In [None]:
# copying zip file to google colab

!cp -r '/content/drive/MyDrive/RDD/train.tar.gz' '/content/'

In [None]:
# unzipping the zipped file

!tar -xvf  'train.tar.gz'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
train/Japan/annotations/xmls/Japan_009766.xml
train/Japan/annotations/xmls/Japan_006254.xml
train/Japan/annotations/xmls/Japan_008726.xml
train/Japan/annotations/xmls/Japan_004664.xml
train/Japan/annotations/xmls/Japan_007945.xml
train/Japan/annotations/xmls/Japan_006391.xml
train/Japan/annotations/xmls/Japan_005903.xml
train/Japan/annotations/xmls/Japan_008200.xml
train/Japan/annotations/xmls/Japan_003663.xml
train/Japan/annotations/xmls/Japan_001949.xml
train/Japan/annotations/xmls/Japan_011141.xml
train/Japan/annotations/xmls/Japan_003141.xml
train/Japan/annotations/xmls/Japan_002870.xml
train/Japan/annotations/xmls/Japan_008094.xml
train/Japan/annotations/xmls/Japan_007381.xml
train/Japan/annotations/xmls/Japan_012963.xml
train/Japan/annotations/xmls/Japan_007843.xml
train/Japan/annotations/xmls/Japan_004889.xml
train/Japan/annotations/xmls/Japan_011099.xml
train/Japan/annotations/xmls/Japan_003745.xml
train/Japan/ann

In [None]:
# importing important libraries
import numpy as np
import pandas as pd
import glob
import os
import cv2
from PIL import Image
import xml.etree.ElementTree as ET
import shutil

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# loading dataset
final_dataset = pd.read_csv('drive/MyDrive/RDD/final_dataset.csv')
final_dataset.head()

Unnamed: 0,images,annotations
0,train/Czech/images/Czech_000006.jpg,train/Czech/annotations/xmls/Czech_000006.xml
1,train/Czech/images/Czech_000010.jpg,train/Czech/annotations/xmls/Czech_000010.xml
2,train/Czech/images/Czech_000020.jpg,train/Czech/annotations/xmls/Czech_000020.xml
3,train/Czech/images/Czech_000021.jpg,train/Czech/annotations/xmls/Czech_000021.xml
4,train/Czech/images/Czech_000022.jpg,train/Czech/annotations/xmls/Czech_000022.xml


In [None]:
final_dataset.shape

(12195, 2)

In [None]:
from sklearn.model_selection import train_test_split

# performing traditional 80:20 split
train, test = train_test_split(final_dataset, test_size=0.2, random_state=42)
print(train.shape)
print(test.shape)

(9756, 2)
(2439, 2)


In [None]:
def final_dataset_creator(dest_path, xml_list):
  '''
  This function takes xml file and convert it into a another xml file with same file name, but these new files can have only 4 possible damage types,
  namely D00, D10, D20 and D40
  Input : xml file destination path (string), xml file name (string)
  '''
  for each_xml_file in xml_list:
    # parse the file using ElementTree module
    tree = ET.parse(each_xml_file)
    # get the parsed xml data in tree like format
    root = tree.getroot()
    # create a list of objects 
    raw_root = [o for o in root.iter('object')]
    # iterate over the above list to check the class name 
    for obj in raw_root:
        # get the text inside the 'name' tag i.e. our class label
        cls_name = obj.find('name').text
        # if the class label is not in the "D00,D10,D20,D40" remove the object tag elemet
        if cls_name not in "D00,D10,D20,D40".split(","):
            root.remove(obj)

    tree = ET.ElementTree(root)

    file_name = each_xml_file.split('/')[-1]
    new_file_path = dest_path + file_name
    tree.write(new_file_path, encoding="utf-8")
    tree = ET.parse(new_file_path)
    with open(new_file_path, 'r') as f: 
        data = f.read()

In [None]:
%%time

# calling function for train annotations
final_dataset_creator('data/images/train/', train['annotations'].tolist())

CPU times: user 2.44 s, sys: 720 ms, total: 3.16 s
Wall time: 3.18 s


In [None]:
%%time

# calling function for test annotations
final_dataset_creator('data/images/test/', test['annotations'].tolist())

CPU times: user 606 ms, sys: 172 ms, total: 778 ms
Wall time: 783 ms


In [None]:
def image_paster(src_path, dst_dir):
  '''
  This function is used to copy image from one folder to another folder.
  Input : source path (string) and destination path (string)
  '''
  for each_image in src_path:
    shutil.copy(each_image, dst_dir)

In [None]:
%%time

# calling function for copying train images from source to destination
image_paster(train['images'].tolist(), 'data/images/train/')

CPU times: user 712 ms, sys: 3.11 s, total: 3.83 s
Wall time: 4.3 s


In [None]:
%%time

# calling function for copying test images from source to destination
image_paster(test['images'].tolist(), 'data/images/test/')

CPU times: user 178 ms, sys: 733 ms, total: 911 ms
Wall time: 1.03 s


In [None]:
import tarfile

def make_tarfile(output_filename, source_dir):
  '''
  This function is used to create tar.gz file for a given folder.
  Input : output file name (string) and source directory (string)
  '''
  # Reference : https://stackoverflow.com/questions/2032403/how-to-create-full-compressed-tar-file-using-python
  with tarfile.open(output_filename, "w:gz") as tar:
    tar.add(source_dir, arcname=os.path.basename(source_dir))

In [None]:
# function calling for creating tar.gz file
make_tarfile('data.tar.gz', 'data')

In [None]:
# also copying data.tar.gz to drive
!cp -r '/content/data.tar.gz' '/content/drive/MyDrive/RDD'

That's it for the Dataset Preparation, next **Model Training**.