# DATA PREPARATION
---
---

## Import Necessary Libraries

In [None]:
import os
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et

## STEP 1 : Load ALL XML files and store in a list

In [None]:
xml_list = glob("C:/Users/06533V744/Desktop/ML/Udemy Courses/5. Yolo Object Detection/Image Prediction/data_images/*.xml")

In [None]:
xml_list

## Start Data Cleaning

In [None]:
xml_list = list(map(lambda x :x.replace("\\","/"),xml_list))

In [None]:
xml_list

## STEP 2: Load XML file - Sample Check

In [None]:
tree = et.parse('C:/Users/06533V744/Desktop/ML/Udemy Courses/5. Yolo Object Detection/Image Prediction/data_images/000001.xml')
root = tree.getroot()

In [None]:
root

In [None]:
# Extract File Name

image_name = root.find("filename").text
image_name

In [None]:
# Width and Height of the Image File

width = root.find('size').find('width').text
height = root.find('size').find('height').text

[width, height]

In [None]:
# our first image has 3 objects
# FIRST OBJECT : find the name, name, xmin, xmax, ymin, ymax

objs = root.findall('object')

obj = objs[0]
obj

In [None]:
name = obj.find('name').text
bndbox = obj.find('bndbox')

xmin = bndbox.find('xmin').text
ymin = bndbox.find('ymin').text
xmax = bndbox.find('xmax').text
ymax = bndbox.find('ymax').text

[name, xmin, ymin, xmax, ymax]

In [None]:
# ALL THE OBJECTS : Find the name, name, xmin, xmax, ymin, ymax 

for obj in objs:
    name = obj.find('name').text
    bndbox = obj.find('bndbox')

    xmin = bndbox.find('xmin').text
    ymin = bndbox.find('ymin').text
    xmax = bndbox.find('xmax').text
    ymax = bndbox.find('ymax').text

    print([name, xmin, ymin, xmax, ymax])

## STEP 2: Load XML file - Detailed Version

In [None]:
'''

NOTE: Time to combine all above things together

'''

In [None]:
# Extract information like image_name, width, height, name, xmin, ymin, xmax, ymax


parser = []

# Decide the image
image_name = root.find("filename").text
width = root.find('size').find('width').text
height = root.find('size').find('height').text

# Extract the data for objects
for obj in objs:
    name = obj.find('name').text
    bndbox = obj.find('bndbox')

    xmin = bndbox.find('xmin').text
    ymin = bndbox.find('ymin').text
    xmax = bndbox.find('xmax').text
    ymax = bndbox.find('ymax').text

    parser.append([image_name, width, height, name, xmin, ymin, xmax, ymax])
    
parser

## Function for this extraction task

In [None]:
### Create a Function for this extraction task

def extract_text(filename):
    
    
    tree = et.parse(filename)
    root = tree.getroot()
    

    # Decide the image
    image_name = root.find("filename").text
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    
    objs = root.findall('object')
    parser =[]

    # Extract the data for objects
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')

        xmin = bndbox.find('xmin').text
        ymin = bndbox.find('ymin').text
        xmax = bndbox.find('xmax').text
        ymax = bndbox.find('ymax').text

        parser.append([image_name, width, height, name, xmin, ymin, xmax, ymax])

    return parser



In [None]:
parser_all = list(map(extract_text, xml_list))

In [None]:
'''
parser_all = []
for i in xml_list:
    
    parser_all.append(extract_text(i))
    
'''

In [None]:
len(parser_all)

In [None]:
parser_all[4500]

## FLATTEN the Data

In [None]:


data = reduce (lambda x, y : x+y, parser_all)
data

## Prepare our DataFrame

In [None]:
df = pd.DataFrame(data, columns = ['file_name', 'width', 'height', 'name', 'xmin', 'ymin', 'xmax', 'ymax'])

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df['name'].value_counts()

## STEP 3 : Find out center_x, center_y, w, h

In [None]:
df.info()

In [None]:
# Conclusion : we need to turn this objects into numeric

# TYPE CONVERSION


df.columns

In [None]:
cols = ['width', 'height', 'xmin', 'ymin', 'xmax', 'ymax']
df[cols] = df[cols].astype(int)
df.info()

In [None]:
# center_x & center_y

df['center_x'] = ((df['xmin'] +df['xmax']) /2) /df['width']
df['center_y'] = ((df['ymin'] +df['ymax']) /2) /df['height']


# w & h

df['w'] = (df['xmax']-df['xmin'])/df['width']
df['h'] = (df['ymax']-df['ymin'])/df['height']


In [None]:
df.head()

## STEP 4 : Split data in train and test

In [None]:
# check how many images 

images = df['file_name'].unique()
len(images)

In [None]:
# 80% train 20% Test

image_df = pd.DataFrame(images, columns = ['file_name'])

# Shuffle and pic 80% images
image_train = tuple(image_df.sample(frac=0.8)['file_name'])

# Take rest 20%
image_test = tuple(image_df.query(f'file_name not in {image_train}')['file_name'])

In [None]:
len(image_train)

In [None]:
len(image_test)

In [None]:
train_df = df.query(f'file_name in {image_train}')
test_df = df.query(f'file_name in {image_test}')

In [None]:
train_df.head()

In [None]:
test_df.head()

## STEP : 5 LABEL ENCODING

In [None]:
# converting name to ids

def Label_Encoding(x):
    labels = {'person' : 0,
              'car' : 1, 
              'chair' : 2,
              'bottle' : 3,
              'pottedplant' : 4,
              'bird' : 5,
              'dog' : 6,
              'sofa' : 7,
              'bicycle' : 8,
              'horse' : 9,
              'boat' : 10,
              'motorbike' : 11,
              'cat' : 12,
              'tvmonitor' : 13,
              'cow' :14,
              'sheep' : 15,
              'aeroplane' : 16,
              'train': 17,
              'diningtable': 18,
              'bus' : 19 }
    
    return labels[x]

In [None]:
train_df['id'] = train_df['name'].apply(Label_Encoding)

test_df['id'] = test_df['name'].apply(Label_Encoding)

In [None]:
train_df.head()

In [None]:
test_df.head()

## STEP 6 : Save Images and Labels in Text 

In [None]:
import os
from shutil import move

In [None]:
train_folder = 'C:/Users/06533V744/Desktop/ML/Udemy Courses/5. Yolo Object Detection/Image Prediction/data_images/train'
test_folder = 'C:/Users/06533V744/Desktop/ML/Udemy Courses/5. Yolo Object Detection/Image Prediction/data_images/test'

os.mkdir(train_folder)
os.mkdir(test_folder)

In [None]:
cols = ['file_name', 'id', 'center_x', 'center_y', 'w', 'h']

groupby_obj_train = train_df[cols].groupby('file_name')
groupby_obj_test = test_df[cols].groupby('file_name')

In [None]:
def save_data(file_name, folder_path, group_obj):
    
    # Move Image
    src = os.path.join('C:/Users/06533V744/Desktop/ML/Udemy Courses/5. Yolo Object Detection/Image Prediction/data_images/', file_name)
    dst = os.path.join(folder_path, file_name)
    move(src, dst)
    
    # Save the labels 
    text_filename = os.path.join(folder_path, os.path.splitext(file_name)[0]+'.txt')
    
    group_obj.get_group(file_name).set_index('file_name').to_csv(text_filename, sep =' ', index = False, header = False)
    

In [None]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [None]:
filename_series.apply(save_data, args = (train_folder , groupby_obj_train ))

In [None]:
filename_series = pd.Series(groupby_obj_test.groups.keys())
filename_series.apply(save_data, args = (test_folder , groupby_obj_test ))

In [None]:
df['name'].unique()