# Convert XML annotations to Roboflow approved .txt annotations

In [None]:
!pip install pandas

In [2]:
import pandas as pd
import numpy as np
import os
import glob
import xml.etree.ElementTree as ET 
import random
import shutil

In [None]:
# Get path of .xml file type annotations
!mkdir raw
path = os.path.join(os.getcwd(), "raw", "dataset", "annotations.xml")
print(path)

In [4]:
# Extract required data fields from annotations file
dataset = []

for anno in glob.glob(path):
    tree = ET.parse(anno)
    root = tree.getroot()
    
    for image_elem in root.iter("image"):
        image_attrs = image_elem.attrib
        image_data = {
            "filename": image_attrs['name'],
            "width": float(image_attrs['width']),
            "height": float(image_attrs['height']),
            "boxes": []
        }
        
        for box_elem in image_elem.iter("box"):
            box_attrs = box_elem.attrib
            box_data = {
                "label": box_attrs['label'],
                "occluded": int(box_attrs['occluded']),
                "xtl": float(box_attrs['xtl']),
                "ytl": float(box_attrs['ytl']),
                "xbr": float(box_attrs['xbr']),
                "ybr": float(box_attrs['ybr'])
            }
            image_data["boxes"].append(box_data)
        
        dataset.append(image_data)

In [None]:
data0=pd.DataFrame(dataset)
display(data0)

In [None]:
data2=pd.DataFrame()
for i in range(len(data0)):
    boxes=data0.loc[i,'boxes']
    for box in boxes:
        a=data0.loc[i,'filename']
        b=data0.loc[i,'width']
        c=data0.loc[i,'height']
        d=box['xtl']
        e=box['ytl']
        f=box['xbr']
        g=box['ybr']
        add_df=pd.DataFrame([[a,b,c,d,e,f,g]])
        data2=pd.concat([data2,add_df],axis=0)
data2.columns=['filename','width','height','xtl','ytl','xbr','ybr']
display(data2)        

In [None]:
display(data2.info())

In [8]:
data2['label']=0
data2['Xcent']=(data2['xtl']+data2['xbr'])/(2*data2['width'])
data2['Ycent']=(data2['ytl']+data2['ybr'])/(2*data2['height'])
data2['boxW']=(data2['xbr']-data2['xtl'])/data2['width']
data2['boxH']=(data2['ybr']-data2['ytl'])/data2['height']

In [None]:
# Convert extracted data to .csv file
display(data2)
data2.to_csv('annotation.csv', path_or_buf="raw", index=False)

In [None]:
!mkdir {os.path.join("raw", "labels")}

In [15]:
# Convert .csv file data into .txt annotations
files=data2['filename'].unique().tolist()
for file in files:
    datai=data2[data2['filename']==file].iloc[:,7:]
    name=file.split('/')[-1][0:-4]
    fmt = ['%d'] + ['%f'] * (datai.shape[1]-1)
    np.savetxt("labels/" + name + ".txt", np.array(datai), fmt=fmt, delimiter=" ")

# Partition dataset for YOLOv5

In [24]:
path0 = os.path.join(os.getcwd(), "raw", "dataset", "images")
path0l = os.path.join(os.getcwd(), "raw", "labels")

path1a = os.path.join(os.getcwd(), "raw", "datasetsFinal", "train")
path1b = os.path.join(os.getcwd(), "raw", "datasetsFinal", "valid")
path1c = os.path.join(os.getcwd(), "raw", "datasetsFinal", "test")

In [None]:
paths0 = []

for dirname, _, filenames in os.walk(path0):
    for filename in filenames:
        paths0+=[(os.path.join(dirname, filename))]

# If using bash in windows run this version:
# for dirname, _, filenames in os.walk(path0):
#     for filename in filenames:
#         paths0+=[(os.path.join(dirname, filename)).replace('\\', '/')]

paths0=random.sample(paths0,len(paths0))
print(paths0[0:5])

In [None]:
!mkdir {os.path.join("raw", "datasetsFinal")}
!mkdir {path1a}
!mkdir {path1b}
!mkdir {path1c}

In [29]:
# Partition dataset for train, valid and test

for i,path in enumerate(paths0):
    if i<len(paths0)*3//5:
        !cp -r {path} {path1a}
        file=path.split('/')[-1].split('.')[0]+'.txt'
        path2=os.path.join(path0l,file)
        !cp -r {path2} {path1a} 
    elif i<len(paths0)*4//5:
        !cp -r {path} {path1b}
        file=path.split('/')[-1].split('.')[0]+'.txt'
        path2=os.path.join(path0l,file)
        !cp -r {path2} {path1b}         
    else:
        !cp -r {path} {path1c}
        file=path.split('/')[-1].split('.')[0]+'.txt'
        path2=os.path.join(path0l,file)
        !cp -r {path2} {path1c}   