### Import Basic Libraries

In [1]:
import sys
import os
import numpy as np
import pandas as pd
from xml.etree import ElementTree 

In [2]:
xml_path = '../Dataset - Safety Helment/annotations'
img_path = '../Dataset - Safety Helment/Safety Helmat/images'

<b>Lets take a look on ".xml" file</b>

- In the below image we can observe, three main elements
    - filename : This defines the image name
    - size : This defines the image size
    - object : This defines a object inside the image
- object element has two main component
    - name : Defines object type (Helmet/Person/Head)
    - bndbox : Defines the object position in image

<img align="left" src="./sample_image.png" style="border-radius:6px">

In [5]:
# read & process xml files

object_x1 = []       #objects xmin coordinate
object_y1 = []       #objects ymin coordinate
object_x2 = []       #objects xmax coordinate
object_y2 = []       #objects ymax coordinate
labels = []          #objects label/type
file_title = []      #file title

for file in os.listdir(xml_path):
    if '.xml' in file:
        objects = ElementTree.parse(os.path.join(xml_path,file)).findall('object')
        object_x1.append([int(obj.find("bndbox").findtext("xmin")) for obj in objects])
        object_y1.append([int(obj.find("bndbox").findtext("ymin")) for obj in objects])
        object_x2.append([int(obj.find("bndbox").findtext("xmax")) for obj in objects])
        object_y2.append([int(obj.find("bndbox").findtext("ymax")) for obj in objects])
        labels.append([obj.findtext('name') for obj in objects])
        
        file_title.append(file[:-4])

In [6]:
object_dict = {
    'file_name' : file_title,
    'xmin' : object_x1,
    'ymin' : object_y1,
    'xmax' : object_x2,
    'ymax' : object_y2,
    'label' : labels
}

df = pd.DataFrame(object_dict)

df.head()

Unnamed: 0,file_name,xmin,ymin,xmax,ymax,label
0,hard_hat_workers2536,"[239, 102, 102]","[114, 67, 0]","[262, 148, 149]","[149, 121, 38]","[helmet, helmet, helmet]"
1,hard_hat_workers721,"[203, 223, 231, 251, 285, 304, 380, 370, 344]","[190, 196, 205, 184, 183, 164, 183, 210, 194]","[217, 237, 245, 265, 299, 324, 396, 382, 358]","[209, 214, 219, 204, 202, 188, 202, 225, 213]","[helmet, helmet, helmet, helmet, helmet, helme..."
2,hard_hat_workers3425,"[135, 226, 181, 134, 181]","[116, 172, 131, 0, 0]","[162, 251, 207, 162, 208]","[148, 204, 156, 23, 8]","[helmet, helmet, helmet, helmet, helmet]"
3,hard_hat_workers3947,"[113, 142, 184, 223, 269, 306, 335, 134]","[148, 138, 158, 134, 161, 165, 157, 143]","[132, 166, 216, 267, 309, 325, 394, 145]","[174, 176, 198, 187, 210, 192, 226, 173]","[helmet, helmet, helmet, helmet, helmet, helme..."
4,hard_hat_workers1934,"[204, 334]","[70, 69]","[332, 414]","[211, 211]","[helmet, helmet]"


In [7]:
# Save the dataframe for furture purpose

df.to_csv('./Dataset.csv',index=False)

### Extract Useful labels

In [8]:
import pickle

In [9]:
# define function for bounding box
# we will discard 'person' labels 

def getBoundingBox(df):
    
    boundingBoxDict = {}
    for i in range(df.shape[0]):
        label = []
        dimension = []
        boundingBox = {}
        
        dims = df.iloc[i][1:-1].values
        lbl = df.iloc[i][-1]
        for j in range(len(lbl)):
            if lbl[j] != 'person':
                #we will discard person labels
                bbox = [dim[j] for dim in dims]
                dimension.append(bbox)
                label.append(lbl[j])
                
        boundingBox['dimension'] = dimension
        boundingBox['label'] = label
        boundingBoxDict[df.iloc[i][0] + '.png'] = boundingBox
        
    return boundingBoxDict

In [10]:
boundingBoxDict = getBoundingBox(df)

boundingBoxDict['hard_hat_workers0.png']

{'dimension': [[357, 116, 404, 175],
  [4, 146, 39, 184],
  [253, 139, 275, 177],
  [300, 145, 323, 181],
  [116, 151, 138, 180],
  [80, 151, 100, 180],
  [62, 144, 83, 172],
  [322, 141, 345, 178],
  [175, 156, 194, 186],
  [222, 151, 240, 182],
  [200, 146, 216, 173],
  [98, 140, 112, 160],
  [157, 150, 175, 177]],
 'label': ['helmet',
  'helmet',
  'helmet',
  'helmet',
  'helmet',
  'helmet',
  'head',
  'head',
  'head',
  'head',
  'head',
  'helmet',
  'head']}

In [11]:
with open('./useful_labels.pkl','wb') as f:
    pickle.dump(boundingBoxDict,f)