### Uploading labels from xmls for the images from the dataset.

In [1]:
import csv
import cv2
import glob
import os
import xml.etree.ElementTree as ET
import numpy as np
import pandas as pd
import re

In [2]:
xmls="./oilannotation"
xml_files = glob.glob("{}/xmls/*".format(xmls))

In [3]:
alist=[]
def atoi(text):
    return int(text) if text.isdigit() else text

def natural_keys(text):
    return [ atoi(c) for c in re.split(r'(\d+)', text) ]

alist=xml_files
alist.sort(key=natural_keys)

### Reading data from xmls and converting into a dataframe

In [4]:
heights=[]
widths=[]
xmins=[]
ymins=[]
xmaxs=[]
ymaxs=[]
anomalies=[]
outputs=[]
image_names = []

for i, xml_file in enumerate(alist):
        tree = ET.parse(xml_file)
            #print(xml_file)
        
        image_name = tree.findtext("filename")
        height = int(tree.findtext("./size/height"))
        width = int(tree.findtext("./size/width"))
        xmin = int(tree.findtext("./object/bndbox/xmin"))
        ymin = int(tree.findtext("./object/bndbox/ymin"))
        xmax = int(tree.findtext("./object/bndbox/xmax"))
        ymax = int(tree.findtext("./object/bndbox/ymax"))
        anamoly = tree.findtext("./object/name")
        
        image_names.append(image_name)
        widths.append(width)
        xmins.append(xmin)
        xmaxs.append(xmax)
        ymins.append(ymin)
        ymaxs.append(ymax)
        heights.append(height)
        anomalies.append(anamoly)

In [5]:
#Converting lists to series
s_path = pd.Series(alist)
s_images=pd.Series(image_names)
s_widths=pd.Series(widths)
s_xmins=pd.Series(xmins)
s_xmaxs=pd.Series(xmaxs)
s_ymins=pd.Series(ymins)
s_ymaxs=pd.Series(ymaxs)
s_heights=pd.Series(heights)
s_anamoly=pd.Series(anomalies)


In [6]:
# Converting series to dataframe
df_path = pd.DataFrame(s_path)
df_image = pd.DataFrame(s_images)
df_width= pd.DataFrame(s_widths)
df_xmins= pd.DataFrame(s_xmins)
df_xmaxs= pd.DataFrame(s_xmaxs)
df_ymins= pd.DataFrame(s_ymins)
df_ymaxs= pd.DataFrame(s_ymaxs)
df_heights= pd.DataFrame(s_heights)
df_anamoly=pd.DataFrame(s_anamoly)

df_anamoly = pd.concat([df_path,df_image,df_width,df_heights,df_xmins, df_xmaxs, df_ymins,df_ymaxs,df_anamoly], axis=1)

In [7]:
df_anamoly.head()

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8
0,./oilannotation/xmls\oil_spill_1.xml,oil_spill_1.jpg,600,476,23,600,58,435,anomaly
1,./oilannotation/xmls\oil_spill_2.xml,oil_spill_2.jpg,669,700,102,669,101,573,anomaly
2,./oilannotation/xmls\oil_spill_3.xml,oil_spill_3.jpg,470,236,1,470,90,179,anomaly
3,./oilannotation/xmls\oil_spill_4.xml,oil_spill_4.jpg,284,216,82,155,68,192,anomaly
4,./oilannotation/xmls\oil_spill_5.xml,oil_spill_5.jpg,250,288,129,250,39,139,anomaly


In [8]:
df_anamoly.columns = ['path','image','width','height','xmins','xmaxs','ymins','ymaxs','classtype']

In [9]:
df_anamoly['class'] = 1 # as the data extracted from the xmls is only anamoly data.

In [10]:
df_anamoly.shape

(52, 10)

In [11]:
# Extracting data which is not an anamoly
n_files = []
n_height = []
n_width = []
for file in glob.glob("./oilspill/not_oil_spill_*"):
    #print(file)
    try:
        unscaled = cv2.imread(file)
        image_height,image_width, _= unscaled.shape
        n_height.append(image_height)
        n_width.append(image_width)
        files = file.split("\\")
        n_files.append(files[1])
    except AttributeError:
        pass

In [12]:
len(n_files)

33

In [13]:
#Converting lists to series
s_images=pd.Series(n_files)
s_heights = pd.Series(n_height)
s_widths = pd.Series(n_width)
df_image_n= pd.DataFrame(s_images,columns = ['image'])
df_height_n= pd.DataFrame(s_heights,columns = ['height'])
df_width_n= pd.DataFrame(s_widths,columns = ['width'])
df_temp = pd.concat([df_image_n,df_height_n,df_width_n],axis=1)

In [14]:
df_temp

Unnamed: 0,image,height,width
0,not_oil_spill_1.jpg,1005,971
1,not_oil_spill_10.jpg,530,1033
2,not_oil_spill_11.jpg,262,192
3,not_oil_spill_12.jpg,183,275
4,not_oil_spill_13.jpg,182,277
5,not_oil_spill_14.jpg,501,680
6,not_oil_spill_15.jpg,699,820
7,not_oil_spill_16.jpg,540,540
8,not_oil_spill_17.jpg,820,800
9,not_oil_spill_18.jpg,745,680


In [15]:
df_train = pd.concat([df_anamoly,df_temp])
df_train.shape

(85, 10)

In [16]:
df_train.tail()

Unnamed: 0,path,image,width,height,xmins,xmaxs,ymins,ymaxs,classtype,class
28,,not_oil_spill_5.jpg,267,189,,,,,,
29,,not_oil_spill_6.jpg,447,113,,,,,,
30,,not_oil_spill_7.jpg,237,213,,,,,,
31,,not_oil_spill_8.jpg,174,290,,,,,,
32,,not_oil_spill_9.jpg,201,251,,,,,,


### Treating missing values

In [17]:
df_train['xmins']=df_train['xmins'].fillna(1)
df_train['xmaxs']=df_train['xmaxs'].fillna(df_train['width'])
df_train['ymins']=df_train['ymins'].fillna(1)
df_train['ymaxs']=df_train['ymaxs'].fillna(df_train['height'])
df_train['class']=df_train['class'].fillna(0)
df_train['classtype'] = df_train['classtype'].fillna('normal')

In [18]:
df_train.tail()

Unnamed: 0,path,image,width,height,xmins,xmaxs,ymins,ymaxs,classtype,class
28,,not_oil_spill_5.jpg,267,189,1.0,267.0,1.0,189.0,normal,0.0
29,,not_oil_spill_6.jpg,447,113,1.0,447.0,1.0,113.0,normal,0.0
30,,not_oil_spill_7.jpg,237,213,1.0,237.0,1.0,213.0,normal,0.0
31,,not_oil_spill_8.jpg,174,290,1.0,174.0,1.0,290.0,normal,0.0
32,,not_oil_spill_9.jpg,201,251,1.0,201.0,1.0,251.0,normal,0.0


### Converting dataframe to csv

In [19]:
df_train.to_csv('./oilannotation/train_data_n.csv',index=True)