In [60]:
import xml.etree.ElementTree as ET
import pandas as pd 
import os 
from collections import Counter
import plotly.figure_factory as ff
import numpy as np
import plotly.express as px


### Read xmls files 

In [61]:
xml_dir = "labeld_images_and_xml"
xmls_names = [f for f in os.listdir(xml_dir) if "xml" in f]

### Count lables in each file 

In [62]:
d = []
for f in xmls_names:
    path = os.path.join(xml_dir,f)
    mytree = ET.parse(path)
    myroot = mytree.getroot()
    dict_file = Counter([a.text for a in  myroot.iter('name')])
    dict_file["file"]  = f
    d.append(dict_file)

## Isoating numeric columns 

In [63]:
df = pd.DataFrame.from_dict(d)
group_labels = [col for col,typ in zip(df.columns, df.dtypes) if typ == "float64"]
group_labels

['freeFS',
 'drinking',
 'eating_24',
 'freeDC',
 'freeD',
 'freeFSN',
 'eating_g',
 'mushroom',
 'drinking_g',
 'drinking_b',
 'eating_l',
 'grouped']

## Labeling errors check 

In [64]:
# Check for labeling mistakes 
df.columns

Index(['freeFS', 'file', 'drinking', 'eating_24', 'freeDC', 'freeD', 'freeFSN',
       'eating_g', 'mushroom', 'drinking_g', 'drinking_b', 'eating_l',
       'grouped'],
      dtype='object')

In [65]:
# Mapping istakes to correct lables 
error_map = { 
             
             'eating_e':"eating_s",
             'freeDc':"freeDC",
             "eating_2s":"eating_s",
             "\\":"freeDC",
             "EA":"eating_s",
             "eating":"drinking",
             """eating_g\\""": "eating_g",
            "eating_d":"eating",
            "eating_s":"eating",
            "eating_b":"eating",
            "eating_2":"eating_24",
             "freeC":"freeD",
             "eating_sw":"eating",
            "gruoped":"grouped",
            "eating_on":"eating_in",
           "mushtoom":"mushroom"

}






In [66]:
# Find files with mistakes 
errors_dict = {}
for col in error_map.keys():
    if col in df.columns:
        errors_dict[col] = df[~df[col].isnull()]["file"].tolist()
    else:
        print(f"""error of type -- '{col}' -- not exist any more\n""")
        
error_map = {k:v for k,v in error_map.items() if k in errors_dict}
print(f"""'error_map get' updated to {error_map}\n""")
print(f"The following files contain lables mistakes {errors_dict}")

error of type -- 'eating_e' -- not exist any more

error of type -- 'freeDc' -- not exist any more

error of type -- 'eating_2s' -- not exist any more

error of type -- '\' -- not exist any more

error of type -- 'EA' -- not exist any more

error of type -- 'eating' -- not exist any more

error of type -- 'eating_g\' -- not exist any more

error of type -- 'eating_d' -- not exist any more

error of type -- 'eating_s' -- not exist any more

error of type -- 'eating_b' -- not exist any more

error of type -- 'eating_2' -- not exist any more

error of type -- 'freeC' -- not exist any more

error of type -- 'eating_sw' -- not exist any more

error of type -- 'gruoped' -- not exist any more

error of type -- 'eating_on' -- not exist any more

error of type -- 'mushtoom' -- not exist any more

'error_map get' updated to {}

The following files contain lables mistakes {}


In [67]:
# Iterate throw files, mapping error to correct values, save files back 
for error,fix in error_map.items():
    files = errors_dict[error]
    for f in files:
        path = os.path.join(xml_dir,f)
        mytree = ET.parse(path)
        myroot = mytree.getroot()
        
        for name in myroot.iter('name'):
            if name.text == error:
                print(name.text)
                name.text = fix
        mytree.write(file_or_filename=path)

In [68]:
for f in xmls_names:
    path = os.path.join(xml_dir,f)
    with open(path) as f:
        lines = f.read()
    c = 0
    for i in group_labels:
        if i in lines:
            c +=1

    if c == 0:
        print(path)
    if "test" in path:
        print(path)
        break


In [69]:
path

'labeld_images_and_xml/time=20220520T075000Z_growing_day=11_camera=201.xml'

# Distribution plot 

In [70]:
import plotly.graph_objects as go

fig = go.Figure()

for i in group_labels:
    fig.add_trace(go.Histogram(x=df[i],name=i))

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.6)
fig.show()

### Pie plot 

In [71]:
cols = df.columns

In [72]:
df.head()

Unnamed: 0,freeFS,file,drinking,eating_24,freeDC,freeD,freeFSN,eating_g,mushroom,drinking_g,drinking_b,eating_l,grouped
0,7.0,time=20220521T202200Z_growing_day=12_camera=20...,,,,,,,,,,,
1,1.0,time=20220520T153000Z_growing_day=11_camera=20...,7.0,3.0,14.0,23.0,,,,,,,
2,,time=20220524T030200Z_growing_day=15_camera=30...,,,,,5.0,,,,,,
3,,time=20220520T000000Z_growing_day=11_camera=20...,,,,,5.0,,,,,,
4,,time=20220520T105000Z_growing_day=11_camera=20...,6.0,,5.0,20.0,,3.0,4.0,3.0,1.0,,


In [73]:
dd = df.melt(
        value_vars = cols)

In [74]:
fig = px.pie(dd, values='value', names='variable', title='Labels proportion')
fig.show()

In [75]:
def copy_xml_paths_names(input_folder: str,
                         output_folder: str,
                         file_name: str,
                         typ: str
                         ):

    # reading xml
    xml_input_path = f"{input_folder}/{file_name}.xml"
    xml_output_path = f"{output_folder}/{file_name}.xml"
    mytree = ET.parse(xml_input_path)
    myroot = mytree.getroot()

    for folder in myroot.iter('folder'):
        folder.text = input_folder

    for filename in myroot.iter('filename'):
        file_name = f"{file_name}.{typ}"
        filename.text = file_name

    for path in myroot.iter('path'):
        wd = os.getcwd()
        path_name = os.path.join(wd, input_folder, file_name)
        path.text = path_name

    if not os.path.exists(output_folder):
        os.makedirs(name=output_folder)

    mytree.write(file_or_filename=xml_output_path)

In [20]:
import os 
from PIL import Image

c = 0 
d = "augmented_images_and_xml"
la = os.listdir(d)

d = "augmented_images_and_xml"
lb = os.listdir(d)

for i in la[:]:
    if ("301.xml" in i ) or ("201.xml" in i ):
        os.remove(f"{d}/{i}")
        



TypeError: remove() missing required argument 'path' (pos 1)