# Data Preprocessing

In [1]:
# Loading required packages
import os
import json
import pandas as pd
from pandas.io.json import json_normalize

In [2]:
# Loading the class description dataset
classes_desc = pd.read_csv('class-descriptions-boxable.csv', header=None)

In [None]:
classes_desc.head()

In [3]:
# Dictionary of class labels
classes_dict = dict(zip(classes_desc[0], classes_desc[1]))

In [None]:
# Opening json file
with open('bbox_labels_600_hierarchy.json', 'r') as file:
    json_data = json.load(file)

In [None]:
# Function to unstructure the hierarchy
from itertools import chain, starmap

def flatten_json_iterative_solution(dictionary):
    """Flatten a nested json file"""

    def unpack(parent_key, parent_value):
        """Unpack one level of nesting in json file"""
        # Unpack one level only!!!
        
        if isinstance(parent_value, dict):
            for key, value in parent_value.items():
                temp1 = parent_key + '_' + key
                yield temp1, value
        elif isinstance(parent_value, list):
            i = 0 
            for value in parent_value:
                temp2 = parent_key + '_'+str(i) 
                i += 1
                yield temp2, value
        else:
            yield parent_key, parent_value    

            
    # Keep iterating until the termination condition is satisfied
    while True:
        # Keep unpacking the json file until all values are atomic elements (not dictionary or list)
        dictionary = dict(chain.from_iterable(starmap(unpack, dictionary.items())))
        # Terminate condition: not any value in the json file is dictionary or list
        if not any(isinstance(value, dict) for value in dictionary.values()) and \
           not any(isinstance(value, list) for value in dictionary.values()):
            break

    return dictionary

In [None]:
# Getting the classes
classes = pd.Series(flatten_json_iterative_solution(json_data)).to_frame()
classes['Description'] = classes.index
classes[0].replace(classes_dict, inplace=True)
classes.reset_index(drop=True, inplace=True)
classes = classes.iloc[1:,:]

In [None]:
# Putting in a csv file
classes.to_csv('classdesc.csv', index=False)

In [None]:
# Manually filtered out the class labels for food and kitchenware from classdesc file and put in final-classes file

In [4]:
# Reading the csv file
classes = pd.read_csv('final-classes.csv', header=None)
classes.columns = ['Class']

In [5]:
labels = list(classes.Class)

In [6]:
len(labels)

66

In [7]:
# Loading the train annotations data
tr = pd.read_csv('train-annotations-bbox.csv')
vl = pd.read_csv('validation-annotations-bbox.csv')
te = pd.read_csv('test-annotations-bbox.csv')

In [8]:
# Keeping only necessary columns
tr = tr[['ImageID', 'LabelName', 'XMin', 'XMax', 'YMin', 'YMax']]
vl = vl[['ImageID', 'LabelName', 'XMin', 'XMax', 'YMin', 'YMax']]
te = te[['ImageID', 'LabelName', 'XMin', 'XMax', 'YMin', 'YMax']]

In [9]:
# Replacing the encoded label name with the actual name
tr.LabelName.replace(classes_dict, inplace=True)
vl.LabelName.replace(classes_dict, inplace=True)
te.LabelName.replace(classes_dict, inplace=True)

In [10]:
# Keeping only data for our labels
train = tr[tr['LabelName'].isin(labels)]
validation = vl[vl['LabelName'].isin(labels)]
test = te[te['LabelName'].isin(labels)]

In [11]:
train.reset_index(drop=True, inplace=True)
validation.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [None]:
# Putting in csv file for further use
train.to_csv('train.csv', index=False)
validation.to_csv('validation.csv', index=False)
test.to_csv('test.csv', index=False)