# MultiLabelProcessor

This Notebook is used to convert the labels provided in JSON files to numpy arrays and store them on a local disk.

How to use this Notebook:
1. Check the data variables in cell 2. Make sure those folders exist and you have your downloaded files in there.
2. Run the whole Notebook **once** - it stores the generated arrays on your local disk.
3. Import the following functions from `./multilabel_functions.py` in your Python module or Jupyter Notebook:
 * `get_multilabels_train()`
 * `get_multilabels_validation`()
 
    Those return the labels in the correct format.
    
In developing this Notebook I have used https://www.kaggle.com/anqitu/for-starter-json-to-multilabel-in-24-seconds/notebook.

In [1]:
import pandas as pd
import numpy as np
import json
import os

pd.set_option('display.max_rows', 600)
pd.set_option('display.max_columns', 50)
import warnings
warnings.filterwarnings('ignore')

In [2]:
datadir = os.getcwd()
input_path = os.path.join(datadir, '../Fashion/data/json/')
output_path = os.path.join(datadir, '../Fashion/data/multilabels/')

## 1. Load data from json

In [3]:
train={}
test={}
validation={}
with open(os.path.join(input_path, 'train.json')) as json_data:
    train= json.load(json_data)
with open(os.path.join(input_path, 'test.json')) as json_data:
    test= json.load(json_data)
with open(os.path.join(input_path, 'validation.json')) as json_data:
    validation = json.load(json_data)

print('Train No. of images: %d'%(len(train['images'])))
print('Test No. of images: %d'%(len(test['images'])))
print('Validation No. of images: %d'%(len(validation['images'])))

# JSON TO PANDAS DATAFRAME
# train data
train_img_url=train['images']
train_img_url=pd.DataFrame(train_img_url)
train_ann=train['annotations']
train_ann=pd.DataFrame(train_ann)
train=pd.merge(train_img_url, train_ann, on='imageId', how='inner')

# test data
test=pd.DataFrame(test['images'])

# Validation Data
val_img_url=validation['images']
val_img_url=pd.DataFrame(val_img_url)
val_ann=validation['annotations']
val_ann=pd.DataFrame(val_ann)
validation=pd.merge(val_img_url, val_ann, on='imageId', how='inner')

datas = {'Train': train, 'Test': test, 'Validation': validation}
for data in datas.values():
    data['imageId'] = data['imageId'].astype(np.uint32)

Train No. of images: 1014544
Test No. of images: 39706
Validation No. of images: 9897


In [4]:
train.head()

Unnamed: 0,imageId,url,labelId
0,1,https://contestimg.wish.com/api/webimage/570f3...,"[95, 66, 137, 70, 20]"
1,2,https://contestimg.wish.com/api/webimage/5468f...,"[36, 66, 44, 214, 105, 133]"
2,3,https://contestimg.wish.com/api/webimage/54641...,"[170, 66, 97, 153, 105, 138]"
3,4,https://contestimg.wish.com/api/webimage/550b9...,"[18, 66, 44, 153, 164]"
4,5,https://contestimg.wish.com/api/webimage/54451...,"[189, 66, 176, 137, 153, 74, 181, 171, 138]"


In [5]:
validation.head()

Unnamed: 0,imageId,url,labelId
0,1,https://contestimg.wish.com/api/webimage/568e1...,"[62, 17, 66, 214, 105, 137, 85]"
1,2,https://contestimg.wish.com/api/webimage/5452f...,"[95, 17, 66, 214, 164, 137, 20, 204, 184]"
2,3,https://contestimg.wish.com/api/webimage/54058...,"[122, 19, 66, 186, 180, 44, 154, 20]"
3,4,https://contestimg.wish.com/api/webimage/540c6...,"[190, 222, 66, 153, 164, 226, 53, 184]"
4,5,https://contestimg.wish.com/api/webimage/54477...,"[62, 66, 153, 171, 111, 137, 70, 204, 184]"


In [6]:
test.head()

Unnamed: 0,imageId,url
0,1,https://contestimg.wish.com/api/webimage/568e1...
1,2,https://contestimg.wish.com/api/webimage/5452f...
2,3,https://contestimg.wish.com/api/webimage/54058...
3,4,https://contestimg.wish.com/api/webimage/540c6...
4,5,https://contestimg.wish.com/api/webimage/54477...


## 2. MultiLabelBinarizer
As the labelId is a list, we need to convert them to single label in a matrix to feed out classifiers.

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
train_label = mlb.fit_transform(train['labelId'])
validation_label = mlb.transform(validation['labelId'])

for data in [validation_label, train_label, test]:
    print(data.shape)

(9897, 228)
(1014544, 228)
(39706, 2)


In [None]:
# Save as numpy
train_multilabel_filename = 'multilabel_train.npy'
validation_multilabel_filename = 'multilabel_validation.npy'

np.save(os.path.join(output_path, train_multilabel_filename), train_label)
np.save(os.path.join(output_path, validation_multilabel_filename), validation_label)