In [23]:
import pandas as pd 
import numpy as np
import json
import base64
import swifter
from tqdm import tqdm
import csv
import pickle
from sklearn.externals import joblib
import gc
from time import sleep

In [24]:
TRAIN_PATH = '../data/train/train.tsv'
VAL_PATH = '../data/valid/valid.tsv'
VAL_ANS_PATH = '../data/valid_answer.json'
SAMPLE_PATH = '../data/valid/train.sample.tsv'
LABEL_PATH = '../data/multimodal_labels.txt'
TEST_PATH = '../data/testA/testA.tsv'

In [25]:
def get_label(path):
    with open(path) as f:
        lines = f.readlines()
        label2id = {l.split('\n')[0].split('\t')[1]:int(l.split('\n')[0].split('\t')[0]) for l in lines[1:]}
        id2label = {int(l.split('\n')[0].split('\t')[0]):l.split('\n')[0].split('\t')[1] for l in lines[1:]}
    return label2id, id2label

In [26]:
label2id, id2label = get_label(LABEL_PATH)

In [27]:
print(id2label, label2id)

{0: 'top clothes (coat, jacket, shirt, etc.)', 1: 'skirt & dress', 2: 'bottom clothes (trousers, pants, etc.)', 3: 'luggage, leather goods', 4: 'shoes', 5: 'accessories (jewelry, clothing accessories, belts, hats, scarves, etc.)', 6: 'snacks, nuts, liquor and tea', 7: 'makeup, perfume, beauty tools and essential oils', 8: 'bottle drink', 9: 'furniture', 10: 'stationery', 11: 'household electrical appliances', 12: 'home decoration', 13: 'household fabric', 14: 'kitchenware', 15: 'home / personal cleaning tools', 16: 'storage supplies', 17: 'motorcycle, motorcycle accessories, vehicles, bicycle and riding equipment', 18: 'outdoor product', 19: 'lighting', 20: 'toys', 21: 'underwear', 22: 'digital supplies', 23: 'bed linens', 24: 'baby products', 25: 'personal care', 26: 'sporting goods', 27: 'clothes (accessories, baby clothing, etc.)', 28: 'others', 29: 'human face', 30: 'arm', 31: 'hair', 32: 'hand'} {'top clothes (coat, jacket, shirt, etc.)': 0, 'skirt & dress': 1, 'bottom clothes (tr

In [28]:
def convertBoxes(num_boxes, boxes):
    return np.frombuffer(base64.b64decode(boxes), dtype=np.float32).reshape(num_boxes, 4)
def convertFeature(num_boxes, features,):
    return np.frombuffer(base64.b64decode(features), dtype=np.float32).reshape(num_boxes, 2048)
def convertLabel(num_boxes, label):
    return np.frombuffer(base64.b64decode(label), dtype=np.int64).reshape(num_boxes)
def convertLabelWord(num_boxes, label):
    temp = np.frombuffer(base64.b64decode(label), dtype=np.int64).reshape(num_boxes)
    return '###'.join([id2label[t] for t in temp])
def convertPos(num_boxes, boxes, H, W):
    pos_list = []
    for i in range(num_boxes):
        temp = boxes[i]
        pos_list.append([temp[0]/W, 
                         temp[2]/W, 
                         temp[1]/H, 
                         temp[3]/H, 
                         ((temp[2] - temp[0]) * (temp[3] - temp[1]))/ (W*H),])
    return pos_list

In [29]:
train = pd.read_csv(TRAIN_PATH,sep='\t', chunksize=10000, nrows = 10000, quoting=csv.QUOTE_NONE)
LEN = 0
product_set = set()
num_boxes_list = []
image_h_list = []
image_w_list = []
words_len_list = []
words_list = []
label_list = []
label_words_list = []
boxes_list = []
boxes_feature_list = []
pos_list = []

In [30]:
i = 0
for t in tqdm(train):
    print("starting")
    gc.collect()
    sleep(1)
    LEN += len(t)
    temp = list(t['query'])
    words_len_list.extend([len(q.split()) for q in temp])
    words_list.extend(temp)
    t['labels_convert_words'] = t.swifter.apply(lambda x: convertLabelWord(x['num_boxes'], x['class_labels']), axis=1)
    temp = list(t['labels_convert_words'])
    label_words_list.extend(temp)
    t['boxes_convert'] = t.swifter.apply(lambda x: convertBoxes(x['num_boxes'], x['boxes']), axis=1)
    temp = list(t['boxes_convert'])
    boxes_list.extend(temp)
    t['feature_convert'] = t.swifter.apply(lambda x: convertFeature(x['num_boxes'], x['features']), axis=1)
    temp = list(t['feature_convert'])
    boxes_feature_list.extend(temp)
    t['pos'] = t.swifter.apply(lambda x: convertPos(x['num_boxes'], x['boxes_convert'], x['image_h'], x['image_w']), axis=1)
    temp = list(t['pos'])
    pos_list.extend(temp)
    del temp
    gc.collect()
    sleep(60)
    i += 1

0it [00:00, ?it/s]

starting


HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=10000.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=10000.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=10000.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=10000.0, style=ProgressStyle(descripti…




1it [01:17, 77.11s/it]


In [31]:
print(LEN, len(product_set))

10000 0


In [32]:
data = pd.DataFrame({
                     'words':words_list,
                     'label_words':label_words_list,
                     'features':boxes_feature_list,
                     'pos':pos_list,
                    })

In [33]:
print(data.head(10))

                              words  \
0               treble popular reed   
1         compatible ink cartridges   
2       check new look facial cream   
3         royal commemorative stamp   
4         calendula baby shower gel   
5            domestic folding table   
6  chamomile beauty salon care suit   
7           hypoallergenic earrings   
8               produce special soy   
9          metalworking mini hammer   

                                         label_words  \
0                                    others###others   
1                                             others   
2   makeup, perfume, beauty tools and essential oils   
3         others###others###others###others###others   
4  others###snacks, nuts, liquor and tea###snacks...   
5                                          furniture   
6   makeup, perfume, beauty tools and essential oils   
7  accessories (jewelry, clothing accessories, be...   
8                           others###others###others   
9  others#

In [34]:
with open('../data/temp_data.pkl', 'wb') as outp:
    joblib.dump(data, outp)
print("temp data finish")

temp data finish


In [35]:
val = pd.read_csv(VAL_PATH,sep='\t')
val['boxes_convert'] = val.swifter.apply(lambda x: convertBoxes(x['num_boxes'], x['boxes']), axis=1)
val['feature_convert'] = val.swifter.apply(lambda x: convertFeature(x['num_boxes'], x['features']), axis=1)
val['labels_convert'] = val.swifter.apply(lambda x: convertLabel(x['num_boxes'], x['class_labels']), axis=1)
val['label_words'] = val.swifter.apply(lambda x: convertLabelWord(x['num_boxes'], x['class_labels']), axis=1)
val['pos'] = val.swifter.apply(lambda x: convertPos(x['num_boxes'], x['boxes_convert'], x['image_h'], x['image_w']), axis=1)
del val['boxes'], val['features'], val['class_labels']    
with open('../data/val_data.pkl', 'wb') as outp:
    pickle.dump(val, outp)             
print("val data finish")

  from pandas import Panel


HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=14720.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=14720.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=14720.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=14720.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=14720.0, style=ProgressStyle(descripti…


val data finish


In [36]:
test = pd.read_csv(TEST_PATH,sep='\t')
test['boxes_convert'] = test.swifter.apply(lambda x: convertBoxes(x['num_boxes'], x['boxes']), axis=1)
test['feature_convert'] = test.swifter.apply(lambda x: convertFeature(x['num_boxes'], x['features']), axis=1)
test['labels_convert'] = test.swifter.apply(lambda x: convertLabel(x['num_boxes'], x['class_labels']), axis=1)
test['label_words'] = test.swifter.apply(lambda x: convertLabelWord(x['num_boxes'], x['class_labels']), axis=1)
test['pos'] = test.swifter.apply(lambda x: convertPos(x['num_boxes'], x['boxes_convert'], x['image_h'], x['image_w']), axis=1)
del test['boxes'], test['features'], test['class_labels']
with open('../data/test_data.pkl', 'wb') as outp:
    pickle.dump(test, outp)
print("test data finish")

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=28830.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=28830.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=28830.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=28830.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=28830.0, style=ProgressStyle(descripti…


test data finish
