In [1]:
import os
import pandas as pd

import data_preparation as mdp
from mercari_config import MercariConfig

In [2]:
train_data = mdp.load_data(MercariConfig.TRAINING_SET_PREP_FILE)
val_data = mdp.load_data(MercariConfig.VALIDATION_SET_PREP_FILE)

In [3]:
categories = train_data[['category_name', 'name']]
categories = categories.groupby("category_name").count()

categories.columns = ['count']

In [4]:
categories['category_id'] = [i for i in range(MercariConfig.WORD_I, len(categories) + MercariConfig.WORD_I)]

categories.at[MercariConfig.PAD, 'category_id'] = MercariConfig.PAD_I
categories.at[MercariConfig.START, 'category_id'] = MercariConfig.START_I
categories.at[MercariConfig.OOV, 'category_id'] = MercariConfig.OOV_I
categories.at[MercariConfig.REMOVED_PRICE, 'category_id'] = MercariConfig.REMOVED_PRICE_I
categories.at[MercariConfig.EMPTY_NAME, 'category_id'] = MercariConfig.EMPTY_NAME_I
categories.at[MercariConfig.EMPTY_CAT, 'category_id'] = MercariConfig.EMPTY_CAT_I
categories.at[MercariConfig.EMPTY_BRAND, 'category_id'] = MercariConfig.EMPTY_BRAND_I
categories.at[MercariConfig.EMPTY_DESC, 'category_id'] = MercariConfig.EMPTY_DESC_I

categories['count'].fillna(value=0, inplace=True)
categories.sort_values(by='category_id', inplace=True)
categories = categories.astype('int32')

In [5]:
def index_category(data):
    data['category_id'] = 0

    data_len = len(data)

    progress = 0

    row_iterator = data.iterrows()

    for index, row in row_iterator:
        cat_nm = row['category_name']

        if cat_nm in categories.index:
            data.at[index, 'category_id'] = categories.at[cat_nm, 'category_id']
        else:
            data.at[index, 'category_id'] = MercariConfig.EMPTY_CAT_I


        progress += 1

        if not progress % 10000:
            print("Progress: %3.2f" % (progress * 100.0 / data_len))

In [6]:
%%time

index_category(train_data)

Progress: 4.22
Progress: 8.43
Progress: 12.65
Progress: 16.86
Progress: 21.08
Progress: 25.29
Progress: 29.51
Progress: 33.73
Progress: 37.94
Progress: 42.16
Progress: 46.37
Progress: 50.59
Progress: 54.80
Progress: 59.02
Progress: 63.24
Progress: 67.45
Progress: 71.67
Progress: 75.88
Progress: 80.10
Progress: 84.32
Progress: 88.53
Progress: 92.75
Progress: 96.96
CPU times: user 25.5 s, sys: 628 ms, total: 26.1 s
Wall time: 26.1 s


In [7]:
%%time

index_category(val_data)

Progress: 16.86
Progress: 33.73
Progress: 50.59
Progress: 67.45
Progress: 84.31
CPU times: user 6.36 s, sys: 176 ms, total: 6.54 s
Wall time: 6.54 s


In [8]:
val_data

Unnamed: 0_level_0,name,item_condition_id,category_name,brand_name,price,shipping,item_description,nm0,nm1,nm2,...,id292,id293,id294,id295,id296,id297,id298,id299,id300,category_id
train_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
315930,Toddler girl Old navy fur boots,3,Kids/Girls 2T-5T/Shoes,___VERY_EMPTY_BRAND___,20.0,0,Toddler girl old navy boots with fur balls Zip...,1,56477,68628,...,0,0,0,0,0,0,0,0,0,519
709258,DRAKE ART IPHONE 5c 5/5s 6/6s 6/6s+,1,"Electronics/Cell Phones & Accessories/Cases, C...",Apple,15.0,0,LISTING IS FOR ONE CASE. YOU GET TO CHOOSE WHA...,1,21223,10213,...,0,0,0,0,0,0,0,0,0,83
989695,Lularoe small Joy blue NWT,1,Women/Coats & Jackets/Vest,___VERY_EMPTY_BRAND___,40.0,1,No description yet,1,36451,78339,...,0,0,0,0,0,0,0,0,0,977
656176,Chanel NOIR Mascara 1ml travel/sample sz,1,Beauty/Makeup/Eyes,Chanel,7.0,1,Chanel travel/sample size Le Volume De Chanel ...,1,18459,40445,...,0,0,0,0,0,0,0,0,0,33
1424939,Lipsense Sheer Berry Diamond/Pearl Gloss,1,Beauty/Makeup/Lips,SeneGence,51.0,1,This listing is for Sheer berry Diamond New Re...,1,35835,51416,...,0,0,0,0,0,0,0,0,0,35
1044165,3 SHIP VS PINK,1,Women/Women's Handbags/Backpack Style,Victoria's Secret,29.0,0,-NWT -SIZE MEDIUM -ALSO AVAILABLE IN SIZE LARGE,1,6692,49129,...,0,0,0,0,0,0,0,0,0,1096
1356311,Too faced Melted,3,Beauty/Makeup/Lips,Too Faced,13.0,1,Brand New Too faced Melted Matte long wear lip...,1,56584,67541,...,0,0,0,0,0,0,0,0,0,35
899599,Indian polki bangles,1,Women/Jewelry/Bracelets,___VERY_EMPTY_BRAND___,19.0,0,Exclusive pair of High quality Designer polki ...,1,31064,2,...,0,0,0,0,0,0,0,0,0,998
1390068,american girl doll game,2,Kids/Toys/Dolls & Accessories,American Girl ®,9.0,0,julie finds a way nintendo ds game,1,61828,68628,...,0,0,0,0,0,0,0,0,0,563
413220,ON FIRE! NAKED3 12pcs Makeup Brush,1,Beauty/Makeup/Face,___VERY_EMPTY_BRAND___,18.0,1,Click on my picture to view all my listings FR...,1,41749,2,...,0,0,0,0,0,0,0,0,0,34


In [9]:
mdp.save_data(train_data, MercariConfig.TRAINING_SET_PREP_FILE)

In [10]:
len(categories)

1105

In [11]:
mdp.save_data(val_data, MercariConfig.VALIDATION_SET_PREP_FILE)