In [68]:
import os
import pandas as pd
import numpy as np
import spacy
from spacy.tokenizer import Tokenizer
from ipywidgets import IntProgress
from IPython.display import display
import timer
from mercari_config import MercariConfig

# fix random seed for reproducibility
np.random.seed(7)

nlp = spacy.load('en')
tokenizer = Tokenizer(nlp.vocab)

In [69]:
df = pd.read_csv(filepath_or_buffer=os.path.join(MercariConfig.DATASETS_DIR, MercariConfig.TRAINING_SET_FILE), 
                    header=0, sep='\t', index_col=['train_id'])



  mask |= (ar1 == a)


In [70]:
set_len = len(df)
set_len

1482535

In [71]:
df.count()

name                 1482535
item_condition_id    1482535
category_name        1476208
brand_name            849853
price                1482535
shipping             1482535
item_description     1482531
dtype: int64

In [72]:
df['category_name'].fillna(value=MercariConfig.EMPTY_CAT, inplace=True)

In [73]:
assert(len(df[df.category_name.isnull()]) == 0)

In [74]:
df['brand_name'].fillna(value=MercariConfig.EMPTY_BRAND, inplace=True)

In [75]:
assert(len(df[df.brand_name.isnull()]) == 0)

In [76]:
df[df.item_description.isnull()]

Unnamed: 0_level_0,name,item_condition_id,category_name,brand_name,price,shipping,item_description
train_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
511535,Shoes for Michelle,4,Kids/Girls 0-24 Mos/Shoes,___VERY_EMPTY_BRAND___,9.0,0,
861230,Lipgloss,4,Beauty/Makeup/Lips,___VERY_EMPTY_BRAND___,49.0,0,
1224924,Disney Minnie Head band,3,Women/Women's Accessories/Hair Accessories,Disney,9.0,0,
1264242,For Bianca,3,Women/Women's Accessories/Scarves & Wraps,___VERY_EMPTY_BRAND___,10.0,1,


In [77]:
df['item_description'].fillna(value=MercariConfig.EMPTY_DESC, inplace=True)

In [78]:
assert(len(df[df.item_description.isnull()]) == 0)

In [79]:
#df2 = df.head(10000)
#df = df2.append(df.loc[511535])
#set_len = len(df)

In [80]:
word2index = pd.DataFrame(columns=['word', 'word_id', 'count'])
word2index = word2index.astype(dtype={'word': str, 'word_id': int, 'count': int})

word2index.set_index(['word'], inplace=True)

fp = IntProgress(min=0, max=len(df))

display(fp)

word2index.loc[MercariConfig.PAD] = (0, 0)
word2index.loc[MercariConfig.START] = (1, 0)
word2index.loc[MercariConfig.OOV] = (2, 0)
word2index.loc[MercariConfig.EMPTY_DESC] = (3, 0)

max_word_id = 4
max_words_in_col = 0

with timer.Timer():
    for desc in df['item_description']:
        desc_doc = tokenizer(desc)
        words_in_col = len(desc_doc)

        for token in desc_doc:
            if not token.text in word2index.index:
                word2index.loc[token.text] = (max_word_id, 1)
                max_word_id += 1
            else:
                word2index.at[token.text, 'count'] += 1

        max_words_in_col = words_in_col if max_words_in_col < words_in_col else max_words_in_col

        fp.value += 1
            

165.19043970108032


In [86]:
word2index.sort_values(by='word_id').head(10)

Unnamed: 0_level_0,word_id,count
word,Unnamed: 1_level_1,Unnamed: 2_level_1
___PAD___,0,0
___START___,1,0
___OOV___,2,0
___VERY_EMPTY_DESCRIPTION___,3,1
No,4,1277
description,5,601
yet,6,590
This,7,651
keyboard,8,4
is,9,2721


In [82]:
word2index.to_csv(path_or_buf=os.path.join(MercariConfig.DATASETS_DIR, MercariConfig.WORD_2_INDEX_FILE))

In [83]:
fp = FloatProgress(min=0, max=len(df))

display(fp)

#df.assign('word_ids')
word_ids = np.empty(shape=(set_len, max_words_in_col + 1), dtype=int)
i = 0

with timer.Timer():
    for desc in df.item_description:
        desc_doc = tokenizer(desc)
        word_id = np.zeros(shape=(max_words_in_col + 1), dtype=int)
        j = 1

        word_id[0] = 1

        for token in desc_doc:
            word_id[j] = word2index.at[token.text, 'word_id']
            j += 1

        word_ids[i] = word_id
        i += 1

        fp.value += 1

    df['item_desc_word_seq'] = word_ids.tolist()

11.64185643196106


In [85]:
df.to_csv(path_or_buf=os.path.join(MercariConfig.DATASETS_DIR, MercariConfig.PREP_TRAINING_SET_FILE))