In [1]:
import os
import pandas as pd
import spacy
from spacy.tokenizer import Tokenizer

import data_preparation as mdp
from mercari_config import MercariConfig

In [2]:
nlp = spacy.load('en')
tokenizer = Tokenizer(nlp.vocab)

df = mdp.load_data(MercariConfig.TRAINING_SET_PREP_FILE)

In [3]:
word = {}

word[MercariConfig.PAD] = 0
word[MercariConfig.START] = 0
word[MercariConfig.OOV] = 0
word[MercariConfig.EMPTY_DESC] = 0
word[MercariConfig.REMOVED_PRICE] = 0

In [4]:
%%time

set_len = len(df)

max_words_in_item_desc = 0

progress = 0

for desc in df['item_description']:
    desc_doc = tokenizer(desc)
    words_in_col = len(desc_doc)

    for token in desc_doc:
        if not token.text in word:
            word[token.text] = 1
        else:
            word[token.text] += 1

    max_words_in_item_desc = words_in_col if max_words_in_item_desc < words_in_col else max_words_in_item_desc

    progress += 1

    if not progress % 10000:
        print("Progress: %3.2f" % (progress * 100.0 / set_len))

Progress: 6.75
Progress: 13.49
Progress: 20.24
Progress: 26.98
Progress: 33.73
Progress: 40.47
Progress: 47.22
Progress: 53.96
Progress: 60.71
Progress: 67.45
Progress: 74.20
Progress: 80.94
Progress: 87.69
Progress: 94.43
CPU times: user 45.1 s, sys: 2.58 s, total: 47.6 s
Wall time: 42.3 s


In [5]:
word2index = pd.Series(word)
word2index = word2index.reset_index()

word2index['word_id'] = [i for i in range(MercariConfig.WORD_I, len(word) + MercariConfig.WORD_I)]

word2index.columns = ['word', 'count', 'word_id']

word2index.set_index(['word'], inplace=True)

word2index = word2index[['word_id', 'count']]

word2index.at[MercariConfig.PAD, 'word_id'] = MercariConfig.PAD_I
word2index.at[MercariConfig.START, 'word_id'] = MercariConfig.START_I
word2index.at[MercariConfig.OOV, 'word_id'] = MercariConfig.OOV_I
word2index.at[MercariConfig.REMOVED_PRICE, 'word_id'] = MercariConfig.REMOVED_PRICE_I
word2index.at[MercariConfig.EMPTY_DESC, 'word_id'] = MercariConfig.EMPTY_DESC_I

word2index.sort_values(by='word_id', inplace=True)

In [6]:
mdp.save_word2index(word2index=word2index, file_name=MercariConfig.WORD_2_INDEX_4_ITEM_DESC_FILE )

In [7]:
len(word2index)

186944

In [8]:
word2index.count()

word_id    186944
count      186944
dtype: int64

In [9]:
max_words_in_item_desc

235

In [10]:
word2index.sort_values(by='word_id', ascending=True)

Unnamed: 0_level_0,word_id,count
word,Unnamed: 1_level_1,Unnamed: 2_level_1
___PAD___,0,0
___START___,1,0
___OOV___,2,0
[rm],3,16621
___VERY_EMPTY_DESCRIPTION___,4,0
,10,1
!,11,1946
!!,12,505
!!!,13,323
!!!!,14,103
