In [1]:
import spacy
from spacy.tokenizer import Tokenizer

import data_preparation as mdp
from mercari_config import MercariConfig

In [2]:
nlp = spacy.load('en')
tokenizer = Tokenizer(nlp.vocab)

In [3]:
word2index = mdp.load_word2index(
    file_name=MercariConfig.WORD_2_INDEX_4_ITEM_DESC_FILE, 
    max_words_from_index=MercariConfig.MAX_WORDS_FROM_INDEX_4_ITEM_DESC)

train_data = mdp.load_data(MercariConfig.TRAINING_SET_PREP_FILE, head=None)
val_data = mdp.load_data(MercariConfig.VALIDATION_SET_PREP_FILE, head=None)

In [4]:
def index_item_desc(data):
    max_words_item_desc = MercariConfig.MAX_WORDS_IN_ITEM_DESC

    for i in range(max_words_item_desc + 1):
        data['id' + str(i)] = 0

    data_len = len(data)

    progress = 0

    row_iterator = data.iterrows()

    for index, _ in row_iterator:
        desc = data.at[index, 'item_description']
        desc_doc = tokenizer(desc)
        seq_i = 1

        data.at[index, 'id0'] = MercariConfig.START_I # <START>

        for token in desc_doc:
            if seq_i <= max_words_item_desc:
                if token.text in word2index.index:
                    data.at[index, 'id' + str(seq_i)] = word2index.at[token.text, 'word_id']
                else:
                    data.at[index, 'id' + str(seq_i)] = MercariConfig.OOV_I # <OOV>

                seq_i += 1
            else:
                break

        progress += 1

        if not progress % 10000:
            print("Progress: %3.2f" % (progress * 100.0 / data_len))

In [None]:
%%time

index_item_desc(train_data)

Progress: 4.22
Progress: 8.43
Progress: 12.65
Progress: 16.86
Progress: 21.08
Progress: 25.29
Progress: 29.51
Progress: 33.73
Progress: 37.94
Progress: 42.16
Progress: 46.37
Progress: 50.59
Progress: 54.80
Progress: 59.02
Progress: 63.24
Progress: 67.45
Progress: 71.67
Progress: 75.88
Progress: 80.10
Progress: 84.32
Progress: 88.53
Progress: 92.75
Progress: 96.96
CPU times: user 3min 41s, sys: 7.18 s, total: 3min 49s
Wall time: 3min 35s


In [None]:
%%time

index_item_desc(val_data)

In [None]:
mdp.save_data(train_data, MercariConfig.TRAINING_SET_PREP_FILE)

In [None]:
mdp.save_data(val_data, MercariConfig.VALIDATION_SET_PREP_FILE)