In [1]:
import spacy
from spacy.tokenizer import Tokenizer
from spacy.symbols import ORTH

import data_preparation as mdp
from mercari_config import MercariConfig

In [2]:
%%time

nlp = spacy.load('en_core_web_md', disable=['parser', 'tagger', 'ner'])  
tokenizer = Tokenizer(nlp.vocab)

CPU times: user 12 s, sys: 444 ms, total: 12.5 s
Wall time: 12.4 s


In [3]:
nlp.pipeline

[]

In [4]:
nlp.tokenizer.add_special_case('[rm]', [{ORTH: '[rm]'}])

In [5]:
%%time

word2index = mdp.load_word2index(
    file_name=MercariConfig.WORD_2_INDEX_4_NAME_FILE, 
    max_words_from_index=MercariConfig.MAX_WORDS_FROM_INDEX_4_NAME)

train_data = mdp.load_data(MercariConfig.TRAINING_SET_PREP_FILE, head=None)
val_data = mdp.load_data(MercariConfig.VALIDATION_SET_PREP_FILE, head=None)

CPU times: user 8.14 s, sys: 736 ms, total: 8.88 s
Wall time: 8.88 s


In [10]:
def walk_tokens_4_indexation(data, index, doc, word2index, seq_i, start, end):
    for i in range(start, end):
        tok = doc[i]

        if tok.text in word2index.index:
            data.at[index, 'nm' + str(seq_i)] = word2index.at[tok.text, 'word_id']
        else:
            data.at[index, 'nm' + str(seq_i)] = MercariConfig.OOV_I # <OOV>
        
        seq_i += 1

        print('Token:', tok.text, tok.i)
    
    return seq_i


def index_name(data):
    max_words_name = MercariConfig.MAX_WORDS_IN_NAME
    
    for i in range(max_words_name + 1):
        data['nm' + str(i)] = 0

    data_len = len(data)

    progress = 0

    row_iterator = data.iterrows()

    for index, _ in row_iterator:
        item = data.at[index, 'name']
        doc = nlp(item)
        tok_cnt = len(doc)
        tok_i = 0
        seq_i = 1
        
        data.at[index, 'nm0'] = MercariConfig.START_I # <START>

        print (doc.ents)
        print (doc)

        for ent in doc.ents:
            if seq_i <= max_words_name:
                seq_i = walk_tokens_4_indexation(data=data, index=index, doc=doc, word2index=word2index, 
                                                 seq_i=seq_i, 
                                                 start=tok_i, end=min(ent.start, tok_i + max_words_name - seq_i + 1))

                if ent.text in word2index.index:
                    data.at[index, 'nm' + str(seq_i)] = word2index.at[ent.text, 'word_id']
                else:
                    data.at[index, 'nm' + str(seq_i)] = MercariConfig.OOV_I # <OOV>

                tok_i = ent.end
                seq_i += 1
                
                print('Entity:', ent.text, ent.start, ent.end, ent.label_)

            else:
                break

        walk_tokens_4_indexation(data, index, doc, word2index, 
                                 seq_i, tok_i, min(tok_cnt, tok_i + max_words_name - seq_i + 1))

        progress += 1

        if not progress % 10000:
            print("Progress: %3.2f" % (progress * 100.0 / data_len))

In [12]:
%%time

index_name(train_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


()
Gymshark cropped hoodie
Token: Gymshark 0
Token: cropped 1
Token: hoodie 2
()
Nine West dress size 10
Token: Nine 0
Token: West 1
Token: dress 2
Token: size 3
Token: 10 4
()
Nike leggings
Token: Nike 0
Token: leggings 1
()
Nike slides
Token: Nike 0
Token: slides 1
()
Vera's Dolls
Token: Vera 0
Token: 's 1
Token: Dolls 2
()
Long,Beige cardigan
Token: Long 0
Token: , 1
Token: Beige 2
Token: cardigan 3
()
F21 strappy bralette
Token: F21 0
Token: strappy 1
Token: bralette 2
()
Apple Watch Band
Token: Apple 0
Token: Watch 1
Token: Band 2
()
4 neutrogena colorsticks
Token: 4 0
Token: neutrogena 1
Token: colorsticks 2
()
American Eagle Jeans
Token: American 0
Token: Eagle 1
Token: Jeans 2
CPU times: user 696 ms, sys: 8 ms, total: 704 ms
Wall time: 697 ms


In [6]:
%%time

index_name(val_data)

Progress: 33.73
Progress: 67.45
CPU times: user 9.18 s, sys: 268 ms, total: 9.44 s
Wall time: 8.92 s


In [7]:
mdp.save_data(train_data, MercariConfig.TRAINING_SET_PREP_FILE)

In [8]:
mdp.save_data(val_data, MercariConfig.VALIDATION_SET_PREP_FILE)

In [13]:
train_data

Unnamed: 0_level_0,name,item_condition_id,category_name,brand_name,price,shipping,item_description,nm0,nm1,nm2,...,id293,id294,id295,id296,id297,id298,id299,id300,category_id,brand_id
train_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1081706,Gymshark cropped hoodie,1,Women/Athletic Apparel/Shirts & Tops,Gymshark,65.0,0,SZ XS,1,20262,50811,...,0,0,0,0,0,0,0,0,957,1043
1287829,Nine West dress size 10,2,Women/Dresses/Knee-Length,Nine West,25.0,1,Purging my closet and need some things gone. A...,1,31009,46072,...,0,0,0,0,0,0,0,0,983,1765
775398,Nike leggings,1,"Women/Athletic Apparel/Pants, Tights, Leggings",Nike,17.0,0,Size medium never worn,1,30976,55489,...,0,0,0,0,0,0,0,0,956,1758
213500,Nike slides,2,"Women/Dresses/Above Knee, Mini",___VERY_EMPTY_BRAND___,14.0,0,Says boys 6 Will fit women's 7.5-8,1,30976,60925,...,0,0,0,0,0,0,0,0,980,6
868648,Vera's Dolls,1,Kids/Toys/Dolls & Accessories,___VERY_EMPTY_BRAND___,31.0,0,6 cabbage Patch Kids for Vera! Thank you!,1,45089,26,...,0,0,0,0,0,0,0,0,563,6
1025225,"Long,Beige cardigan",3,Women/Sweaters/Cardigan,H&M,19.0,0,In very good condition.,1,26611,45,...,0,0,0,0,0,0,0,0,1051,1045
349665,F21 strappy bralette,1,Handmade/Clothing/Lingerie,___VERY_EMPTY_BRAND___,9.0,1,Black seamless strappy bralette. Very stretchy...,1,16942,61640,...,0,0,0,0,0,0,0,0,180,6
850620,Apple Watch Band,1,Men/Men's Accessories/Watches,___VERY_EMPTY_BRAND___,14.0,0,Rock your Apple Watch in New way with this 38m...,1,6199,45910,...,0,0,0,0,0,0,0,0,622,6
895116,4 neutrogena colorsticks,1,Beauty/Makeup/Lips,Neutrogena,10.0,1,"They are all sealed Almond nude, pink nude, cl...",1,2846,57266,...,0,0,0,0,0,0,0,0,35,1741
1248193,American Eagle Jeans,3,Women/Jeans/Boot Cut,American Eagle,15.0,1,Women's size 12 long Artist style Super Stretc...,1,5891,16031,...,0,0,0,0,0,0,0,0,986,134
