In [29]:
import pandas as pd
import fastText as ft
import os
from sklearn.utils import shuffle

In [48]:
df = pd.read_csv('amazon_data_shuffled.csv')

In [8]:
df.groupby('category').count()

Unnamed: 0_level_0,product_name,brand,price
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
active,10197,6013,10206
"coats, jackets and vests",10807,6997,10808
dresses,10186,8311,10188
fashion hoodies and sweatshirts,10390,8315,10390
jeans,10314,5275,10310
"jumpsuits, rompers and overalls",10688,9483,10689
leggings,10627,9284,10626
lingerie and sleepwear,10292,7097,10291
pants,10733,7606,10733
shorts,10964,7794,10962


In [49]:
#balance the dataset
l = []
for i in set(df.category):
    #for every unique microtag
    temp = df[df.category==i]
    #if count>10000, get only 10000 samples
    if len(temp)>10000:
        l.append(temp.sample(10000))
    else: #otherwise, get everything
        l.append(temp)


In [50]:
#concatenate the dataset
df = pd.concat(l)
df = df[df.category.notnull()]

In [51]:
df = shuffle(df, random_state = 100)

In [52]:
df.to_csv("amazon_ready_to_train")

In [53]:
#create fasttext specific training data
df['label']  = '__label__'+df.category.apply(lambda x: x.replace(' ','_')+' ')
df['label'] = df['label'] + df.product_name
df[['label']].to_csv('training_data.txt', header=None, index=None, mode='a')

train_data = os.path.join(os.getenv("DATADIR", ''), 'training_data.txt')
model = ft.train_supervised(train_data, epoch=25, lr=1.0, wordNgrams=2, verbose=2, minCount=1)
model.save_model("amazon.bin")

In [55]:
#try several cases
print(model.predict('zara blue shirt'))
print(model.predict('calvin klein thermal underwear'))
print(model.predict('apple macbook pro'))

(('__label__tops_and_tees',), array([0.70327908]))
(('__label__lingerie_and_sleepwear',), array([0.98231769]))
(('__label__socks_and_hosiery',), array([0.68098867]))


In [35]:
model.quantize(input=train_data, qnorm=True, retrain=True, cutoff=100000)
model.save_model("amazon.ftz")

In [45]:
#try several cases
print(model.predict('zara blue shirt'))
print(model.predict('calvin klein thermal underwear'))
print(model.predict('shoes'))


(('__label__tops_and_tees',), array([0.95627415]))
(('__label__lingerie_and_sleepwear',), array([0.98613846]))
(('__label__pants',), array([0.92220092]))


In [19]:
model.test('training_data.txt')

(279828, 0.9290278313821347, 0.9290278313821347)