In [29]:
import pandas as pd
import fastText as ft
import os
from sklearn.utils import shuffle

In [71]:
df = pd.read_csv('amazon_data_clean.csv')

In [75]:
df.head(10)

Unnamed: 0,brand,category,price,product_name
0,TOMS,slippers,21.99,women's classics
1,Vionic,slippers,59.95,women's relax slipper (size 11/dark grey zebra)
2,Birkenstock,slippers,89.95,arizona women's birko-flor sandal
3,TOMS,slippers,19.99,men's classic canvas slip-on
4,ULTRAIDEAS,slippers,19.9,women's comfort memory foam slippers wool-like...
5,Shevalues,slippers,10.99,shower sandal slippers quick drying bathroom s...
6,,slippers,13.99,women's comfort slip on memory foam slippers f...
7,Vionic,slippers,49.95,relax - orthaheel orthotic slippers
8,,slippers,12.0,women's terry ballerina slipper with bow for i...
9,,slippers,64.95,ohana slipper - women's blueberry/black 10


In [76]:
#balance the dataset
l = []
for i in set(df.category):
    #for every unique microtag
    temp = df[df.category==i]
    #if count>10000, get only 10000 samples
    if len(temp)>10000:
        l.append(temp.sample(10000))
    else: #otherwise, get everything
        l.append(temp)


In [77]:
#concatenate the dataset
df = pd.concat(l)
df = df[df.category.notnull()]

In [78]:
df = shuffle(df, random_state = 100)

In [79]:
df.tail(10)

Unnamed: 0,brand,category,price,product_name
127754,YVWTUC,swimsuits and cover ups,15.23,plasticity cozy bikini set pcs folkcustom beac...
232502,Jenny Yoo,dresses,115.36,elizabeth chiffon illusion neckline openback l...
69915,Cloudless,mules and clogs,29.99,cute mules for woman fashion pointed toe flat ...
278564,Laurel Burch,socks and hosiery,20.95,crew socks pair polka dot gato black red
201621,NRS,jeans,56.92,dear john denim dear john gisele high rise ski...
5663,Cocominibox,slippers,10.69,women's emoji cotton indoor slippers cartoon p...
170879,,active,35.0,warm thermal fitted run hoodie xsxl plus size xx
261890,thorlos,socks and hosiery,14.99,unisex experia show single pair night berry me...
218572,PRAPRA,tops and tees,15.22,casual loose short sleeve round collar cotton ...
34427,,fashion sneakers,33.54,"women's flex appeal2.0 - newsmaker sneaker,"


In [80]:
df.to_csv("amazon_ready_to_train")

In [113]:
df.head()

Unnamed: 0,brand,category,price,product_name,label
125735,Domy,swimsuits and cover ups,22.99,plus size swimwear retro halter top vintage on...,__label__swimsuits_and_cover_ups plus size swi...
93675,LE CHÂTEAU,pumps,49.97,women's classic leather pointy toe slingback pump,__label__pumps women's classic leather pointy ...
279434,DongDong,dresses,7.82,dress ladies short casual pocket summer sleeve...,__label__dresses dress ladies short casual poc...
255335,,jeans,105.0,super stretch skinny fit distressed jeans wfla...,__label__jeans super stretch skinny fit distre...
92049,,pumps,42.99,women's total motion salima dress pump,__label__pumps women's total motion salima dre...


In [81]:
#create fasttext specific training data
df['label']  = '__label__'+df.category.apply(lambda x: x.replace(' ','_')+' ')
df['label'] = df['label'] + df.product_name
df[['label']].to_csv('training_data.txt', header=None, index=None, mode='a')



In [116]:
#create fasttext unsupervised training data
df[['product_name']].to_csv('training_data_unsupervised.txt',header=None,index=None,mode='a')

In [117]:
train_data = os.path.join(os.getenv("DATADIR", ''), 'training_data.txt')
train_data2 = os.path.join(os.getenv("DATADIR", ''), 'training_data_unsupervised.txt')

In [1]:
#Train unsupervised model
model2 = ft.train_unsupervised(train_data2, epoch=25, lr=1.0, wordNgrams=2, verbose=2, minCount=1)

NameError: name 'ft' is not defined

In [83]:
#Train supervised model
model = ft.train_supervised(train_data, epoch=25, lr=1.0, wordNgrams=2, verbose=2, minCount=1)
model.save_model("amazon.bin")
model.

In [84]:
#try several cases
print(model.predict('zara blue shirt'))
print(model.predict('calvin klein thermal underwear'))
print(model.predict('apple macbook pro'))

(('__label__tops_and_tees',), array([0.54978126]))
(('__label__lingerie_and_sleepwear',), array([0.97836107]))
(('__label__coats,_jackets_and_vests',), array([0.36347061]))


In [85]:
model.quantize(input=train_data, qnorm=True, retrain=True, cutoff=100000)
model.save_model("amazon.ftz")

In [109]:
#try several cases
print(model.predict('zara blue shirt'))
print(model.predict('calvin klein thermal underwear'))
print(model.predict('jack purcell low profile ox sneakers'))


(('__label__tops_and_tees',), array([1.00001001]))
(('__label__lingerie_and_sleepwear',), array([1.00001001]))
(('__label__fashion_sneakers',), array([1.00001001]))


In [58]:
model.test('training_data.txt')

(366402, 0.953635624259693, 0.953635624259693)

In [112]:
model.get_sentence_vector('Rein')

array([  656.0759  ,  -561.1235  ,  -329.5382  ,   115.54471 ,
        -452.32443 ,   278.9113  ,  -328.98642 ,   466.2833  ,
       -1833.0869  ,  -206.65681 ,   584.8116  ,   -44.790337,
         804.19855 ,  -274.65872 ,   966.45087 ,  -111.88818 ,
        -680.48566 ,  -788.7844  ,   944.9018  ,  -333.4626  ,
         443.07907 ,  -504.199   ,   141.8493  ,    92.166954,
         -89.3413  ,   434.30048 ,  1212.2523  ,   951.4056  ,
         979.96234 ,  -189.5821  ,   564.8393  ,   527.22327 ,
        -147.31876 ,  -385.422   , -1374.9117  ,  -423.15045 ,
        -558.5091  ,   436.27032 ,   589.4014  ,  -533.05176 ,
          75.18426 ,  -291.77454 ,  1109.8188  ,   414.12503 ,
         -68.36644 ,   159.42451 ,  -573.76495 ,  -293.90314 ,
         -37.032883,  -191.48688 , -1283.1628  ,  1057.7242  ,
        -118.9082  ,  -294.21207 ,    61.65367 ,   574.1455  ,
        -833.38605 ,   241.08812 ,   -70.14299 ,   315.48724 ,
        1247.2383  ,   624.9851  ,   641.1147  ,  -895.