### FastText

In [66]:
import pandas as pd
import fastText as ft
import os
from sklearn.utils import shuffle
import numpy as np
from sklearn.metrics import classification_report 
from sklearn.metrics import accuracy_score

In [61]:
df = pd.read_csv('amazon_data_clean.csv')
df.shape

(281267, 4)

In [26]:
df[df.category=='active'].sample(10)

Unnamed: 0,brand,category,price,product_name
258747,PIYOGA,active,40.0,petite athleisure yoga pants short straight le...
203431,Campeon,active,15.95,kids skinny soccer pants training sweat sport ...
122376,Areke,active,19.99,multi performance advanced cushion crew socks ...
246730,Unitop,active,19.99,quick dry cargo crop hiking pants drawstring
219716,Leonisa,active,50.0,activelife power lift firm compression highwai...
210603,Bise,active,14.99,yoga sports bra high active impact support dou...
264126,,active,14.99,short sleeve athletic dept logo tshirt prime e...
223904,,active,30.0,swoosh sports bra carbon heatheranthraciteblac...
213047,,active,29.11,dancer drapey mesh tshirt
226201,Augusta Sportswear,active,10.33,fanatic tee


In [24]:
df.category.value_counts()

flats                              20256
slippers                           11658
boots                              11174
shorts                             10964
socks and hosiery                  10932
skirts                             10829
athletic shoes                     10813
coats, jackets and vests           10809
tops and tees                      10775
pants                              10735
jumpsuits, rompers and overalls    10691
leggings                           10628
pumps                              10592
swimsuits and cover ups            10552
suiting and blazers                10500
fashion hoodies and sweatshirts    10391
jeans                              10314
lingerie and sleepwear             10293
active                             10206
dresses                            10188
sandals                            10170
outdoor shoes                      10112
fashion sneakers                   10072
sweaters                            9939
mules and clogs 

### Balance the dataset

In [3]:
l = []
for i in set(df.category):
    #for every unique microtag
    temp = df[df.category==i]
    #if count>10000, get only 10000 samples
    if len(temp)>10000:
        l.append(temp.sample(10000))
    else: #otherwise, get everything
        l.append(temp)
df = pd.concat(l)
df = df[df.category.notnull()]

### Shuffle the dataset

In [4]:
df = shuffle(df, random_state = 100)
df.head()

Unnamed: 0,brand,category,price,product_name
233413,KASCLINO,dresses,9.99,vintage dresswomens summer casual dresses line...
242142,,"coats, jackets and vests",30.8,en pointe satin jacket
73011,,mules and clogs,104.9,'cinnamon' mary jane clogs womens - copper / c...
194658,ouxiuli,suiting and blazers,13.87,sleeve one button slim fit blazer jacket
245557,DOLLY LAMB,"coats, jackets and vests",129.99,leather jacket motorcycle biker jacket black c...


### Create FastText training.txt

In [None]:
df['label']  = '__label__'+df.category.apply(lambda x: x.replace(' ','_')+' ')
df['label'] = df['label'] + df.product_name
df[['label']].to_csv('training_data.txt', header=None, index=None, mode='a')

In [None]:
train_data = os.path.join(os.getenv("DATADIR", ''), 'training_data.txt')
train_data2 = os.path.join(os.getenv("DATADIR", ''), 'training_data_unsupervised.txt')

### Train FastText supervised movdel

In [None]:
model = ft.train_supervised(train_data, epoch=25, lr=1.0, wordNgrams=2, verbose=2, minCount=1)
model.save_model("amazon_ver2.bin")


### Test several product names:

In [None]:
print(model.predict('purple tshirt'))
print(model.predict('reebok sneakers'))
print(model.predict('grey cotton sweat pants'))
print(model.predict('blue uniqlo jacket'))

### Quantize Model and save

In [None]:
model.quantize(input=train_data, qnorm=True, retrain=True, cutoff=100000)
model.save_model("amazon.bin")

### Test product names again:

In [None]:
print(model.predict('purple tshirt'))
print(model.predict('reebok sneakers'))
print(model.predict('grey cotton sweat pants'))
print(model.predict("Syktkmx Womens Mary Jane Wedges Pumps Ankle Strap Closed Toe Heeled Walking Work Shoes"))

### Get sentence embeddings from FastText
#### Add into dataframe as sent_embed

In [5]:
model = ft.load_model('amazon.ftz')

In [6]:
embed = []
for i in df['product_name']:
    embed.append(model.get_sentence_vector((str(i))))

In [7]:
df['sent_embed'] = embed

### Change Labels into integers

In [8]:
c ={'active': 1,
 'athletic shoes': 2,
 'boots': 3,
 'coats, jackets and vests': 4,
 'dresses': 5,
 'fashion hoodies and sweatshirts': 6,
 'fashion sneakers': 7,
 'flats': 8,
 'jeans': 9,
 'jumpsuits, rompers and overalls': 10,
 'leggings': 11,
 'lingerie and sleepwear': 12,
 'mules and clogs': 13,
 'outdoor shoes': 14,
 'oxford shoes': 15,
 'pants': 16,
 'pumps': 17,
 'sandals': 18,
 'shorts': 19,
 'skirts': 20,
 'slippers': 21,
 'socks and hosiery': 22,
 'suiting and blazers': 23,
 'sweaters': 24,
 'swimsuits and cover ups': 25,
 'tops and tees': 26}

#### Add Column "Label" as integer

In [9]:
df['label']=df['category'].apply(lambda x: int(c[x]))

In [10]:
#Separate Data into X and y:
X = np.array(df.sent_embed.tolist())
y = df['label']

In [11]:
df.head()

Unnamed: 0,brand,category,price,product_name,sent_embed,label
233413,KASCLINO,dresses,9.99,vintage dresswomens summer casual dresses line...,"[-53.521038, -16.029465, -145.3494, 30.784159,...",5
242142,,"coats, jackets and vests",30.8,en pointe satin jacket,"[322.8789, 84.392006, -280.41544, -151.54898, ...",4
73011,,mules and clogs,104.9,'cinnamon' mary jane clogs womens - copper / c...,"[-37.236427, 157.44653, 138.99718, 217.8964, -...",13
194658,ouxiuli,suiting and blazers,13.87,sleeve one button slim fit blazer jacket,"[49.38974, 11.347481, -55.440914, 284.82578, -...",23
245557,DOLLY LAMB,"coats, jackets and vests",129.99,leather jacket motorcycle biker jacket black c...,"[-62.573383, 295.50958, 327.38187, 545.0122, 1...",4


### Split Data using train_test_split

In [12]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

### Instantiate KNN Classifier and find best K Value

In [13]:
#Instantiate model
from sklearn.neighbors import KNeighborsClassifier

In [None]:
error_rate = []
for i in range(1,20):
    knn = KNeighborsClassifier(n_neighbors =i,n_jobs=3)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.figure(figsize=(10,6))
plt.plot(range(1,10),error_rate,color='blue',linestyle='dashed',marker='o',markerfacecolor = 'red',markersize=10)
plt.title('Error Rate vs N_Value')
plt.xlabel('K')
plt.ylabel('Error Rate')


### Choose 4 as n_neighbor value and print classification_report

In [15]:
knn = KNeighborsClassifier(n_neighbors =4,n_jobs=3)
knn.fit(X_train,y_train)
pred_i = knn.predict(X_test)

print(classification_report(y_test,pred_i))

             precision    recall  f1-score   support

          1       0.72      0.79      0.76      2965
          2       0.66      0.76      0.71      2971
          3       0.80      0.85      0.83      2990
          4       0.92      0.92      0.92      2934
          5       0.89      0.98      0.93      3014
          6       0.93      0.94      0.94      2996
          7       0.88      0.83      0.85      2989
          8       0.90      0.91      0.90      2923
          9       0.96      0.98      0.97      3072
         10       0.96      0.97      0.97      2984
         11       0.94      0.95      0.95      2979
         12       0.94      0.91      0.92      3026
         13       0.93      0.93      0.93      2767
         14       0.68      0.65      0.67      3060
         15       0.94      0.89      0.91      2568
         16       0.92      0.90      0.91      2961
         17       0.93      0.94      0.93      2982
         18       0.88      0.84      0.86   

In [67]:
print(accuracy_score(y_test, pred_i))

0.8976372858547694


### Try Random Forest classifier

In [34]:
from sklearn.ensemble import RandomForestClassifier

In [45]:
forest_model = RandomForestClassifier(n_jobs=2,random_state=0,n_estimators = 30)
forest_model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [47]:
predictforest = forest_model.predict(X_test)

In [37]:
print(accuracy_score(y_test,predictforest))

0.8817995962941877


In [48]:
print(classification_report(y_test,predictforest)) ### with 10 estimators

             precision    recall  f1-score   support

          1       0.70      0.71      0.71      2965
          2       0.68      0.67      0.67      2971
          3       0.82      0.80      0.81      2990
          4       0.92      0.91      0.92      2934
          5       0.91      0.94      0.92      3014
          6       0.93      0.93      0.93      2996
          7       0.84      0.84      0.84      2989
          8       0.90      0.90      0.90      2923
          9       0.97      0.97      0.97      3072
         10       0.97      0.96      0.96      2984
         11       0.93      0.95      0.94      2979
         12       0.91      0.91      0.91      3026
         13       0.94      0.91      0.93      2767
         14       0.65      0.67      0.66      3060
         15       0.93      0.91      0.92      2568
         16       0.89      0.90      0.89      2961
         17       0.93      0.93      0.93      2982
         18       0.84      0.85      0.85   

In [39]:
print(classification_report(y_test,predictforest)) ### with 20 estimators

             precision    recall  f1-score   support

          1       0.69      0.70      0.70      2965
          2       0.67      0.66      0.67      2971
          3       0.81      0.80      0.81      2990
          4       0.91      0.91      0.91      2934
          5       0.89      0.94      0.92      3014
          6       0.93      0.93      0.93      2996
          7       0.84      0.83      0.83      2989
          8       0.89      0.91      0.90      2923
          9       0.96      0.97      0.97      3072
         10       0.96      0.95      0.96      2984
         11       0.92      0.94      0.93      2979
         12       0.91      0.91      0.91      3026
         13       0.93      0.91      0.92      2767
         14       0.64      0.66      0.65      3060
         15       0.92      0.89      0.91      2568
         16       0.88      0.89      0.89      2961
         17       0.93      0.92      0.92      2982
         18       0.84      0.84      0.84   

### Try Logistic Regression

In [31]:
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)

In [32]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,predictions))

0.9299984472853372


In [33]:
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          1       0.85      0.81      0.83      2965
          2       0.81      0.77      0.79      2971
          3       0.89      0.87      0.88      2990
          4       0.96      0.94      0.95      2934
          5       0.95      0.97      0.96      3014
          6       0.96      0.96      0.96      2996
          7       0.91      0.89      0.90      2989
          8       0.86      0.95      0.90      2923
          9       0.98      0.98      0.98      3072
         10       0.98      0.98      0.98      2984
         11       0.96      0.98      0.97      2979
         12       0.95      0.94      0.95      3026
         13       0.96      0.94      0.95      2767
         14       0.75      0.82      0.79      3060
         15       0.96      0.92      0.94      2568
         16       0.94      0.94      0.94      2961
         17       0.96      0.95      0.95      2982
         18       0.91      0.88      0.90   

### Try SVC

In [49]:
from sklearn.svm import LinearSVC
svcmodel = LinearSVC()
svcmodel.fit(X_train,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [51]:
predsvc = svcmodel.predict(X_test)
print(classification_report(y_test,predsvc))

             precision    recall  f1-score   support

          1       0.87      0.55      0.67      2965
          2       0.69      0.74      0.72      2971
          3       0.71      0.93      0.81      2990
          4       0.91      0.94      0.93      2934
          5       0.68      0.98      0.81      3014
          6       0.93      0.91      0.92      2996
          7       0.93      0.75      0.83      2989
          8       0.92      0.88      0.90      2923
          9       0.98      0.97      0.97      3072
         10       0.99      0.92      0.95      2984
         11       0.89      0.97      0.93      2979
         12       0.95      0.83      0.89      3026
         13       0.97      0.85      0.90      2767
         14       0.72      0.65      0.68      3060
         15       0.92      0.89      0.90      2568
         16       0.92      0.86      0.89      2961
         17       0.78      0.98      0.87      2982
         18       0.91      0.82      0.86   

In [68]:
print(accuracy_score(y_test, predsvc))

0.8704518399668755


### Try AdaBoost

In [56]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier()
ada.fit(X_train,y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [58]:
predAda = ada.predict(X_test)
print(classification_report(y_test,predAda))

             precision    recall  f1-score   support

          1       0.72      0.68      0.70      2965
          2       0.70      0.68      0.69      2971
          3       0.76      0.77      0.76      2990
          4       0.88      0.89      0.88      2934
          5       0.86      0.87      0.87      3014
          6       0.90      0.85      0.88      2996
          7       0.83      0.82      0.82      2989
          8       0.81      0.78      0.80      2923
          9       0.95      0.94      0.94      3072
         10       0.93      0.91      0.92      2984
         11       0.85      0.82      0.84      2979
         12       0.85      0.92      0.88      3026
         13       0.86      0.86      0.86      2767
         14       0.68      0.64      0.66      3060
         15       0.85      0.89      0.87      2568
         16       0.83      0.83      0.83      2961
         17       0.86      0.89      0.87      2982
         18       0.80      0.83      0.81   

In [69]:
print(accuracy_score(y_test, predAda))

0.8460224626054552


### Save model to disk

In [60]:
import pickle
filename = 'ada_fasttext.sav'
pickle.dump(ada, open(filename, 'wb'))

In [30]:
logmodel = pickle.load(open('finalized_model.sav', 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.9299984472853372
