In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

import re
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

In [2]:
pd.set_option('display.max_colwidth', -1)

## Load the Amazon Dataset

In [3]:
amazon_fashion = pd.read_csv('amazon_fashion.csv')

In [4]:
amazon_fashion.head()

Unnamed: 0,product_name,brand,price,category
0,Grey/Camo Stingray Escape Bodybuilding Weightlifting MMA & Boxing Shoe,Otomix,133.0,athletic shoes
1,Women's Hexaffect 5.0 MTM Running Shoe,,35.95,athletic shoes
2,Women's Stan Smith Fashion Sneakers,,38.62,fashion sneakers
3,Women's Summer Casual Sleeveless Long Sleeve Mini Plain Pleated Tank Vest Dresses T-Shirt Dress,VERABENDI,9.99,dresses
4,"Evie Nubuck Mule, 6.5",,54.99,mules and clogs


In [5]:
amazon_fashion.shape

(281267, 4)

## Cleaning the Dataset

We are only interested on the 'product_name' and 'category' columns, so we can drop the other 2 columns.

In [6]:
amazon_fashion.drop(['brand', 'price'], axis=1, inplace=True)

First let's check if the product_name column includes any nan values

In [7]:
amazon_fashion.isnull().sum(axis = 0)

product_name    29
category        0 
dtype: int64

In [8]:
amazon_fashion.dropna(subset=['product_name'], how='all', inplace=True)

In [9]:
amazon_fashion.isnull().sum(axis = 0)

product_name    0
category        0
dtype: int64

Now that we eliminated the unwanted columns and the rows with nan values, we can split the data into X and y.

In [10]:
X = amazon_fashion['product_name']

In [11]:
y = amazon_fashion['category']

## Preparation of the data into word vectors and embeddings

We are going to test 2 distinct approaches for the data. The first one using Bag of Words & TF-IDF, and the second one using Embeddings from fastText

### Bag of Words & TF-IDF

Before turning the product names into a bag of words, let's do some pre-processing first, which includes: apply lower case, remove numbers and special characters, remove most frequent words that won't help the classification.

In [12]:
bow_amazon_fashion = X.str.lower()

In [13]:
bow_amazon_fashion.head()

0    grey/camo stingray escape bodybuilding weightlifting mma & boxing shoe                         
1    women's hexaffect 5.0 mtm running shoe                                                         
2    women's stan smith fashion sneakers                                                            
3    women's summer casual sleeveless long sleeve mini plain pleated tank vest dresses t-shirt dress
4    evie nubuck mule, 6.5                                                                          
Name: product_name, dtype: object

In [14]:
bow_amazon_fashion = bow_amazon_fashion.map(lambda x: re.sub(r'[^a-zA-Z ]', '', str(x)))

In [15]:
bow_amazon_fashion = bow_amazon_fashion.map(lambda x: re.sub(r'^[\d]*', '', str(x)))

In [16]:
stop_words = stopwords.words('english')

bow_amazon_fashion = bow_amazon_fashion.apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))

In [17]:
most_freq = pd.Series(' '.join(bow_amazon_fashion).split()).value_counts()[:20]
most_freq

womens    205123
women     35759 
shoes     27282 
sleeve    23510 
long      22917 
casual    22593 
dress     21995 
high      19843 
pants     17193 
size      16100 
flat      15786 
toe       14216 
short     14082 
top       13305 
skirt     13118 
waist     12951 
plus      12179 
lace      12030 
socks     11311 
jacket    11265 
dtype: int64

We can observe that some words are reflective to the actual category the item belongs to, while others won't help the classification process. Words such as 'womens' and 'women' will hence be removed.

In [18]:
most_freq = list(['womens', 'women', 'size'])

bow_amazon_fashion = bow_amazon_fashion.apply(lambda x: " ".join(x for x in x.split() if x not in most_freq))

Now we need to split the data into train and test datasets

In [19]:
bow_train_data, bow_test_data, bow_train_labels, bow_test_labels = train_test_split(
    bow_amazon_fashion, y, test_size=0.33, random_state=42)

In [20]:
bow_train_data.shape

(188429,)

In [21]:
bow_test_data.shape

(92809,)

And finally, create the TF_IDF vector

In [22]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=29765)
tfidf_vect.fit(bow_train_data)

bow_train_data =  tfidf_vect.transform(bow_train_data)
bow_test_data =  tfidf_vect.transform(bow_test_data)

### Get Embeddings from fastText

## Defining and Training the Models

In [23]:
def modeling(ml_model, train_data, train_labels, test_data, test_labels):
    
    # Training
    model = ml_model
    model.fit(train_data, train_labels)
    y_pred = model.predict(test_data)

    # Evaluation
    acc = accuracy_score(test_labels, y_pred)
    
    return acc

### Run the models with the bag of words

#### SVM Classifier

In [26]:
linear_svc_acc = modeling(LinearSVC(), bow_train_data, bow_train_labels, bow_test_data, bow_test_labels)

In [27]:
linear_svc_acc = '%.2f'%(linear_svc_acc * 100)
print("Accuracy on the dataset using the SVC Classifier: {}".format(linear_svc_acc))

Accuracy on the dataset using the SVC Classifier: 81.86


#### Logistic Regression Classifier

In [28]:
log_reg_acc = modeling(LogisticRegression(), bow_train_data, bow_train_labels, bow_test_data, bow_test_labels)

In [29]:
log_reg_acc = '%.2f'%(log_reg_acc * 100)
print("Accuracy on the dataset using the Logistic Regression Classifier: {}".format(log_reg_acc))

Accuracy on the dataset using the Logistic Regression Classifier: 81.78


#### Random Forest Classifier

In [30]:
random_forest_acc = modeling(RandomForestClassifier(), bow_train_data, bow_train_labels, bow_test_data, bow_test_labels)

In [31]:
random_forest_acc = '%.2f'%(random_forest_acc * 100)
print("Accuracy on the dataset using the Random Forest Classifier: {}".format(random_forest_acc))

Accuracy on the dataset using the Random Forest Classifier: 78.96


#### Naive Bayes Classifier

In [32]:
naive_bayes_acc = modeling(GaussianNB(), bow_train_data.toarray(), bow_train_labels, bow_test_data.toarray(), bow_test_labels)

In [33]:
naive_bayes_acc = '%.2f'%(naive_bayes_acc * 100)
print("Accuracy on the dataset using the Naive Bayes Classifier: {}".format(naive_bayes_acc))

Accuracy on the dataset using the Naive Bayes Classifier: 33.80


#### XGBoost Classifier

In [34]:
xgboost_acc = modeling(XGBClassifier(), bow_train_data, bow_train_labels, bow_test_data, bow_test_labels)

  if diff:


In [35]:
xgboost_acc = '%.2f'%(xgboost_acc * 100)
print("Accuracy on the dataset using the XGBoost Classifier: {}".format(xgboost_acc))

Accuracy on the dataset using the XGBoost Classifier: 77.77


In [None]:
accuracies = [svc_acc, log_reg_acc, random_forest_acc, naive_bayes_acc, xgboost_acc]

## Class for testing

Training the selected model

In [36]:
final_model = LogisticRegression()
final_model.fit(bow_train_data, bow_train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [55]:
def get_category_proba(product_name):
    
    name = pd.Series([product_name])
    
    name =  tfidf_vect.transform(name)
    
    cat = final_model.predict(name)
    prob = final_model.predict_proba(name)
    prob = "{:.2f}".format(np.amax(prob)*100)
    
    result = "The item named {} was fitted under the {} category, with a probability of {}%".format(
        product_name, cat, prob)
    
    return result

In [61]:
result = get_category_proba('Levis High Rise Skinny Jeans')
print(result)

The item named Levis High Rise Skinny Jeans was fitted under the ['jeans'] category, with a probability of 94.37%


In [62]:
result = get_category_proba('Long Sleeve Knit Cardigan')
print(result)

The item named Long Sleeve Knit Cardigan was fitted under the ['sweaters'] category, with a probability of 93.76%
