In [97]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.utils import shuffle

import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier

In [30]:
pd.set_option('display.max_colwidth', -1)

## Load the Amazon Dataset

In [47]:
amazon_fashion = pd.read_csv('amazon_data_clean.csv')
#amazon_fashion.drop(columns='Unnamed: 0',inplace=True)

In [48]:
amazon_fashion.head()

Unnamed: 0,brand,category,price,product_name
0,TOMS,slippers,21.99,women's classics
1,Vionic,slippers,59.95,women's relax slipper (size 11/dark grey zebra)
2,Birkenstock,slippers,89.95,arizona women's birko-flor sandal
3,TOMS,slippers,19.99,men's classic canvas slip-on
4,ULTRAIDEAS,slippers,19.9,"women's comfort memory foam slippers wool-like plush fleece lined house shoes w/indoor, outdoor anti-skid rubber sole"


In [49]:
amazon_fashion.shape

(281211, 4)

## Cleaning the Dataset

We are only interested on the 'product_name' and 'category' columns, so we can drop the other 2 columns.

In [50]:
amazon_fashion.drop(['brand', 'price'], axis=1, inplace=True)

First let's check if the product_name column includes any nan values

In [51]:
amazon_fashion.isnull().sum(axis = 0)

category        0
product_name    0
dtype: int64

In [36]:
amazon_fashion.dropna(subset=['product_name'], how='all', inplace=True)

In [55]:
amazon_fashion.isnull().sum(axis = 0)
amazon_fashion = shuffle(amazon_fashion, random_state = 100)

Now that we eliminated the unwanted columns and the rows with nan values, we can split the data into X and y.

In [56]:
X = amazon_fashion['product_name']

In [57]:
y = amazon_fashion['category']

## Preparation of the data into word vectors and embeddings

We are going to test 2 distinct approaches for the data. The first one using Bag of Words & TF-IDF, and the second one using Embeddings from fastText

### Bag of Words & TF-IDF

Before turning the product names into a bag of words, let's do some pre-processing first, which includes: apply lower case, remove numbers and special characters, remove most frequent words that won't help the classification.

In [58]:
bow_amazon_fashion = X.str.lower()

In [59]:
bow_amazon_fashion.head()

47729     womens spark kiska slip on ballet flat shoes                      
169500    dashiki cut print zipfront totem retro package hip skirt          
102270    womens amela grand pump pointed toe classic pumps                 
193662    one piece swimsuits athletic bathing suits training sport swinwear
83994     mens cooper                                                       
Name: product_name, dtype: object

In [60]:
bow_amazon_fashion = bow_amazon_fashion.map(lambda x: re.sub(r'[^a-zA-Z ]', '', str(x)))

In [61]:
bow_amazon_fashion = bow_amazon_fashion.map(lambda x: re.sub(r'^[\d]*', '', str(x)))

Now we need to split the data into train and test datasets

In [62]:
bow_train_data, bow_test_data, bow_train_labels, bow_test_labels = train_test_split(
    bow_amazon_fashion, y, test_size=0.30, random_state=100)

In [63]:
bow_train_data.shape

(196847,)

In [64]:
bow_test_data.shape

(84364,)

And finally, create the TF_IDF vector

In [65]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=29765)
tfidf_vect.fit(bow_train_data)

bow_train_data =  tfidf_vect.transform(bow_train_data)
bow_test_data =  tfidf_vect.transform(bow_test_data)

## Defining and Training the Models

In [83]:
def modeling(ml_model, train_data, train_labels, test_data, test_labels):
    
    # Training
    model = ml_model
    model.fit(train_data, train_labels)
    y_pred = model.predict(test_data)

    # Evaluation
    acc = accuracy_score(test_labels, y_pred)
    
    return acc

### Run the models with the bag of words

#### SVM Classifier

In [85]:
linear_svc_acc = modeling(LinearSVC(), bow_train_data, bow_train_labels, bow_test_data, bow_test_labels)

In [86]:
linear_svc_acc = '%.2f'%(linear_svc_acc * 100)
print("Accuracy on the dataset using the SVC Classifier: {}".format(linear_svc_acc))

Accuracy on the dataset using the SVC Classifier: 81.94


#### Logistic Regression Classifier

In [87]:
log_reg_acc = modeling(LogisticRegression(), bow_train_data, bow_train_labels, bow_test_data, bow_test_labels)

In [88]:
log_reg_acc = '%.2f'%(log_reg_acc * 100)
print("Accuracy on the dataset using the Logistic Regression Classifier: {}".format(log_reg_acc))

Accuracy on the dataset using the Logistic Regression Classifier: 82.08


#### Random Forest Classifier

In [89]:
random_forest_acc = modeling(RandomForestClassifier(), bow_train_data, bow_train_labels, bow_test_data, bow_test_labels)

In [90]:
random_forest_acc = '%.2f'%(random_forest_acc * 100)
print("Accuracy on the dataset using the Random Forest Classifier: {}".format(random_forest_acc))

Accuracy on the dataset using the Random Forest Classifier: 79.07


#### AdaBoost Classifier

In [92]:
adaboost_acc = modeling(AdaBoostClassifier(), bow_train_data, bow_train_labels, bow_test_data, bow_test_labels)

In [93]:
adaboost_acc = '%.2f'%(adaboost_acc * 100)
print("Accuracy on the dataset using the adaboost Classifier: {}".format(adaboost_acc))

Accuracy on the dataset using the adaboost Classifier: 64.71


In [None]:
accuracies = [svc_acc, log_reg_acc, random_forest_acc, naive_bayes_acc, xgboost_acc]