# Product Category Classification using Machine Learning Approaches
## In this Notebook, Different machine learning approaches are compared to classify product category using product title
* Preprocessing
* Create Word Vector using Count Vector & TF-IDF
* Training Different Models such as SVM, Naive Bayes, Linear Regression etc and comparing the results

In [36]:
# Import Necessary Packages
import os
import re
import string
import scipy
import seaborn as sns
import numpy as np
import pandas as pd
import time
import nltk
import pickle
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import ParameterGrid
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from utils.helper_functions import read_dataframes

nltk.download('stopwords')
ROOT_DATA_PATH = '../data/'

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
train_df, val_df, test_df  = read_dataframes()

In [3]:
train_df

Unnamed: 0,product_title,cluster_label,category
0,apple iphone 8 plus 5 5 single sim 4g 64gb grey,apple iphone 8 plus 64gb,7
1,samsung s5610,samsung gt s5610,7
2,bosch kil42vs30g series 4 built single door fr...,bosch kil42vs30g integrated,5
3,samsung ue49nu7100 49 ultra hd certified hdr s...,samsung ue49nu7100,8
4,siemens standsp ler sr25e205eu 45cm,siemens sr25e205eu white,2
...,...,...,...
21181,samsung rz32m7120bc freezer frost free 315litr...,samsung rz32m7120bc eu black,3
21182,smeg ukud7140lsp integrated counter fridge,smeg ukud7140lsp integrated,5
21183,neff gi1213f30g built freezer 87x54cm 3 freeze...,neff gi1213f30g integrated,3
21184,hisense h49n5700 49 inch smart 4k ultra hd hdr...,hisense h49n5700,8


# Count Vector Vs TF-IDF

In [4]:
count_vec = CountVectorizer()

In [5]:
x_train_bow = np.asarray(count_vec.fit_transform(train_df['product_title']).todense())
y_train = train_df['category'].to_numpy()
x_val_bow = np.asarray(count_vec.transform(val_df['product_title']).todense())
y_val = val_df['category'].to_numpy()

In [6]:
tfidf = TfidfVectorizer(ngram_range = (1, 1))
x_train_tfidf = np.asarray(tfidf.fit_transform(train_df['product_title']).todense())
x_val_tfidf = np.asarray(tfidf.transform(val_df['product_title']).todense())

In [7]:
model_logistic = LogisticRegression()
model_logistic.fit(x_train_bow, y_train)

In [8]:
y_pred_val = model_logistic.predict(x_val_bow)

In [9]:
print('Accuracy for Count Vector ', accuracy_score(y_val, y_pred_val))

Accuracy for Count Vector  0.9551118663268195


In [10]:
print('F1 Score for Count Vector', f1_score(y_val, y_pred_val, average="macro"))

F1 Score for Count Vector 0.9566667123834044


In [11]:
model_logistic = LogisticRegression()
model_logistic.fit(x_train_tfidf, y_train)

In [12]:
y_pred_val = model_logistic.predict(x_val_tfidf)

In [13]:
print('Accuracy for Count Vector ', accuracy_score(y_val, y_pred_val))

Accuracy for Count Vector  0.9528462192013594


In [14]:
print('F1 Score for Count Vector', f1_score(y_val, y_pred_val, average='macro'))

F1 Score for Count Vector 0.954459748301382


## The performance of Count Vector and TF-IDF is almost similar, with count vector being slightly better so count vector is used further 

In [15]:
models_dict = {
    'logistic_regression' : LogisticRegression(max_iter = 200),
    'multinomial_naive_bayes' : MultinomialNB(),
    'svm_with_kernal' : svm.SVC(kernel = 'linear'),
    'random_forest' : RandomForestClassifier(),
    'xgboost' : XGBClassifier(),
}

In [48]:
def train_model(x_train, y_train, x_val, y_val, model_name, model):
    now = time.time()
    model.fit(x_train, y_train)
    end_time = time.time() - now
    print(f"Training Finished for {model_name} in {end_time} seconds")
    y_train_pred, y_val_pred = model.predict(x_train), model.predict(x_val)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred, average='macro')
    val_accuracy = accuracy_score(y_val, y_val_pred)
    val_f1 = f1_score(y_val, y_val_pred, average='macro')
    with open(f'model_files/{model_name}_classification.pkl','wb') as f:
        pickle.dump(model,f)    
    return train_accuracy, train_f1, val_accuracy, val_f1

In [49]:
result_df = pd.DataFrame()
train_accuracy_arr = []
train_f1_arr = []
val_accuracy_arr = []
val_f1_arr  = []

In [50]:
for model in models_dict.keys():
    train_accuracy, train_f1, val_accuracy, val_f1 = train_model(x_train_bow, y_train, x_val_bow, y_val, model, models_dict[model])
    train_accuracy_arr.append(train_accuracy)
    train_f1_arr.append(train_f1)
    val_accuracy_arr.append(val_accuracy)
    val_f1_arr.append(val_f1)

Training Finished for logistic_regression in 11.949161767959595 seconds
Training Finished for multinomial_naive_bayes in 23.066933155059814 seconds
Training Finished for svm_with_kernal in 464.4019694328308 seconds
Training Finished for random_forest in 78.87743735313416 seconds
Training Finished for xgboost in 79.26292753219604 seconds


In [51]:
result_df['Model'] = models_dict.keys()
result_df['Train Accuracy'] = train_accuracy_arr
result_df['Train F1'] = train_f1_arr
result_df['Val Accuracy'] = val_accuracy_arr
result_df['Val F1'] = val_f1_arr

In [52]:
result_df.sort_values(by = 'Val Accuracy', ascending = False, inplace = True)

In [53]:
result_df

Unnamed: 0,Model,Train Accuracy,Train F1,Val Accuracy,Val F1
2,svm_with_kernal,0.99915,0.999201,0.957094,0.958524
3,random_forest,0.999906,0.999895,0.955253,0.956461
0,logistic_regression,0.990843,0.990886,0.955112,0.956667
4,xgboost,0.957519,0.958389,0.945058,0.94632
1,multinomial_naive_bayes,0.967809,0.966381,0.940952,0.938766


## Grid Search for hyperparameter tuning

In [38]:
svm_model = svm.SVC()
svm_params = {
    'kernel': ['linear'],
    'C': [0.1, 1, 10]
}

best_model = svm_model
best_params =  ParameterGrid(svm_params)[0]
best_accuracy = 0

In [39]:
for i, param in enumerate(ParameterGrid(svm_params)):
    now = time.time()
    print(f'Grid Search Step {i} using {param}')
    svm_model.set_params(**param)
    svm_model.fit(x_train_bow, y_train)
    y_train_pred, y_val_pred = svm_model.predict(x_train_bow), svm_model.predict(x_val_bow)
    train_accuracy, val_accuracy = accuracy_score(y_train, y_train_pred), accuracy_score(y_val, y_val_pred)
    end_time = time.time() - now
    print(f'Training accuracy: {train_accuracy}, Validation accuracy: {val_accuracy}, Finished in {end_time} seconds')
    print('')
    if val_accuracy > best_accuracy:
        best_params, best_accuracy = param, val_accuracy

Grid Search Step 0 using {'C': 0.1, 'kernel': 'linear'}
Training accuracy: 0.9621448126121024, Validation accuracy: 0.9415179835740584, Finished in 1185.2666008472443 seconds

Grid Search Step 1 using {'C': 1, 'kernel': 'linear'}
Training accuracy: 0.9991503823279524, Validation accuracy: 0.9570943075615973, Finished in 1073.1937501430511 seconds

Grid Search Step 2 using {'C': 10, 'kernel': 'linear'}
Training accuracy: 0.9998583970546587, Validation accuracy: 0.956952704616256, Finished in 1128.784733057022 seconds



In [43]:
best_model, best_params, best_score = svm.SVC(), best_params, best_accuracy
best_model.set_params(**best_params)
print(f"Best model: {best_model}")
print(" ")
print(f"Best parameters: {best_params}")
print(f"Best validation accuracy: {best_score}")

Best model: SVC(C=1, kernel='linear')
 
Best parameters: {'C': 1, 'kernel': 'linear'}
Best validation accuracy: 0.9570943075615973


## Save Count Vector Object for Prediction

In [37]:
with open('model_files/count_vectorizer.pkl', 'wb') as file:
    pickle.dump(count_vec, file)