In [1]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import spacy
from sklearn.model_selection import train_test_split

In [2]:
df_train = pd.read_csv("D:\\Data science\\Final project\\category\\training_data.csv")
df_train.head()

Unnamed: 0,title,category
0,The Three Amigos,
1,Home Essentials Blue Floral Glass Vintage Styl...,Home & Kitchen
2,Cooper Wiring Quiet Toggle Switch Single Pole ...,Tools & Home Improvement
3,Baseboarders&reg; Wall Brackets,Tools & Home Improvement
4,The Great Wave Off Kanagawa Custom Rectangle M...,Office Products


In [3]:
df_test = pd.read_csv("D:\\Data science\\Final project\\category\\testing_data.csv")
df_test.head()

Unnamed: 0,title,category
0,Pom Pom Hair Band Rabbit Light Grey Decorative...,
1,Mariposa Golf Ball Napkin Weight,
2,Mediterranean Snack Food Roasted Garlic Hummuz...,
3,John Deere 0071750GX22269 Genuine Original Equ...,
4,"Protech Wood Cleanr Gl,PERFORMANCE COATINGS IN...",


In [4]:
df_train.shape

(20188, 2)

In [5]:
df_train['category'].unique()

array(['None', 'Home & Kitchen', 'Tools & Home Improvement',
       'Office Products', 'Grocery & Gourmet Food', 'Electronics',
       'Industrial & Scientific'], dtype=object)

In [6]:
df_train['category'].value_counts()

None                        10123
Home & Kitchen               4960
Tools & Home Improvement     2080
Office Products              1144
Grocery & Gourmet Food       1102
Industrial & Scientific       588
Electronics                   191
Name: category, dtype: int64

In [7]:
df_train.isnull().sum()

title       0
category    0
dtype: int64

In [8]:
df_train.category.replace(to_replace=['None'], value=np.nan, inplace=True)

In [9]:
df_train.isnull().sum()

title           0
category    10123
dtype: int64

In [10]:
df_train.dropna(inplace=True)

In [11]:
df_train.shape

(10065, 2)

In [12]:
blanks = []

for i,l,r in df_train.itertuples():
    if type(r) == str:
        if r.isspace():
            blanks.append(i)
            
print(blanks)

[]


In [13]:
X = df_train['title']

# label
y = df_train['category']

In [14]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [16]:
model1 = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [17]:
model1.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [18]:
prediction1 = model1.predict(X_test)

In [19]:
print(confusion_matrix(y_test, prediction1))

[[  28    0   11    1    6   13]
 [   0  286   34    2    0    1]
 [   0   12 1411    7   17   52]
 [   1    3   31   83   10   41]
 [   3    2   69    9  267   16]
 [   1    0   96    8   18  481]]


In [20]:
print(classification_report(y_test, prediction1))

                          precision    recall  f1-score   support

             Electronics       0.85      0.47      0.61        59
  Grocery & Gourmet Food       0.94      0.89      0.91       323
          Home & Kitchen       0.85      0.94      0.90      1499
 Industrial & Scientific       0.75      0.49      0.59       169
         Office Products       0.84      0.73      0.78       366
Tools & Home Improvement       0.80      0.80      0.80       604

                accuracy                           0.85      3020
               macro avg       0.84      0.72      0.77      3020
            weighted avg       0.84      0.85      0.84      3020



In [21]:
print(accuracy_score(y_test, prediction1))

0.8463576158940397


In [24]:
from sklearn.model_selection import GridSearchCV

In [25]:
model_grid_search = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC(random_state=42))])

parameters = [{
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__max_features': (None, 5000, 10000, 50000),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)]
},{
    'clf__C': [0.00001, 0.0001, 0.0005],
    'clf__dual': (True, False)
}]

grid_search = GridSearchCV(estimator=model_grid_search, param_grid=parameters, verbose=1, n_jobs=-1, cv=4)

In [26]:
grid_search.fit(X_train, y_train)

Fitting 4 folds for each of 42 candidates, totalling 168 fits


GridSearchCV(cv=4,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('clf', LinearSVC(random_state=42))]),
             n_jobs=-1,
             param_grid=[{'tfidf__max_df': (0.25, 0.5, 0.75),
                          'tfidf__max_features': (None, 5000, 10000, 50000),
                          'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)]},
                         {'clf__C': [1e-05, 0.0001, 0.0005],
                          'clf__dual': (True, False)}],
             verbose=1)

In [27]:
grid_search.best_estimator_

Pipeline(steps=[('tfidf', TfidfVectorizer(max_df=0.25)),
                ('clf', LinearSVC(random_state=42))])

In [28]:
grid_search.best_score_

0.8218610150176513

In [29]:
grid_search.best_params_

{'tfidf__max_df': 0.25,
 'tfidf__max_features': None,
 'tfidf__ngram_range': (1, 1)}

In [30]:
model2 = Pipeline([('tfidf', TfidfVectorizer(max_df=0.25,
                                               max_features=50000,
                                              ngram_range=(1,2))),
                     ('clf', LinearSVC(random_state=42))
                    ])

In [31]:
model2.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_df=0.25, max_features=50000,
                                 ngram_range=(1, 2))),
                ('clf', LinearSVC(random_state=42))])

In [32]:
prediction2 = model2.predict(X_test)

In [33]:
print(confusion_matrix(y_test, prediction2))

[[  30    0   11    1    8    9]
 [   0  290   31    0    0    2]
 [   0   16 1414    6   11   52]
 [   1    3   31   89    6   39]
 [   3    4   75    8  261   15]
 [   1    3   94   11   13  482]]


In [34]:
print(classification_report(y_test, prediction2))

                          precision    recall  f1-score   support

             Electronics       0.86      0.51      0.64        59
  Grocery & Gourmet Food       0.92      0.90      0.91       323
          Home & Kitchen       0.85      0.94      0.90      1499
 Industrial & Scientific       0.77      0.53      0.63       169
         Office Products       0.87      0.71      0.78       366
Tools & Home Improvement       0.80      0.80      0.80       604

                accuracy                           0.85      3020
               macro avg       0.85      0.73      0.78      3020
            weighted avg       0.85      0.85      0.84      3020



In [35]:
print(accuracy_score(y_test, prediction2))

0.8496688741721854


In [36]:
print("First Prediction:\n",classification_report(y_test, prediction1),"\n\nSecond Prediciton:\n",classification_report(y_test, prediction2))

First Prediction:
                           precision    recall  f1-score   support

             Electronics       0.85      0.47      0.61        59
  Grocery & Gourmet Food       0.94      0.89      0.91       323
          Home & Kitchen       0.85      0.94      0.90      1499
 Industrial & Scientific       0.75      0.49      0.59       169
         Office Products       0.84      0.73      0.78       366
Tools & Home Improvement       0.80      0.80      0.80       604

                accuracy                           0.85      3020
               macro avg       0.84      0.72      0.77      3020
            weighted avg       0.84      0.85      0.84      3020
 

Second Prediciton:
                           precision    recall  f1-score   support

             Electronics       0.86      0.51      0.64        59
  Grocery & Gourmet Food       0.92      0.90      0.91       323
          Home & Kitchen       0.85      0.94      0.90      1499
 Industrial & Scientific       

In [37]:
# feature
X_t = df_test['title']

# label
y_t = df_test['category']

In [38]:
getting_categories = model2.predict(X_t)

In [39]:
len(getting_categories), len(df_test)

(10094, 10094)

In [40]:
df_test['category'] = getting_categories

In [43]:
df_test.head(20)

Unnamed: 0,title,category
0,Pom Pom Hair Band Rabbit Light Grey Decorative...,Home & Kitchen
1,Mariposa Golf Ball Napkin Weight,Home & Kitchen
2,Mediterranean Snack Food Roasted Garlic Hummuz...,Grocery & Gourmet Food
3,John Deere 0071750GX22269 Genuine Original Equ...,Home & Kitchen
4,"Protech Wood Cleanr Gl,PERFORMANCE COATINGS IN...",Home & Kitchen
5,tic tac Holiday Candy Cane and Cherry Apple Tw...,Grocery & Gourmet Food
6,DENIED Red Office Stock Self-Inking Rubber Stamp,Office Products
7,Replacement Message Board Face with Tracks for...,Office Products
8,Bundle - 3 Items: Sweet Baby Ray's Buffalo Win...,Grocery & Gourmet Food
9,BravoVision Fashion Custom Chicago Scenery Wat...,Home & Kitchen
