In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import nltk
import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score, classification_report

import pickle
from joblib import dump, load
     

In [2]:
df = pd.read_csv('DATA1.csv')
df.head()

Unnamed: 0,Product,Item,Category
0,"""Prada Striped Shell Belt Bag""",Bags,Accessories
1,"""Falke - Lhasa Wool And Cashmere-blend Socks -...",Socks,Accessories
2,"""Thom Browne Navy 4-Bar Rib Gloves""",Gloves,Accessories
3,"""Alice Made This - Bayley Round Patina-brass C...",Cufflinks,Accessories
4,"""Bode Off-White Sherpa Half-Zip Mittens""",Gloves,Accessories


In [3]:
df.shape

(11179, 3)

In [7]:
df.Item.unique()

array(['Bags', 'Socks', 'Gloves', 'Cufflinks', 'Tech Accessories', 'Ties',
       'Eyewear', 'Pocket Squares', 'Hats', 'Pins and Clips', 'Belts',
       'Wallets', 'Scarves', 'Active Pants', 'Sweatshirts/Hoodies',
       'Shirts', 'Diapers', 'Baby Creams & Lotions', 'Baby Powder',
       'Baby Wipes', 'Baby Oil & Shampoo', 'Baby Laundry',
       'Baby Gift Sets', 'Baby Health', 'Baby Bath',
       'Combs, Brushes, Clippers', 'Maternity Personal Care',
       'Baby Oral Care', 'Paneer, Tofu & Cream', 'Milk',
       'Butter & Margarine', 'Curd', 'Dairy Free (Vegan)',
       'Flavoured, Soya Milk', 'Cheese', 'Bakery Biscuits, Cookies',
       'Croissants, Bagels', 'Tea Cakes & Slice Cakes', 'Rusks',
       'Khari & Cream Rolls', 'Yogurt & Shrikhand', 'Ice Creams',
       'Premium Cookies', 'Doughnuts & Mousses', 'Bread Sticks & Lavash',
       'Breadcrumbs & Croutons', 'Muffins & Cup Cakes', 'Gourmet Bread',
       'Pastries & Brownies', 'Bathing Bars & Soaps', 'Toothpaste',
       'Antis

In [9]:
df.Product.unique()

array(['"Prada Striped Shell Belt Bag"',
       '"Falke - Lhasa Wool And Cashmere-blend Socks - Mens - Navy"',
       '"Thom Browne Navy 4-Bar Rib Gloves"', ...,
       '"Men\'s Barocco Logo Bath Robe"',
       '"Men\'s Majestic International Stretch Out Shawl Collar Robe"',
       '"Men\'s Polo Ralph Lauren Boulder Cotton Boxers"'], dtype=object)

In [8]:
df['Category'].value_counts()

Beauty & Hygiene            3284
Kitchen, Garden & Pets      1233
Accessories                 1154
Shoes                        811
Shirts                       540
Activewear                   503
Pants                        487
Cleaning & Household         485
Foodgrains, Oil & Masala     445
Gourmet & World Food         393
Snacks & Branded Foods       304
coats                        289
Eggs, Meat & Fish            281
Underwear and Nightwear      212
Suits                        160
Sweaters                     132
Bakery, Cakes & Dairy        119
Jewelry                       97
Beverages                     96
Baby Care                     94
Fruits & Vegetables           59
Category                       1
Name: Category, dtype: int64

In [7]:
import re


def clean_text(text):
    
    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # remove the characters [\], ['] and ["]
    text = re.sub(r"\\", "", text)    
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)    
    
    # convert text to lowercase
    text = text.strip().lower()
    
    # replace punctuation characters with spaces
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)

    return text
clean_text("<div>This is not a sentence.<\div>").split()

['this', 'is', 'not', 'a', 'sentence']

In [8]:
# df1 = df.copy()
# df1['description'] = df1['description'].apply('clean_text', text=df1['description'])

In [10]:

target = {
       'Beauty & Hygiene' : 0,
       'Kitchen, Garden & Pets': 1, 
       'Accessories' : 2,
       'Shoes' : 3,
       'Shirts' : 4,
       'Activewear' : 5,
       'Pants' : 6,
       'Cleaning & Household' : 7,
       'Foodgrains, Oil & Masala' : 8,
       'Gourmet & World Food' : 9,
       'Snacks & Branded Foods' : 10,
       'Coats' : 11,
       'Eggs, Meat & Fish' : 12,
       'Underwear and Nightwear' : 13,
       'Suits' : 14,
       'Sweaters' : 15,
       'Bakery, Cakes & Dairy' : 16,
       'Jewelry' : 17,
       'Beverages' : 18,
       'Baby Care' : 19,
       'Fruits & Vegetables' : 20 
}

In [11]:
df['target']=df['Category'].map(target)
df = df.dropna()

In [12]:
# df.head(-5)
df.target.unique()

array([ 2.,  5., 19., 16.,  0., 18.,  7., 12.,  8., 20.,  9., 17.,  1.,
        6.,  4.,  3., 10., 14., 15., 13.])

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(df, test_size=0.2, random_state=111)

In [14]:

stemmer = nltk.stem.SnowballStemmer('english')

In [15]:
file_path = "stopwords\english" 
with open(file_path, "r") as file:
    stopwords = file.readlines()

stop = [line.rstrip('\n') for line in stopwords]
stop[::10]

['i',
 "you've",
 'himself',
 'they',
 'that',
 'been',
 'a',
 'while',
 'through',
 'in',
 'here',
 'few',
 'own',
 'just',
 're',
 'doesn',
 'ma',
 "shouldn't"]

In [16]:
porter = PorterStemmer()

In [17]:
def preprocess_data(text):
    ''' The function to remove punctuation,
    stopwords and apply stemming'''
    
    words = re.sub("[^a-zA-Z]", " ", text)
    words = [word.lower() for word in text.split() if word.lower() not in stop]
    words = [porter.stem(word) for word in words]
    return " ".join(words)

In [18]:
df['Product'] = df['Product'].apply(preprocess_data)

In [19]:
df['Product'][459]

'"illesteva - murdoch round acet sunglass - men - black"'

In [19]:
# tokens = df['description'][0]

# nltk.pos_tag(word_tokenize(tokens))

In [20]:
vectorizer = CountVectorizer()
vectorizer.fit(df['Product'])
vector = vectorizer.transform(df['Product'])

In [21]:
print(vector.shape)
print(vector.toarray())

(10889, 8329)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [22]:
tfidf_converter = TfidfTransformer()
X_tfidf = tfidf_converter.fit_transform(vector).toarray()
X_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [23]:
X = df['Product']
y = df['Category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [24]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8166,), (2723,), (8166,), (2723,))

In [25]:
def get_models(X_train, X_test, y_train, y_test):
    models = pd.DataFrame()
    classifiers = [
        LogisticRegression(),
        LinearSVC(),
        MultinomialNB(),
        RandomForestClassifier(n_estimators=50),
        GradientBoostingClassifier(n_estimators=50), ]

    for classifier in classifiers:
        pipeline = Pipeline(steps=[('vect', CountVectorizer(
                               min_df=5, ngram_range=(1, 2))),
                                    ('tfidf', TfidfTransformer()),
                                    ('classifier', classifier)])
        pipeline.fit(X_train, y_train)
        score = pipeline.score(X_test, y_test)
        param_dict = {
                      'Model': classifier.__class__.__name__,
                      'Score': score
                     }
        models = models.append(pd.DataFrame(param_dict, index=[0]))

    models.reset_index(drop=True, inplace=True)
    print(models.sort_values(by='Score', ascending=False))

In [26]:
get_models(X_train, X_test, y_train, y_test)

  models = models.append(pd.DataFrame(param_dict, index=[0]))
  models = models.append(pd.DataFrame(param_dict, index=[0]))
  models = models.append(pd.DataFrame(param_dict, index=[0]))
  models = models.append(pd.DataFrame(param_dict, index=[0]))


                        Model     Score
1                   LinearSVC  0.925083
3      RandomForestClassifier  0.897539
0          LogisticRegression  0.895703
4  GradientBoostingClassifier  0.850900
2               MultinomialNB  0.838414


  models = models.append(pd.DataFrame(param_dict, index=[0]))


In [27]:
# log_acc = accuracy_score(pred, y_test)
# svm_acc = accuracy_score(y_pred, y_test)
# nb_acc = accuracy_score(pred_y, y_test)
# rf_acc = accuracy_score(preds, y_test)
# gb_acc = accuracy_score(predicted, y_test)
# models = pd.DataFrame({
#                       'Model': ['Logistic Regression', 'SVC', 'Naive Bayes', 'Random Forest', 'Gradient Boosting'],
#                       'Score': [log_acc, svm_acc, nb_acc, rf_acc, gb_acc]})
# models.sort_values(by='Score', ascending=False)

In [29]:
svc = Pipeline([('vect', CountVectorizer(min_df=5, ngram_range=(1,2))),
               ('tfidf', TfidfTransformer()),
               ('model',LinearSVC()),
               ])

svc.fit(X_train, y_train)

ytest = np.array(y_test)
y_pred = svc.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
# print(classification_report(ytest, y_pred))

accuracy 0.9250826294528094
                          precision    recall  f1-score   support

             Accessories       0.97      0.98      0.97       295
              Activewear       0.93      0.87      0.90       130
               Baby Care       1.00      0.79      0.88        28
   Bakery, Cakes & Dairy       0.79      0.54      0.64        28
        Beauty & Hygiene       0.95      0.98      0.96       833
               Beverages       0.71      0.57      0.63        21
    Cleaning & Household       0.88      0.84      0.86       131
       Eggs, Meat & Fish       0.98      0.95      0.97        61
Foodgrains, Oil & Masala       0.79      0.81      0.80       110
     Fruits & Vegetables       1.00      0.72      0.84        18
    Gourmet & World Food       0.70      0.70      0.70        97
                 Jewelry       0.92      0.86      0.89        28
  Kitchen, Garden & Pets       0.94      0.95      0.94       304
                   Pants       0.94      0.90  

In [30]:
pickle.dump(svc, open('SVC_model.pkl', 'wb'))

In [31]:
with open('SVC_model.pkl', 'rb') as f:
    model = load(f)

In [32]:
text = ["shirt red"]
prediction = model.predict(text)

print(prediction)

['Shirts']
