# Importing Libraries
-----------------


In [12]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
from sklearn.naive_bayes import MultinomialNB

# Reading Data
-------------------

In [2]:
df = pd.read_csv('set.csv')

In [3]:
df.head()

Unnamed: 0,category,description
0,__label__Clothing-,key features of alisha solid women cycling sho...
1,__label__Furniture-,fabhomedecor fabric double sofa bed finish col...
2,__label__Footwear-,key features of aw bellies sandals wedges heel...
3,__label__Clothing-,key features of alisha solid women cycling sho...
4,__label__Pet-Supplies-,specifications of sicons all purpose arnica do...


# Preprocessing Data
-----------------

In [4]:
df['description'] = df['description'].fillna('').astype(str)

In [5]:
df['category_id'] = df['category'].factorize()[0]
category_id_df = df[['category', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'category']].values)
df.head()

Unnamed: 0,category,description,category_id
0,__label__Clothing-,key features of alisha solid women cycling sho...,0
1,__label__Furniture-,fabhomedecor fabric double sofa bed finish col...,1
2,__label__Footwear-,key features of aw bellies sandals wedges heel...,2
3,__label__Clothing-,key features of alisha solid women cycling sho...,0
4,__label__Pet-Supplies-,specifications of sicons all purpose arnica do...,3


In [6]:
len(df.category.unique())

266

In [7]:
df.category_id.unique()

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [9]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df.description).toarray()
labels = df.category_id
features.shape

(20000, 25139)

In [10]:
N = 2
for Product, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(features, labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  #print("# '{}':".format(Product))
  #print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
  #print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

# Modelling and Evaluation
------------

In [13]:
X_train, X_test, y_train, y_test = train_test_split(df['description'], df['category'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [14]:
print(clf.predict(count_vect.transform(["key features of alisha solid women cycling shorts cotton lycra black red specifications of alisha solid women cycling shorts shorts details number of contents in sales package pack of fabric cotton lycra type cycling shorts general details pattern solid ideal for women fabric care gentle machine wash in lukewarm water do not bleach additional details style code altght_ in the box shorts"])))

['__label__Clothing-']


In [17]:
print("The model accuracy is: ",clf.score(count_vect.transform(X_test), y_test)*100)

The model accuracy is:  74.32


### The Naive Bayes model gives us an accuracy of 74.32 