# Using Naive-Bayes Algorithm for Multi-class classification on textual data 

In [1]:
import pandas as pd
df = pd.read_csv('option_1.csv')
df

Unnamed: 0,Text,Cat
0,Hi! I'm Martine Redman and I created this puzz...,puzzles
1,120 colors? I say 120 sticks of fun! And a fre...,arts crafts
2,"What a wonderful creation. Art, in our house, ...",arts crafts
3,"Meowth is the coolest, lots of play value. Gre...",action toy figures
4,You can give a MagnaDoodle to any kid who's ab...,arts crafts
...,...,...
4995,I am always looking for foods that have not be...,snack food
4996,Critical item for our household which includes...,baby food
4997,I ordered these based on the positive reviews ...,snack food
4998,I was a little leery having to buy a box with ...,snack food


In [2]:
import re
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
import spacy

In [3]:
df['new_Text'] = df['Text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['new_Text'].head()

0    hi! i'm martine redman and i created this puzz...
1    120 colors? i say 120 sticks of fun! and a fre...
2    what a wonderful creation. art, in our house, ...
3    meowth is the coolest, lots of play value. gre...
4    you can give a magnadoodle to any kid who's ab...
Name: new_Text, dtype: object

In [4]:
import nltk
nltk.download('stopwords')
stop = stopwords.words('english')
df['new_Text'] = df['new_Text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df.head(20)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sunil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Text,Cat,new_Text
0,Hi! I'm Martine Redman and I created this puzz...,puzzles,hi! i'm martine redman created puzzle briarpat...
1,120 colors? I say 120 sticks of fun! And a fre...,arts crafts,120 colors? say 120 sticks fun! free sharpener...
2,"What a wonderful creation. Art, in our house, ...",arts crafts,"wonderful creation. art, house, comes many col..."
3,"Meowth is the coolest, lots of play value. Gre...",action toy figures,"meowth coolest, lots play value. great use kid..."
4,You can give a MagnaDoodle to any kid who's ab...,arts crafts,give magnadoodle kid who's able grasp stylus; ...
5,Mitchum Clear Gel Anti-Perspirant is an effect...,personal care,mitchum clear gel anti-perspirant effective de...
6,I loved the flavor of the bars!!! I really did...,nutrition wellness,loved flavor bars!!! really expect good are!!!...
7,My 5 year old daughter loved following the lif...,learning education,5 year old daughter loved following life cycle...
8,On my way to a great vacation in the Pocono's ...,electronics for kids,way great vacation pocono's son family. niita ...
9,These things are absolutely marvelous! Serious...,household supplies,"things absolutely marvelous! seriously folks, ..."


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df.new_Text).toarray()
labels = df.Cat


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
X_train, X_test, y_train, y_test = train_test_split(df['new_Text'], df['Cat'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

MultinomialNB()

In [7]:
print(X_train_tfidf)

  (0, 14182)	0.18830534793615428
  (0, 13738)	0.13367237451953598
  (0, 13360)	0.13194325554294412
  (0, 10353)	0.15822083741116807
  (0, 9696)	0.2344863506695177
  (0, 9500)	0.32362824946958924
  (0, 9499)	0.2710730857331412
  (0, 9278)	0.2557016857306
  (0, 9268)	0.24985775067205893
  (0, 9134)	0.203146521994152
  (0, 7828)	0.14665636606538907
  (0, 7661)	0.18455395156374824
  (0, 6798)	0.26660786759628274
  (0, 6000)	0.09119016049133763
  (0, 5893)	0.10122729485724666
  (0, 5312)	0.3140646439730568
  (0, 3244)	0.2710730857331412
  (0, 2876)	0.20159893825852515
  (0, 2077)	0.16778281354284771
  (0, 1860)	0.20084821002276876
  (0, 1577)	0.2518544684572913
  (0, 785)	0.10980779936089004
  (1, 14851)	0.06760157877956305
  (1, 14765)	0.06760157877956305
  (1, 14549)	0.05364528084212625
  :	:
  (3749, 14134)	0.08932803146170602
  (3749, 13629)	0.22240862740086054
  (3749, 10714)	0.14855102309490137
  (3749, 10518)	0.13996715700088203
  (3749, 10129)	0.08300071710811902
  (3749, 10058)	0.1

In [8]:
#validating the model using a new review 
print(clf.predict(count_vect.transform(['Hi I love these puzzles, I spend all my days playing with them ']))) 

['snack food']


In [9]:
X_test_counts = count_vect.transform(X_test)
tfidf_transformer = TfidfTransformer()
X_test_tfidf = tfidf_transformer.fit_transform(X_test_counts)

In [10]:
import numpy as np 
from sklearn.metrics import accuracy_score

print(X_train_tfidf.shape)
print(X_test_tfidf.shape)
clf.fit(X_train_tfidf,y_train)

(3750, 14937)
(1250, 14937)


MultinomialNB()

In [11]:
y_hat = clf.predict(X_test_tfidf)
y_true = y_test
clf_2 = accuracy_score(y_hat,y_true)

In [12]:
ypred=clf.predict(X_test_tfidf)

In [13]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
result = confusion_matrix(y_test, ypred)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, ypred)
print("Classification Report:")
print (result1)
result2 = accuracy_score(y_test,ypred)
print("Accuracy:",result2)

Confusion Matrix:
[[ 25   0   0   0   3   0   0   0   5   0   0   0   4   5   2   0   0  23
    0   2]
 [  1  26   0   0   0   0   0   0   0   0   0   0   6   2   0   0   2  27
    3   0]
 [  0   0   0   0   0   0   0   0   0   0   1   0   0   0   1   0   0   6
    0   0]
 [  0   0   0   5   0   0   0   0   0   0   0   1   0   0  12   0   0  58
    0   0]
 [  5   0   0   0  16   0   0   0   1   0   0   0   3   8   0   0   1  13
    2   0]
 [  0   0   0   0   0  41   0   0   0   0   0   2   0   5   3   0   0   8
    0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   3   0   0  21
    0   0]
 [  0   0   0   0   0   2   0  24   0   0   0   1   0  11   4   0   0  21
    0   0]
 [  1   0   0   0   0   1   0   1  27   0   0   0   0   6   0   0   0   8
    1   1]
 [  0   0   0   0   0   0   0   0   0  20   0   0   0  10  11   0   0  10
    0   0]
 [  0   0   0   0   0   0   0   0   0   0  22   0   0  20  13   2   0  16
    0   0]
 [  0   0   0   0   0   1   0   0   0   0   0  

  _warn_prf(average, modifier, msg_start, len(result))


Accuracy achieved for the given model when we used naive bayes is 51.28%

# Using  KNN for Multi-Class Classification

In [14]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [15]:
df['Cat'].unique()

array(['puzzles', 'arts crafts', 'action toy figures', 'personal care',
       'nutrition wellness', 'learning education', 'electronics for kids',
       'household supplies', 'stuffed animals plush', 'health care',
       'medical supplies equipment', 'building toys',
       'sports outdoor play', 'cats', 'dogs', 'breakfast foods',
       'cooking baking supplies', 'snack food', 'baby food',
       'fish aquatic pets'], dtype=object)

In [16]:
df.isnull().values.any()

False

In [17]:
X_train, X_test, y_train, y_test = train_test_split(df['new_Text'], df['Cat'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
knn_clf=KNeighborsClassifier()
knn_clf.fit(X_train_tfidf,y_train)

KNeighborsClassifier()

In [18]:

print(knn_clf.predict(count_vect.transform(['Hi I love these puzzles, I spend all my days playing with them '])))

['snack food']


In [19]:
y_hat = knn_clf.predict(X_test_tfidf)
y_true = y_test
clf_3 = accuracy_score(y_hat,y_true)

In [20]:
clf_3

0.5856

In [21]:
ypred=knn_clf.predict(X_test_tfidf)

In [22]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
result = confusion_matrix(y_test, ypred)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, ypred)
print("Classification Report:")
print (result1)
result2 = accuracy_score(y_test,ypred)
print("Accuracy:",result2)

Confusion Matrix:
[[33  1  0  0  8  2  0  0  8  1  0  1  4  0  1  0  2  2  5  1]
 [ 5 32  0  2  5  0  0  0  3  0  1  1  7  0  1  1  4  0  5  0]
 [ 1  0  6  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0]
 [ 0  1  2 41  0  0  2  1  0  0  0  0  0  0 20  0  0  9  0  0]
 [ 7  2  0  1 29  0  0  0  4  0  1  0  3  1  0  0  1  0  0  0]
 [ 0  1  0  0  1 45  0  2  0  1  2  1  0  1  1  2  1  1  0  0]
 [ 0  1  1  4  0  0  8  0  0  0  0  0  1  0  3  1  0  5  0  0]
 [ 0  2  2  2  0  7  0 42  0  0  0  2  0  3  0  2  0  0  0  1]
 [ 4  3  0  0  0  1  0  1 30  0  2  0  2  1  0  0  1  0  1  0]
 [ 1  0  1  2  0  0  0  1  0 39  1  1  0  3  0  0  0  1  0  1]
 [ 2  2  1  2  2  5  0  3  1  1 34  1  0  5  7  4  0  1  2  0]
 [ 3  4  1  1  2  1  4  4  2  1  5 58  1  5  1  0  0  1  0  0]
 [ 4  9  1  0  3  0  1  2  3  0  0  1 13  2  0  0  4  0  0  0]
 [ 1  5  2  3  1  6  2  7  2  2  5  4  1 64  1  1  0  1  0  0]
 [ 0  0  3 22  0  4  0  2  1  0  7  5  0  3 53  1  0  5  0  0]
 [ 1  0  0  1  1  0  0  0  2  1  2  1

Accuracy acheived for KNN is 58.56% , KNN does better than Naive-bayes for the given dataset 

# Using Decision Trees for Multi Class Cassification 

In [23]:
from sklearn.tree import DecisionTreeClassifier
clf_gini=DecisionTreeClassifier(criterion="gini",random_state=100,max_depth=3,min_samples_leaf=5)

In [24]:
clf_gini.fit(X_train_tfidf,y_train)

DecisionTreeClassifier(max_depth=3, min_samples_leaf=5, random_state=100)

In [25]:
clf_entropy=DecisionTreeClassifier(criterion="entropy",random_state=100,max_depth=3,min_samples_leaf=5)

In [26]:
clf_entropy.fit(X_train_tfidf,y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5,
                       random_state=100)

In [27]:
y_pred_gini=clf_gini.predict(X_test_tfidf)

In [28]:
y_pred_entropy=clf_entropy.predict(X_test_tfidf)

In [29]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
result = confusion_matrix(y_test, y_pred_gini)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, y_pred_gini)
print("Classification Report:")
print (result1)
result2 = accuracy_score(y_test,y_pred_gini)
print("Accuracy:",result2)

Confusion Matrix:
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  69
    0   0]
 [  0   0   0   0   0   0   0   2   0   0   0   0   0   0   0   0   0  65
    0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   8
    0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  76
    0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  49
    0   0]
 [  0   0   0   0   0  24   0   7   0   0   0   0   0   0   0   0   0  28
    0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  24
    0   0]
 [  0   0   0   0   0   1   0  37   0   0   0   0   0   0   0   0   0  25
    0   0]
 [  0   0   0   0   0   1   0   1   0   0   0   0   0   0   0   0   0  44
    0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  51
    0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  73
    0   0]
 [  0   0   0   0   0   0   0   0   0   0   0  

  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
result = confusion_matrix(y_test, y_pred_entropy)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, y_pred_entropy)
print("Classification Report:")
print (result1)
result2 = accuracy_score(y_test,y_pred_entropy)
print("Accuracy:",result2)

Confusion Matrix:
[[ 29   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  40
    0   0]
 [ 13   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0  53
    0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   8
    0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  76
    0   0]
 [ 17   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  32
    0   0]
 [  1   0   0   0   0   0   0   6   0   0   0   0   0   0   0   0   0  52
    0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  24
    0   0]
 [ 10   0   0   0   0   0   0  31   0   0   0   0   0   0   0   0   0  22
    0   0]
 [ 18   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  28
    0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  51
    0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  73
    0   0]
 [  0   0   0   0   0   0   0   0   0   0   0  

In [31]:
# Accuracy of Decision tree is v low maybe I didnt prepare my data the right way 

# Using logistic Regression 

In [37]:
print(X_train_tfidf)

  (0, 14182)	0.18830534793615428
  (0, 13738)	0.13367237451953598
  (0, 13360)	0.13194325554294412
  (0, 10353)	0.15822083741116807
  (0, 9696)	0.2344863506695177
  (0, 9500)	0.32362824946958924
  (0, 9499)	0.2710730857331412
  (0, 9278)	0.2557016857306
  (0, 9268)	0.24985775067205893
  (0, 9134)	0.203146521994152
  (0, 7828)	0.14665636606538907
  (0, 7661)	0.18455395156374824
  (0, 6798)	0.26660786759628274
  (0, 6000)	0.09119016049133763
  (0, 5893)	0.10122729485724666
  (0, 5312)	0.3140646439730568
  (0, 3244)	0.2710730857331412
  (0, 2876)	0.20159893825852515
  (0, 2077)	0.16778281354284771
  (0, 1860)	0.20084821002276876
  (0, 1577)	0.2518544684572913
  (0, 785)	0.10980779936089004
  (1, 14851)	0.06760157877956305
  (1, 14765)	0.06760157877956305
  (1, 14549)	0.05364528084212625
  :	:
  (3749, 14134)	0.08932803146170602
  (3749, 13629)	0.22240862740086054
  (3749, 10714)	0.14855102309490137
  (3749, 10518)	0.13996715700088203
  (3749, 10129)	0.08300071710811902
  (3749, 10058)	0.1

In [57]:
from sklearn.linear_model import LogisticRegression
LR_clf=LogisticRegression(multi_class='multinomial')
LR_clf.fit(X_train_tfidf,y_train)

LogisticRegression(multi_class='multinomial')

In [58]:
ypred=LR_clf.predict(X_test_tfidf)

In [59]:
print(len(ypred))
len(np.unique(ypred))

1250


19

In [60]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
result = confusion_matrix(y_test, ypred)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, ypred)
print("Classification Report:")
print (result1)
result2 = accuracy_score(y_test,ypred)
print("Accuracy:",result2)

Confusion Matrix:
[[40  0  0  0  7  0  0  0  5  0  0  0  7  3  2  0  0  2  2  1]
 [ 3 34  0  0  1  0  0  2  0  0  1  1  6  1  0  0  0  6 10  2]
 [ 0  0  0  1  0  0  0  0  0  0  3  0  0  0  1  0  0  2  0  1]
 [ 0  0  0 27  0  0  0  0  0  0  1  1  0  0 16  0  0 31  0  0]
 [ 3  0  0  0 33  0  0  0  2  0  0  1  4  4  0  0  1  0  1  0]
 [ 0  0  0  0  0 45  0  4  0  1  3  2  0  2  0  0  0  2  0  0]
 [ 0  2  0  1  0  0  1  0  0  0  0  1  0  1  3  1  0 14  0  0]
 [ 1  0  0  0  0  2  0 46  1  0  0  4  0  9  0  0  0  0  0  0]
 [ 2  1  0  0  0  1  0  1 31  0  2  0  1  4  0  0  0  1  1  1]
 [ 0  0  0  0  0  0  0  0  0 39  1  1  0  6  2  0  0  2  0  0]
 [ 0  0  0  0  0  0  0  0  0  1 41  2  0 19  6  2  0  2  0  0]
 [ 0  0  0  0  0  1  0  0  0  0  4 68  0 13  2  1  0  4  0  1]
 [ 3  5  0  0  2  0  0  0  2  0  2  1 15  7  3  0  2  0  1  0]
 [ 1  0  0  0  0  0  0  1  0  0  6  4  0 92  3  0  0  1  0  0]
 [ 0  0  0  6  0  1  0  1  0  0  4  3  0  4 74  1  0 12  0  0]
 [ 1  0  0  0  0  0  0  0  1  0  5  2

  _warn_prf(average, modifier, msg_start, len(result))


So far Logistic Regression has given us the highest accuracy of 65.84% it looks like it's performing pretty well on the given datset as compared to other algorithms 