# Kudzai Sibanda

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import distributions as dist
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
os.chdir(r'C:\Users\Carlos\Onedrive\Desktop\AML')

In [3]:
data = pd.read_excel(r'reviews.xlsx',encoding = 'latin-1',sep ="|")
data.head

<bound method NDFrame.head of             Id                                             Review  Label
0            0                               good and interesting      5
1            1  This class is very helpful to me. Currently, I...      5
2            2  like!Prof and TAs are helpful and the discussi...      5
3            3  Easy to follow and includes a lot basic and im...      5
4            4  Really nice teacher!I could got the point eazl...      4
...        ...                                                ...    ...
107013  107013  Trendy topic with talks from expertises in the...      4
107014  107014  Wonderful! Simple and clear language, good ins...      5
107015  107015   an interesting and fun course. thanks. dr quincy      5
107016  107016  very broad perspective, up to date information...      4
107017  107017  An informative course on the social and financ...      4

[107018 rows x 3 columns]>

# PREPROCESSING

In [4]:
import re 
import nltk 
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer

In [5]:
nltk.download('stopwords') 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Carlos\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
vocab = [] 
  
for i in range(0,107018 ): 
    text = re.sub('[^a-z A-Z]','' , data['Review'][i]) 
    text = text.lower() 
    text = text.split() 
    ps = PorterStemmer() 
    text = ''.join(text) 
    vocab.append(text) 
    

In [7]:
vec = MultiLabelBinarizer()
Xd = vec.fit_transform(vocab) #transform into binary form

In [8]:
#variable definitions
data['Review'] = data['Review'].astype(str) #convert to string format
Yd = data['Label']
N = Xd.shape[0]  #total number of points 
M = np.c_[Xd, Yd] #joining reviews and labels

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Xd,Yd, random_state=0)

In [10]:
#unique words in reviews data
vocabulari = Xd.shape[1]
vocabulari

26

# NAIVE BAYES CALCULATIONS

In [11]:
#NAIVE BAYES FUNCTION
def NaiveBayes (likelihood, prior):
    return(likelihood * prior)

In [12]:
#sums of features in each label/rating class
N_1 = (Yd==1).sum()
print('Label 1: ',N_1)
N_2 = (Yd==2).sum()
print('Label 2: ',N_2)
N_3 = (Yd==3).sum()
print('Label 3: ',N_3)
N_4 = (Yd==4).sum()
print('Label 4: ',N_4)
N_5 = (Yd==5).sum()
print('Label 5: ',N_5)

Label 1:  2469
Label 2:  2251
Label 3:  5071
Label 4:  18054
Label 5:  79173


In [13]:
#number of times a feature/word is observed in each class
N_j1 = np.sum(M[Yd==1] [:,:Xd.shape[1]] ,axis=0)
N_j2 = np.sum(M[Yd==2] [:,:Xd.shape[1]] ,axis=0)
N_j3 = np.sum(M[Yd==3] [:,:Xd.shape[1]] ,axis=0)
N_j4 = np.sum(M[Yd==4] [:,:Xd.shape[1]] ,axis=0)
N_j5 = np.sum(M[Yd==5] [:,:Xd.shape[1]] ,axis=0)

In [14]:
#probabilities of a feature belonging to each of the classes
π_1 = N_1/N
π_2 = N_2/N
π_3 = N_3/N
π_4 = N_4/N
π_5 = N_5/N

In [15]:
#calculating the likelihood for each class with laplace smoothing added
likelihood_1 = (np.product(N_j1) + 1)/(N_1 + abs(vocabulari))
likelihood_2 = (np.product(N_j2) + 1)/(N_2 + abs(vocabulari))
likelihood_3 = (np.product(N_j3) + 1)/(N_3 + abs(vocabulari))
likelihood_4 = (np.product(N_j4) + 1)/(N_4 + abs(vocabulari))
likelihood_5 = (np.product(N_j5) + 1)/(N_5 + abs(vocabulari))

In [16]:
#classifying for each class
Class_Label1 = NaiveBayes(likelihood_1, π_1)
Class_Label2 = NaiveBayes(likelihood_2, π_2)
Class_Label3 = NaiveBayes(likelihood_3, π_3)
Class_Label4 = NaiveBayes(likelihood_4, π_4)
Class_Label5 = NaiveBayes(likelihood_5, π_5)

# Model Evaluation

In [17]:
model1 = NaiveBayes(likelihood_1, π_1)
model2 = NaiveBayes(likelihood_2, π_2)
model3 = NaiveBayes(likelihood_3, π_3)
model4 = NaiveBayes(likelihood_4, π_4)
model5 = NaiveBayes(likelihood_5, π_5)

In [23]:
from sklearn.naive_bayes import MultinomialNB

In [38]:
nb = MultinomialNB()

nb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [39]:
y_pred = nb.predict(X_test)

In [40]:
from sklearn import metrics 

In [41]:
print(" Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_test, y_pred)*100)

 Naive Bayes model accuracy(in %): 74.09456176415623


In [72]:
#test values
print(y_test.value_counts())


5    19824
4     4522
3     1275
1      599
2      535
Name: Label, dtype: int64


In [66]:
#Cross Validation training data
from sklearn.model_selection import cross_val_score

scores = cross_val_score(nb, X_train, y_train, cv=5, scoring='accuracy')
print(scores)

[0.73942565 0.73942565 0.73942565 0.73947172 0.73940942]


In [67]:
#cross validation testing data
scores = cross_val_score(nb, X_test, y_test, cv=5, scoring='accuracy')
print(scores)

[0.74098299 0.74098299 0.74098299 0.74098299 0.74079611]
