In [38]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier

In [39]:
data = pd.read_csv("Restaurant_Reviews.tsv", delimiter = "\t", quoting=3)

In [40]:
data

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [41]:
# Cleaning a dataset corpus
corpus= []
for i in range(1000):
    review = re.sub('[^a-zA-Z]', ' ', data["Review"][i])   #removed the non alphabets and replaced with space
    review = review.lower()
    review=review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words("english")
    all_stopwords.remove("not")
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]   #stem all words that are not stop words
    review = ' '.join(review)
    corpus.append(review)

In [42]:
#creating the Bag of Words
cv = CountVectorizer(max_features=1600)   #This will select 1450 most frequent words
x = cv.fit_transform(corpus).toarray()
y = data["Liked"].values

In [43]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.25)

In [44]:
classifier = GaussianNB()
classifier.fit(xtrain, ytrain)

In [45]:
ypred= classifier.predict(xtest)
print(np.concatenate((ypred.reshape(len(ypred), 1),ytest.reshape(len(ytest),1)),1))

[[0 1]
 [0 1]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 0]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 1]
 [0 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 0]
 [1 0]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [1 0]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [1 0]
 [1 0]
 [1 1]
 [1 0]
 [0 0]
 [0 1]
 [1 0]
 [1 0]
 [1 1]
 [1 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 1]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [0 0]
 [1 0]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [1 0]
 [0 0]

In [46]:
cnf = confusion_matrix(ytest,ypred)

In [47]:
cnf

array([[73, 61],
       [25, 91]])

In [48]:
print("Accuracy: ",accuracy_score(ytest, ypred))

Accuracy:  0.656


In [49]:
rf_classifier = RandomForestClassifier(n_estimators=50)

In [50]:
rf_classifier.fit(xtrain, ytrain)

In [51]:
rf_ypred = rf_classifier.predict(xtest)

In [52]:
rf_cnf = confusion_matrix(ytest, rf_ypred)
rf_cnf

array([[115,  19],
       [ 25,  91]])

In [53]:
print(accuracy_score(ytest, rf_ypred))

0.824


In [54]:
# trying sentimental analysis for a user given review.

string = input()
string = re.sub('[^a-zA-Z]', ' ', string)
string = string.lower()
string = string.split()
string = [ps.stem(word) for word in string if not word in set(all_stopwords)]
string = ' '.join(string)
new_corpus = [string]
new_xtest = cv.transform(new_corpus).toarray()
new_ypred = classifier.predict(new_xtest)
rf_newypred = rf_classifier.predict(new_xtest)
print("Naive bayes classifier: ", new_ypred)
print("Random Forest classifier: ", rf_newypred)



Naive bayes classifier:  [0]
Random Forest classifier:  [1]
