### Import Libaries and Load data

In [9]:
import pandas as pd
import numpy as np
import nltk
import re
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from nltk.stem.porter import PorterStemmer


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Raksha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Raksha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df =pd.read_csv('data/Restaurant_Reviews.tsv',sep='\t')

In [4]:
df.shape

(1000, 2)

In [5]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [7]:
print(df['Liked'].unique())

[1 0]


In [10]:
df['Review'].head(10)

0                             Wow... Loved this place.
1                                   Crust is not good.
2            Not tasty and the texture was just nasty.
3    Stopped by during the late May bank holiday of...
4    The selection on the menu was great and so wer...
5       Now I am getting angry and I want my damn pho.
6                Honeslty it didn't taste THAT fresh.)
7    The potatoes were like rubber and you could te...
8                            The fries were great too.
9                                       A great touch.
Name: Review, dtype: object

### Data Preprocessing


In [11]:
corpus=[]
for i in range(1000):
    #remove special characters
    review=re.sub(pattern='[^a-zA-Z]',repl=' ',string=df['Review'][i])
    #convert to lowercase
    review=review.lower()
    #tokenization
    review=word_tokenize(review)
    #remove stopwords and stemming
    review_words=[word for word in review if word not in set(stopwords.words('english'))]
    ps =PorterStemmer()
    review=[ps.stem(word) for word in review_words]
    review=" ".join(review)
    
    corpus.append(review)
    

In [12]:
corpus[:10]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price',
 'get angri want damn pho',
 'honeslti tast fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fri great',
 'great touch']

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=1500)
X=cv.fit_transform(corpus).toarray()
y=df.iloc[:,1].values

In [15]:
print(X[:5])
print(y[:5])

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[1 0 0 1 1]


### Spillting into train test

In [20]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)


### Model Building


In [22]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [23]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
gnb=GaussianNB()
gnb.fit(x_train,y_train)
y_pred_gnb=gnb.predict(x_test)
gnb_score=accuracy_score(y_test,y_pred_gnb)
print("GaussianNB Accuracy:",gnb_score)


GaussianNB Accuracy: 0.67


In [24]:
mnb=MultinomialNB()
mnb.fit(x_train,y_train)
y_pred_mnb=mnb.predict(x_test)
mnb_score=accuracy_score(y_test,y_pred_mnb)
print("MultinomialNB Accuracy:",mnb_score)

MultinomialNB Accuracy: 0.745


In [25]:
# Hyperparameter tuning the Naive Bayes Classifier
best_accuracy = 0.0
alpha_val = 0.0
for i in np.arange(0.1,1.1,0.1):
  temp_classifier = MultinomialNB(alpha=i)
  temp_classifier.fit(x_train, y_train)
  temp_y_pred = temp_classifier.predict(x_test)
  score = accuracy_score(y_test, temp_y_pred)
  print("Accuracy score for alpha={} is: {}%".format(round(i,1), round(score*100,2)))
  if score>best_accuracy:
    best_accuracy = score
    alpha_val = i
print('--------------------------------------------')
print('The best accuracy is {}% with alpha value as {}'.format(round(best_accuracy*100, 2), round(alpha_val,1)))

Accuracy score for alpha=0.1 is: 74.0%
Accuracy score for alpha=0.2 is: 74.0%
Accuracy score for alpha=0.3 is: 73.5%
Accuracy score for alpha=0.4 is: 74.5%
Accuracy score for alpha=0.5 is: 74.0%
Accuracy score for alpha=0.6 is: 74.0%
Accuracy score for alpha=0.7 is: 74.0%
Accuracy score for alpha=0.8 is: 74.0%
Accuracy score for alpha=0.9 is: 74.0%
Accuracy score for alpha=1.0 is: 74.5%
--------------------------------------------
The best accuracy is 74.5% with alpha value as 0.4


In [29]:
def predict_sentiment(sample_review):
  sample_review = re.sub(pattern='[^a-zA-Z]',repl=' ', string = sample_review)
  sample_review = sample_review.lower()
  sample_review_words = sample_review.split()
  sample_review_words = [word for word in sample_review_words if not word in set(stopwords.words('english'))]
  ps = PorterStemmer()
  final_review = [ps.stem(word) for word in sample_review_words]
  final_review = ' '.join(final_review)

  temp = cv.transform([final_review]).toarray()
  return mnb.predict(temp)

In [30]:
# Predicting values
sample_review = 'The food is really good here.'

if predict_sentiment(sample_review):
  print('This is a POSITIVE review.')
else:
  print('This is a NEGATIVE review!')

This is a POSITIVE review.
