In [1]:
# Importing essential libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,confusion_matrix
from tabulate import tabulate

In [2]:
# Loading the dataset
df = pd.read_csv('/content/Restaurant_Reviews.tsv', delimiter='\t', quoting=3)

In [3]:
df.shape

(1000, 2)

In [4]:
df.columns

Index(['Review', 'Liked'], dtype='object')

In [5]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


# **Data Preprocessing**

In [6]:
# Importing essential libraries for performing Natural Language Processing on 'Restaurant_Reviews.tsv' dataset
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# Cleaning the reviews
corpus = []
for i in range(0,1000):

  # Cleaning special character from the reviews
  review = re.sub(pattern='[^a-zA-Z]',repl=' ', string=df['Review'][i])

  # Converting the entire review into lower case
  review = review.lower()

  # Tokenizing the review by words
  review_words = review.split()

  # Removing the stop words
  review_words = [word for word in review_words if not word in set(stopwords.words('english'))]

  # Stemming the words
  ps = PorterStemmer()
  review = [ps.stem(word) for word in review_words]

  # Joining the stemmed words
  review = ' '.join(review)

  # Creating a corpus
  corpus.append(review)

In [8]:
corpus[0:10]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price',
 'get angri want damn pho',
 'honeslti tast fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fri great',
 'great touch']

In [9]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:, 1].values

# **Model Building**

In [10]:
classifiers = {'random forest':RandomForestClassifier(n_estimators=50,random_state=42),\
               'Svm': SVC(),'MultinomialNB': MultinomialNB()\
               ,'Gradient boost':GradientBoostingClassifier(),'Logistic regression':LogisticRegression()}

In [11]:
def performance(x_train, x_test, y_train, y_test, classifier):
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    acc = accuracy_score(y_test, y_pred)
    pre = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    return acc, pre, rec, f1

In [12]:
def find_optimal_split_ratio(x, y, classifiers, splitting_ratios):
    optimal_ratio = None
    best_cls=None
    max_accuracy = 0.0
    header = ["Test Ratio", "Classifier", "Accuracy", "Precision", "Recall", "F1-score"]
    table_data = []
    for test_ratio in splitting_ratios:
      xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=test_ratio, random_state=0)
      for classifier_name, classifier in classifiers.items():
        accuracy, precision, recall, f1 = performance(xtrain, xtest, ytrain, ytest, classifier)
        row_data = [f"{test_ratio:.2f}", classifier_name, f"{accuracy:.4f}", f"{precision:.4f}", f"{recall:.4f}", f"{f1:.4f}"]
        table_data.append(row_data)
        if (accuracy > max_accuracy):
          max_accuracy = accuracy
          optimal_ratio = test_ratio
          best_cls=classifier_name

    print(f"Optimal Splitting Ratio:{optimal_ratio:.2f} ")
    print('best model: ',best_cls)
    print(tabulate(table_data, headers=header, tablefmt="grid"))

In [13]:
splitting_ratios=np.linspace(0.3,0.7)
print(find_optimal_split_ratio(X,y,classifiers,splitting_ratios))

Optimal Splitting Ratio:0.41 
best model:  MultinomialNB
+--------------+---------------------+------------+-------------+----------+------------+
|   Test Ratio | Classifier          |   Accuracy |   Precision |   Recall |   F1-score |
|         0.3  | random forest       |     0.6833 |      0.7281 |   0.6833 |     0.6722 |
+--------------+---------------------+------------+-------------+----------+------------+
|         0.3  | Svm                 |     0.72   |      0.7697 |   0.72   |     0.7105 |
+--------------+---------------------+------------+-------------+----------+------------+
|         0.3  | MultinomialNB       |     0.7533 |      0.7538 |   0.7533 |     0.7534 |
+--------------+---------------------+------------+-------------+----------+------------+
|         0.3  | Gradient boost      |     0.7367 |      0.7874 |   0.7367 |     0.7281 |
+--------------+---------------------+------------+-------------+----------+------------+
|         0.3  | Logistic regression |     

In [14]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.41,random_state=0)
classifier=MultinomialNB()
classifier.fit(X_train,y_train)
y_pred=classifier.predict(X_test)
cm=confusion_matrix(y_test,y_pred)
print(cm)

[[160  42]
 [ 58 150]]


In [15]:
# Hyperparameter tuning the Naive Bayes Classifier
best_accuracy = 0.0
alpha_val = 0.0
for i in np.arange(0.1,1.1,0.1):
  temp_classifier = MultinomialNB(alpha=i)
  temp_classifier.fit(X_train, y_train)
  temp_y_pred = temp_classifier.predict(X_test)
  score = accuracy_score(y_test, temp_y_pred)
  print("Accuracy score for alpha={} is: {}%".format(round(i,1), round(score*100,2)))
  if score>best_accuracy:
    best_accuracy = score
    alpha_val = i
print('--------------------------------------------')
print('The best accuracy is {}% with alpha value as {}'.format(round(best_accuracy*100, 2), round(alpha_val,1)))

Accuracy score for alpha=0.1 is: 74.63%
Accuracy score for alpha=0.2 is: 75.12%
Accuracy score for alpha=0.3 is: 74.88%
Accuracy score for alpha=0.4 is: 74.63%
Accuracy score for alpha=0.5 is: 74.15%
Accuracy score for alpha=0.6 is: 73.9%
Accuracy score for alpha=0.7 is: 74.39%
Accuracy score for alpha=0.8 is: 74.39%
Accuracy score for alpha=0.9 is: 75.12%
Accuracy score for alpha=1.0 is: 75.61%
--------------------------------------------
The best accuracy is 75.61% with alpha value as 1.0


# **Predictions**

In [17]:
def predict_sentiment(sample_review):
  sample_review = re.sub(pattern='[^a-zA-Z]',repl=' ', string = sample_review)
  sample_review = sample_review.lower()
  sample_review_words = sample_review.split()
  sample_review_words = [word for word in sample_review_words if not word in set(stopwords.words('english'))]
  ps = PorterStemmer()
  final_review = [ps.stem(word) for word in sample_review_words]
  final_review = ' '.join(final_review)

  temp = cv.transform([final_review]).toarray()
  return classifier.predict(temp)

In [18]:
# Predicting values
sample_review = 'The food is really good here.'

if predict_sentiment(sample_review):
  print('This is a POSITIVE review.')
else:
  print('This is a NEGATIVE review!')

This is a POSITIVE review.


In [19]:
# Predicting values
sample_review = 'Food was pretty bad and the service was very slow.'

if predict_sentiment(sample_review):
  print('This is a POSITIVE review.')
else:
  print('This is a NEGATIVE review!')

This is a NEGATIVE review!


In [20]:
# Predicting values
sample_review = 'The food was absolutely wonderful, from preparation to presentation, very pleasing.'

if predict_sentiment(sample_review):
  print('This is a POSITIVE review.')
else:
  print('This is a NEGATIVE review!')

This is a POSITIVE review.


# **Saving the models**

In [21]:
import pickle
with open('count_vectorizer.sav', 'wb') as file:
    pickle.dump(cv, file)

In [24]:
model=MultinomialNB(alpha=1.0)
model.fit(X_train, y_train)
with open('multinomialNB.sav', 'wb') as file:
    pickle.dump(model, file)