# **Importing Important Libraries**

---



In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("Restaurant_Reviews.tsv", delimiter="\t", quoting=3)
data.head(10)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


# **Data Wrangling**


---


<ul>
<li>Cleaning only the alphabetical data.</li>
<li>Making all data lowercase.</li>
<li>Removing Stopwords.</li>
</ul>

---

In [3]:
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer as ps

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nanus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
corpus = []

for i in range(0,1000):

  review = re.sub(pattern='[^a-zA-Z]',repl=' ',string=data['Review'][i])
  review = review.lower()
  review_words = review.split()
  review_words = [ word for word in review_words if not word in set(stopwords.words('english'))]
  review = [ps().stem(word) for word in review_words]
  review = ' '.join(review)
  corpus.append(review)

corpus[:1000]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price',
 'get angri want damn pho',
 'honeslti tast fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fri great',
 'great touch',
 'servic prompt',
 'would go back',
 'cashier care ever say still end wayyy overpr',
 'tri cape cod ravoli chicken cranberri mmmm',
 'disgust pretti sure human hair',
 'shock sign indic cash',
 'highli recommend',
 'waitress littl slow servic',
 'place worth time let alon vega',
 'like',
 'burritto blah',
 'food amaz',
 'servic also cute',
 'could care less interior beauti',
 'perform',
 'right red velvet cake ohhh stuff good',
 'never brought salad ask',
 'hole wall great mexican street taco friendli staff',
 'took hour get food tabl restaur food luke warm sever run around like total overwhelm',
 'worst salmon sashimi',
 'also combo like burger fri beer decent deal',
 'like final blow',
 'found place acc

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = data.iloc[: ,1].values

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

# **Multinomial Naive Baise Classifier**

---




In [7]:
from sklearn.naive_bayes import MultinomialNB
classifier =  MultinomialNB()
classifier.fit(X_train,y_train)

In [8]:
y_predict = classifier.predict(X_test)
y_predict

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 1])

In [9]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

score1 = accuracy_score(y_test,y_predict)
score2 = precision_score(y_test,y_predict)
score3 = recall_score(y_test,y_predict)

print(f"The accuracy score for the model is {score1*100}%")
print(f"The precision score for the model is {score2*100}%")
print(f"The recall score for the model is {score3*100}%")

The accuracy score for the model is 76.5%
The precision score for the model is 76.41509433962264%
The recall score for the model is 78.64077669902912%


<h3>Hyperparameter Tuning</h3>

In [10]:
best_score = 0.0
alp = 0.0
for i in np.arange (0.1,1.1,0.1):
  classifier = MultinomialNB(alpha = i)
  classifier.fit(X_train,y_train)
  y_predict = classifier.predict(X_test)
  score = accuracy_score(y_test,y_predict)
  if score > best_score:
    best_score = score
    alp = i

print("Best accuracy Score is "+str(best_score*100)+" for alpha "+str(alp))

Best accuracy Score is 78.5 for alpha 0.2


<h3>Using the best model as per hyperparameter tuning</h3>

In [11]:
from sklearn.naive_bayes import MultinomialNB
classifier =  MultinomialNB(alpha=0.2)
classifier.fit(X_train,y_train)

In [12]:
y_predict = classifier.predict(X_test)
y_predict

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 1])

In [13]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

score1 = accuracy_score(y_test,y_predict)
score2 = precision_score(y_test,y_predict)
score3 = recall_score(y_test,y_predict)

print(f"The accuracy score for the model is {score1*100}%")
print(f"The precision score for the model is {score2*100}%")
print(f"The recall score for the model is {score3*100}%")

The accuracy score for the model is 78.5%
The precision score for the model is 77.77777777777779%
The recall score for the model is 81.55339805825243%


<h3>Testing the model Against Random Inputs</h3>

In [14]:
def predict_sentiment(sample_review):
  sample_review = re.sub(pattern='[^a-zA-Z]',repl=' ',string = sample_review)
  sample_review = sample_review.lower()
  sample_review_words = sample_review.split()
  sample_review_words = [ word for word in sample_review_words if not word in set(stopwords.words('english'))]
  final_review = [ps().stem(word) for word in sample_review_words]
  final_review = ' '.join(final_review)

  temp = cv.transform([final_review]).toarray()
  return classifier.predict(temp)

In [15]:
sample_review = ['The food is really wonderful',
                 'The food is bad and service is also not good',
                 'Not tasty and the texture was just nasty',
                 'Highly recommended',
                 'The worst was the salmon sashimi']
i=1
for sample in sample_review:
  if predict_sentiment(sample):
    print(f'The review {i} is Positive')
  else:
    print(f'The review {i} is Negative')
  i+=1

The review 1 is Positive
The review 2 is Negative
The review 3 is Negative
The review 4 is Positive
The review 5 is Negative


# **Logistic Regression**

---



In [16]:
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)
sm = SMOTE(random_state = 2)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())

classifier = LogisticRegression()
classifier.fit(X_train,y_train)

In [17]:
y_predict = classifier.predict(X_test)
y_predict

array([1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 1])

In [18]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

score1 = accuracy_score(y_test,y_predict)
score2 = precision_score(y_test,y_predict)
score3 = recall_score(y_test,y_predict)

print(f"The accuracy score for the model is {score1*100}%")
print(f"The precision score for the model is {score2*100}%")
print(f"The recall score for the model is {score3*100}%")

The accuracy score for the model is 78.0%
The precision score for the model is 78.0%
The recall score for the model is 78.0%


<h3>Testing the model against random inputs.</h3>

In [19]:
def predict_sentiment(sample_review):
  sample_review = re.sub(pattern='[^a-zA-Z]',repl=' ',string = sample_review)
  sample_review = sample_review.lower()
  sample_review_words = sample_review.split()
  sample_review_words = [ word for word in sample_review_words if not word in set(stopwords.words('english'))]
  final_review = [ps().stem(word) for word in sample_review_words]
  final_review = ' '.join(final_review)

  temp = cv.transform([final_review]).toarray()
  return classifier.predict(temp)

In [20]:
sample_review = ['The food is really wonderful',
                 'The food is bad and service is also not good',
                 'Not tasty and the texture was just nasty',
                 'Highly recommended',
                 'The worst was the salmon sashimi']
i=1
for sample in sample_review:
  if predict_sentiment(sample):
    print(f'The review {i} is Positive')
  else:
    print(f'The review {i} is Negative')
  i+=1

The review 1 is Positive
The review 2 is Positive
The review 3 is Negative
The review 4 is Positive
The review 5 is Negative


# **Decision Tree Classifier**


---



In [21]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion="entropy", max_depth=1)
classifier.fit(X_train,y_train)

In [22]:
y_predict = classifier.predict(X_test)
y_predict

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0])

In [23]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

score1 = accuracy_score(y_test,y_predict)
score2 = precision_score(y_test,y_predict)
score3 = recall_score(y_test,y_predict)

print(f"The accuracy score for the model is {score1*100}%")
print(f"The precision score for the model is {score2*100}%")
print(f"The recall score for the model is {score3*100}%")

The accuracy score for the model is 57.99999999999999%
The precision score for the model is 100.0%
The recall score for the model is 16.0%


<h3>Hyperparameter Tuning</h3>

In [24]:
best_score = 0.0
dep = 0
for i in range (10,100):
  classifier = DecisionTreeClassifier(criterion="entropy", max_depth=i)
  classifier.fit(X_train,y_train)
  y_predict = classifier.predict(X_test)
  score = accuracy_score(y_test,y_predict)
  if score > best_score:
    best_score = score
    dep = i

print("Best accuracy Score is "+str(best_score*100)+" for max_depth "+str(dep))

Best accuracy Score is 79.5 for max_depth 35


<h3>Using the best model as per hyperparameter tuning</h3>

In [25]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion="entropy", max_depth=30)
classifier.fit(X_train,y_train)

In [26]:
y_predict = classifier.predict(X_test)
y_predict

array([1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 0])

In [27]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

score1 = accuracy_score(y_test,y_predict)
score2 = precision_score(y_test,y_predict)
score3 = recall_score(y_test,y_predict)

print(f"The accuracy score for the model is {score1*100}%")
print(f"The precision score for the model is {score2*100}%")
print(f"The recall score for the model is {score3*100}%")

The accuracy score for the model is 79.0%
The precision score for the model is 89.1891891891892%
The recall score for the model is 66.0%


<h3>Testing the model against random inputs.</h3>

In [28]:
def predict_sentiment(sample_review):
  sample_review = re.sub(pattern='[^a-zA-Z]',repl=' ',string = sample_review)
  sample_review = sample_review.lower()
  sample_review_words = sample_review.split()
  sample_review_words = [ word for word in sample_review_words if not word in set(stopwords.words('english'))]
  final_review = [ps().stem(word) for word in sample_review_words]
  final_review = ' '.join(final_review)

  temp = cv.transform([final_review]).toarray()
  return classifier.predict(temp)

In [29]:
sample_review = ['The food is really wonderful',
                 'The food is bad and service is also not good',
                 'Not tasty and the texture was just nasty',
                 'Highly recommended',
                 'The worst was the salmon sashimi']
i=1
for sample in sample_review:
  if predict_sentiment(sample):
    print(f'The review {i} is Positive')
  else:
    print(f'The review {i} is Negative')
  i+=1

The review 1 is Positive
The review 2 is Positive
The review 3 is Negative
The review 4 is Negative
The review 5 is Negative


# **Random Forest Classifier**

---



In [30]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(criterion="entropy")
classifier.fit(X_train,y_train)

In [31]:
y_predict = classifier.predict(X_test)
y_predict

array([1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0])

In [32]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

score1 = accuracy_score(y_test,y_predict)
score2 = precision_score(y_test,y_predict)
score3 = recall_score(y_test,y_predict)

print(f"The accuracy score for the model is {score1*100}%")
print(f"The precision score for the model is {score2*100}%")
print(f"The recall score for the model is {score3*100}%")

The accuracy score for the model is 77.0%
The precision score for the model is 80.68181818181817%
The recall score for the model is 71.0%


<h3>Hyperparameter tuning</h3>

In [33]:
best_score = 0.0
dep = 0
for i in range (1,50):
  classifier = RandomForestClassifier(criterion="entropy", max_depth=i)
  classifier.fit(X_train,y_train)
  y_predict = classifier.predict(X_test)
  score = accuracy_score(y_test,y_predict)
  if score > best_score:
    best_score = score
    dep = i

print("Best accuracy Score is "+str(best_score*100)+" for max_depth "+str(dep))

Best accuracy Score is 80.5 for max_depth 34


<h3>Using the best model as per hyperparameter tuning.</h3>

In [34]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(criterion="entropy", max_depth=dep)
classifier.fit(X_train,y_train)

In [35]:
y_predict = classifier.predict(X_test)
y_predict

array([1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 0])

In [36]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

score1 = accuracy_score(y_test,y_predict)
score2 = precision_score(y_test,y_predict)
score3 = recall_score(y_test,y_predict)

print(f"The accuracy score for the model is {score1*100}%")
print(f"The precision score for the model is {score2*100}%")
print(f"The recall score for the model is {score3*100}%")

The accuracy score for the model is 80.0%
The precision score for the model is 85.71428571428571%
The recall score for the model is 72.0%


<h3>Testing the model against random input.</h3>

In [37]:
def predict_sentiment(sample_review):
  sample_review = re.sub(pattern='[^a-zA-Z]',repl=' ',string = sample_review)
  sample_review = sample_review.lower()
  sample_review_words = sample_review.split()
  sample_review_words = [ word for word in sample_review_words if not word in set(stopwords.words('english'))]
  final_review = [ps().stem(word) for word in sample_review_words]
  final_review = ' '.join(final_review)

  temp = cv.transform([final_review]).toarray()
  return classifier.predict(temp)

In [38]:
sample_review = ['The food is really wonderful',
                 'The food is bad and service is also not good',
                 'Not tasty and the texture was just nasty',
                 'Highly recommended',
                 'The worst was the salmon sashimi']
i=1
for sample in sample_review:
  if predict_sentiment(sample):
    print(f'The review {i} is Positive')
  else:
    print(f'The review {i} is Negative')
  i+=1

The review 1 is Positive
The review 2 is Negative
The review 3 is Negative
The review 4 is Negative
The review 5 is Negative


# **Extra Trees Classifier**

---



In [39]:
from sklearn.ensemble import ExtraTreesClassifier
classifier = ExtraTreesClassifier()
classifier.fit(X_train,y_train)

In [40]:
y_predict = classifier.predict(X_test)
y_predict

array([1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 1])

In [41]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

score1 = accuracy_score(y_test,y_predict)
score2 = precision_score(y_test,y_predict)
score3 = recall_score(y_test,y_predict)

print(f"The accuracy score for the model is {score1*100}%")
print(f"The precision score for the model is {score2*100}%")
print(f"The recall score for the model is {score3*100}%")

The accuracy score for the model is 80.5%
The precision score for the model is 82.79569892473118%
The recall score for the model is 77.0%


<h3>Hyperparameter Tuning</h3>

In [42]:
best_score = 0.0
dep = 0
for i in range (1,50):
  classifier = ExtraTreesClassifier(criterion="entropy", max_depth=i)
  classifier.fit(X_train,y_train)
  y_predict = classifier.predict(X_test)
  score = accuracy_score(y_test,y_predict)
  if score > best_score:
    best_score = score
    dep = i

print("Best accuracy Score is "+str(best_score*100)+" for max_depth "+str(dep))

Best accuracy Score is 81.5 for max_depth 46


In [43]:
from sklearn.ensemble import ExtraTreesClassifier
classifier = ExtraTreesClassifier(criterion="entropy", max_depth=dep)
classifier.fit(X_train,y_train)

In [44]:
y_predict = classifier.predict(X_test)
y_predict

array([1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 1])

In [45]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

score1 = accuracy_score(y_test,y_predict)
score2 = precision_score(y_test,y_predict)
score3 = recall_score(y_test,y_predict)

print(f"The accuracy score for the model is {score1*100}%")
print(f"The precision score for the model is {score2*100}%")
print(f"The recall score for the model is {score3*100}%")

The accuracy score for the model is 80.0%
The precision score for the model is 83.33333333333334%
The recall score for the model is 75.0%


<h3>Testing the model against random input</h3>

In [46]:
def predict_sentiment(sample_review):
  sample_review = re.sub(pattern='[^a-zA-Z]',repl=' ',string = sample_review)
  sample_review = sample_review.lower()
  sample_review_words = sample_review.split()
  sample_review_words = [ word for word in sample_review_words if not word in set(stopwords.words('english'))]
  final_review = [ps().stem(word) for word in sample_review_words]
  final_review = ' '.join(final_review)

  temp = cv.transform([final_review]).toarray()
  return classifier.predict(temp)

In [47]:
sample_review = ['The food is really wonderful',
                 'The food is bad and service is also not good',
                 'Not tasty and the texture was just nasty',
                 'Highly recommended',
                 'The worst was the salmon sashimi']
i=1
for sample in sample_review:
  if predict_sentiment(sample):
    print(f'The review {i} is Positive')
  else:
    print(f'The review {i} is Negative')
  i+=1

The review 1 is Positive
The review 2 is Negative
The review 3 is Negative
The review 4 is Positive
The review 5 is Negative


# **KNeighborsClassifier**

---



In [48]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier()
classifier.fit(X_train,y_train)

In [49]:
y_predict = classifier.predict(X_test)
y_predict

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0])

In [50]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

score1 = accuracy_score(y_test,y_predict)
score2 = precision_score(y_test,y_predict)
score3 = recall_score(y_test,y_predict)

print(f"The accuracy score for the model is {score1*100}%")
print(f"The precision score for the model is {score2*100}%")
print(f"The recall score for the model is {score3*100}%")

The accuracy score for the model is 63.0%
The precision score for the model is 72.41379310344827%
The recall score for the model is 42.0%


<h3>Hyperparameter Tuning</h3>

In [51]:
best_score = 0.0
n = 0
for i in range (1,100):
  classifier = KNeighborsClassifier(n_neighbors=i)
  classifier.fit(X_train,y_train)
  y_predict = classifier.predict(X_test)
  score = accuracy_score(y_test,y_predict)
  if score > best_score:
    best_score = score
    n = i

print("Best accuracy Score is "+str(best_score*100)+" for neighbours "+str(n))

Best accuracy Score is 65.0 for neighbours 3


<h3>Using the best model as per hyperparameter tuning</h3>

In [52]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=n)
classifier.fit(X_train,y_train)

In [53]:
y_predict = classifier.predict(X_test)
y_predict

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0])

In [54]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

score1 = accuracy_score(y_test,y_predict)
score2 = precision_score(y_test,y_predict)
score3 = recall_score(y_test,y_predict)

print(f"The accuracy score for the model is {score1*100}%")
print(f"The precision score for the model is {score2*100}%")
print(f"The recall score for the model is {score3*100}%")

The accuracy score for the model is 65.0%
The precision score for the model is 74.19354838709677%
The recall score for the model is 46.0%


<h3> Testing the model against random inputs.

In [55]:
def predict_sentiment(sample_review):
  sample_review = re.sub(pattern='[^a-zA-Z]',repl=' ',string = sample_review)
  sample_review = sample_review.lower()
  sample_review_words = sample_review.split()
  sample_review_words = [ word for word in sample_review_words if not word in set(stopwords.words('english'))]
  final_review = [ps().stem(word) for word in sample_review_words]
  final_review = ' '.join(final_review)

  temp = cv.transform([final_review]).toarray()
  return classifier.predict(temp)

In [56]:
sample_review = ['The food is really wonderful',
                 'The food is bad and service is also not good',
                 'Not tasty and the texture was just nasty',
                 'Highly recommended',
                 'The worst was the salmon sashimi']
i=1
for sample in sample_review:
  if predict_sentiment(sample):
    print(f'The review {i} is Positive')
  else:
    print(f'The review {i} is Negative')
  i+=1

The review 1 is Negative
The review 2 is Positive
The review 3 is Negative
The review 4 is Positive
The review 5 is Negative


# **Support Vector Classification (SVC)**

---



In [57]:
from sklearn.svm import SVC
classifier = SVC(kernel='sigmoid')
classifier.fit(X_train,y_train)

In [58]:
y_predict = classifier.predict(X_test)
y_predict

array([1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 1])

In [59]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

score1 = accuracy_score(y_test,y_predict)
score2 = precision_score(y_test,y_predict)
score3 = recall_score(y_test,y_predict)

print(f"The accuracy score for the model is {score1*100}%")
print(f"The precision score for the model is {score2*100}%")
print(f"The recall score for the model is {score3*100}%")

The accuracy score for the model is 78.5%
The precision score for the model is 80.0%
The recall score for the model is 76.0%


<h3>Hyperparameter Tuning</h3>

In [60]:
best_score = 0.0
g = 0
for i in range (0,100):
  classifier = SVC(kernel='linear',gamma=i)
  classifier.fit(X_train,y_train)
  y_predict = classifier.predict(X_test)
  score = accuracy_score(y_test,y_predict)
  if score > best_score:
    best_score = score
    g = i

print("Best accuracy Score is "+str(best_score*100)+" for neighbours "+str(g))

Best accuracy Score is 77.0 for neighbours 0


*Note : The best model is before we do hyperparameter tuning so we will use the same model before the tuning.*

<h3>Testing the model against random inputs.</h3>

In [61]:
def predict_sentiment(sample_review):
  sample_review = re.sub(pattern='[^a-zA-Z]',repl=' ',string = sample_review)
  sample_review = sample_review.lower()
  sample_review_words = sample_review.split()
  sample_review_words = [ word for word in sample_review_words if not word in set(stopwords.words('english'))]
  final_review = [ps().stem(word) for word in sample_review_words]
  final_review = ' '.join(final_review)

  temp = cv.transform([final_review]).toarray()
  return classifier.predict(temp)

In [62]:
sample_review = ['The food is really wonderful',
                 'The food is bad and service is also not good',
                 'Not tasty and the texture was just nasty',
                 'Highly recommended',
                 'The worst was the salmon sashimi']
i=1
for sample in sample_review:
  if predict_sentiment(sample):
    print(f'The review {i} is Positive')
  else:
    print(f'The review {i} is Negative')
  i+=1

The review 1 is Positive
The review 2 is Negative
The review 3 is Negative
The review 4 is Positive
The review 5 is Negative


# **Conclusion**
*   Extra Trees Classifier Algorithm is the best algorithm amongst all the algorithm used for this dataset with the highest accuracy score of 81%
* KNeighbours Classifier Algorithm is the worst algorithm amongst all
the algorithm used for this dataset with the lowest accuracy score of 67.5%.


In [63]:
import pickle

# After training your model
with open('sentiment_model.pkl', 'wb') as file:
    pickle.dump(classifier, file)  # `classifier` is your model object



In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=1500)
tfidf.fit(corpus)

with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf, file)  # `tfidf` is your fitted vectorizer


In [66]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample training data (use your own dataset)
corpus = [
    'I love this food',
    'This restaurant is terrible',
    'Amazing experience',
    'Worst service ever'
]

# 1. Create and fit the vectorizer
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(corpus)  # This trains the TF-IDF

# 2. Now save it using pickle
import pickle
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf, file)


In [67]:
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf, file)  # `tfidf` is your fitted vectorizer


In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample training data (use your own dataset)
corpus = [
    'I love this food',
    'This restaurant is terrible',
    'Amazing experience',
    'Worst service ever'
]

# 1. Create and fit the vectorizer
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(corpus)  # This trains the TF-IDF

# 2. Now save it using pickle
import pickle
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf, file)


In [77]:
%run train_model.py



✅ Model and vectorizer saved successfully!
