In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
dataset = pd.read_csv('Restaurant_Reviews.tsv',delimiter = '\t',quoting=3)
dataset

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [4]:
dataset.shape

(1000, 2)

In [5]:
import nltk
nltk.download('stopwords')
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
corpus = []
for i in range(0, 1000):
    #Review concents with Regular expression/ alphabets
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    
    #lower case conversion
    review = review.lower()
    
    #split to array with space as separator
    review = review.split()
    
    #Apply Porterstemer
    ps =PorterStemmer()
    
    #stemming each word in review
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    
    #rejoin the word
    review = ' '.join(review)
    
    #append the string
    corpus.append(review)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
#Feature vector
X = cv.fit_transform(corpus).toarray()
#Target variable
y = dataset.iloc[:,1].values

In [10]:
#Split the dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)

from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train,y_train)

In [11]:
y_pred = classifier.predict(X_test)

In [12]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
cm = confusion_matrix(y_test,y_pred)
cm

array([[55, 42],
       [12, 91]], dtype=int64)

In [13]:
accuracy_score(y_test,y_pred)

0.73

In [15]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.82      0.57      0.67        97
           1       0.68      0.88      0.77       103

    accuracy                           0.73       200
   macro avg       0.75      0.73      0.72       200
weighted avg       0.75      0.73      0.72       200



In [16]:
#Random Forest Algorithm
from sklearn.ensemble import RandomForestClassifier
classifier1 = RandomForestClassifier(n_estimators=500,criterion='entropy')
classifier1.fit(X_train,y_train)

In [18]:
y_pred1 = classifier1.predict(X_test)

In [19]:
cm = confusion_matrix(y_test,y_pred1)
cm

array([[87, 10],
       [47, 56]], dtype=int64)

In [20]:
accuracy_score(y_test,y_pred1)

0.715