In [2]:
import numpy as np
import pandas as pd

In [3]:
data = pd.read_csv("Restaurant_Reviews.csv",on_bad_lines="skip")
data

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1.0
1,Crust is not good.,0.0
2,Not tasty and the texture was just nasty.,0.0
3,Stopped by during the late May bank holiday of...,1.0
4,The selection on the menu was great and so wer...,1.0
...,...,...
711,the presentation of the food was awful.,0.0
712,I can't tell you how disappointed I was.,0.0
713,I think food should have flavor and texture an...,0.0
714,Appetite instantly gone.,0.0


## Preprocessing

In [4]:
import re

import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/macbook/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()  # separates words into their stems 

from nltk.corpus import stopwords

In [6]:
clean_data = []

for i in range(716):
    datas = re.sub("[^a-zA-Z]"," ",data.iloc[i,0]) # removing alphanumerical characters
    datas = datas.lower().split()
    datas = [ps.stem(word) for word in datas if not word in set(stopwords.words("english"))] # removing stopwords and suffixes
    datas = " ".join(datas) 
    clean_data.append(datas)

In [7]:
clean_data

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price',
 'get angri want damn pho',
 'honeslti tast fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fri great',
 'great touch',
 'servic prompt',
 'would go back',
 'cashier care ever say still end wayyy overpr',
 'disgust pretti sure human hair',
 'shock sign indic cash',
 'highli recommend',
 'waitress littl slow servic',
 'like',
 'burritto blah',
 'servic also cute',
 'could care less interior beauti',
 'perform',
 'right red velvet cake ohhh stuff good',
 'name',
 'worst salmon sashimi',
 'like final blow',
 'found place accid could happier',
 'redeem qualiti restaur inexpens',
 'ampl portion good price',
 'first visit hiro delight',
 'servic suck',
 'shrimp tender moist',
 'deal good enough would drag establish',
 'hard judg whether side good gross melt styrofoam want eat fear get sick',
 'thing like prime rib dessert sectio

## Feature extraction  (Bag of Words (BOW))

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1000)

In [9]:
X = cv.fit_transform(clean_data).toarray()
Y = data.iloc[:,1]

In [10]:
Y.fillna(Y.mean(), inplace=True)

In [11]:
Y = Y.astype(int)

## Building model

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

In [14]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

In [15]:
y_pred = gnb.predict(X_test)
y_pred

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0])

In [16]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
cm

array([[45, 33],
       [ 5, 61]])

I used Gaussian Bayes to create a classification model for this project. But you can try another algorithms. So the confusion matrix changes.