In [121]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [122]:
df = pd.read_csv('Restaurant_Reviews.tsv', delimiter= '\t', quoting = 3)
df

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


In [131]:
#Libraries used for Cleaning the Texts
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rajatbhalla/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [132]:
#Cleaning the Texts

ps = PorterStemmer()

corpus = []

for i in range(0,1000):
    review = re.sub('[^a-zA-Z]',' ',df['Review'][i])   # replace the words with space 
    review = review.lower()                            # convert all letters in lowercase
    review = review.split()                            # split the words to list of words
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]    # stopwords are removed
    review = ' '.join(review)                           # join the words 
    corpus.append(review)
    

In [133]:
corpus

[u'wow love place',
 'crust good',
 u'tasti textur nasti',
 u'stop late may bank holiday rick steve recommend love',
 u'select menu great price',
 u'get angri want damn pho',
 u'honeslti tast fresh',
 u'potato like rubber could tell made ahead time kept warmer',
 u'fri great',
 'great touch',
 u'servic prompt',
 'would go back',
 u'cashier care ever say still end wayyy overpr',
 u'tri cape cod ravoli chicken cranberri mmmm',
 u'disgust pretti sure human hair',
 u'shock sign indic cash',
 u'highli recommend',
 u'waitress littl slow servic',
 u'place worth time let alon vega',
 'like',
 u'burritto blah',
 u'food amaz',
 u'servic also cute',
 u'could care less interior beauti',
 u'perform',
 'right red velvet cake ohhh stuff good',
 u'never brought salad ask',
 u'hole wall great mexican street taco friendli staff',
 u'took hour get food tabl restaur food luke warm sever run around like total overwhelm',
 'worst salmon sashimi',
 u'also combo like burger fri beer decent deal',
 'like final

In [137]:
# Creating Bags of Words

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)      # choosed 1500 most frequent words out of 1565
x = cv.fit_transform(corpus).todense()       # creation of sparse matrix
x

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [140]:
y = df.iloc[:,1].values
y

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,

In [145]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [146]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [147]:
model.fit(X_train,y_train)

GaussianNB(priors=None)

In [148]:
y_pred = model.predict(X_test)

In [150]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
cm

array([[55, 42],
       [12, 91]])

In [158]:
model.score(X_test,y_test)

0.73