In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk

In [2]:
df = pd.read_csv('./Restaurant_Reviews.tsv', delimiter='\t', quoting=3)
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
# df_copy = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)
df_copy = df.copy()

In [5]:
df_copy.shape

(1000, 2)

Stemming: Chops off the ends of words based on heuristis like 'SS' -> 'S'
Lemmatization: Respects words and reduces to morphological root

In [6]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import LancasterStemmer
#nltk.download('stopwords')

### The original stop words block out the negatives - the above code creates a new stopwords sw which adds them back

In [8]:
#create a new variable sw containing  the stop words from sn but avoiding negative words
sn= set(stopwords.words('english'))

In [9]:
sn

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [10]:
negwords=["not","mustn","doesn","won't","wasn't","hadn't","shouldn't","shan't","doesn't","haven't","isn't","hasn't","needn","shouldn","didn","weren't","wouldn","mightn't","hadn","hasn","haven","couldn","needn't","didn't","wouldn't"]
sw=list()
for x in sn:
    if x not in negwords:
        sw.append(x)

In [13]:
#stemmer = PorterStemmer()
stemmer = SnowballStemmer("english")
#stemmer = LancasterStemmer("english")
def rep(review):
    review = re.sub('[^a-zA-Z]', ' ', review)
    review = review.lower()
    reviews = review.split()
    reviewsnew=list()
    #nlpreviews=nlp(review)
    reviews = [stemmer.stem(x) for x in reviews if x not in sw]
    # reviews = [x for x in reviews if x not in sw]
    review = ' '.join(reviews)
    return review

In [14]:
df['Review'] = df['Review'].apply(rep)
df.head()

Unnamed: 0,Review,Liked
0,wow love place,1
1,crust not good,0
2,not tasti textur nasti,0
3,stop late may bank holiday rick steve recommen...,1
4,select menu great price,1


In [15]:
corpus = df['Review'].tolist()
corpus[:10]

['wow love place',
 'crust not good',
 'not tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price',
 'get angri want damn pho',
 'honeslti didn tast fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fri great',
 'great touch']

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()

In [17]:
X.shape

(1000, 1500)

In [19]:
y = df['Liked'].tolist()

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [21]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

clf = GaussianNB()
clf.fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
print('Training accuracy : {0: .1f}% ' .format( 100*accuracy_score(y_train,y_train_pred)))

print ('Test Accuracy:{0: .1f}%'.format(clf.score(X_test, y_test) * 100))

Training accuracy :  92.4% 
Test Accuracy: 74.0%


### Display the results

In [22]:
predictions = clf.predict(X)

In [23]:
pred=pd.DataFrame(predictions, columns=['Predicted'])

In [25]:
df_copy.join([ pred])

Unnamed: 0,Review,Liked,Predicted
0,Wow... Loved this place.,1,1
1,Crust is not good.,0,0
2,Not tasty and the texture was just nasty.,0,0
3,Stopped by during the late May bank holiday of...,1,1
4,The selection on the menu was great and so wer...,1,1
5,Now I am getting angry and I want my damn pho.,0,1
6,Honeslty it didn't taste THAT fresh.),0,0
7,The potatoes were like rubber and you could te...,0,0
8,The fries were great too.,1,1
9,A great touch.,1,1


In [26]:
# some mis-predictions could be because the critical word was not in the training set.

word = 'awful'
df['contains'] = df['Review'].apply(lambda x: 1 if word in x else 0)

In [30]:
df[df.contains == 1]

Unnamed: 0,Review,Liked,contains
