In [1]:
# Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk

In [2]:
# Importing Dataset
reviews = pd.read_csv("Restaurant_Reviews.tsv", delimiter = "\t", quoting = 3)

In [3]:
reviews.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
reviews.shape

(1000, 2)

In [5]:
#Importing NLTK library
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [6]:
# Data cleaning
corpus = []
ps = PorterStemmer()
for i in range(0,1000):
    review = re.sub('[^a-zA-Z]', ' ', reviews['Review'][i])
    review = review.lower().split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [7]:
# Bag-of-Words Approach
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = reviews.iloc[:, 1].values

In [8]:
# Creating the model
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics.classification import accuracy_score



In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [10]:
GB_model = GaussianNB()

In [11]:
GB_model.fit(X_train, y_train)

GaussianNB(priors=None)

In [12]:
y_pred = GB_model.predict(X_test)

In [13]:
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_pred, y_test)

In [14]:
print('Confusion Matrix: ', cm)
print('Accuracy: ', accuracy)

Confusion Matrix:  [[55 42]
 [12 91]]
Accuracy:  0.73
