In [34]:
# Importing the libraries
import numpy as np
import pandas as pd
import re
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import pickle

In [2]:
#Importing the dataset
dataset = pd.read_csv('Restaurant_Reviews.tsv',delimiter= '\t', quoting = 3)

In [3]:
dataset.head(20)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


In [4]:
dataset.shape

(1000, 2)

# Cleaning the dataset

In [11]:
corpus = []
for i in range(len(dataset)):
    review = re.sub(r"\s+[a-zA-Z]\s+", " ", dataset['Review'][i])
    review = re.sub(r"^[a-zA-Z]", " ", dataset['Review'][i])
    review = re.sub(r"[a-zA-Z]$", " ", dataset['Review'][i])
    review = re.sub(r"\W", " ", dataset['Review'][i])
    review = re.sub(r"\d", " ", dataset['Review'][i])
    review = re.sub(r"\s+", " ", dataset['Review'][i])
    review = re.sub(r"^s+", " ", dataset['Review'][i])
    review = re.sub(r"\s+$", " ", dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

# Bag Of Words Model

In [24]:
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
Y = dataset.iloc[:, 1].values

# Splitting The dataset

In [25]:
# Splitting the dataset into the Training set and Test set
text_train, text_test, sent_train, sent_test = train_test_split(X, 
                                                                Y, 
                                                                test_size = 0.20, 
                                                                random_state = 0)

# Building our Classifier

In [26]:
classifier = LogisticRegression()
classifier.fit(text_train, sent_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [28]:
y_pred = classifier.predict(text_test)
y_pred

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 0], dtype=int64)

In [31]:
#Comparing the results
cm = confusion_matrix(sent_test, y_pred)
cm

array([[83, 14],
       [35, 68]], dtype=int64)

In [32]:
acc = accuracy_score(sent_test, y_pred)
print("Hence our models accuracy is: {}%".format(acc*100))

Hence our models accuracy is: 75.5%


# Building a text report showing the main classification metrics

In [33]:
cr = classification_report(sent_test, y_pred)
print(cr)

             precision    recall  f1-score   support

          0       0.70      0.86      0.77        97
          1       0.83      0.66      0.74       103

avg / total       0.77      0.76      0.75       200



# Saving Our Model and the Classifier

In [35]:
with open('classifier.pickle', 'wb') as f:
    pickle.dump(classifier, f)