# Import libraries and open file with proper delimiter and parameters for the TSV file

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#Since we are reading a Tab Seperated Values file - its important that we mention the delimiter.  Option of quoting 3 is to 
#ignore double quotes
ds = pd.read_csv('Restaurant_Reviews.tsv', delimiter ='\t', quoting = 3)

In [11]:
ds

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


# Clean the data

In [3]:
#In the next two steps the code nltk.download("stopwords") and set(stopwords.words('english')) has been commented out
#the reason for this is that we were facing a connection error to nltk.download - which referred to the URL: 
# https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
# Instead the following zip file was directly downloaded : https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/stopwords.zip
# The english file was copied to the root directory here.  Another way to make it work is to do an nltk.download("all")
import re
import nltk
nltk.download("all")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\E3000527\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\E3000527\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     C:\Users\E3000527\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to
[nltk_data]    |     C:\Users\E3000527\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\brown.zip.
[nltk_data]    | Downloading package brown_tei to
[nltk_data]    |     C:\Users\E3000527\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to
[nltk_data]    |     C:\Users\E3000527\AppData\Roaming\n

[nltk_data]    |   Unzipping corpora\product_reviews_2.zip.
[nltk_data]    | Downloading package pros_cons to
[nltk_data]    |     C:\Users\E3000527\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\pros_cons.zip.
[nltk_data]    | Downloading package qc to
[nltk_data]    |     C:\Users\E3000527\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\qc.zip.
[nltk_data]    | Downloading package reuters to
[nltk_data]    |     C:\Users\E3000527\AppData\Roaming\nltk_data...
[nltk_data]    | Downloading package rte to
[nltk_data]    |     C:\Users\E3000527\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\rte.zip.
[nltk_data]    | Downloading package semcor to
[nltk_data]    |     C:\Users\E3000527\AppData\Roaming\nltk_data...
[nltk_data]    | Downloading package senseval to
[nltk_data]    |     C:\Users\E3000527\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\senseval.zip.
[nltk_data]    | Downloading package sentiwordnet to
[nlt

In [4]:
corpus = []
for i in range(0, 1000):
    #What the next line does is take in the 'Review' column of our input dataset ds and only retain chars from a to z and A to Z
    #All other characters that are removed get replaced with a space
    review = re.sub('[^a-zA-Z]', ' ', ds['Review'][i])
    #What the next line will do is to change all upper case characters to lower case
    review = review.lower()
    #What the next line would do is to convert each sentence into a list of its composite words
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    #The next line would remove words that are stopwords like 'this', 'is' etc. that are in English.  It also does the 
    #stemming process - which changes words to their root word.  Like loved would be changed to love etc.
    #review = [ps.stem(word) for word in review if not word in set('english')]
    #In the next line all the words in each row are joint together into a single phrase/string with a space seperating individual
    #words
    review = ' '.join(review)
    #In the next line all lines of the input are appended to an empty corpus after the above cleaning process is done
    corpus.append(review)

In [9]:
# Creating the Bag of Words model
# A bag of words is a table matrix where each review corresponds to a row and each column corresponds to a unique word across all 
# reviews together.  The cells will have a number which corresponds to the number of times a word appears  in that particular
# review.  Therefore in this table you will see a lot of cells with zeroes.  So a matrix with a lot of zeroes in it is called
# as a SPARSE MATRIX.  This entire process of creating a SPARSE MATRIX is called TOKENIZATION.
# Also note that CountVectorizer (below) can do all of the data cleaning that re can do above.  However its better to use re in
# above step in a step-by-step fashion
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
X.shape

(1000, 1565)

In [7]:
# The above X.shape reveals that there are 1000 rows (something we already knew as there were 1000 reviews) and 1565 columns (one
# column for each word).  Some words may be unimportant words.  Hence its safe to choose 1500 columns/features.  So we redo the
# code like this
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
#Now in the above step - X becomes the list of features or the independent variables



In [14]:
#So here below we go back and check the values of the original dataset.  
ds.values

array([['Wow... Loved this place.', 1],
       ['Crust is not good.', 0],
       ['Not tasty and the texture was just nasty.', 0],
       ...,
       ['Overall I was not impressed and would not go back.', 0],
       ["The whole experience was underwhelming, and I think we'll just go to Ninja Sushi next time.",
        0],
       ["Then, as if I hadn't wasted enough of my life there, they poured salt in the wound by drawing out the time it took to bring the check.",
        0]], dtype=object)

In [16]:
# In the next step we would be interested only in the second column i.e. wether a review is positive or not.  This is the label
# or the dependent variable
y = ds.iloc[:, 1].values

In [17]:
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)


#The three most common algos used for NLP are Naive Bayes, Decision Tree Classification and Random Forest Classification. 
# CART, C5.0, Maximum Entropy:  CART or Classification And Regression Trees are a broad category of models., not one specific 
# model. Also C5.0 is a methodology used in applying Decision trees, and is one of the most common approaches. 
# Maximum Entropy is a method that multiple models use to establish decision trees and model learning. 
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm



array([[55, 42],
       [12, 91]], dtype=int64)

In [8]:
review

'then as if hadn t wast enough of my life there they pour salt in the wound by draw out the time it took to bring the check'

In [9]:
corpus

['wow love thi place',
 'crust is not good',
 'not tasti and the textur wa just nasti',
 'stop by dure the late may bank holiday off rick steve recommend and love it',
 'the select on the menu wa great and so were the price',
 'now am get angri and want my damn pho',
 'honeslti it didn t tast that fresh',
 'the potato were like rubber and you could tell they had been made up ahead of time be kept under a warmer',
 'the fri were great too',
 'a great touch',
 'servic wa veri prompt',
 'would not go back',
 'the cashier had no care what so ever on what had to say it still end up be wayyy overpr',
 'tri the cape cod ravoli chicken with cranberri mmmm',
 'wa disgust becaus wa pretti sure that wa human hair',
 'wa shock becaus no sign indic cash onli',
 'highli recommend',
 'waitress wa a littl slow in servic',
 'thi place is not worth your time let alon vega',
 'did not like at all',
 'the burritto blah',
 'the food amaz',
 'servic is also cute',
 'could care less the interior is just beau

In [10]:
cm

array([[55, 42],
       [12, 91]], dtype=int64)