In [87]:
import pandas as pd
import numpy as np 

import boto3
import sagemaker.amazon.common as smac

In [88]:
# Function to download data from S3

def download_from_s3(filename, bucket, key):
    with open(filename, 'wb') as f: #Write in Binary Mode
        return boto3.resource('s3').Bucket(bucket).Object(key).download_fileobj(f)

In [89]:
download_from_s3('Reviews', 'sara-ml-sagemaker', 'NLP/Restaurant_Reviews.tsv')

In [90]:
restaurant_review = pd.read_csv('Reviews', delimiter = '\t', quoting = 3) # Quoting removes any quotes within the text. 

In [91]:
restaurant_review.head(5)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [92]:
import matplotlib.pyplot as plt

In [93]:
import math
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [94]:
# Cleaning the text
# We will remove anything that is not lowercase letter or uppercase letter from the first review.
#re.sub(pattern, repl, string, count=0, flags=0)
# Split the words in each rewiev so that the unnecessary words like 'the', 'is', 'that' etc can be filtered
# comparing to the nltk 'stopwords' and removed from the review.
# Iterate over each word in the review list and keep the words that are not present in the stopwords list
# we use for loop with an if not (present in stopwords list) condition to achieve this.
# Uset set() so that the iteration is faster
# Next step is stemming, which means that we take root of the wods instead of the words used with various tenses. 
# This is done so that we dont end up with too many words which mean the same thing. 
# This will make the list unnecessarily too long 
# having forms of the same word does not add any value, and also wastes computing resources.

In [95]:
corpus = [] # We place all the cleaned review as a corpus
# lower bound is included in the range fn, but upper bound is not inlcuded
# Hence for 1000 reviews starting from 0 to 999, 1000 is the upper bound. 
for i in range(0, 1000):    
    review = re.sub('[^a-zA-Z]', ' ', restaurant_review['Review'][i]) # In this case specify what is not to be removed.
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    # Join the words int he list separated by a space to convert it back into a string. 
    review = ' '.join(review)
    corpus.append(review)

In [96]:
corpus[:5]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price']

In [97]:
# Create bag of words model
# Convert it to a matrix 

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)

In [98]:
# CountVectorizer will tokenize the words
X = cv.fit_transform(corpus).toarray()

In [99]:
X.shape

(1000, 1500)

In [100]:
 # Create dependent variable vector
y = restaurant_review.iloc[:, 1].values

In [101]:
# NLP generally uses Naive Bayes, Decision Tree or Random Forest classification models. 

In [102]:
# Here we are using Naive Bayes classification model
# Splitting the data set into training set and test set 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [103]:
# Naive Bayes model
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [104]:
# Run the predictions for the test set
y_pred = classifier.predict(X_test)

In [105]:
# Making the Confusin matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [106]:
cm

array([[55, 42],
       [12, 91]])

In [107]:
accuracy = (55+91)/200

In [108]:
accuracy

0.73