# NLP 

In [1]:
# import libraries
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# use tsv (tab separated values) because some reviews have commas (not common for reviews to have tabs) 
# quoting = 3 (code value) means to ignore double quotes
dataset = pd.read_csv('Restaurant_reviews.tsv', delimiter = '\t', quoting = 3)

In [3]:
dataset

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


In [4]:
# Cleaning the text 
# ^ <- not remove
import re
import nltk # contains words that are generic and useless
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /Users/sam1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


End goal: Each word has its own column 

In [5]:
# corpus is a collection of text that can be anything 
corpus = []

for i in range(len(dataset)):
    # remove all punctuation keep only the letters (words)
    review = re.sub('[^a-zA-Z]', ' ',dataset['Review'][i])
    # lower case all words to make words the same
    review = review.lower()
    # split each word one by one so that we can analyze more effectively 
    review = review.split()
    # ps = porterstemmer to apply stemming to each of the words in the review 
    ps = PorterStemmer()
    # remove words that are irrelevant like "this, i" himself" etc. 
    # don't know why it has to be in a set, because the length is the same oh well 
    # apply stemming to all words in review (Strip affixes from the token and return the stem)
    # ex: loved -> love, loves -> love 
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    # join the list of words back together into a string with a space as a separator 
    review = ' '.join(review)
    corpus.append(review)

In [6]:
# creating bag of words model 
# take different but unique words in all the reviews then create one column for each word
# will create a sparse matrix (lots of zeros)
from sklearn.feature_extraction.text import CountVectorizer
# many parameters such as stop_words, delimiter, split in the CountVectorizer
# basically could of created a CountVectorizer object to fulfill the above in one line 
# but they prefer to split it up one command at a time to clean more detailed (example html has headers <html> </html>)
# max features parameter = keep most common words, can filter non-relevant words 
# take top 1500 words
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = dataset['Liked'].values

In [7]:
# reducing sparsity by 1. max_features 2. dimensionality reduction 
# naive base and random forest classification are most common, 
# but you should treat it case by case and find the most effective model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 0)

In [8]:
# feature scaling not necessary because 0s and 1s
# Fitting Naive Bayes to the training set
from sklearn.naive_bayes import GaussianNB 
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None)

In [9]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [10]:
# Making the confusion matrix 
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [11]:
cm

array([[55, 42],
       [12, 91]])

In [12]:
# accuracy
(cm[0][0]+cm[1][1])/sum(sum(cm))

0.73