# Natural Language Processing (NLP)

## Restaurant Review

### Bag Of Words Model

Importing Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Importing Dataset

In [3]:
# Here delimiter('\t') is used as it is a .tsv file (i.e tab separated)
# Quoting = 3 is used to ignore the quotes in the dataset to ignore during parsing
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

Cleaning The Text

In [23]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [25]:
# Simplify the text so it becomes easier for the model to execute bag-of-words
# Convert all letters to lowercase, remove punctuations and apply stemming 
import re       # To simplify texts
# Removing non rlevant words in our reviews like 'this', 'and', 'the' which don' convey Insights
import nltk     # To download symbols of stop words
nltk.download('stopwords')      # Downloads Stop Words
from nltk.corpus import stopwords       #Imports the Stopwords 
from nltk.stem.porter import PorterStemmer      # To perform stemming (loved -> love)
corpus = []     # Empty list containing all our reviews simplified
for i in range(1000):
    review = re.sub('[^a-zA-Z]', ' ',dataset['Review'][i])      # Changing all the non alphabetic words to space
    review = review.lower()             # Changing all letters to lowercase
    review = review.split()             # Spliting each review into its each words
    ps = PorterStemmer()                # Stemming function
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')         # Removing not from stopwords as not as it conveys insights into emotion
    # using stemming on words except stopwords in english language
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]  
    review = ' '.join(review)           # Join the words back to form a string
    corpus.append(review)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
print(corpus)

Creating Bag-Of-Words Model

In [26]:
from sklearn.feature_extraction.text import CountVectorizer     # Used to create the sparse matrix
cv = CountVectorizer(max_features= 1500)
x = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

In [27]:
len(x[0])

1500

Splitting The Dataset Into Training Set and Test Set

In [28]:
from sklearn.model_selection import train_test_split
x_train, x_test , y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

Training The Naive Bayes Model On The Training Set

In [29]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train, y_train)

GaussianNB()

Predicting The Test Set Results

In [30]:
y_pred = classifier.predict(x_test)

In [12]:
print(np.concatenate((y_pred.reshape(-1,1),y_test.reshape(-1,1)),1))

[[1 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 0]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [1 0]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [0 0]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 0]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]

Making The Confusion Matrix

In [31]:
from sklearn.metrics import accuracy_score, confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[55 42]
 [12 91]]


Checking Accuracy Score

In [32]:
print(accuracy_score(y_test, y_pred))

0.73


Finding TP, TN, FP, FN

In [46]:
TP = 55     # True Positive
TN = 91     # True Negative
FP = 42     # False Positive
FN = 12     # False Negative

Checking Accuracy

In [47]:
accuracy = (TP + TN) / (TP + TN + FP + FN)
print(accuracy)

0.73


Checking Precision

In [50]:
precision = TP / (TP + FP)
print(precision)

0.5670103092783505


Checking Recall

In [54]:
recall = TP / (TP + FN)
print(recall)

0.8208955223880597


Checking F1 Score

In [53]:
F1_score = 2 * precision * recall / (precision + recall)
print(F1_score)

0.6707317073170731


### Predicting If A Single Review Is Positive Or Negative

Postive Review

In [75]:
new_review = 'I love this restaurant so much'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_x_test = cv.transform(new_corpus).toarray()
new_y_pred = classifier.predict(new_x_test)

In [76]:
print(new_y_pred)

[1]


Negative Review

In [68]:
new_review = 'I hate this restaurant so much'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = classifier.predict(new_X_test)
print(new_y_pred)

[0]


Training Random Forest Classification On The Training Set

In [40]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
rf_classifier.fit(x_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

In [41]:
print(confusion_matrix(y_test , y_pred))

[[55 42]
 [12 91]]


In [42]:
print(accuracy_score(y_test, y_pred))

0.73


Taining Kernel SVM On Training Set

In [67]:
from sklearn.svm import SVC
ksvm_classifier = SVC(kernel = 'rbf', random_state = 0)
ksvm_classifier.fit(x_train, y_train)

SVC(random_state=0)

In [44]:
print(confusion_matrix(y_test, y_pred))

[[55 42]
 [12 91]]


In [45]:
print(accuracy_score(y_test, y_pred))

0.73


## New Classification Models

CART Classification