# Natural Language Processing

**1. Importing Data**

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# quoting = 3: ignoring "
dataframe = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)
dataframe.head(5)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


**2. Cleaning 'Review' column**

**Step 1:** Only keeping the letters in the reviews (remove numbers and punctuations and etc.)
<br>
**Step 2:** Covert to lower case letters
<br>
**Step 3:** Split reviews
<br>
**Step 4:** Keep important word and remove the other (Stopwords)
<br>
**Step 5:** Keep roots of the words (Stemming)
<br>
**Step 6:** Covert lists to strings

In [3]:
import re
import nltk

In [4]:
# Download stop words
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [6]:
corpus = []

In [7]:
for review in dataframe['Review']:
    review = re.sub('[^a-zA-Z]', ' ', review)
    review = review.lower()
    review = review.split()
    porter_stemmer = PorterStemmer()
    review = [porter_stemmer.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [8]:
for i in range(5):
    print('before: {}'.format(dataframe['Review'][i]))
    print('after: {}'.format(corpus[i]))
    print('*'*10)

before: Wow... Loved this place.
after: wow love place
**********
before: Crust is not good.
after: crust good
**********
before: Not tasty and the texture was just nasty.
after: tasti textur nasti
**********
before: Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.
after: stop late may bank holiday rick steve recommend love
**********
before: The selection on the menu was great and so were the prices.
after: select menu great price
**********


**3. Create Bag of Words Model**

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
cv = CountVectorizer(max_features=1500)

In [11]:
X = cv.fit_transform(corpus).toarray()
y = dataframe.iloc[:, 1].values

**4. Splitting the Dataset into the Training Set and Test Set**

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

**Common Models to Use in Natural Language Processing are: Naive Bayes, Decision Tree, Random Forest Classification**

**5. Naive Bayes**

In [14]:
from sklearn.naive_bayes import GaussianNB

In [15]:
naive_bayes_classifier = GaussianNB()

In [16]:
naive_bayes_classifier.fit(x_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [17]:
y_naive_bayes_predict = naive_bayes_classifier.predict(x_test)

**5.1. Confusion Matrix**

In [18]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_naive_bayes_predict)

array([[55, 42],
       [12, 91]], dtype=int64)

In [19]:
# split to True_Negative, False_Positive, False_Negative, True_Positive
tn, fp, fn, tp = confusion_matrix(y_test, y_naive_bayes_predict).ravel()

In [20]:
tn, fp, fn, tp

(55, 42, 12, 91)

In [21]:
accuracy = (tn + tp) / (tn + fp + fn + tp)
precision = tp / (fp + tp)
recall = tp / (fn + tp)
f1_score = (2 * precision * recall) / (precision * recall)

In [22]:
print('Accuracy: {}'.format(accuracy))
print('Precision: {}'.format(precision))
print('Recall: {}'.format(recall))
print('F1 Score: {}'.format(f1_score))

Accuracy: 0.73
Precision: 0.6842105263157895
Recall: 0.883495145631068
F1 Score: 2.0


**6. Decision Tree**

In [23]:
from sklearn.tree import DecisionTreeClassifier

In [24]:
decision_tree_classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)

In [25]:
decision_tree_classifier.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [26]:
y_decision_tree_predict = decision_tree_classifier.predict(x_test)

**6.1. Confusion Matrix**

In [27]:
confusion_matrix(y_test, y_decision_tree_predict)

array([[74, 23],
       [35, 68]], dtype=int64)

In [28]:
tn, fp, fn, tp = confusion_matrix(y_test, y_decision_tree_predict).ravel()

In [29]:
accuracy = (tn + tp) / (tn + fp + fn + tp)
precision = tp / (fp + tp)
recall = tp / (fn + tp)
f1_score = (2 * precision * recall) / (precision * recall)

In [30]:
print('Accuracy: {}'.format(accuracy))
print('Precision: {}'.format(precision))
print('Recall: {}'.format(recall))
print('F1 Score: {}'.format(f1_score))

Accuracy: 0.71
Precision: 0.7472527472527473
Recall: 0.6601941747572816
F1 Score: 2.0
