# Natural Language Processing

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [89]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

In [90]:
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


## Cleaning the texts

In [95]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 1000):
    # review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = dataset['Review'][i]
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Creating the Bag of Words model

In [104]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

In [105]:
X.shape

(1000, 1500)

## Splitting the dataset into the Training set and Test set

In [106]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [107]:
# from sklearn.decomposition import TruncatedSVD
# svd = TruncatedSVD(n_components=500)
# X_train_svd = svd.fit_transform(X_train)
# X_test_svd = svd.transform(X_test)

In [76]:
# svd.explained_variance_ratio_.cumsum()[-10:]

array([0.96549756, 0.96578544, 0.96607046, 0.966355  , 0.96663686,
       0.96691504, 0.96719075, 0.96746448, 0.96773579, 0.9680055 ])

## Training the Naive Bayes model on the Training set

In [111]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

classifier = GaussianNB()
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

## Predicting the Test set results

In [112]:
y_pred = classifier.predict(X_test)

## Making the Confusion Matrix

In [113]:
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test, y_pred))

[[73 24]
 [22 81]]
              precision    recall  f1-score   support

           0       0.77      0.75      0.76        97
           1       0.77      0.79      0.78       103

    accuracy                           0.77       200
   macro avg       0.77      0.77      0.77       200
weighted avg       0.77      0.77      0.77       200

