# Natural Language Processing

## Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing the dataset

In [2]:
dataset = pd.read_csv("Restaurant_Reviews.tsv", sep="\t", quoting =3)

## Cleaning the texts

In [3]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(len(dataset)):
  review = re.sub('[^a-zA-Z]', ' ', str(dataset.iloc[i].values))
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove("not")
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Creating the Bag of Words model

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,-1].values

## Splitting the dataset into the Training set and Test set

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
temp_x, temp_y = X_train, y_train

# Comperative Analysis of differest classifier

In [6]:
results = []

## Decision Tree

In [7]:
X_train, y_train = temp_x, temp_y
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [8]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
accuracy_score(y_test, y_pred)
TP = cm[0][0]
TN = cm[1][1]
FP = cm[1][0]
FN = cm[0][1]
Accuracy = (TP + TN) / (TP + TN + FP + FN)
Precision = TP / (TP + FP)
Recall = TP / (TP + FN)
print(f"Accuracy: {Accuracy}\nPrecision: {Precision}\nRecall:{Recall}")
results.append([str(type(classifier)).split(".")[-1][:-2],Accuracy, Precision,Recall])

Accuracy: 0.755
Precision: 0.6902654867256637
Recall:0.8478260869565217


## GaussianNB

In [9]:
X_train, y_train = temp_x, temp_y
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train) 

GaussianNB(priors=None, var_smoothing=1e-09)

In [10]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
accuracy_score(y_test, y_pred)
TP = cm[0][0]
TN = cm[1][1]
FP = cm[1][0]
FN = cm[0][1]
Accuracy = (TP + TN) / (TP + TN + FP + FN)
Precision = TP / (TP + FP)
Recall = TP / (TP + FN)
print(f"Accuracy: {Accuracy}\nPrecision: {Precision}\nRecall:{Recall}")
results.append([str(type(classifier)).split(".")[-1][:-2],Accuracy, Precision,Recall])

Accuracy: 0.71
Precision: 0.7236842105263158
Recall:0.5978260869565217


## KNeighbors

In [11]:
X_train, y_train = temp_x, temp_y
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [12]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
accuracy_score(y_test, y_pred)
TP = cm[0][0]
TN = cm[1][1]
FP = cm[1][0]
FN = cm[0][1]
Accuracy = (TP + TN) / (TP + TN + FP + FN)
Precision = TP / (TP + FP)
Recall = TP / (TP + FN)
print(f"Accuracy: {Accuracy}\nPrecision: {Precision}\nRecall:{Recall}")
results.append([str(type(classifier)).split(".")[-1][:-2],Accuracy, Precision,Recall])

Accuracy: 0.7
Precision: 0.6311475409836066
Recall:0.8369565217391305


## Logestic Regression

In [13]:
X_train, y_train = temp_x, temp_y
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
accuracy_score(y_test, y_pred)
TP = cm[0][0]
TN = cm[1][1]
FP = cm[1][0]
FN = cm[0][1]
Accuracy = (TP + TN) / (TP + TN + FP + FN)
Precision = TP / (TP + FP)
Recall = TP / (TP + FN)
print(f"Accuracy: {Accuracy}\nPrecision: {Precision}\nRecall:{Recall}")
results.append([str(type(classifier)).split(".")[-1][:-2],Accuracy, Precision,Recall])

Accuracy: 0.765
Precision: 0.7142857142857143
Recall:0.8152173913043478


## Random Forest

In [15]:
X_train, y_train = temp_x, temp_y
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [16]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
accuracy_score(y_test, y_pred)
TP = cm[0][0]
TN = cm[1][1]
FP = cm[1][0]
FN = cm[0][1]
Accuracy = (TP + TN) / (TP + TN + FP + FN)
Precision = TP / (TP + FP)
Recall = TP / (TP + FN)
print(f"Accuracy: {Accuracy}\nPrecision: {Precision}\nRecall:{Recall}")
results.append([str(type(classifier)).split(".")[-1][:-2],Accuracy, Precision,Recall])

Accuracy: 0.76
Precision: 0.6929824561403509
Recall:0.8586956521739131


## SVM

In [17]:
X_train, y_train = temp_x, temp_y 
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [18]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
accuracy_score(y_test, y_pred)
TP = cm[0][0]
TN = cm[1][1]
FP = cm[1][0]
FN = cm[0][1]
Accuracy = (TP + TN) / (TP + TN + FP + FN)
Precision = TP / (TP + FP)
Recall = TP / (TP + FN)
print(f"Accuracy: {Accuracy}\nPrecision: {Precision}\nRecall:{Recall}")
results.append([str(type(classifier)).split(".")[-1][:-2],Accuracy, Precision,Recall])

Accuracy: 0.82
Precision: 0.7372881355932204
Recall:0.9456521739130435


## SVM Kernal

In [19]:
X_train, y_train = temp_x, temp_y
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [20]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
accuracy_score(y_test, y_pred)
TP = cm[0][0]
TN = cm[1][1]
FP = cm[1][0]
FN = cm[0][1]
Accuracy = (TP + TN) / (TP + TN + FP + FN)
Precision = TP / (TP + FP)
Recall = TP / (TP + FN)
print(f"Accuracy: {Accuracy}\nPrecision: {Precision}\nRecall:{Recall}")
results.append([str(type(classifier)).split(".")[-1][:-2],Accuracy, Precision,Recall])

Accuracy: 0.785
Precision: 0.7333333333333333
Recall:0.8369565217391305


# Result

In [21]:
results[-1][0] = "SVC Kernal"
pd.DataFrame(results, columns=["Model", "Accuracy", "Precision","Recall"])

Unnamed: 0,Model,Accuracy,Precision,Recall
0,DecisionTreeClassifier,0.755,0.690265,0.847826
1,GaussianNB,0.71,0.723684,0.597826
2,KNeighborsClassifier,0.7,0.631148,0.836957
3,LogisticRegression,0.765,0.714286,0.815217
4,RandomForestClassifier,0.76,0.692982,0.858696
5,SVC,0.82,0.737288,0.945652
6,SVC Kernal,0.785,0.733333,0.836957


# Conclusion

> As per the Accuracy we can select SVC

