# Importing Libraries

In [1]:
%%capture
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pdb
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# Importing Dataset

In [2]:
dataset = pd.read_csv('../data/all_reviews.tsv', delimiter = '\t', quoting = 3)

FileNotFoundError: File b'all_reviews.tsv' does not exist

# Data Pre-processing

In [None]:
corpus = []

for i in range(0, len(dataset)):
    try:
        review = re.sub('[^a-zA-Z]', ' ', dataset['Reivew'][i])
        review = review.lower()
        review = review.split()
        ps = PorterStemmer()
        review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
        review = ' '.join(review)
    except:
        review = str(dataset['Reivew'][i])

    corpus.append(review)

# Splitting the Dataset

In [None]:
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# Applying ML Classification Algorithms

## Decision Tree Classifier

In [None]:
dt_classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
dt_classifier.fit(X_train, y_train)
dt_y_pred = dt_classifier.predict(X_test)

## Random Forest Classifier

In [None]:
rf_classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
rf_classifier.fit(X_train, y_train)
rf_y_pred = rf_classifier.predict(X_test)

## Support Vector Machine Classifier

In [None]:
svm_classifier = SVC(kernel = 'linear', random_state = 0)
svm_classifier.fit(X_train, y_train)
svm_y_pred = svm_classifier.predict(X_test)

## Naive Bayes Classifier

In [None]:
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)
nb_y_pred = nb_classifier.predict(X_test)

## KNN Classifier

In [None]:
knn_classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn_classifier.fit(X_train, y_train)
knn_y_pred = knn_classifier.predict(X_test)

# Results

## Decision Tree

In [None]:
dt_cm = confusion_matrix(y_test, dt_y_pred)
dt_acc = accuracy_score(y_test, dt_y_pred)
dt_prec = precision_score(y_test, dt_y_pred)
dt_recall = recall_score(y_test, dt_y_pred)
dt_f1 = f1_score(y_test, dt_y_pred)
dt_df = ['Decision Tree', dt_cm[1][1], dt_cm[0][0], dt_cm[1][0], dt_cm[0][1], dt_acc, dt_prec, dt_recall, dt_f1]

## Random Forest

In [None]:
rf_cm = confusion_matrix(y_test, rf_y_pred)
rf_acc = accuracy_score(y_test, rf_y_pred)
rf_prec = precision_score(y_test, rf_y_pred)
rf_recall = recall_score(y_test, rf_y_pred)
rf_f1 = f1_score(y_test, rf_y_pred)
rf_df = ['Random Forest', rf_cm[1][1], rf_cm[0][0], rf_cm[1][0], rf_cm[0][1], rf_acc, rf_prec, rf_recall, rf_f1]

## Support Vector Machine

In [None]:
svm_cm = confusion_matrix(y_test, svm_y_pred)
svm_acc = accuracy_score(y_test, svm_y_pred)
svm_prec = precision_score(y_test, svm_y_pred)
svm_recall = recall_score(y_test, svm_y_pred)
svm_f1 = f1_score(y_test, svm_y_pred)
svm_df = ['Support Vector Machine', svm_cm[1][1], svm_cm[0][0], svm_cm[1][0], svm_cm[0][1], svm_acc, svm_prec, svm_recall, svm_f1]

## Naive Bayes

In [None]:
nb_cm = confusion_matrix(y_test, nb_y_pred)
nb_acc = accuracy_score(y_test, nb_y_pred)
nb_prec = precision_score(y_test, nb_y_pred)
nb_recall = recall_score(y_test, nb_y_pred)
nb_f1 = f1_score(y_test, nb_y_pred)
nb_df = ['Naive Bayes', nb_cm[1][1], nb_cm[0][0], nb_cm[1][0], nb_cm[0][1], nb_acc, nb_prec, nb_recall, nb_f1]

## K-Nearest Neighbour

In [None]:
knn_cm = confusion_matrix(y_test, knn_y_pred)
knn_acc = accuracy_score(y_test, knn_y_pred)
knn_prec = precision_score(y_test, knn_y_pred)
knn_recall = recall_score(y_test, knn_y_pred)
knn_f1 = f1_score(y_test, knn_y_pred)
knn_df = ['K-Nearest Neighbour', knn_cm[1][1], knn_cm[0][0], knn_cm[1][0], knn_cm[0][1], knn_acc, knn_prec, knn_recall, knn_f1]

## Final Output

In [None]:
result_df = pd.DataFrame([dt_df, rf_df, svm_df, nb_df, knn_df], columns=['Algorithm', 'True Positives', 'True Negatives', 'False Positives', 'False Negatives', 'Accuracy', 'Precision', 'Recall', 'F1']).set_index('Algorithm')
del result_df.index.name
result_df