In [None]:
import pandas as pd 
import bz2 
import re # regular expressions
import random
import nltk
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
import gc
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import numpy as np



In [None]:
train_file = bz2.BZ2File(r'train.ft.txt.bz2')
test_file = bz2.BZ2File(r'test.ft.txt.bz2')

In [None]:
train_file_lines = train_file.readlines()
test_file_lines = test_file.readlines()

In [None]:
# delete unnecessary data to relief memory
del train_file, test_file
gc.collect()

In [None]:
#Convert from raw binary strings to strings that can be parsed
train_file_lines = [x.decode('utf-8') for x in train_file_lines]
test_file_lines = [x.decode('utf-8') for x in test_file_lines]

In [None]:
random.shuffle(train_file_lines) 
random.shuffle(test_file_lines)


In [None]:
# reducing the size of the dataset
train_set = train_file_lines[:10000]
test_set = test_file_lines[:2500]

In [None]:
print(train_set[257])

In [None]:
# delete unnecessary data
del train_file_lines, test_file_lines
gc.collect()

In [None]:
# extracting labels from the dataset
train_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in train_set]
test_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test_set]

In [None]:
# 0s and 1s
Y_train = pd.DataFrame(train_labels)
Y_test = pd.DataFrame(test_labels)

In [None]:
# removing labels and \n from dataset and keeping the actual reviews
train_reviews = [x.split(' ', 1)[1][:-1] for x in train_set]
test_reviews = [x.split(' ', 1)[1][:-1] for x in test_set]

In [None]:
# removing URLs from reviews
for i in range(len(train_reviews)):
    if 'www.' in train_reviews[i] or 'http:' in train_reviews[i] or 'https:' in train_reviews[i] or '.com' in train_reviews[i]:
        train_reviews[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "", train_reviews[i])

for i in range(len(test_reviews)):
    if 'www.' in test_reviews[i] or 'http:' in test_reviews[i] or 'https:' in test_reviews[i] or '.com' in test_reviews[i]:
        test_reviews[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "", test_reviews[i])

In [None]:
# removing numbers from reviews
for i in range(len(train_reviews)):
    train_reviews[i] = re.sub(r'[0-9]+', '', train_reviews[i])
for i in range(len(test_reviews)):
    test_reviews[i] = re.sub(r'[0-9]+', '', test_reviews[i])    
    

In [None]:
# removing punctuations from reviews
for i in range(len(train_reviews)):
    train_reviews[i] = re.sub(r'[^\w\s]', '', train_reviews[i])
for i in range(len(test_reviews)):
    test_reviews[i] = re.sub(r'[^\w\s]', '', test_reviews[i])   

In [None]:
train_reviews[456]

In [None]:
# CountVectorizer technique
from sklearn.feature_extraction.text import CountVectorizer

# Without stop_words and stemming
cv1 = CountVectorizer()

# With stop_words
cv2 = CountVectorizer(analyzer = 'word',stop_words='english')

# With stemming
import nltk.stem

stemmer = nltk.stem.SnowballStemmer('english')

class StemmedCountVectorizer_train(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer_train, self).build_analyzer()
        return lambda train_reviews: ([stemmer.stem(w) for w in analyzer(train_reviews)])

class StemmedCountVectorizer_test(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer_test, self).build_analyzer()
        return lambda test_reviews: ([stemmer.stem(w) for w in analyzer(test_reviews)])    

cv3_train = StemmedCountVectorizer_train()
cv3_test = StemmedCountVectorizer_test()

In [None]:
# Tfidf technique
from sklearn.feature_extraction.text import TfidfVectorizer

# Without stop_words and stemming
tf1 = TfidfVectorizer()

# With stop_words
tf2 = TfidfVectorizer(analyzer = 'word',stop_words='english')

# With stemming
import nltk.stem

stemmer = nltk.stem.SnowballStemmer('english')

class StemmedTfidf_train(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedTfidf_train, self).build_analyzer()
        return lambda train_reviews: ([stemmer.stem(w) for w in analyzer(train_reviews)])

class StemmedTfidf_test(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedTfidf_test, self).build_analyzer()
        return lambda test_reviews: ([stemmer.stem(w) for w in analyzer(test_reviews)])

tf3_train = StemmedTfidf_train()
tf3_test = StemmedTfidf_test()

In [None]:
# Experiment 1
X_train = cv1.fit_transform(train_reviews) 
X_test = cv1.transform(test_reviews)

X_label = []
X_acc = []
X_pres = []
X_recall = []
X_f1 = []

In [None]:
# Logisitc Regression
lr = LogisticRegression(max_iter=7600)
lr.fit(X_train, Y_train.values.ravel())
y_pred = lr.predict(X_test)
print(classification_report(Y_test, y_pred,digits=4))

import seaborn as sn
result_matrix = confusion_matrix(Y_test, y_pred)
ax= plt.subplot()
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
labels = ['0','1']
sn.heatmap(result_matrix, xticklabels=labels, yticklabels=labels, annot=True, fmt="1", linewidths=1.0, square=1)
ax.set_xlabel('Predicted Class');ax.set_ylabel('True Class');

X_label.append("LRcv1")
X_acc.append(accuracy_score(Y_test, y_pred))
X_pres.append(precision_score(Y_test, y_pred))
X_recall.append(recall_score(Y_test, y_pred))
X_f1.append(f1_score(Y_test, y_pred))

In [None]:
# Support Vector Machine
svc = svm.SVC()
svc.fit(X_train, Y_train.values.ravel())
y_pred = svc.predict(X_test)
print(classification_report(Y_test, y_pred,digits=4))

X_label.append("SVMcv1")
X_acc.append(accuracy_score(Y_test, y_pred))
X_pres.append(precision_score(Y_test, y_pred))
X_recall.append(recall_score(Y_test, y_pred))
X_f1.append(f1_score(Y_test, y_pred))

In [None]:
# KNN
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train, Y_train.values.ravel())
y_pred = knn.predict(X_test)
print(classification_report(Y_test, y_pred,digits=4))

X_label.append("KNNcv1")
X_acc.append(accuracy_score(Y_test, y_pred))
X_pres.append(precision_score(Y_test, y_pred))
X_recall.append(recall_score(Y_test, y_pred))
X_f1.append(f1_score(Y_test, y_pred))

In [None]:
# Experiment 2
X_train = cv2.fit_transform(train_reviews) 
X_test = cv2.transform(test_reviews) 

In [None]:
X_test.shape

In [None]:
# Logisitc Regression
lr = LogisticRegression(max_iter=7600)
lr.fit(X_train, Y_train.values.ravel())
y_pred = lr.predict(X_test)
print(classification_report(Y_test, y_pred,digits=4))

X_label.append("LRcv2")
X_acc.append(accuracy_score(Y_test, y_pred))
X_pres.append(precision_score(Y_test, y_pred))
X_recall.append(recall_score(Y_test, y_pred))
X_f1.append(f1_score(Y_test, y_pred))

In [None]:
# Support Vector Machine
svc = svm.SVC()
svc.fit(X_train, Y_train.values.ravel())
y_pred = svc.predict(X_test)
print(classification_report(Y_test, y_pred,digits=4))

X_label.append("SVMcv2")
X_acc.append(accuracy_score(Y_test, y_pred))
X_pres.append(precision_score(Y_test, y_pred))
X_recall.append(recall_score(Y_test, y_pred))
X_f1.append(f1_score(Y_test, y_pred))

In [None]:
# KNN
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train, Y_train.values.ravel())
y_pred = knn.predict(X_test)
print(classification_report(Y_test, y_pred,digits=4))

X_label.append("KNNcv2")
X_acc.append(accuracy_score(Y_test, y_pred))
X_pres.append(precision_score(Y_test, y_pred))
X_recall.append(recall_score(Y_test, y_pred))
X_f1.append(f1_score(Y_test, y_pred))

In [None]:
# Experiment 3
X_train = cv3_train.fit_transform(train_reviews) 
X_test = cv3_train.transform(test_reviews) 

In [None]:
X_train.shape

In [None]:
# Logisitc Regression
lr = LogisticRegression(max_iter=7600)
lr.fit(X_train, Y_train.values.ravel())
y_pred = lr.predict(X_test)
print(classification_report(Y_test, y_pred,digits=4))

X_label.append("LRcv3")
X_acc.append(accuracy_score(Y_test, y_pred))
X_pres.append(precision_score(Y_test, y_pred))
X_recall.append(recall_score(Y_test, y_pred))
X_f1.append(f1_score(Y_test, y_pred))

In [None]:
# Support Vector Machine
svc = svm.SVC()
svc.fit(X_train, Y_train.values.ravel())
y_pred = svc.predict(X_test)
print(classification_report(Y_test, y_pred,digits=4))

X_label.append("SVMcv3")
X_acc.append(accuracy_score(Y_test, y_pred))
X_pres.append(precision_score(Y_test, y_pred))
X_recall.append(recall_score(Y_test, y_pred))
X_f1.append(f1_score(Y_test, y_pred))

In [None]:
# KNN
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train, Y_train.values.ravel())
y_pred = knn.predict(X_test)
print(classification_report(Y_test, y_pred,digits=4))

X_label.append("KNNcv3")
X_acc.append(accuracy_score(Y_test, y_pred))
X_pres.append(precision_score(Y_test, y_pred))
X_recall.append(recall_score(Y_test, y_pred))
X_f1.append(f1_score(Y_test, y_pred))

In [None]:
# Experiment 4
X_train = tf1.fit_transform(train_reviews) 
X_test = tf1.transform(test_reviews) 

In [None]:
# Logisitc Regression
lr = LogisticRegression(max_iter=7600)
lr.fit(X_train, Y_train.values.ravel())
y_pred = lr.predict(X_test)
print(classification_report(Y_test, y_pred,digits=4))

X_label.append("LRtf1")
X_acc.append(accuracy_score(Y_test, y_pred))
X_pres.append(precision_score(Y_test, y_pred))
X_recall.append(recall_score(Y_test, y_pred))
X_f1.append(f1_score(Y_test, y_pred))

In [None]:
# Support Vector Machine
svc = svm.SVC()
svc.fit(X_train, Y_train.values.ravel())
y_pred = svc.predict(X_test)
print(classification_report(Y_test, y_pred,digits=4))

X_label.append("SVMtf1")
X_acc.append(accuracy_score(Y_test, y_pred))
X_pres.append(precision_score(Y_test, y_pred))
X_recall.append(recall_score(Y_test, y_pred))
X_f1.append(f1_score(Y_test, y_pred))

In [None]:
# KNN
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train, Y_train.values.ravel())
y_pred = knn.predict(X_test)
print(classification_report(Y_test, y_pred,digits=4))

X_label.append("KNNtf1")
X_acc.append(accuracy_score(Y_test, y_pred))
X_pres.append(precision_score(Y_test, y_pred))
X_recall.append(recall_score(Y_test, y_pred))
X_f1.append(f1_score(Y_test, y_pred))

In [None]:
# Experiment 5
X_train = tf2.fit_transform(train_reviews) 
X_test = tf2.transform(test_reviews) 

In [None]:
# Logisitc Regression
lr = LogisticRegression(max_iter=7600)
lr.fit(X_train, Y_train.values.ravel())
y_pred = lr.predict(X_test)
print(classification_report(Y_test, y_pred,digits=4))

X_label.append("LRtf2")
X_acc.append(accuracy_score(Y_test, y_pred))
X_pres.append(precision_score(Y_test, y_pred))
X_recall.append(recall_score(Y_test, y_pred))
X_f1.append(f1_score(Y_test, y_pred))

In [None]:
# Support Vector Machine
svc = svm.SVC()
svc.fit(X_train, Y_train.values.ravel())
y_pred = svc.predict(X_test)
print(classification_report(Y_test, y_pred,digits=4))

X_label.append("SVMtf2")
X_acc.append(accuracy_score(Y_test, y_pred))
X_pres.append(precision_score(Y_test, y_pred))
X_recall.append(recall_score(Y_test, y_pred))
X_f1.append(f1_score(Y_test, y_pred))

In [None]:
# KNN
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train, Y_train.values.ravel())
y_pred = knn.predict(X_test)
print(classification_report(Y_test, y_pred,digits=4))

X_label.append("KNNtf2")
X_acc.append(accuracy_score(Y_test, y_pred))
X_pres.append(precision_score(Y_test, y_pred))
X_recall.append(recall_score(Y_test, y_pred))
X_f1.append(f1_score(Y_test, y_pred))

In [None]:
# Experiment 6
X_train = tf3_train.fit_transform(train_reviews) 
X_test = tf3_train.transform(test_reviews) 

In [None]:
# Logisitc Regression
lr = LogisticRegression(max_iter=7600)
lr.fit(X_train, Y_train.values.ravel())
y_pred = lr.predict(X_test)
print(classification_report(Y_test, y_pred,digits=4))

X_label.append("LRtf3")
X_acc.append(accuracy_score(Y_test, y_pred))
X_pres.append(precision_score(Y_test, y_pred))
X_recall.append(recall_score(Y_test, y_pred))
X_f1.append(f1_score(Y_test, y_pred))

In [None]:
# Support Vector Machine
svc = svm.SVC()
svc.fit(X_train, Y_train.values.ravel())
y_pred = svc.predict(X_test)
print(classification_report(Y_test, y_pred,digits=4))

X_label.append("SVMtf3")
X_acc.append(accuracy_score(Y_test, y_pred))
X_pres.append(precision_score(Y_test, y_pred))
X_recall.append(recall_score(Y_test, y_pred))
X_f1.append(f1_score(Y_test, y_pred))

In [None]:
# KNN
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train, Y_train.values.ravel())
y_pred = knn.predict(X_test)
print(classification_report(Y_test, y_pred,digits=4))

X_label.append("KNNtf3")
X_acc.append(accuracy_score(Y_test, y_pred))
X_pres.append(precision_score(Y_test, y_pred))
X_recall.append(recall_score(Y_test, y_pred))
X_f1.append(f1_score(Y_test, y_pred))

In [None]:
print(X_label)
print(X_acc)
print(X_pres)
print(X_recall)
print(X_f1)


    
#plt.hist([X_acc, X_pres, X_recall, X_f1], bins, label=X_label[0])
plt.bar(X_label[:10],X_acc[:10], width = 0.75, color = "blue")
plt.bar(X_label[9:],X_acc[9:], width = 0.75, color = "purple")
plt.rcParams["figure.figsize"] = (15,5)
plt.title("Accuracy Scores",fontsize=15)
plt.ylim(0.5,0.9)
plt.show()

plt.bar(X_label[:10],X_pres[:10], width = 0.75, color = "red")
plt.bar(X_label[9:],X_pres[9:], width = 0.75, color = "orange")
plt.rcParams["figure.figsize"] = (15,5)
plt.title("Precision Scores",fontsize=15)
plt.ylim(0.5,0.9)
plt.show()

plt.bar(X_label[:10],X_recall[:10], width = 0.75, color = "green")
plt.bar(X_label[9:],X_recall[9:], width = 0.75, color = "limegreen")
plt.rcParams["figure.figsize"] = (15,5)
plt.title("Recall Scores",fontsize=15)
plt.ylim(0.5,0.9)
plt.show()

plt.bar(X_label[:10],X_f1[:10], width = 0.75, color = "teal")
plt.bar(X_label[9:],X_f1[9:], width = 0.75, color = "lime")
plt.rcParams["figure.figsize"] = (15,5)
plt.title("F1 Scores",fontsize=15)
plt.ylim(0.5,0.9)
plt.show()

In [None]:
def Average(lst):
    return sum(lst) / len(lst)

BoW = []
Tfidf = []

retrie = ["Bag of Words","TFIDF"]
stat = ["Accuarcy","Precision","Recall","F1"]

BoW.append(Average(X_acc[:10]))
Tfidf.append(Average(X_acc[9:]))
BoW.append(Average(X_pres[:10]))
Tfidf.append(Average(X_pres[9:]))
BoW.append(Average(X_recall[:10]))
Tfidf.append(Average(X_recall[9:]))
BoW.append(Average(X_f1[:10]))
Tfidf.append(Average(X_f1[9:]))

In [None]:
print(BoW,Tfidf)

In [None]:
app1 = plt.bar(stat,BoW, width = 0.25, align = "edge", color = "teal")
app2 = plt.bar(stat,Tfidf, width = 0.25, color = "gold")
plt.legend(retrie)
plt.title("Bag of Words v. TFIDF",fontsize=15)
plt.rcParams["figure.figsize"] = (10,5)
plt.ylim(0.7,0.825)
plt.show()

In [None]:

new_label = X_label.copy()

new_acc = X_acc.copy()

print(new_acc)
print(new_label)


In [None]:
num = [2,4,6,8,10,12]

for i in range(len(num)):
    new_label.pop(num[i])
    new_acc.pop(num[i])
    
print(new_acc)
print(new_label)

In [None]:
appr = ["Bag of Words","TFIDF"]

plt.bar(X_label[:7],X_acc[:7], width = 0.75, color = "teal")
plt.bar(X_label[6:],X_acc[6:], width = 0.75, color = "gold")
plt.rcParams["figure.figsize"] = (10,5)
plt.title("Accuracy Scores (w/o KNN)",fontsize=15)
plt.ylim(0.82,0.88)
plt.legend(appr)
plt.show()

In [None]:
defa = new_acc.copy()
stop = new_acc.copy()
stem = new_acc.copy()

num1=[2,2,2,2,4,4,4,4]
num2=[0,0,2,2,2,2,4,4]
num3=[0,0,0,0,2,2,2,2]

for i in range(len(num1)):
    defa.pop(num1[i])
    
for i in range(len(num2)):
    stop.pop(num2[i])
    
for i in range(len(num3)):
    stem.pop(num3[i])

In [None]:
print(defa,stop,stem)
avg_all = []
avg_all.append(Average(defa))
avg_all.append(Average(stop))
avg_all.append(Average(stem))

print(avg_all)

method = ["No Filters","Stop","Stemming"]

app1 = plt.bar(method,avg_all, width = 0.75, color = "navy")
plt.rcParams["figure.figsize"] = (10,5)
plt.title("Average Accuracy of Filter Methods",fontsize=15)
plt.ylim(0.83,0.87)
plt.show()