##Import

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


##Data loading

In [None]:
df = pd.read_csv('clean_data.csv')
df

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching oz episode hoo...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend time hot summer we...,1
3,basically family little boy jake think zombie ...,0
4,petter mattei love time money visually stunnin...,1
...,...,...
49995,thought movie right good job creative original...,1
49996,bad plot bad dialogue bad acting idiotic direc...,0
49997,catholic taught parochial elementary school nu...,0
49998,going disagree previous comment side maltin on...,0


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['review'].values, df['sentiment'].values, test_size=0.2, random_state=42)

##Word Vectors


In [None]:
!pip install spacy
!python -m spacy download en_core_web_md

In [None]:
import spacy
nlp = spacy.load('en_core_web_md')

In [None]:
train_docs = [nlp(text) for text in X_train]
test_docs = [nlp(text) for text in X_test]

In [None]:
X_train = [x.vector for x in train_docs]
X_test = [x.vector for x in test_docs]

In [None]:
len(X_train), len(y_train), len(X_test), len(y_test)

(40000, 40000, 10000, 10000)

##Bag of words - TF-IDF (better)


In [None]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)



In [None]:
from collections import Counter

# Assuming your labels are in y_train and y_test
y_train_counts = Counter(y_train)
y_test_counts = Counter(y_test)

positive_train = y_train_counts[1]
negative_train = y_train_counts[0]

positive_test = y_test_counts[1]
negative_test = y_test_counts[0]

print("Train Data: Positive:", positive_train, "Negative:", negative_train)
print("Test Data: Positive:", positive_test, "Negative:", negative_test)


Train Data: Positive: 19961 Negative: 20039
Test Data: Positive: 5039 Negative: 4961


##Classification SVM

### Linear SVM

In [None]:
clf_svm = svm.SVC(kernel='rbf', C=8)
clf_svm.fit(X_train, y_train)
y_pred = clf_svm.predict(X_test)

In [None]:
f1_svm = f1_score(y_test, y_pred, average=None)
acc_svm = accuracy_score(y_test, y_pred)
recall_svm = recall_score(y_test, y_pred, average=None)
precision_svm = precision_score(y_test, y_pred, average=None)
print(f'Test F1 score: {f1_svm}.')
print(f'Test Accuracy score: {acc_svm}.')
print(f'Test Recall score: {recall_svm}.')
print(f'Test Precision score: {precision_svm}.')

Test F1 score: [0.90164437 0.90567147].
Test Accuracy score: 0.9037.
Test Recall score: [0.88973997 0.91744394].
Test Precision score: [0.91387164 0.89419729].


In [None]:
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
f1_rf = f1_score(y_test, y_pred, average=None)
acc_rf = accuracy_score(y_test, y_pred)
recall_rf = recall_score(y_test, y_pred, average=None)
precision_rf = precision_score(y_test, y_pred, average=None)
print(f'Test F1 score: {f1_rf}.')
print(f'Test Accuracy score: {acc_rf}.')
print(f'Test Recall score: {recall_rf}.')
print(f'Test Precision score: {precision_rf}.')

Test F1 score: [0.85791475 0.85808515].
Test Accuracy score: 0.858.
Test Recall score: [0.86414029 0.85195475].
Test Precision score: [0.85177826 0.86430441].


In [None]:
gradient_boosting = GradientBoostingClassifier()
gradient_boosting.fit(X_train, y_train)
y_pred = gradient_boosting.predict(X_test)
f1_gradient_boosting = f1_score(y_test, y_pred, average=None)
acc_gradient_boosting = accuracy_score(y_test, y_pred)
recall_gradient_boosting = recall_score(y_test, y_pred, average=None)
precision_gradient_boosting = precision_score(y_test, y_pred, average=None)
print(f'Test F1 score: {f1_gradient_boosting}.')
print(f'Test Accuracy score: {acc_gradient_boosting}.')
print(f'Test Recall score: {recall_gradient_boosting}.')
print(f'Test Precision score: {precision_gradient_boosting}.')

Test F1 score: [0.8031044  0.82518407].
Test Accuracy score: 0.8148.
Test Recall score: [0.76133844 0.86743401].
Test Precision score: [0.84971879 0.78685869].


###Stacking

In [None]:
svm = svm.SVC(kernel='rbf', C=8)
random_forest = RandomForestClassifier()
gradient_boosting = GradientBoostingClassifier()
log_reg = LogisticRegression()

estimators = [('svm', svm), ('rf', random_forest), ('gb', gradient_boosting)]
stacking = StackingClassifier(estimators=estimators, final_estimator = log_reg)
stacking.fit(X_train, y_train)
y_pred = stacking.predict(X_test)


In [None]:
f1_stacking = f1_score(y_test, y_pred, average=None)
acc_stacking = accuracy_score(y_test, y_pred)
recall_stacking = recall_score(y_test, y_pred, average=None)
precision_stacking = precision_score(y_test, y_pred, average=None)
print(f'Test F1 score: {f1_stacking}.')
print(f'Test Accuracy score: {acc_stacking}.')
print(f'Test Recall score: {recall_stacking}.')
print(f'Test Precision score: {precision_stacking}.')

###Grid Search

In [None]:
"""
param_grid = {
    'C': [1, 4, 8, 16],
    'kernel': ['linear', 'rbf'],
}

svm_classifier = SVC()
grid_search = GridSearchCV(svm_classifier, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
y_pred = grid_search.predict(X_test)

best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print("Best Parameters:", best_params)
print("Best Estimator:", best_estimator)


accuracy = best_estimator.score(X_test, y_test)
print("Test Accuracy:", accuracy)
f1 = f1_score(y_test, y_pred, average=None)
print(f'Test F1 score: {f1}.')
"""