In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, RandomForestClassifier

In [2]:
# Import datasets
df = pd.read_csv('datasets/movie.csv')
df.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [3]:
df['label'].value_counts()

label
0    20019
1    19981
Name: count, dtype: int64

In [4]:
negative = df[df['label']==0]
positive = df[df['label']==1]
negative.shape, positive.shape

((20019, 2), (19981, 2))

In [10]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], random_state=0)
X_train.shape, X_test.shape

((30000,), (10000,))

In [None]:
# Naive Bayes Classification
textclassifier = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('smote', SMOTE(random_state=12)),
    ('mnb', MultinomialNB(alpha =0.1))
])

textclassifier.fit(X_train, y_train)
nb_pred = textclassifier.predict(X_test)

print('Naive Bayes classifier:')
print('Accuracy:', accuracy_score(y_test, nb_pred))
print('F1 score:', f1_score(y_test, nb_pred, average='weighted'))
print('Precision:', precision_score(y_test, nb_pred, average='weighted'))
print('Recall:', recall_score(y_test, nb_pred, average='weighted'))

# Naive bayes Confusion Matriks
nb_cm = confusion_matrix(y_test, nb_pred)
sns.heatmap(nb_cm, annot=True, fmt='d', cmap='Blues')
plt.title('Naive Bayes Classifier Confusion Matrix')
plt.show()

In [None]:
# Gradient Boosting
textclassifier = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('smote', SMOTE(random_state=12)),
    ('mnb', GradientBoostingClassifier())
])

textclassifier.fit(X_train, y_train)

gb_pred = textclassifier.predict(X_test)

print('Gradient Boosting classifier:')
print('Accuracy:', accuracy_score(y_test, gb_pred))
print('F1 score:', f1_score(y_test, gb_pred, average='weighted'))
print('Precision:', precision_score(y_test, gb_pred, average='weighted'))
print('Recall:', recall_score(y_test, gb_pred, average='weighted'))

# Gradient Boosting Confusion Matriks
gb_cm = confusion_matrix(y_test, gb_pred)
sns.heatmap(gb_cm, annot=True, fmt='d', cmap='Blues')
plt.title('Gradient Boosting Classifier Confusion Matrix')
plt.show()

In [None]:
# Adaboost Classifier
textclassifier = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('smote', SMOTE(random_state=12)),
    ('mnb', AdaBoostClassifier())
])

textclassifier.fit(X_train, y_train)

ac_pred = textclassifier.predict(X_test)

print('Adaboost classifier:')
print('Accuracy:', accuracy_score(y_test, ac_pred))
print('F1 score:', f1_score(y_test, ac_pred, average='weighted'))
print('Precision:', precision_score(y_test, ac_pred, average='weighted'))
print('Recall:', recall_score(y_test, ac_pred, average='weighted'))

# Adaboost Confusion Matriks
ac_cm = confusion_matrix(y_test, ac_pred)
sns.heatmap(ac_cm, annot=True, fmt='d', cmap='Blues')
plt.title('Adaboost Classifier Confusion Matrix')
plt.show()