In [10]:
import statistics
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import cross_validate, train_test_split

from sklearn.naive_bayes import MultinomialNB

In [None]:
result = []

# Looping to find how number of datasets impacted on accuracy
# Load dataset
df = pd.read_csv('datasets/movie.csv')
# Splitting Dataset
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], random_state=42, train_size=20000)

# Build the model
textclassifier = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('smote', SMOTE(random_state=12)),
    ('nb', MultinomialNB())
])

# Train the data
textclassifier.fit(X_train, y_train)

# Scoring parameter that we want to analysis
scoring = ['precision_macro', 'recall_macro', 'f1_macro', 'accuracy']
# 10 fold cross validation
scores = cross_validate(textclassifier, df['text'], df['label'], scoring=scoring)

# save the result of prediction into variabel result
result.append({
    'dataset': 20000,
    'scores': scores
})

In [None]:
precision_macro = result[0]['scores']['test_precision_macro']

print('Precision (macro):', precision_macro)
print('Precision Mean:', statistics.mean(precision_macro))
print('Precision Stdev:', statistics.stdev(precision_macro))

In [None]:
recall_macro = result[0]['scores']['test_recall_macro']

print('Recall (macro):', recall_macro)
print('Recall Mean:', statistics.mean(recall_macro))
print('Recall Stdev:', statistics.stdev(recall_macro))

In [None]:
f1_macro = result[0]['scores']['test_f1_macro']

print('f1 (macro):', f1_macro)
print('f1 Mean:', statistics.mean(f1_macro))
print('f1 Stdev:', statistics.stdev(f1_macro))

In [None]:
accuracy = result[0]['scores']['test_accuracy']

print('Accuracy:', accuracy)
print('f1 Mean:', statistics.mean(accuracy))
print('f1 Stdev:', statistics.stdev(accuracy))