In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
from module import MultinomialNBClassifier, RandomizedSearchCV, CrossValidation

## Data Preproessing

In [2]:
data_frame = pd.read_csv('cleaned/full_cleaned_data.csv')

In [3]:
def ubah(x):
    if x == 'neutral':
        return 1
    if x == 'negative':
        return 0
    if x == 'positive':
        return 2

In [4]:
# Extract texts and labels
texts = data_frame['text']
labels = data_frame['label']

In [5]:
# Step 2: Convert text to numerical features
countvector = CountVectorizer()
X = countvector.fit_transform(texts)
X = X.toarray() if hasattr(X, "toarray") else X
y = labels.apply(ubah)

# Step 3: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

In [6]:
X_train.shape, y_train.shape

((204, 1555), (204,))

In [7]:
X_train_np = X_train.values if hasattr(X_train, 'values') else X_train
y_train_np = y_train.values if hasattr(y_train, 'values') else y_train

## Model And Training Model

In [8]:
alpha_values = [0.1, 0.5, 1.0, 1.5, 2.0]
iter = 5

In [9]:
k_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []

In [10]:
best_score, best_param = RandomizedSearchCV(X_train_np, y_train_np, iter, k_folds, alpha_values)

Alpha: 0.1, Average CV Accuracy: 0.436829268292683
Alpha: 0.5, Average CV Accuracy: 0.5103658536585366
Alpha: 1.0, Average CV Accuracy: 0.5298780487804878
Alpha: 1.5, Average CV Accuracy: 0.5151219512195122
Alpha: 2.0, Average CV Accuracy: 0.5247560975609756


In [11]:
classifier = MultinomialNBClassifier(alpha=best_param['alpha'])
classifier.fit(X_train, y_train)
classifier.show_process(X_test)


Instance 1:
Features: [0 0 0 ... 0 0 0]
  Class 'negative': Score = -29.123458976476382
  Class 'neutral': Score = -27.519614405467077
  Class 'positive': Score = -31.34420741616199
Predicted class: neutral

Instance 2:
Features: [0 0 0 ... 0 0 0]
  Class 'negative': Score = -98.04576275159276
  Class 'neutral': Score = -98.31898994188715
  Class 'positive': Score = -99.1867968495037
Predicted class: negative

Instance 3:
Features: [0 0 0 ... 0 0 0]
  Class 'negative': Score = -52.78656657540894
  Class 'neutral': Score = -50.7682890067784
  Class 'positive': Score = -54.54995003675853
Predicted class: neutral

Instance 4:
Features: [0 0 0 ... 0 0 0]
  Class 'negative': Score = -55.7442697748581
  Class 'neutral': Score = -54.858747963425046
  Class 'positive': Score = -52.965283303412974
Predicted class: positive

Instance 5:
Features: [0 0 0 ... 0 0 0]
  Class 'negative': Score = -44.166455849986015
  Class 'neutral': Score = -41.52366315683568
  Class 'positive': Score = -43.023055

In [12]:
print('Best Score: ', best_score)
print('Best Alpha: ', best_param['alpha'])

Best Score:  0.5298780487804878
Best Alpha:  1.0


In [13]:
classifier.info()

class priors: {'negative': 0.3872549019607843, 'neutral': 0.47058823529411764, 'positive': 0.14215686274509803}
feature probabilities: {'negative': array([0.00036088, 0.00072176, 0.00036088, ..., 0.00072176, 0.00036088,
       0.00072176]), 'neutral': array([0.00108225, 0.00036075, 0.00036075, ..., 0.0007215 , 0.00036075,
       0.00036075]), 'positive': array([0.00046019, 0.00046019, 0.00046019, ..., 0.00046019, 0.00184077,
       0.00046019])}


## Testing and Overview

In [14]:
# Step 5: Make predictions and evaluate the model
y_pred = classifier.predict(X_test)
y_pred.shape

(52,)

In [15]:
y_test = y_test.to_numpy().reshape((52,))

In [16]:
y_test.shape, y_pred.shape

((52,), (52,))

In [17]:
result_table = pd.DataFrame({'y_pred': y_pred, 'y_test': y_test})
result_table

Unnamed: 0,y_pred,y_test
0,neutral,neutral
1,negative,neutral
2,neutral,neutral
3,positive,neutral
4,neutral,positive
5,neutral,neutral
6,neutral,neutral
7,negative,negative
8,neutral,negative
9,neutral,negative


In [18]:
report = classification_report(y_pred, y_test)

In [19]:
print(report)

              precision    recall  f1-score   support

    negative       0.57      0.53      0.55        15
     neutral       0.77      0.70      0.73        33
    positive       0.25      0.50      0.33         4

    accuracy                           0.63        52
   macro avg       0.53      0.58      0.54        52
weighted avg       0.67      0.63      0.65        52

