In [96]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score

In [97]:
class MultinomialNBClassifier:
    def __init__(self, alpha):
        self.alpha = alpha  # Smoothing parameter
        self.class_priors = {}
        self.feature_probs = {}
        self.classes = []

    def fit(self, X, y):
        self.classes = np.unique(y)
        n_features = X.shape[1]

        # Calculate class priors and feature likelihoods
        for c in self.classes:
            X_c = X[y == c]
            self.class_priors[c] = len(X_c) / len(X)
            
            # Count the occurrences of each feature in the class
            feature_counts = X_c.sum(axis=0)
            total_count = feature_counts.sum()

            # Calculate likelihood with Laplace smoothing
            self.feature_probs[c] = (feature_counts + self.alpha) / (total_count + self.alpha * n_features)

    def _calculate_posterior(self, x):
        posteriors = {}
        for c in self.classes:
            # Log of the prior for the class
            prior = np.log(self.class_priors[c])
            # Sum log-likelihoods of features
            likelihood = np.sum(x * np.log(self.feature_probs[c]))
            posteriors[c] = prior + likelihood
        return posteriors

    def predict(self, X):
        y_pred = []
        for x in X:
            posteriors = self._calculate_posterior(x)
            # Select class with highest posterior
            y_pred.append(max(posteriors, key=posteriors.get))
        return np.array(y_pred)
    
    def score(self, X, y):
        y_pred = self.predict(X)
        # Return the accuracy score
        return np.mean(y_pred == y)
    
    def info(self):
        print(f'class priors: {self.class_priors}')
        print(f'feature probabilities: {self.feature_probs}')

    def show_process(self, X):
        for i, x in enumerate(X):
            print(f"\nInstance {i + 1}:")
            print(f"Features: {x}")
            posteriors = self._calculate_posterior(x)
            
            # Display scores for each class
            for c in self.classes:
                print(f"  Class '{c}': Score = {posteriors[c]}")
            
            # Determine and display the predicted class
            predicted_class = max(posteriors, key=posteriors.get)
            print(f"Predicted class: {predicted_class}")

In [98]:
def CrossValidation(X_train_np, y_train_np, k_folds, classifier):
    scores = []
    for train_index, val_index in k_folds.split(X_train_np, y_train_np):
        X_train_fold, X_val_fold = X_train_np[train_index], X_train_np[val_index]
        y_train_fold, y_val_fold = y_train_np[train_index], y_train_np[val_index]

        classifier.fit(X_train_fold, y_train_fold)

        accuracy = classifier.score(X_val_fold, y_val_fold)
        scores.append(accuracy)
    return scores

In [99]:
def RandomizedSearchCV(X_train_np, y_train_np, iter, k_folds, alpha_values):
    best_params = None
    best_score = 0

    for i in range(iter):
        fold_scores = []

        for trainidx, validx in k_folds.split(X_train_np, y_train_np):
            X_train_fold, X_val_fold = X_train_np[trainidx], X_train_np[validx]
            y_train_fold, y_val_fold = y_train_np[trainidx], y_train_np[validx]

            model = MultinomialNBClassifier(alpha = alpha_values[i])
            model.fit(X_train_fold, y_train_fold)

            y_val_pred = model.predict(X_val_fold)
            fold_accuracy = accuracy_score(y_val_fold, y_val_pred)
            fold_scores.append(fold_accuracy)
        
        avg_score = np.mean(fold_scores)
        print(f"Alpha: {alpha_values[i]}, Average CV Accuracy: {avg_score}")

        if avg_score > best_score:
            best_score = avg_score
            best_params = {'alpha': alpha_values[i]}

    return best_score, best_params

In [100]:
alpha_values = [0.1, 0.5, 1.0, 1.5, 2.0]
iter = 5

In [101]:
data_frame = pd.read_csv('full_cleaned_data.csv')
data_frame = data_frame

In [102]:
for entry in data_frame:
    print(entry)

text
label


In [103]:
# Extract texts and labels
texts = data_frame['text']
labels = data_frame['label']

In [104]:
labels

0       neutral
1      negative
2      positive
3      negative
4      negative
         ...   
251     neutral
252     neutral
253     neutral
254     neutral
255     neutral
Name: label, Length: 256, dtype: object

In [105]:
def ubah(x):
    if x == 'neutral':
        return 1
    if x == 'negative':
        return 0
    if x == 'positive':
        return 2

In [106]:
labels = labels.apply(ubah)

In [107]:
labels

0      1
1      0
2      2
3      0
4      0
      ..
251    1
252    1
253    1
254    1
255    1
Name: label, Length: 256, dtype: int64

In [108]:
# Step 2: Convert text to numerical features
countvector = CountVectorizer()
X = countvector.fit_transform(texts)
X = X.toarray() if hasattr(X, "toarray") else X

# Step 3: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

In [109]:
X_train_np = X_train.values if hasattr(X_train, 'values') else X_train
y_train_np = y_train.values if hasattr(y_train, 'values') else y_train

In [110]:
X_train.shape, y_train.shape

((204, 1555), (204,))

In [111]:
k_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []

In [112]:
best_score, best_param = RandomizedSearchCV(X_train_np, y_train_np, iter, k_folds, alpha_values)

Alpha: 0.1, Average CV Accuracy: 0.436829268292683
Alpha: 0.5, Average CV Accuracy: 0.5103658536585366
Alpha: 1.0, Average CV Accuracy: 0.5298780487804878
Alpha: 1.5, Average CV Accuracy: 0.5151219512195122
Alpha: 2.0, Average CV Accuracy: 0.5247560975609756


In [122]:
print('Best Score: ', best_score)
print('Best Alpha: ', best_param['alpha'])

Best Score:  0.5298780487804878
Best Alpha:  1.0


In [114]:
classifier = MultinomialNBClassifier(alpha=best_param['alpha'])
classifier.fit(X_train, y_train)
classifier.show_process(X_test)


Instance 1:
Features: [0 0 0 ... 0 0 0]
  Class '0': Score = -29.123458976476382
  Class '1': Score = -27.519614405467077
  Class '2': Score = -31.34420741616199
Predicted class: 1

Instance 2:
Features: [0 0 0 ... 0 0 0]
  Class '0': Score = -98.04576275159276
  Class '1': Score = -98.31898994188715
  Class '2': Score = -99.1867968495037
Predicted class: 0

Instance 3:
Features: [0 0 0 ... 0 0 0]
  Class '0': Score = -52.78656657540894
  Class '1': Score = -50.7682890067784
  Class '2': Score = -54.54995003675853
Predicted class: 1

Instance 4:
Features: [0 0 0 ... 0 0 0]
  Class '0': Score = -55.7442697748581
  Class '1': Score = -54.858747963425046
  Class '2': Score = -52.965283303412974
Predicted class: 2

Instance 5:
Features: [0 0 0 ... 0 0 0]
  Class '0': Score = -44.166455849986015
  Class '1': Score = -41.52366315683568
  Class '2': Score = -43.023055922560914
Predicted class: 1

Instance 6:
Features: [0 0 0 ... 0 0 0]
  Class '0': Score = -72.99636079154033
  Class '1': Sco

In [115]:
classifier.info()

class priors: {0: 0.3872549019607843, 1: 0.47058823529411764, 2: 0.14215686274509803}
feature probabilities: {0: array([0.00036088, 0.00072176, 0.00036088, ..., 0.00072176, 0.00036088,
       0.00072176]), 1: array([0.00108225, 0.00036075, 0.00036075, ..., 0.0007215 , 0.00036075,
       0.00036075]), 2: array([0.00046019, 0.00046019, 0.00046019, ..., 0.00046019, 0.00184077,
       0.00046019])}


In [116]:
# Step 5: Make predictions and evaluate the model
y_pred = classifier.predict(X_test)
y_pred.shape

(52,)

In [117]:
y_test = y_test.to_numpy().reshape((52,))

In [118]:
y_test.shape, y_pred.shape

((52,), (52,))

In [119]:
result_table = pd.DataFrame({'y_pred': y_pred, 'y_test': y_test})
result_table

Unnamed: 0,y_pred,y_test
0,1,1
1,0,1
2,1,1
3,2,1
4,1,2
5,1,1
6,1,1
7,0,0
8,1,0
9,1,0


In [120]:
report = classification_report(y_pred, y_test)

In [121]:
print(report)

              precision    recall  f1-score   support

           0       0.57      0.53      0.55        15
           1       0.77      0.70      0.73        33
           2       0.25      0.50      0.33         4

    accuracy                           0.63        52
   macro avg       0.53      0.58      0.54        52
weighted avg       0.67      0.63      0.65        52

