In [105]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix

In [106]:
class MultinomialNBClassifier:
    def __init__(self, alpha=1.0):
        self.alpha = alpha  # Smoothing parameter
        self.class_priors = {}
        self.feature_probs = {}
        self.classes = []

    def fit(self, X, y):
        self.classes = np.unique(y)
        n_features = X.shape[1]

        # Calculate class priors and feature likelihoods
        for c in self.classes:
            X_c = X[y == c]
            self.class_priors[c] = len(X_c) / len(X)
            
            # Count the occurrences of each feature in the class
            feature_counts = X_c.sum(axis=0)
            total_count = feature_counts.sum()

            # Calculate likelihood with Laplace smoothing
            self.feature_probs[c] = (feature_counts + self.alpha) / (total_count + self.alpha * n_features)

    def _calculate_posterior(self, x):
        posteriors = {}
        for c in self.classes:
            # Log of the prior for the class
            prior = np.log(self.class_priors[c])
            # Sum log-likelihoods of features
            likelihood = np.sum(x * np.log(self.feature_probs[c]))
            posteriors[c] = prior + likelihood
        return posteriors

    def predict(self, X):
        y_pred = []
        for x in X:
            posteriors = self._calculate_posterior(x)
            # Select class with highest posterior
            y_pred.append(max(posteriors, key=posteriors.get))
        return np.array(y_pred)
    
    def score(self, X, y):
        y_pred = self.predict(X)
        # Return the accuracy score
        return np.mean(y_pred == y)

In [107]:
data_frame = pd.read_csv('data/full_cleaned_data.csv')
data_frame = data_frame

In [108]:
for entry in data_frame:
    print(entry)

text
label


In [109]:
# Extract texts and labels
texts = data_frame['text']
labels = data_frame['label']

In [110]:
labels

0      positif
1      negatif
2      negatif
3       netral
4      positif
        ...   
372     netral
373     netral
374    negatif
375     netral
376     netral
Name: label, Length: 377, dtype: object

In [111]:
def ubah(x):
    if x == 'netral':
        return 1
    if x == 'negatif':
        return 0
    if x == 'positif':
        return 2

In [112]:
labels = labels.apply(ubah)

In [113]:
labels.value_counts()

label
1    151
0    124
2    102
Name: count, dtype: int64

In [114]:
# Step 2: Convert text to numerical features
countvector = CountVectorizer()
X = countvector.fit_transform(texts)
X = X.toarray() if hasattr(X, "toarray") else X

# Step 3: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

In [115]:
X_train_np = X_train.values if hasattr(X_train, 'values') else X_train
y_train_np = y_train.values if hasattr(y_train, 'values') else y_train

In [116]:
X_train.shape, y_train.shape

((301, 2149), (301,))

In [117]:
y_train

137    0
101    0
354    1
196    1
222    2
      ..
71     0
106    2
270    1
348    0
102    0
Name: label, Length: 301, dtype: int64

In [118]:
# Step 4: Train a Naive Bayes classifier
classifier = MultinomialNBClassifier()
k_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []

In [119]:
for train_index, val_index in k_folds.split(X_train_np, y_train_np):
    X_train_fold, X_val_fold = X_train_np[train_index], X_train_np[val_index]
    y_train_fold, y_val_fold = y_train_np[train_index], y_train_np[val_index]

    classifier.fit(X_train_fold, y_train_fold)

    accuracy = classifier.score(X_val_fold, y_val_fold)
    scores.append(accuracy)

In [120]:
average_accuracy = np.mean(scores)
print('Cross Validation Scores: ', scores)
print('Average accuracy: ', average_accuracy)

Cross Validation Scores:  [0.5245901639344263, 0.4, 0.5333333333333333, 0.45, 0.48333333333333334]
Average accuracy:  0.4782513661202185


In [121]:
print("Cross Validation Score: ", scores)
print("Average CV Score: ", np.mean(scores))
print("Number of CV Scores used in Average: ", len(scores))

Cross Validation Score:  [0.5245901639344263, 0.4, 0.5333333333333333, 0.45, 0.48333333333333334]
Average CV Score:  0.4782513661202185
Number of CV Scores used in Average:  5


In [122]:
classifier.fit(X_train, y_train)

In [123]:
# Step 5: Make predictions and evaluate the model
y_pred = classifier.predict(X_test)
y_pred.shape

(76,)

In [124]:
y_test = y_test.to_numpy().reshape((76,))

In [125]:
y_test.shape, y_pred.shape

((76,), (76,))

In [126]:
print(pd.DataFrame({'y_pred': y_pred, 'y_test': y_test}))

    y_pred  y_test
0        0       0
1        0       1
2        0       1
3        2       0
4        0       0
..     ...     ...
71       0       2
72       0       1
73       1       0
74       1       1
75       1       1

[76 rows x 2 columns]


In [127]:
report = classification_report(y_pred, y_test)

In [128]:
print(report)

              precision    recall  f1-score   support

           0       0.62      0.52      0.57        29
           1       0.50      0.47      0.48        34
           2       0.20      0.31      0.24        13

    accuracy                           0.46        76
   macro avg       0.44      0.43      0.43        76
weighted avg       0.50      0.46      0.47        76

