In [115]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [116]:
class MultinomialNBClassifier:
    def __init__(self, alpha=1.0):
        self.alpha = alpha  # Smoothing parameter
        self.class_priors = {}
        self.feature_probs = {}
        self.classes = []

    def fit(self, X, y):
        self.classes = np.unique(y)
        n_features = X.shape[1]

        # Calculate class priors and feature likelihoods
        for c in self.classes:
            X_c = X[y == c]
            self.class_priors[c] = len(X_c) / len(X)
            
            # Count the occurrences of each feature in the class
            feature_counts = X_c.sum(axis=0)
            total_count = feature_counts.sum()

            # Calculate likelihood with Laplace smoothing
            self.feature_probs[c] = (feature_counts + self.alpha) / (total_count + self.alpha * n_features)

    def _calculate_posterior(self, x):
        posteriors = {}
        for c in self.classes:
            # Log of the prior for the class
            prior = np.log(self.class_priors[c])
            # Sum log-likelihoods of features
            likelihood = np.sum(x * np.log(self.feature_probs[c]))
            posteriors[c] = prior + likelihood
        return posteriors

    def predict(self, X):
        y_pred = []
        for x in X:
            posteriors = self._calculate_posterior(x)
            # Select class with highest posterior
            y_pred.append(max(posteriors, key=posteriors.get))
        return np.array(y_pred)

In [117]:
data_frame = pd.read_csv('full_cleaned_data.csv')
data_frame = data_frame

In [118]:
for entry in data_frame:
    print(entry)

text
label


In [None]:
# Extract texts and labels
texts = data_frame['text']
labels = data_frame['label']

In [None]:
def ubah(x):
    if x == 'neutral':
        return 1
    if x == 'negative':
        return 0
    if x == 'positive':
        return 2

In [None]:
# Step 2: Convert text to numerical features
vectorizer = TfidfTransformer()
X = vectorizer.fit_transform(texts)
X = X.toarray() if hasattr(X, "toarray") else X

labels = []

# Step 3: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

ValueError: could not convert string to float: 'effek kampanye pake sound horeg fufufafa biarin aja turun kelas amin'

In [None]:
X_train.shape, y_train.shape

((204, 1555), (204,))

In [108]:
# Step 4: Train a Naive Bayes classifier
classifier = MultinomialNBClassifier()
classifier.fit(X_train, y_train)

In [109]:
# Step 5: Make predictions and evaluate the model
y_pred = classifier.predict(X_test)

In [110]:
y_test.to_numpy()

array(['neutral', 'neutral', 'neutral', 'neutral', 'positive', 'neutral',
       'neutral', 'negative', 'negative', 'negative', 'neutral',
       'neutral', 'positive', 'neutral', 'neutral', 'neutral', 'neutral',
       'negative', 'negative', 'neutral', 'positive', 'neutral',
       'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'negative',
       'neutral', 'neutral', 'positive', 'negative', 'negative',
       'negative', 'neutral', 'neutral', 'neutral', 'negative', 'neutral',
       'positive', 'negative', 'negative', 'neutral', 'negative',
       'neutral', 'positive', 'neutral', 'negative', 'positive',
       'neutral', 'positive', 'neutral'], dtype=object)

In [111]:
print(pd.DataFrame({'y_pred': y_pred, 'y_test': y_test}))

       y_pred    y_test
228   neutral   neutral
6    negative   neutral
79    neutral   neutral
206  positive   neutral
117   neutral  positive
185   neutral   neutral
242   neutral   neutral
167  negative  negative
9     neutral  negative
30    neutral  negative
180  negative   neutral
222   neutral   neutral
230   neutral  positive
217   neutral   neutral
136   neutral   neutral
68    neutral   neutral
199   neutral   neutral
15   positive  negative
96   negative  negative
24    neutral   neutral
235  positive  positive
19    neutral   neutral
120  negative   neutral
152   neutral   neutral
33    neutral   neutral
124  negative   neutral
253   neutral   neutral
208  negative  negative
10    neutral   neutral
164   neutral   neutral
184   neutral  positive
97   negative  negative
148   neutral  negative
190  negative  negative
223   neutral   neutral
25    neutral   neutral
86   negative   neutral
18    neutral  negative
75    neutral   neutral
137   neutral  positive
196  negative  n

In [112]:
report = classification_report(y_pred, y_test)

In [113]:
print(report)

              precision    recall  f1-score   support

    negative       0.57      0.53      0.55        15
     neutral       0.77      0.70      0.73        33
    positive       0.25      0.50      0.33         4

    accuracy                           0.63        52
   macro avg       0.53      0.58      0.54        52
weighted avg       0.67      0.63      0.65        52

