In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score

In [16]:
class MultinomialNBClassifier:
    def __init__(self, alpha=1.0):
        self.alpha = alpha  # Smoothing parameter
        self.class_priors = {}
        self.feature_probs = {}
        self.classes = []

    def fit(self, X, y):
        self.classes = np.unique(y)
        n_features = X.shape[1]

        # Calculate class priors and feature likelihoods
        for c in self.classes:
            X_c = X[y == c]
            self.class_priors[c] = len(X_c) / len(X)
            
            # Count the occurrences of each feature in the class
            feature_counts = X_c.sum(axis=0)
            total_count = feature_counts.sum()

            # Calculate likelihood with Laplace smoothing
            self.feature_probs[c] = (feature_counts + self.alpha) / (total_count + self.alpha * n_features)

    def _calculate_posterior(self, x):
        posteriors = {}
        for c in self.classes:
            # Log of the prior for the class
            prior = np.log(self.class_priors[c])
            # Sum log-likelihoods of features
            likelihood = np.sum(x * np.log(self.feature_probs[c]))
            posteriors[c] = prior + likelihood
        return posteriors

    def predict(self, X):
        y_pred = []
        for x in X:
            posteriors = self._calculate_posterior(x)
            # Select class with highest posterior
            y_pred.append(max(posteriors, key=posteriors.get))
        return np.array(y_pred)
    def score(self, X, y):
        # Predict labels for the input data
        y_pred = self.predict(X)
        # Calculate and return the accuracy
        return accuracy_score(y, y_pred)

In [17]:
data_frame = pd.read_csv('full_cleaned_data.csv')
data_frame = data_frame

In [18]:
for entry in data_frame:
    print(entry)

text
label


In [19]:
# Extract texts and labels
texts = data_frame['text']
labels = data_frame['label']

In [20]:
labels

0       neutral
1      negative
2      positive
3      negative
4      negative
         ...   
251     neutral
252     neutral
253     neutral
254     neutral
255     neutral
Name: label, Length: 256, dtype: object

In [21]:
def ubah(x):
    if x == 'neutral':
        return 1
    if x == 'negative':
        return 0
    if x == 'positive':
        return 2

In [22]:
labels = labels.apply(ubah)

In [23]:
labels

0      1
1      0
2      2
3      0
4      0
      ..
251    1
252    1
253    1
254    1
255    1
Name: label, Length: 256, dtype: int64

In [26]:
# Step 2: Convert text to numerical features
countvector = CountVectorizer()
X = countvector.fit_transform(texts)
X = X.toarray() if hasattr(X, "toarray") else X

# Step 3: Split the dataset into training and testing sets
kf = KFold(n_splits=5, shuffle=True, random_state=42)
model = MultinomialNBClassifier()

best_score = 0  # Initialize variable to track the best score
best_train_index, best_test_index = None, None  # To store the best fold indices

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    
    # Fit the model on training data
    model.fit(X_train, y_train)
    
    # Evaluate on test data
    score = model.score(X_test, y_test)
    print("Score for current fold:", score)

    # Update best_score and store indices if the current score is higher
    if score > best_score:
        best_score = score
        best_train_index, best_test_index = train_index, test_index

# Re-fit the model on the best fold
X_train_best, X_test_best = X[best_train_index], X[best_test_index]
y_train_best, y_test_best = labels[best_train_index], labels[best_test_index]
model.fit(X_train_best, y_train_best)

print("Best score across all folds:", best_score)
print("Final evaluation on the best fold:", model.score(X_test_best, y_test_best))

Score for current fold: 0.6346153846153846
Score for current fold: 0.6470588235294118
Score for current fold: 0.5686274509803921
Score for current fold: 0.49019607843137253
Score for current fold: 0.5098039215686274
Best score across all folds: 0.6470588235294118
Final evaluation on the best fold: 0.6470588235294118


In [36]:
X_train_best.shape, y_train_best.shape

((205, 1555), (205,))

In [None]:
# Step 4: Train a Naive Bayes classifier
model = MultinomialNBClassifier()
model.fit(X_train, y_train)

In [29]:
# Step 5: Make predictions and evaluate the model
y_pred = model.predict(X_test)
y_pred.shape

(51,)

In [31]:
y_test = y_test.to_numpy().reshape((51,))

In [32]:
y_test.shape, y_pred.shape

((51,), (51,))

In [33]:
print(pd.DataFrame({'y_pred': y_pred, 'y_test': y_test}))

    y_pred  y_test
0        0       0
1        1       1
2        1       1
3        0       0
4        1       1
5        1       1
6        0       0
7        1       1
8        0       0
9        0       0
10       1       1
11       1       1
12       1       1
13       2       2
14       0       0
15       0       0
16       1       1
17       0       0
18       0       0
19       0       0
20       1       0
21       0       0
22       2       2
23       1       1
24       0       0
25       0       0
26       1       1
27       0       0
28       0       0
29       1       1
30       0       0
31       2       2
32       1       1
33       0       0
34       1       1
35       1       1
36       1       1
37       0       0
38       2       2
39       2       2
40       2       2
41       1       1
42       1       1
43       1       1
44       0       0
45       2       2
46       0       0
47       0       0
48       1       1
49       1       1
50       1       1


In [34]:
report = classification_report(y_pred, y_test)

In [35]:
print(report)

              precision    recall  f1-score   support

           0       0.95      1.00      0.98        21
           1       1.00      0.96      0.98        23
           2       1.00      1.00      1.00         7

    accuracy                           0.98        51
   macro avg       0.98      0.99      0.98        51
weighted avg       0.98      0.98      0.98        51

