In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix

In [27]:
class MultinomialNBClassifier:
    def __init__(self, alpha=1.0):
        self.alpha = alpha  # Smoothing parameter
        self.class_priors = {}
        self.feature_probs = {}
        self.classes = []

    def fit(self, X, y):
        self.classes = np.unique(y)
        n_features = X.shape[1]

        # Calculate class priors and feature likelihoods
        for c in self.classes:
            X_c = X[y == c]
            self.class_priors[c] = len(X_c) / len(X)
            
            # Count the occurrences of each feature in the class
            feature_counts = X_c.sum(axis=0)
            total_count = feature_counts.sum()

            # Calculate likelihood with Laplace smoothing
            self.feature_probs[c] = (feature_counts + self.alpha) / (total_count + self.alpha * n_features)

    def _calculate_posterior(self, x):
        posteriors = {}
        for c in self.classes:
            # Log of the prior for the class
            prior = np.log(self.class_priors[c])
            # Sum log-likelihoods of features
            likelihood = np.sum(x * np.log(self.feature_probs[c]))
            posteriors[c] = prior + likelihood
        return posteriors

    def predict(self, X):
        y_pred = []
        for x in X:
            posteriors = self._calculate_posterior(x)
            # Select class with highest posterior
            y_pred.append(max(posteriors, key=posteriors.get))
        return np.array(y_pred)
    
    def score(self, X, y):
        y_pred = self.predict(X)
        # Return the accuracy score
        return np.mean(y_pred == y)

In [28]:
data_frame = pd.read_csv('full_cleaned_data.csv')
data_frame = data_frame

In [29]:
for entry in data_frame:
    print(entry)

text
label


In [30]:
# Extract texts and labels
texts = data_frame['text']
labels = data_frame['label']

In [31]:
labels

0       neutral
1      negative
2      positive
3      negative
4      negative
         ...   
251     neutral
252     neutral
253     neutral
254     neutral
255     neutral
Name: label, Length: 256, dtype: object

In [32]:
def ubah(x):
    if x == 'neutral':
        return 1
    if x == 'negative':
        return 0
    if x == 'positive':
        return 2

In [33]:
labels = labels.apply(ubah)

In [34]:
labels

0      1
1      0
2      2
3      0
4      0
      ..
251    1
252    1
253    1
254    1
255    1
Name: label, Length: 256, dtype: int64

In [35]:
# Step 2: Convert text to numerical features
countvector = CountVectorizer()
X = countvector.fit_transform(texts)
X = X.toarray() if hasattr(X, "toarray") else X

# Step 3: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

In [36]:
X_train_np = X_train.values if hasattr(X_train, 'values') else X_train
y_train_np = y_train.values if hasattr(y_train, 'values') else y_train

In [37]:
X_train.shape, y_train.shape

((204, 1555), (204,))

In [38]:
y_train

143    1
84     1
55     1
220    1
104    1
      ..
106    0
14     1
92     1
179    1
102    0
Name: label, Length: 204, dtype: int64

In [39]:
# Step 4: Train a Naive Bayes classifier
classifier = MultinomialNBClassifier()
k_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []

In [41]:
for train_index, val_index in k_folds.split(X_train_np, y_train_np):
    X_train_fold, X_val_fold = X_train_np[train_index], X_train_np[val_index]
    y_train_fold, y_val_fold = y_train_np[train_index], y_train_np[val_index]

    classifier.fit(X_train_fold, y_train_fold)

    accuracy = classifier.score(X_val_fold, y_val_fold)
    scores.append(accuracy)

In [42]:
average_accuracy = np.mean(scores)
print('Cross Validation Scores: ', scores)
print('Average accuracy: ', average_accuracy)

Cross Validation Scores:  [0.4878048780487805, 0.4146341463414634, 0.7317073170731707, 0.3902439024390244, 0.625]
Average accuracy:  0.5298780487804878


In [43]:
print("Cross Validation Score: ", scores)
print("Average CV Score: ", np.mean(scores))
print("Number of CV Scores used in Average: ", len(scores))

Cross Validation Score:  [0.4878048780487805, 0.4146341463414634, 0.7317073170731707, 0.3902439024390244, 0.625]
Average CV Score:  0.5298780487804878
Number of CV Scores used in Average:  5


In [44]:
classifier.fit(X_train, y_train)

In [45]:
# Step 5: Make predictions and evaluate the model
y_pred = classifier.predict(X_test)
y_pred.shape

(52,)

In [46]:
y_test = y_test.to_numpy().reshape((52,))

In [47]:
y_test.shape, y_pred.shape

((52,), (52,))

In [48]:
print(pd.DataFrame({'y_pred': y_pred, 'y_test': y_test}))

    y_pred  y_test
0        1       1
1        0       1
2        1       1
3        2       1
4        1       2
5        1       1
6        1       1
7        0       0
8        1       0
9        1       0
10       0       1
11       1       1
12       1       2
13       1       1
14       1       1
15       1       1
16       1       1
17       2       0
18       0       0
19       1       1
20       2       2
21       1       1
22       0       1
23       1       1
24       1       1
25       0       1
26       1       1
27       0       0
28       1       1
29       1       1
30       1       2
31       0       0
32       1       0
33       0       0
34       1       1
35       1       1
36       0       1
37       1       0
38       1       1
39       1       2
40       0       0
41       0       0
42       1       1
43       0       0
44       1       1
45       2       2
46       1       1
47       1       0
48       0       2
49       1       1
50       1       2
51       0  

In [49]:
report = classification_report(y_pred, y_test)

In [50]:
print(report)

              precision    recall  f1-score   support

           0       0.57      0.53      0.55        15
           1       0.77      0.70      0.73        33
           2       0.25      0.50      0.33         4

    accuracy                           0.63        52
   macro avg       0.53      0.58      0.54        52
weighted avg       0.67      0.63      0.65        52

