In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from src.shared.json_tools import load_json_long
from paths import DATA_DIR

data = load_json_long(DATA_DIR / "test/03-04-January_1000.json")

In [21]:
import numpy as np

X = [i["content"] for i in data]
y = [i["target"] for i in data]

# Convert to NumPy arrays if not already
X = np.array(X)
targets = np.array(y)

# Get indices of each class
pos_indices = np.where(targets == 1)[0]
neg_indices = np.where(targets == 0)[0]

# Balance: choose the smaller class count
min_class_count = min(len(pos_indices), len(neg_indices))

# Randomly sample from both classes
np.random.seed(42)  # for reproducibility
pos_sample = np.random.choice(pos_indices, min_class_count, replace=False)
neg_sample = np.random.choice(neg_indices, min_class_count, replace=False)

# Combine samples and shuffle
balanced_indices = np.concatenate([pos_sample, neg_sample])
np.random.shuffle(balanced_indices)

# Apply to data
X = X[balanced_indices]
y = targets[balanced_indices]

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
len(y)

110

In [24]:
sum(y)

np.int64(55)

In [25]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from xgboost import XGBClassifier

In [26]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    
    print(confusion_matrix(y_test, y_pred))

In [27]:
model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Accuracy: 0.6363636363636364
Precision: 0.6153846153846154
Recall: 0.7272727272727273
F1 Score: 0.6666666666666666

Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.55      0.60        11
           1       0.62      0.73      0.67        11

    accuracy                           0.64        22
   macro avg       0.64      0.64      0.63        22
weighted avg       0.64      0.64      0.63        22

[[6 5]
 [3 8]]


In [28]:
model = SVC(kernel='linear', probability=True)
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Accuracy: 0.6818181818181818
Precision: 0.6666666666666666
Recall: 0.7272727272727273
F1 Score: 0.6956521739130435

Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.64      0.67        11
           1       0.67      0.73      0.70        11

    accuracy                           0.68        22
   macro avg       0.68      0.68      0.68        22
weighted avg       0.68      0.68      0.68        22

[[7 4]
 [3 8]]


In [29]:
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Accuracy: 0.7727272727272727
Precision: 0.8
Recall: 0.7272727272727273
F1 Score: 0.7619047619047619

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.82      0.78        11
           1       0.80      0.73      0.76        11

    accuracy                           0.77        22
   macro avg       0.78      0.77      0.77        22
weighted avg       0.78      0.77      0.77        22

[[9 2]
 [3 8]]


In [30]:
model = BernoulliNB()
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Accuracy: 0.7272727272727273
Precision: 0.7272727272727273
Recall: 0.7272727272727273
F1 Score: 0.7272727272727273

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.73      0.73        11
           1       0.73      0.73      0.73        11

    accuracy                           0.73        22
   macro avg       0.73      0.73      0.73        22
weighted avg       0.73      0.73      0.73        22

[[8 3]
 [3 8]]


In [31]:
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.8181818181818182
Precision: 0.8181818181818182
Recall: 0.8181818181818182
F1 Score: 0.8181818181818182

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.82      0.82        11
           1       0.82      0.82      0.82        11

    accuracy                           0.82        22
   macro avg       0.82      0.82      0.82        22
weighted avg       0.82      0.82      0.82        22

[[9 2]
 [2 9]]
