In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
from src.shared.json_tools import load_json_long
from paths import DATA_DIR

np.random.seed(42)  # for reproducibility
X = []
y = []

for file in (DATA_DIR / "selected").glob("*.json"):
    if len(X) >= 1000:
        break
    data = load_json_long(file)
    targets = [int(i["target"] >= 0.5) for i in data]
    features = [i["content"] for i in data]
    
    features = np.array(features)
    targets = np.array(targets)
    
    pos_indices = np.where(targets == 1)[0]
    neg_indices = np.where(targets == 0)[0]
    
    min_class_count = min(len(pos_indices), len(neg_indices))
    pos_sample = np.random.choice(pos_indices, min_class_count, replace=False)
    neg_sample = np.random.choice(neg_indices, min_class_count, replace=False)
    
    balanced_indices = np.concatenate([pos_sample, neg_sample])
    np.random.shuffle(balanced_indices)
    
    features_balanced = features[balanced_indices]
    y_balanced = targets[balanced_indices]
    
    X = X + features_balanced.tolist()
    y = y + y_balanced.tolist()

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
len(y)

3784

In [5]:
sum(y)

1892

In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from xgboost import XGBClassifier

In [7]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    
    print(confusion_matrix(y_test, y_pred))

In [8]:
from sklearn.dummy import DummyClassifier

model = DummyClassifier(strategy="stratified")
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Accuracy: 0.48348745046235136
Precision: 0.5094850948509485
Recall: 0.4723618090452261
F1 Score: 0.49022164276401564

Classification Report:
               precision    recall  f1-score   support

           0       0.46      0.50      0.48       359
           1       0.51      0.47      0.49       398

    accuracy                           0.48       757
   macro avg       0.48      0.48      0.48       757
weighted avg       0.49      0.48      0.48       757

[[178 181]
 [210 188]]


In [9]:
model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Accuracy: 0.7463672391017173
Precision: 0.7299107142857143
Recall: 0.821608040201005
F1 Score: 0.7730496453900709

Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.66      0.71       359
           1       0.73      0.82      0.77       398

    accuracy                           0.75       757
   macro avg       0.75      0.74      0.74       757
weighted avg       0.75      0.75      0.74       757

[[238 121]
 [ 71 327]]


In [10]:
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Accuracy: 0.8428005284015853
Precision: 0.8236658932714617
Recall: 0.8919597989949749
F1 Score: 0.856453558504222

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.79      0.83       359
           1       0.82      0.89      0.86       398

    accuracy                           0.84       757
   macro avg       0.85      0.84      0.84       757
weighted avg       0.84      0.84      0.84       757

[[283  76]
 [ 43 355]]


In [11]:
model = BernoulliNB()
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Accuracy: 0.7080581241743725
Precision: 0.672514619883041
Recall: 0.8668341708542714
F1 Score: 0.7574094401756312

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.53      0.63       359
           1       0.67      0.87      0.76       398

    accuracy                           0.71       757
   macro avg       0.73      0.70      0.70       757
weighted avg       0.72      0.71      0.70       757

[[191 168]
 [ 53 345]]


In [12]:
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.8441215323645971
Precision: 0.8167420814479638
Recall: 0.907035175879397
F1 Score: 0.8595238095238096

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.77      0.82       359
           1       0.82      0.91      0.86       398

    accuracy                           0.84       757
   macro avg       0.85      0.84      0.84       757
weighted avg       0.85      0.84      0.84       757

[[278  81]
 [ 37 361]]


In [13]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Accuracy: 0.7239101717305152
Precision: 0.6893787575150301
Recall: 0.864321608040201
F1 Score: 0.7670011148272018

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.57      0.66       359
           1       0.69      0.86      0.77       398

    accuracy                           0.72       757
   macro avg       0.74      0.72      0.71       757
weighted avg       0.74      0.72      0.72       757

[[204 155]
 [ 54 344]]


In [14]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

model = QuadraticDiscriminantAnalysis(reg_param=0.1)
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Accuracy: 0.7173051519154557
Precision: 0.6818181818181818
Recall: 0.8668341708542714
F1 Score: 0.7632743362831859

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.55      0.65       359
           1       0.68      0.87      0.76       398

    accuracy                           0.72       757
   macro avg       0.74      0.71      0.71       757
weighted avg       0.73      0.72      0.71       757

[[198 161]
 [ 53 345]]


In [15]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Accuracy: 0.6499339498018494
Precision: 0.7239057239057239
Recall: 0.5402010050251256
F1 Score: 0.6187050359712231

Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.77      0.68       359
           1       0.72      0.54      0.62       398

    accuracy                           0.65       757
   macro avg       0.66      0.66      0.65       757
weighted avg       0.67      0.65      0.65       757

[[277  82]
 [183 215]]


In [16]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(verbose=0, random_state=42)
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Accuracy: 0.8256274768824307
Precision: 0.7891304347826087
Recall: 0.9120603015075377
F1 Score: 0.8461538461538461

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.73      0.80       359
           1       0.79      0.91      0.85       398

    accuracy                           0.83       757
   macro avg       0.84      0.82      0.82       757
weighted avg       0.83      0.83      0.82       757

[[262  97]
 [ 35 363]]


# BEST IS XGBOOST

In [17]:
def evaluate(y_true, y_pred):
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    
    print("\nClassification Report:\n", classification_report(y_true, y_pred))
    
    print(confusion_matrix(y_true, y_pred))

In [18]:
from xgboost import XGBClassifier
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

X_train_gpu = torch.from_numpy(np.array(X_train)).to(device)
y_train_gpu = torch.from_numpy(np.array(y_train)).to(device)

X_test_gpu = torch.from_numpy(np.array(X_test)).to(device)

model = XGBClassifier(
    device="cuda",  # ensure both training and prediction use GPU
    eval_metric="logloss",
    random_state=42
)

model.fit(X_train_gpu, y_train_gpu)

Using device: cuda


In [19]:
y_pred = model.predict(X_test_gpu)

In [20]:
evaluate(y_test, y_pred)

Accuracy: 0.8388375165125496
Precision: 0.8108108108108109
Recall: 0.9045226130653267
F1 Score: 0.8551068883610451

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.77      0.82       359
           1       0.81      0.90      0.86       398

    accuracy                           0.84       757
   macro avg       0.84      0.84      0.84       757
weighted avg       0.84      0.84      0.84       757

[[275  84]
 [ 38 360]]


In [21]:
from xgboost import XGBClassifier
import torch
import numpy as np
from itertools import product
from sklearn.metrics import accuracy_score

# Assume X_train, y_train, X_test, y_test are already defined or split here
# X_train, X_test, y_train, y_test = train_test_split(...)

# Ensure you're on the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Optional: Conversion to GPU tensors (not used directly by XGBoost)
X_train_gpu = torch.from_numpy(np.array(X_train)).to(device)
y_train_gpu = torch.from_numpy(np.array(y_train)).to(device)
X_test_gpu = torch.from_numpy(np.array(X_test)).to(device)

# Define parameter grid
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 5, 7, 10],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "gamma": [0, 0.1, 0.5],
    "reg_alpha": [0, 0.1, 1],   # L1 regularization
    "reg_lambda": [1, 1.5, 3],  # L2 regularization
    "min_child_weight": [1, 3, 5]
}


keys = param_grid.keys()
values = param_grid.values()

all_combinations = [dict(zip(keys, v)) for v in product(*values)]

best_score = 0
best_params = None
results = []

for param in all_combinations:
    model = XGBClassifier(
        device="cuda",
        eval_metric="logloss",
        random_state=42,
        **param
    )
    
    model.fit(X_train_gpu, y_train_gpu)
    preds = model.predict(X_test_gpu)
    acc = accuracy_score(y_test, preds)

    result = {
        "params": param,
        "accuracy": acc
    }
    results.append(result)

    if acc > best_score:
        best_score = acc
        best_params = param

print("\nBest Parameters:")
print(best_params)
print(f"Best Accuracy: {best_score:.4f}")


Using device: cuda


KeyboardInterrupt: 

In [22]:
best_score

0.8599735799207398

In [23]:
best_params

{'n_estimators': 100,
 'max_depth': 10,
 'learning_rate': 0.1,
 'subsample': 0.6,
 'colsample_bytree': 1.0,
 'gamma': 0,
 'reg_alpha': 0.1,
 'reg_lambda': 3,
 'min_child_weight': 1}

In [24]:
result

{'params': {'n_estimators': 100,
  'max_depth': 10,
  'learning_rate': 0.1,
  'subsample': 0.8,
  'colsample_bytree': 0.6,
  'gamma': 0.5,
  'reg_alpha': 0.1,
  'reg_lambda': 1.5,
  'min_child_weight': 3},
 'accuracy': 0.8401585204755614}

Best Parameters:
{'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 1, 'reg_lambda': 1, 'min_child_weight': 1}
Best Accuracy: 0.8534
