In [126]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from time import time

In [127]:
x_tr_resample = pd.read_csv('../../Data/clean/X_train_smote.csv')
X_test = pd.read_csv('../../Data/clean/X_test.csv')
y_tr_resample = np.loadtxt("../../Data/clean/y_train_smote.csv", delimiter=",")
y_test = np.loadtxt("../../Data/clean/y_test.csv", delimiter=",")

In [129]:
skf_grid = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [130]:
param_grid = {
    # Variance smoothing with wider range
    'var_smoothing': np.concatenate([
        np.logspace(-15, -12, 10),  # Ultra-low range
        np.logspace(-12, -9, 15),   # Low range
        np.logspace(-9, -6, 15),    # Medium range
        np.logspace(-6, -3, 10)     # High range
    ]),
    
    # Class priors with more diverse combinations
    'priors': [
        None,  # Let the model learn from data
        [0.05, 0.95], [0.95, 0.05],  # Highly imbalanced
        [0.1, 0.9], [0.15, 0.85], [0.2, 0.8], [0.25, 0.75],
        [0.3, 0.7], [0.35, 0.65],  # Moderately imbalanced
        [0.4, 0.6], [0.45, 0.55],  # Slightly imbalanced
        [0.5, 0.5],  # Balanced
        [0.55, 0.45], [0.6, 0.4],  # Reverse slight imbalance
        [0.65, 0.35], [0.7, 0.3],  # Reverse moderate imbalance
        [0.75, 0.25], [0.8, 0.2], [0.85, 0.15],  # Reverse high imbalance
        [0.9, 0.1], [0.95, 0.05]
    ],
}

In [None]:
gnb = GaussianNB()

In [131]:
grid_search = GridSearchCV(
    estimator=gnb,
    param_grid=param_grid,
    cv=skf_grid,
    n_jobs=-1,
    verbose=2,
    scoring={
        'accuracy': 'accuracy',
        'precision_weighted': 'precision_weighted',
        'recall_weighted': 'recall_weighted',
        'f1_weighted': 'f1_weighted',
        'roc_auc': 'roc_auc'
    },
    refit='f1_weighted',
    return_train_score=True
)

In [132]:
start_time = time()
grid_search.fit(x_tr_resample, y_tr_resample)
training_time = time() - start_time

Fitting 10 folds for each of 1050 candidates, totalling 10500 fits


In [139]:
best_params = grid_search.best_params_

In [None]:
best_model = GaussianNB(**best_params)

In [142]:
skf_eval = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

accuracies = []
recalls = []
precisions = []
f1_scores = []
fold_predictions = []

for fold, (train_idx, val_idx) in enumerate(skf_eval.split(x_tr_resample, y_tr_resample)):
    # Split data
    X_train_fold = x_tr_resample.iloc[train_idx]
    y_train_fold = y_tr_resample[train_idx]
    X_val_fold = x_tr_resample.iloc[val_idx]
    y_val_fold = y_tr_resample[val_idx]
    
    # Train and predict
    best_model.fit(X_train_fold, y_train_fold)
    y_pred_fold = best_model.predict(X_val_fold)
    
    # Store metrics
    accuracies.append(accuracy_score(y_val_fold, y_pred_fold))
    recalls.append(recall_score(y_val_fold, y_pred_fold, average='weighted'))
    precisions.append(precision_score(y_val_fold, y_pred_fold, average='weighted'))
    f1_scores.append(f1_score(y_val_fold, y_pred_fold, average='weighted'))
    fold_predictions.append((y_val_fold, y_pred_fold))

In [143]:
best_model.fit(x_tr_resample, y_tr_resample)
y_pred_test = best_model.predict(X_test)

In [144]:
print("Gaussian Naive Bayes Results:")
print("-" * 50)
print(f"Training Time: {training_time:.2f} seconds")
print("Best Parameters:", best_params)
print("\nCross-validation Results (10-fold):")
print(f"Accuracy    : {np.mean(accuracies)*100:.2f}% (+/- {np.std(accuracies)*100:.2f}%)")
print(f"Recall      : {np.mean(recalls)*100:.2f}% (+/- {np.std(recalls)*100:.2f}%)")
print(f"Precision   : {np.mean(precisions)*100:.2f}% (+/- {np.std(precisions)*100:.2f}%)")
print(f"F1-Score    : {np.mean(f1_scores)*100:.2f}% (+/- {np.std(f1_scores)*100:.2f}%)")

print("\nTest Set Results:")
print(f"Accuracy    : {accuracy_score(y_test, y_pred_test)*100:.2f}%")
print(f"Recall      : {recall_score(y_test, y_pred_test, average='weighted')*100:.2f}%")
print(f"Precision   : {precision_score(y_test, y_pred_test, average='weighted')*100:.2f}%")
print(f"F1-Score    : {f1_score(y_test, y_pred_test, average='weighted')*100:.2f}%")
print("-" * 50)

Gaussian Naive Bayes Results:
--------------------------------------------------
Training Time: 70.67 seconds
Best Parameters: {'priors': [0.95, 0.05], 'var_smoothing': 1e-15}

Cross-validation Results (10-fold):
Accuracy    : 95.54% (+/- 0.53%)
Recall      : 95.54% (+/- 0.53%)
Precision   : 95.59% (+/- 0.52%)
F1-Score    : 95.54% (+/- 0.53%)

Test Set Results:
Accuracy    : 96.34%
Recall      : 96.34%
Precision   : 96.43%
F1-Score    : 96.37%
--------------------------------------------------
