In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import warnings
warnings.filterwarnings("ignore")

In [7]:
x_tr_resample = pd.read_csv('../../Data/clean/X_train_smote.csv')
X_test = pd.read_csv('../../Data/clean/X_test.csv')
y_tr_resample = np.loadtxt("../../Data/clean/y_train_smote.csv", delimiter=",")
y_test = np.loadtxt("../../Data/clean/y_test.csv", delimiter=",")

In [8]:
skf_grid = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [9]:
lgbm_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [-1, 5, 10],
    'learning_rate': [0.01, 0.1],
    'num_leaves': [31, 50, 100],
    'min_child_samples': [20, 30, 50]
}

In [10]:
lgbm = LGBMClassifier(random_state=42, force_col_wise=True)

In [11]:
lgbm_grid = GridSearchCV(lgbm, lgbm_params, cv=skf_grid, n_jobs=-1, verbose=0)

In [12]:
lgbm_grid.fit(x_tr_resample, y_tr_resample)

[LightGBM] [Info] Number of positive: 6120, number of negative: 6120
[LightGBM] [Info] Total Bins 1411
[LightGBM] [Info] Number of data points in the train set: 12240, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [None]:
best_params = lgbm_grid.best_params_

In [13]:
best_model = LGBMClassifier(**best_params, random_state=42, force_col_wise=True)

In [None]:
skf_eval = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

accuracies = []
recalls = []
precisions = []
f1_scores = []

for fold, (train_idx, val_idx) in enumerate(skf_eval.split(x_tr_resample, y_tr_resample)):
    # Split data
    X_train_fold = x_tr_resample.iloc[train_idx]
    y_train_fold = y_tr_resample[train_idx]
    X_val_fold = x_tr_resample.iloc[val_idx]
    y_val_fold = y_tr_resample[val_idx]
    
    # Train model
    best_model.fit(X_train_fold, y_train_fold)
    
    # Make predictions
    y_pred_fold = best_model.predict(X_val_fold)
    
    # Calculate metrics
    accuracies.append(accuracy_score(y_val_fold, y_pred_fold))
    recalls.append(recall_score(y_val_fold, y_pred_fold, average='weighted'))
    precisions.append(precision_score(y_val_fold, y_pred_fold, average='weighted'))
    f1_scores.append(f1_score(y_val_fold, y_pred_fold, average='weighted'))

[LightGBM] [Info] Number of positive: 5508, number of negative: 5508
[LightGBM] [Info] Total Bins 1381
[LightGBM] [Info] Number of data points in the train set: 11016, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 5508, number of negative: 5508
[LightGBM] [Info] Total Bins 1387
[LightGBM] [Info] Number of data points in the train set: 11016, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 5508, number of negative: 5508
[LightGBM] [Info] Total Bins 1389
[LightGBM] [Info] Number of data points in the train set: 11016, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 5508, number of negative: 5508
[LightGBM] [Info] Total Bins 1389
[LightGBM] [Info] Number of data points in the train set: 11016, number of 

In [17]:
best_model.fit(x_tr_resample, y_tr_resample)
y_pred_test = best_model.predict(X_test)

[LightGBM] [Info] Number of positive: 6120, number of negative: 6120
[LightGBM] [Info] Total Bins 1411
[LightGBM] [Info] Number of data points in the train set: 12240, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [18]:
print("LightGBM Results:")
print("-" * 50)
print("Best Parameters:", best_params)
print("\nCross-validation Results (10-fold):")
print(f"Accuracy    : {np.mean(accuracies)*100:.2f}% (+/- {np.std(accuracies)*100:.2f}%)")
print(f"Recall      : {np.mean(recalls)*100:.2f}% (+/- {np.std(recalls)*100:.2f}%)")
print(f"Precision   : {np.mean(precisions)*100:.2f}% (+/- {np.std(precisions)*100:.2f}%)")
print(f"F1-Score    : {np.mean(f1_scores)*100:.2f}% (+/- {np.std(f1_scores)*100:.2f}%)")

print("\nTest Set Results:")
print(f"Accuracy    : {accuracy_score(y_test, y_pred_test)*100:.2f}%")
print(f"Recall      : {recall_score(y_test, y_pred_test, average='weighted')*100:.2f}%")
print(f"Precision   : {precision_score(y_test, y_pred_test, average='weighted')*100:.2f}%")
print(f"F1-Score    : {f1_score(y_test, y_pred_test, average='weighted')*100:.2f}%")
print("-" * 50)


LightGBM Results:
--------------------------------------------------
Best Parameters: {'learning_rate': 0.1, 'max_depth': 10, 'min_child_samples': 30, 'n_estimators': 200, 'num_leaves': 50}

Cross-validation Results (10-fold):
Accuracy    : 99.40% (+/- 0.19%)
Recall      : 99.40% (+/- 0.19%)
Precision   : 99.40% (+/- 0.19%)
F1-Score    : 99.40% (+/- 0.19%)

Test Set Results:
Accuracy    : 99.19%
Recall      : 99.19%
Precision   : 99.19%
F1-Score    : 99.19%
--------------------------------------------------
