In [58]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
# import warnings
# warnings.filterwarnings("ignore")

# print(lgb.create_tree_dumper.__doc__) 

In [59]:
x_tr_resample = pd.read_csv('../../Data/clean/X_train_smote.csv')
X_test = pd.read_csv('../../Data/clean/X_test.csv')
y_tr_resample = np.loadtxt("../../Data/clean/y_train_smote.csv", delimiter=",")
y_test = np.loadtxt("../../Data/clean/y_test.csv", delimiter=",")

In [60]:
skf_grid = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [61]:
# lgbm_params = {
#     'n_estimators': [300],                 # Fix karena sudah optimal
#     'max_depth': [8],                      # Fix karena sudah optimal
#     'learning_rate': [0.1],                # Fix karena sudah optimal
#     'num_leaves': [31],                    # Fix karena sudah optimal
#     'min_child_samples': [20],             # Fix karena sudah optimal
#     'subsample': [1.0],                    # Fix karena sudah optimal
#     'colsample_bytree': [0.7, 0.8, 0.9],   # Hanya ini yang kita cari
#     'device_type': ['gpu']
# }

lgbm_params = {
    'n_estimators': [300],                 # Fix karena sudah optimal
    'max_depth': [8],                      # Fix karena sudah optimal
    'learning_rate': [0.1],                # Fix karena sudah optimal
    'num_leaves': [15, 31, 63],            # Variasi jumlah daun
    'min_child_samples': [10, 20],         # Variasi ukuran anak minimum
    'subsample': [0.8, 0.9, 1.0],          # Variasi untuk subsample
    'colsample_bytree': [0.7, 0.8, 0.9],   # Variasi untuk colsample_bytree
    'device_type': ['gpu']                 # Tetap gunakan GPU
}



In [62]:
lgbm = LGBMClassifier(random_state=42, force_col_wise=True, verbose=-1, metric='auc')

In [63]:
lgbm_grid = GridSearchCV(lgbm, lgbm_params, cv=skf_grid, n_jobs=-1, verbose=0, scoring='accuracy')

In [64]:
lgbm_grid.fit(x_tr_resample, y_tr_resample)

In [65]:
best_params = lgbm_grid.best_params_

In [66]:
best_model = LGBMClassifier(**best_params, random_state=42, force_col_wise=True)

In [67]:
skf_eval = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

accuracies = []
recalls = []
precisions = []
f1_scores = []
roc_aucs = []

for fold, (train_idx, val_idx) in enumerate(skf_eval.split(x_tr_resample, y_tr_resample)):
    # Split data
    X_train_fold = x_tr_resample.iloc[train_idx]
    y_train_fold = y_tr_resample[train_idx]
    X_val_fold = x_tr_resample.iloc[val_idx]
    y_val_fold = y_tr_resample[val_idx]
    
    # Train model
    best_model.fit(X_train_fold, y_train_fold)
    
    # Make predictions
    y_pred_fold = best_model.predict(X_val_fold)
    y_pred_proba_fold = best_model.predict_proba(X_val_fold)[:, 1]
    
    # Calculate metrics
    accuracies.append(accuracy_score(y_val_fold, y_pred_fold))
    recalls.append(recall_score(y_val_fold, y_pred_fold, average='weighted'))
    precisions.append(precision_score(y_val_fold, y_pred_fold, average='weighted'))
    f1_scores.append(f1_score(y_val_fold, y_pred_fold, average='weighted'))
    roc_aucs.append(roc_auc_score(y_val_fold, y_pred_proba_fold))

In [68]:
best_model.fit(
    X_train_fold, 
    y_train_fold,
    eval_set=[(X_val_fold, y_val_fold)]
)
y_pred_test = best_model.predict(X_test)
y_pred_test_proba = best_model.predict_proba(X_test)[:, 1]

y_train_pred = best_model.predict(x_tr_resample)

In [69]:
print("LightGBM Results:")
print("-" * 50)
print("Best Parameters:", best_params)
print("\nCross-validation Results (10-fold):")
print(f"Accuracy    : {np.mean(accuracies)*100:.2f}% (+/- {np.std(accuracies)*100:.2f}%)")
print(f"Recall      : {np.mean(recalls)*100:.2f}% (+/- {np.std(recalls)*100:.2f}%)")
print(f"Precision   : {np.mean(precisions)*100:.2f}% (+/- {np.std(precisions)*100:.2f}%)")
print(f"F1-Score    : {np.mean(f1_scores)*100:.2f}% (+/- {np.std(f1_scores)*100:.2f}%)")

print("\nTrain Set Results:")
print(f"Accuracy    : {accuracy_score(y_tr_resample, y_train_pred)*100:.2f}%")
print(f"Recall      : {recall_score(y_tr_resample, y_train_pred, average='weighted')*100:.2f}%")
print(f"Precision   : {precision_score(y_tr_resample, y_train_pred, average='weighted')*100:.2f}%")
print(f"F1-Score    : {f1_score(y_tr_resample, y_train_pred, average='weighted')*100:.2f}%")

print("\nTest Set Results:")
print(f"Accuracy    : {accuracy_score(y_test, y_pred_test)*100:.2f}%")
print(f"Recall      : {recall_score(y_test, y_pred_test, average='weighted')*100:.2f}%")
print(f"Precision   : {precision_score(y_test, y_pred_test, average='weighted')*100:.2f}%")
print(f"F1-Score    : {f1_score(y_test, y_pred_test, average='weighted')*100:.2f}%")
print(f"ROC-AUC     : {roc_auc_score(y_test, y_pred_test_proba)*100:.2f}%")
print("-" * 50)


LightGBM Results:
--------------------------------------------------
Best Parameters: {'colsample_bytree': 0.8, 'device_type': 'gpu', 'learning_rate': 0.1, 'max_depth': 8, 'min_child_samples': 20, 'n_estimators': 300, 'num_leaves': 31, 'subsample': 0.8}

Cross-validation Results (10-fold):
Accuracy    : 99.43% (+/- 0.17%)
Recall      : 99.43% (+/- 0.17%)
Precision   : 99.43% (+/- 0.17%)
F1-Score    : 99.43% (+/- 0.17%)

Train Set Results:
Accuracy    : 99.95%
Recall      : 99.95%
Precision   : 99.95%
F1-Score    : 99.95%

Test Set Results:
Accuracy    : 99.14%
Recall      : 99.14%
Precision   : 99.13%
F1-Score    : 99.13%
ROC-AUC     : 99.92%
--------------------------------------------------
