In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import xgboost as xgb

# Load dataset
df = pd.read_csv("creditcard.csv")  # Change the path if necessary

# Handling class imbalance
X = df.drop("Class", axis=1)
y = df["Class"]

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Apply SMOTE for oversampling fraud cases
sm = SMOTE(sampling_strategy=0.2, random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)

# Model training function
def train_evaluate_model(model, X_train, y_train, X_test, y_test):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []

    for train_idx, val_idx in skf.split(X_train, y_train):
        X_fold_train, X_fold_val = X_train[train_idx], X_train[val_idx]
        y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]

        model.fit(X_fold_train, y_fold_train)
        y_pred = model.predict_proba(X_fold_val)[:, 1]
        auc_scores.append(roc_auc_score(y_fold_val, y_pred))

    final_pred = model.predict_proba(X_test)[:, 1]
    test_auc = roc_auc_score(y_test, final_pred)

    return np.mean(auc_scores), test_auc

# Define models
lightgbm_model = lgb.LGBMClassifier(n_estimators=500, learning_rate=0.05, max_depth=5, num_leaves=20, random_state=42)
xgboost_model = xgb.XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=5, random_state=42)

# Evaluate models
lightgbm_auc, lightgbm_test_auc = train_evaluate_model(lightgbm_model, X_train, y_train, X_test, y_test)
xgboost_auc, xgboost_test_auc = train_evaluate_model(xgboost_model, X_train, y_train, X_test, y_test)

# Compare results
print(f"LightGBM - Cross Validation AUC: {lightgbm_auc:.4f}, Test AUC: {lightgbm_test_auc:.4f}")
print(f"XGBoost - Cross Validation AUC: {xgboost_auc:.4f}, Test AUC: {xgboost_test_auc:.4f}")

if lightgbm_test_auc > xgboost_test_auc:
    print("LightGBM is the better performing model for your dataset!")
else:
    print("XGBoost is the better performing model for your dataset!")

[LightGBM] [Info] Number of positive: 36392, number of negative: 181960
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.108234 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 218352, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Number of positive: 36392, number of negative: 181961
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051065 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 218353, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166666 -> initscore=-1.609443
[LightGBM] [Info] Start training from score -1.609443
[LightGB

In [5]:
!pip install optuna



In [7]:
import optuna
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# Objective function for Optuna
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 300, 500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 1.0),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 1.0, 10.0),
        "tree_method": "hist",
        "random_state": 42,
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []
    
    for train_idx, val_idx in skf.split(X_train, y_train):
        X_fold_train, X_fold_val = X_train[train_idx], X_train[val_idx]
        y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]

        model = xgb.XGBClassifier(**params)
        model.fit(X_fold_train, y_fold_train)

        y_pred = model.predict_proba(X_fold_val)[:, 1]
        auc_scores.append(roc_auc_score(y_fold_val, y_pred))

    return np.mean(auc_scores)

# Run optimization
study = optuna.create_study(direction="maximize",pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=10, n_jobs=4)

# Best parameters
best_params = study.best_params
print("Best hyperparameters found:", best_params)

# Train with best parameters
optimized_xgb = xgb.XGBClassifier(**best_params)
optimized_xgb.fit(X_train, y_train)

# Evaluate on test set
y_test_pred = optimized_xgb.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, y_test_pred)

print(f"Optimized XGBoost Test AUC: {test_auc:.4f}")


[I 2025-06-07 21:36:09,985] A new study created in memory with name: no-name-c46643ad-4989-4e09-8a0a-e3124575e4f7
[I 2025-06-07 21:40:44,111] Trial 0 finished with value: 0.9999866199678555 and parameters: {'n_estimators': 336, 'learning_rate': 0.16260541270392093, 'max_depth': 4, 'min_child_weight': 9, 'gamma': 0.9194544940339705, 'subsample': 0.8786853874641749, 'colsample_bytree': 0.5168560035258449, 'scale_pos_weight': 9.87716406012022}. Best is trial 0 with value: 0.9999866199678555.
[I 2025-06-07 21:42:10,030] Trial 3 finished with value: 0.9999903491802445 and parameters: {'n_estimators': 455, 'learning_rate': 0.16646434773107632, 'max_depth': 4, 'min_child_weight': 3, 'gamma': 0.46784095694736394, 'subsample': 0.6052075468703377, 'colsample_bytree': 0.9633283300975456, 'scale_pos_weight': 5.810324762272433}. Best is trial 3 with value: 0.9999903491802445.
[I 2025-06-07 22:10:59,682] Trial 2 finished with value: 0.9999876087069772 and parameters: {'n_estimators': 321, 'learning_

Best hyperparameters found: {'n_estimators': 308, 'learning_rate': 0.13159978349323284, 'max_depth': 9, 'min_child_weight': 6, 'gamma': 0.13450040016036358, 'subsample': 0.9308347288768503, 'colsample_bytree': 0.8146521731009645, 'scale_pos_weight': 4.725115508465569}
Optimized XGBoost Test AUC: 0.9804


In [9]:
import optuna
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# Define Optuna objective function
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 250, 400),  # Smaller search range for efficiency
        "learning_rate": trial.suggest_float("learning_rate", 0.05, 0.15),  # Avoid extreme values
        "max_depth": trial.suggest_int("max_depth", 7, 10),  # Test deeper trees for fraud detection
        "min_child_weight": trial.suggest_int("min_child_weight", 4, 8),
        "gamma": trial.suggest_float("gamma", 0.1, 0.5),  # Moderate regularization
        "subsample": trial.suggest_float("subsample", 0.8, 1.0),  # Limit to high values for stability
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 0.9),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 4.0, 8.0),  # Fine-tuning imbalance handling
        "tree_method": "hist",  # Optimized for speed
        "random_state": 42,
    }

    # Stratified k-fold validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []

    for train_idx, val_idx in skf.split(X_train, y_train):
        X_fold_train, X_fold_val = X_train[train_idx], X_train[val_idx]
        y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]

        model = xgb.XGBClassifier(**params, early_stopping_rounds=30)
        model.fit(X_fold_train, y_fold_train, eval_set=[(X_fold_val, y_fold_val)], verbose=False)

        y_pred = model.predict_proba(X_fold_val)[:, 1]
        auc_scores.append(roc_auc_score(y_fold_val, y_pred))

    return np.mean(auc_scores)

# Run Optuna tuning with pruning
study = optuna.create_study(direction="maximize", pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=15, n_jobs=4)

# Best parameters
best_params = study.best_params
print("Optimized hyperparameters:", best_params)

# Train final optimized model
optimized_xgb = xgb.XGBClassifier(**best_params)
optimized_xgb.fit(X_train, y_train)

# Evaluate on test set
y_test_pred = optimized_xgb.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, y_test_pred)

print(f"Final Optimized XGBoost Test AUC: {test_auc:.4f}")

[I 2025-06-07 22:58:40,949] A new study created in memory with name: no-name-9e5cad83-a3c3-4931-a584-5c59aa07225f
[I 2025-06-07 23:47:31,330] Trial 2 finished with value: 0.99999069880091 and parameters: {'n_estimators': 304, 'learning_rate': 0.12726072469401534, 'max_depth': 8, 'min_child_weight': 5, 'gamma': 0.3397176248268663, 'subsample': 0.9046360975443829, 'colsample_bytree': 0.8187897810713769, 'scale_pos_weight': 4.472811379047009}. Best is trial 2 with value: 0.99999069880091.
[I 2025-06-07 23:47:37,098] Trial 1 finished with value: 0.9999904122333476 and parameters: {'n_estimators': 351, 'learning_rate': 0.1401322464202462, 'max_depth': 8, 'min_child_weight': 4, 'gamma': 0.32500252467661284, 'subsample': 0.8223641657579351, 'colsample_bytree': 0.8656222288345152, 'scale_pos_weight': 4.65751002668649}. Best is trial 2 with value: 0.99999069880091.
[I 2025-06-07 23:48:25,092] Trial 0 finished with value: 0.9999910660458088 and parameters: {'n_estimators': 309, 'learning_rate': 

Optimized hyperparameters: {'n_estimators': 315, 'learning_rate': 0.1293347399717354, 'max_depth': 8, 'min_child_weight': 6, 'gamma': 0.1620729320650899, 'subsample': 0.9341307015017964, 'colsample_bytree': 0.7729095319340881, 'scale_pos_weight': 5.766386479576636}
Final Optimized XGBoost Test AUC: 0.9818
