In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score
import lightgbm as lgb
import optuna

# Load data
df = pd.read_csv("HIGGS_short.csv")

y = df["label"]
X = df.drop(columns=["label"])

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

train_data = lgb.Dataset(X_train, label=y_train)
val_data   = lgb.Dataset(X_val, label=y_val)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def objective(trial):

    params = {
        "objective": "binary",
        "metric": "auc",
        "boosting_type": "gbdt",

        # REQUIRED FIX
        "feature_pre_filter": False,

        # Tunable params
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.20),
        "num_leaves": trial.suggest_int("num_leaves", 16, 512),
        "max_depth": trial.suggest_int("max_depth", -1, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 200),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 10.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 10.0),
        "verbose": -1,
    }

    model = lgb.train(
        params,
        train_data,
        num_boost_round=3000,
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(stopping_rounds=100)]
    )

    preds = model.predict(X_val)
    auc = roc_auc_score(y_val, preds)
    return auc

In [3]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)

print("Best AUC:", study.best_value)
print("Best params:", study.best_params)

[I 2025-11-25 15:31:14,231] A new study created in memory with name: no-name-5a25983e-52f9-422b-9850-53f24f8230dd


Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[3000]	valid_0's auc: 0.850303


[I 2025-11-25 15:53:39,818] Trial 0 finished with value: 0.8503027030435572 and parameters: {'learning_rate': 0.08772078223795904, 'num_leaves': 492, 'max_depth': 8, 'min_data_in_leaf': 131, 'feature_fraction': 0.6206535679789964, 'bagging_fraction': 0.7990773117694543, 'bagging_freq': 6, 'lambda_l1': 1.7208458042098496, 'lambda_l2': 7.878662260515488}. Best is trial 0 with value: 0.8503027030435572.


Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[3000]	valid_0's auc: 0.829619


[I 2025-11-25 16:09:19,107] Trial 1 finished with value: 0.8296185083353139 and parameters: {'learning_rate': 0.06098920232795587, 'num_leaves': 133, 'max_depth': 4, 'min_data_in_leaf': 135, 'feature_fraction': 0.9237993932949609, 'bagging_fraction': 0.9123818379845794, 'bagging_freq': 1, 'lambda_l1': 3.754036629318257, 'lambda_l2': 8.3871531692608}. Best is trial 0 with value: 0.8503027030435572.


Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[2998]	valid_0's auc: 0.775295


[I 2025-11-25 16:17:26,050] Trial 2 finished with value: 0.7752953739282303 and parameters: {'learning_rate': 0.1472857945205505, 'num_leaves': 149, 'max_depth': 1, 'min_data_in_leaf': 51, 'feature_fraction': 0.6231916290393269, 'bagging_fraction': 0.5472274593235451, 'bagging_freq': 10, 'lambda_l1': 3.0543124618482196, 'lambda_l2': 2.575037963881095}. Best is trial 0 with value: 0.8503027030435572.


Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[3000]	valid_0's auc: 0.825029


[I 2025-11-25 16:28:00,388] Trial 3 finished with value: 0.8250290450370199 and parameters: {'learning_rate': 0.11010227533008995, 'num_leaves': 283, 'max_depth': 3, 'min_data_in_leaf': 159, 'feature_fraction': 0.6572956308796586, 'bagging_fraction': 0.7900960779728381, 'bagging_freq': 6, 'lambda_l1': 0.5990017999162534, 'lambda_l2': 5.712795362890325}. Best is trial 0 with value: 0.8503027030435572.


Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[3000]	valid_0's auc: 0.84161


[I 2025-11-25 16:44:23,628] Trial 4 finished with value: 0.8416099512224868 and parameters: {'learning_rate': 0.13978333783012709, 'num_leaves': 398, 'max_depth': 5, 'min_data_in_leaf': 150, 'feature_fraction': 0.7624121765934546, 'bagging_fraction': 0.5320806209363824, 'bagging_freq': 6, 'lambda_l1': 7.3849960508032355, 'lambda_l2': 6.319484867767358}. Best is trial 0 with value: 0.8503027030435572.


Best AUC: 0.8503027030435572
Best params: {'learning_rate': 0.08772078223795904, 'num_leaves': 492, 'max_depth': 8, 'min_data_in_leaf': 131, 'feature_fraction': 0.6206535679789964, 'bagging_fraction': 0.7990773117694543, 'bagging_freq': 6, 'lambda_l1': 1.7208458042098496, 'lambda_l2': 7.878662260515488}


In [4]:
best_params = study.best_params
best_params["objective"] = "binary"
best_params["metric"] = "auc"
best_params["boosting_type"] = "gbdt"
best_params["verbose"] = -1
best_params["feature_pre_filter"] = False

final_model = lgb.train(
    best_params,
    train_data,
    num_boost_round=5000,
    valid_sets=[val_data],
    callbacks=[lgb.early_stopping(stopping_rounds=100)]
)

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[5000]	valid_0's auc: 0.852139


In [5]:
preds_proba = final_model.predict(X_test)
preds = (preds_proba > 0.5).astype(int)

auc = roc_auc_score(y_test, preds_proba)
pr_auc = average_precision_score(y_test, preds_proba)
acc = accuracy_score(y_test, preds)

print("\n================= TUNED LGBM RESULTS =================")
print("ROC-AUC:", round(auc, 5))
print("PR-AUC:", round(pr_auc, 5))
print("Accuracy:", round(acc, 5))
print("Best Params:", best_params)
print("======================================================")


ROC-AUC: 0.85187
PR-AUC: 0.86425
Accuracy: 0.7676
Best Params: {'learning_rate': 0.08772078223795904, 'num_leaves': 492, 'max_depth': 8, 'min_data_in_leaf': 131, 'feature_fraction': 0.6206535679789964, 'bagging_fraction': 0.7990773117694543, 'bagging_freq': 6, 'lambda_l1': 1.7208458042098496, 'lambda_l2': 7.878662260515488, 'objective': 'binary', 'metric': 'auc', 'boosting_type': 'gbdt', 'verbose': -1, 'feature_pre_filter': False}


In [7]:
final_model.save_model("Models/lgbm_tuned.txt")

<lightgbm.basic.Booster at 0x262753d65d0>