In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score
import xgboost as xgb
import optuna

# Load data
df = pd.read_csv("HIGGS_short.csv")

y = df["label"]
X = df.drop(columns=["label"])

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

train_d = xgb.DMatrix(X_train, label=y_train)
val_d   = xgb.DMatrix(X_val,   label=y_val)
test_d  = xgb.DMatrix(X_test,  label=y_test)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def objective(trial):

    params = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "tree_method": "hist",        # for CPU speed
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0.0, 10.0),
        "lambda": trial.suggest_float("lambda", 0.0, 10.0),
        "alpha": trial.suggest_float("alpha", 0.0, 10.0),
    }

    evals = [(val_d, "valid")]

    model = xgb.train(
        params,
        train_d,
        num_boost_round=3000,
        evals=evals,
        early_stopping_rounds=100,
        verbose_eval=False
    )

    preds = model.predict(val_d)
    auc = roc_auc_score(y_val, preds)

    return auc

In [3]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)

print("Best AUC:", study.best_value)
print("Best Params:", study.best_params)

[I 2025-11-25 17:26:19,623] A new study created in memory with name: no-name-af5a0e29-b8d2-4b02-9bcc-9bef1f19014c
[I 2025-11-25 17:45:45,498] Trial 0 finished with value: 0.8464788139219784 and parameters: {'learning_rate': 0.06469823573663598, 'max_depth': 7, 'min_child_weight': 10, 'subsample': 0.7614541918386395, 'colsample_bytree': 0.75878723850793, 'colsample_bylevel': 0.6784724052962197, 'gamma': 1.2492658191990247, 'lambda': 8.76910854814445, 'alpha': 2.164769112665621}. Best is trial 0 with value: 0.8464788139219784.
[I 2025-11-25 18:00:49,035] Trial 1 finished with value: 0.8389532365714192 and parameters: {'learning_rate': 0.1857329686436827, 'max_depth': 7, 'min_child_weight': 5, 'subsample': 0.5065851195203814, 'colsample_bytree': 0.7252068675922894, 'colsample_bylevel': 0.8774822368250362, 'gamma': 8.622113006492956, 'lambda': 5.067315771643592, 'alpha': 1.541421714709691}. Best is trial 0 with value: 0.8464788139219784.
[I 2025-11-25 18:15:56,135] Trial 2 finished with va

Best AUC: 0.8464788139219784
Best Params: {'learning_rate': 0.06469823573663598, 'max_depth': 7, 'min_child_weight': 10, 'subsample': 0.7614541918386395, 'colsample_bytree': 0.75878723850793, 'colsample_bylevel': 0.6784724052962197, 'gamma': 1.2492658191990247, 'lambda': 8.76910854814445, 'alpha': 2.164769112665621}


In [4]:
best_params = study.best_params
best_params["objective"] = "binary:logistic"
best_params["eval_metric"] = "auc"
best_params["tree_method"] = "hist"

final_model = xgb.train(
    best_params,
    train_d,
    num_boost_round=5000,
    evals=[(val_d, "valid")],
    early_stopping_rounds=100,
    verbose_eval=100
)

[0]	valid-auc:0.72612
[100]	valid-auc:0.81259
[200]	valid-auc:0.82035
[300]	valid-auc:0.82435
[400]	valid-auc:0.82746
[500]	valid-auc:0.82992
[600]	valid-auc:0.83179
[700]	valid-auc:0.83345
[800]	valid-auc:0.83499
[900]	valid-auc:0.83615
[1000]	valid-auc:0.83715
[1100]	valid-auc:0.83810
[1200]	valid-auc:0.83903
[1300]	valid-auc:0.83981
[1400]	valid-auc:0.84056
[1500]	valid-auc:0.84127
[1600]	valid-auc:0.84183
[1700]	valid-auc:0.84234
[1800]	valid-auc:0.84279
[1900]	valid-auc:0.84323
[2000]	valid-auc:0.84361
[2100]	valid-auc:0.84403
[2200]	valid-auc:0.84437
[2300]	valid-auc:0.84469
[2400]	valid-auc:0.84501
[2500]	valid-auc:0.84528
[2600]	valid-auc:0.84557
[2700]	valid-auc:0.84583
[2800]	valid-auc:0.84606
[2900]	valid-auc:0.84628
[3000]	valid-auc:0.84648
[3100]	valid-auc:0.84668
[3200]	valid-auc:0.84684
[3300]	valid-auc:0.84701
[3400]	valid-auc:0.84718
[3500]	valid-auc:0.84738
[3600]	valid-auc:0.84753
[3700]	valid-auc:0.84770
[3800]	valid-auc:0.84784
[3900]	valid-auc:0.84798
[4000]	valid

In [5]:
preds_proba = final_model.predict(test_d)
preds = (preds_proba > 0.5).astype(int)

auc = roc_auc_score(y_test, preds_proba)
pr_auc = average_precision_score(y_test, preds_proba)
acc = accuracy_score(y_test, preds)

print("\n================= TUNED XGBOOST RESULTS =================")
print("ROC-AUC:", round(auc, 5))
print("PR-AUC:", round(pr_auc, 5))
print("Accuracy:", round(acc, 5))
print("Best Params:", best_params)
print("Best Iteration:", final_model.best_iteration)
print("==========================================================")


ROC-AUC: 0.84888
PR-AUC: 0.86128
Accuracy: 0.76466
Best Params: {'learning_rate': 0.06469823573663598, 'max_depth': 7, 'min_child_weight': 10, 'subsample': 0.7614541918386395, 'colsample_bytree': 0.75878723850793, 'colsample_bylevel': 0.6784724052962197, 'gamma': 1.2492658191990247, 'lambda': 8.76910854814445, 'alpha': 2.164769112665621, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'tree_method': 'hist'}
Best Iteration: 4999


In [6]:
final_model.save_model("Models/xgboost_tuned.json")