In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score
import xgboost as xgb
import optuna

In [None]:
# 1. Load the dataset

df = pd.read_csv("HIGGS_short.csv")

# Target & Features
y = df["label"]
X = df.drop(columns=["label"])

In [None]:
# 2. Train/Val/Test Split (70/15/15)

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

train_d = xgb.DMatrix(X_train, label=y_train)
val_d   = xgb.DMatrix(X_val,   label=y_val)
test_d  = xgb.DMatrix(X_test,  label=y_test)

In [None]:
# 3. Optuna Search Definition

def objective(trial):

    params = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "tree_method": "hist",        # for CPU speed
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0.0, 10.0),
        "lambda": trial.suggest_float("lambda", 0.0, 10.0),
        "alpha": trial.suggest_float("alpha", 0.0, 10.0),
    }

    evals = [(val_d, "valid")]

    model = xgb.train(
        params,
        train_d,
        num_boost_round=3000,
        evals=evals,
        early_stopping_rounds=100,
        verbose_eval=False
    )

    preds = model.predict(val_d)
    auc = roc_auc_score(y_val, preds)

    return auc

In [None]:
# 4. Run Optuna Search

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)

print("Best AUC:", study.best_value)
print("Best Params:", study.best_params)

In [None]:
# 5. Train Model based on best Optuna Parameters

best_params = study.best_params
best_params["objective"] = "binary:logistic"
best_params["eval_metric"] = "auc"
best_params["tree_method"] = "hist"

final_model = xgb.train(
    best_params,
    train_d,
    num_boost_round=5000,
    evals=[(val_d, "valid")],
    early_stopping_rounds=100,
    verbose_eval=100
)

In [None]:
# 6. Evaluation

preds_proba = final_model.predict(test_d)
preds = (preds_proba > 0.5).astype(int)

auc = roc_auc_score(y_test, preds_proba)
pr_auc = average_precision_score(y_test, preds_proba)
acc = accuracy_score(y_test, preds)

print("\n================= TUNED XGBOOST RESULTS =================")
print("ROC-AUC:", round(auc, 5))
print("PR-AUC:", round(pr_auc, 5))
print("Accuracy:", round(acc, 5))
print("Best Params:", best_params)
print("Best Iteration:", final_model.best_iteration)
print("==========================================================")

In [None]:
# 7. Save Model

final_model.save_model("Models/xgboost_tuned.json")