In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score
import xgboost as xgb

In [None]:
# 1. Load the dataset

df = pd.read_csv("HIGGS_short.csv")

# Target & Features
y = df["label"]
X = df.drop(columns=["label"])

In [None]:
# 2. Train/Val/Test Split (70/15/15)

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

In [None]:
# 3. Convert to DMatrix

train_d = xgb.DMatrix(X_train, label=y_train)
val_d   = xgb.DMatrix(X_val,   label=y_val)
test_d  = xgb.DMatrix(X_test,  label=y_test)

In [None]:
# 4. Untuned XGBoost Baseline Hyperparameters

params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "tree_method": "hist",
    "predictor": "auto",
    "learning_rate": 0.05,
    "max_depth": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "min_child_weight": 1
}

In [None]:
# 5. Train Model

evals = [(train_d, "train"), (val_d, "valid")]

model = xgb.train(
    params,
    train_d,
    num_boost_round=1000,
    evals=evals,
    early_stopping_rounds=100,
    verbose_eval=100,
)

In [None]:
# 6. Evaluate on Test Set

preds_proba = model.predict(test_d)
preds = (preds_proba > 0.5).astype(int)

auc = roc_auc_score(y_test, preds_proba)
pr_auc = average_precision_score(y_test, preds_proba)
acc = accuracy_score(y_test, preds)

print("\n================= RESULTS =================")
print("ROC-AUC:", round(auc, 5))
print("PR-AUC:", round(pr_auc, 5))
print("Accuracy:", round(acc, 5))
print("Best iteration:", model.best_iteration)
print("===========================================\n")

In [None]:
# 7. Save Model

model.save_model("Models/xgboost_untuned.json")