In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score
import lightgbm as lgb

In [None]:
# 1. Load the dataset

df = pd.read_csv("HIGGS_short.csv")

# Target & Features
y = df["label"]
X = df.drop(columns=["label"])

In [None]:
# 2. Train/Val/Test Split (70/15/15)

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

In [None]:
# 3. LightGBM Dataset Formatting

train_data = lgb.Dataset(X_train, label=y_train)
val_data   = lgb.Dataset(X_val, label=y_val, reference=train_data)

In [None]:
# 4. LightGBM Hyperparameters (Industry Baseline)

params = {
    "objective": "binary",
    "boosting_type": "gbdt",
    "metric": "auc",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "max_depth": -1,
    "min_data_in_leaf": 50,
    "verbose": -1
}

In [None]:
# 5. Train the Model

model = lgb.train(
    params,
    train_data,
    num_boost_round=1000,
    valid_sets=[train_data, val_data],
    valid_names=["train", "valid"],
    callbacks=[
        lgb.early_stopping(stopping_rounds=100),
        lgb.log_evaluation(period=100),
    ]
)

In [None]:
# 6. Evaluate on Test Set

preds_proba = model.predict(X_test)
preds = (preds_proba > 0.5).astype(int)

auc = roc_auc_score(y_test, preds_proba)
pr_auc = average_precision_score(y_test, preds_proba)
acc = accuracy_score(y_test, preds)

print("\n================= RESULTS =================")
print("ROC-AUC:", round(auc, 5))
print("PR-AUC:", round(pr_auc, 5))
print("Accuracy:", round(acc, 5))
print("===========================================\n")

In [None]:
# 7. Feature Importance (top 20)

importances = model.feature_importance()
feat_names = X.columns

sorted_idx = np.argsort(importances)[::-1][:20]

print("Top 20 Features:")
for idx in sorted_idx:
    print(f"{feat_names[idx]}: {importances[idx]}")

In [None]:
# 8. Save Model

model.save_model("Models/lgbm_untuned.txt")