In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score
import lightgbm as lgb
import optuna

In [None]:
# 1. Load the dataset

df = pd.read_csv("HIGGS_short.csv")

# Target & Features
y = df["label"]
X = df.drop(columns=["label"]).copy()

In [None]:
# 2. Functions for calculating new features

def delta_phi(phi1, phi2):
    dphi = np.abs(phi1 - phi2)
    return np.where(dphi > np.pi, 2*np.pi - dphi, dphi)

def delta_r(eta1, phi1, eta2, phi2):
    return np.sqrt((eta1 - eta2)**2 + (delta_phi(phi1, phi2))**2)

def transverse_mass(pt_lep, met, dphi):
    return np.sqrt(2 * pt_lep * met * (1 - np.cos(dphi)))

In [None]:
# 3. 7 New Physics Feature Definition

# ΔR(jet1, jet2)
X["deltaR_j1_j2"] = delta_r(
    X[" jet 1 eta"], X[" jet 1 phi"], 
    X[" jet 2 eta"], X[" jet 2 phi"]
)

# ΔR(lepton, jet1)
X["deltaR_lep_j1"] = delta_r(
    X[" lepton  eta"], X[" lepton  phi"],
    X[" jet 1 eta"], X[" jet 1 phi"]
)

# ΔR(lepton, jet2)
X["deltaR_lep_j2"] = delta_r(
    X[" lepton  eta"], X[" lepton  phi"],
    X[" jet 2 eta"], X[" jet 2 phi"]
)

# HT = sum of jet pT
X["HT"] = (
    X[" jet 1 pt"] +
    X[" jet 2 pt"] +
    X[" jet 3 pt"] +
    X[" jet 4 pt"]
)

# ST = HT + lepton pT + MET
X["ST"] = X["HT"] + X[" lepton  pT"] + X[" missing energy magnitude"]

# Δphi(lepton, MET)
X["dphi_lep_met"] = delta_phi(
    X[" lepton  phi"],
    X[" missing energy phi"]
)

# MT_W (W transverse mass)
X["MT_W"] = transverse_mass(
    X[" lepton  pT"],
    X[" missing energy magnitude"],
    X["dphi_lep_met"]
)

In [None]:
# 4. Train/Val/Test Split (70/15/15)

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

train_data = lgb.Dataset(X_train, label=y_train)
val_data   = lgb.Dataset(X_val, label=y_val)

In [None]:
# 5. Optuna Search Definition

def objective(trial):

    params = {
        "objective": "binary",
        "metric": "auc",
        "boosting_type": "gbdt",
        
        # REQUIRED FIX
        "feature_pre_filter": False,

        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.20),
        "num_leaves": trial.suggest_int("num_leaves", 16, 512),
        "max_depth": trial.suggest_int("max_depth", -1, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 200),

        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),

        "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 10.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 10.0),

        "verbose": -1,
    }

    model = lgb.train(
        params,
        train_data,
        num_boost_round=5000,
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(stopping_rounds=100)],
    )

    preds = model.predict(X_val)
    auc = roc_auc_score(y_val, preds)
    return auc

In [None]:
# 6. Run Optuna Search

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)

print("Best AUC:", study.best_value)
print("Best Params:", study.best_params)

In [None]:
# 7. Train Model based on best Optuna Parameters

best_params = study.best_params
best_params["objective"] = "binary"
best_params["metric"] = "auc"
best_params["boosting_type"] = "gbdt"
best_params["verbose"] = -1
best_params["feature_pre_filter"] = False

final_model = lgb.train(
    best_params,
    train_data,
    num_boost_round=5000,
    valid_sets=[val_data],
    callbacks=[lgb.early_stopping(stopping_rounds=100)]
)

In [None]:
# 8. Evaluation

preds_proba = final_model.predict(X_test)
preds = (preds_proba > 0.5).astype(int)

auc = roc_auc_score(y_test, preds_proba)
pr_auc = average_precision_score(y_test, preds_proba)
acc = accuracy_score(y_test, preds)

print("\n================= PHYSICS LGBM RESULTS =================")
print("ROC-AUC:", round(auc, 5))
print("PR-AUC:", round(pr_auc, 5))
print("Accuracy:", round(acc, 5))
print("Best Params:", best_params)
print("========================================================")

In [None]:
# 9. Save Model

final_model.save_model("Models/lgbm_physics.txt")