In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score
import lightgbm as lgb
import optuna

df = pd.read_csv("HIGGS_short.csv")

y = df["label"]
X = df.drop(columns=["label"]).copy()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import numpy as np

def delta_phi(phi1, phi2):
    dphi = np.abs(phi1 - phi2)
    return np.where(dphi > np.pi, 2*np.pi - dphi, dphi)

def delta_r(eta1, phi1, eta2, phi2):
    return np.sqrt((eta1 - eta2)**2 + (delta_phi(phi1, phi2))**2)

def transverse_mass(pt_lep, met, dphi):
    return np.sqrt(2 * pt_lep * met * (1 - np.cos(dphi)))

In [3]:
# ΔR(jet1, jet2)
X["deltaR_j1_j2"] = delta_r(
    X[" jet 1 eta"], X[" jet 1 phi"], 
    X[" jet 2 eta"], X[" jet 2 phi"]
)

# ΔR(lepton, jet1)
X["deltaR_lep_j1"] = delta_r(
    X[" lepton  eta"], X[" lepton  phi"],
    X[" jet 1 eta"], X[" jet 1 phi"]
)

# ΔR(lepton, jet2)
X["deltaR_lep_j2"] = delta_r(
    X[" lepton  eta"], X[" lepton  phi"],
    X[" jet 2 eta"], X[" jet 2 phi"]
)

# HT = sum of jet pT
X["HT"] = (
    X[" jet 1 pt"] +
    X[" jet 2 pt"] +
    X[" jet 3 pt"] +
    X[" jet 4 pt"]
)

# ST = HT + lepton pT + MET
X["ST"] = X["HT"] + X[" lepton  pT"] + X[" missing energy magnitude"]

# Δphi(lepton, MET)
X["dphi_lep_met"] = delta_phi(
    X[" lepton  phi"],
    X[" missing energy phi"]
)

# MT_W (W transverse mass)
X["MT_W"] = transverse_mass(
    X[" lepton  pT"],
    X[" missing energy magnitude"],
    X["dphi_lep_met"]
)

In [4]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

train_data = lgb.Dataset(X_train, label=y_train)
val_data   = lgb.Dataset(X_val,   label=y_val)

In [5]:
def objective(trial):

    params = {
        "objective": "binary",
        "metric": "auc",
        "boosting_type": "gbdt",
        
        # REQUIRED FIX
        "feature_pre_filter": False,

        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.20),
        "num_leaves": trial.suggest_int("num_leaves", 16, 512),
        "max_depth": trial.suggest_int("max_depth", -1, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 200),

        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),

        "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 10.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 10.0),

        "verbose": -1,
    }

    model = lgb.train(
        params,
        train_data,
        num_boost_round=5000,
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(stopping_rounds=100)],
    )

    preds = model.predict(X_val)
    auc = roc_auc_score(y_val, preds)
    return auc

In [6]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)

print("Best AUC:", study.best_value)
print("Best Params:", study.best_params)

[I 2025-11-25 19:17:14,483] A new study created in memory with name: no-name-3cbfe213-1a36-4d6c-ba3a-4369126c3a8f


Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[5000]	valid_0's auc: 0.778029


[I 2025-11-25 19:28:46,227] Trial 0 finished with value: 0.7780294830195021 and parameters: {'learning_rate': 0.07634021029960143, 'num_leaves': 181, 'max_depth': 1, 'min_data_in_leaf': 152, 'feature_fraction': 0.6562517067772623, 'bagging_fraction': 0.7792543322471425, 'bagging_freq': 6, 'lambda_l1': 5.257538823470352, 'lambda_l2': 7.184964432598548}. Best is trial 0 with value: 0.7780294830195021.


Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[4995]	valid_0's auc: 0.778754


[I 2025-11-25 19:40:45,771] Trial 1 finished with value: 0.7787543766139833 and parameters: {'learning_rate': 0.14916476253737912, 'num_leaves': 177, 'max_depth': 1, 'min_data_in_leaf': 125, 'feature_fraction': 0.6634286261205482, 'bagging_fraction': 0.6825849140490049, 'bagging_freq': 7, 'lambda_l1': 9.117144334100523, 'lambda_l2': 1.6894060220787344}. Best is trial 1 with value: 0.7787543766139833.


Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[5000]	valid_0's auc: 0.82174


[I 2025-11-25 19:55:07,691] Trial 2 finished with value: 0.8217400029544881 and parameters: {'learning_rate': 0.1417676660524938, 'num_leaves': 265, 'max_depth': 2, 'min_data_in_leaf': 105, 'feature_fraction': 0.8320890691146678, 'bagging_fraction': 0.7426496658626047, 'bagging_freq': 2, 'lambda_l1': 5.725628300761631, 'lambda_l2': 6.249369661548751}. Best is trial 2 with value: 0.8217400029544881.


Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[4990]	valid_0's auc: 0.778829


[I 2025-11-25 20:09:29,549] Trial 3 finished with value: 0.7788291552502108 and parameters: {'learning_rate': 0.19399523430146706, 'num_leaves': 190, 'max_depth': 1, 'min_data_in_leaf': 144, 'feature_fraction': 0.9908636891745078, 'bagging_fraction': 0.7655304717934288, 'bagging_freq': 1, 'lambda_l1': 2.7806001928885324, 'lambda_l2': 5.6914739040356235}. Best is trial 2 with value: 0.8217400029544881.


Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[5000]	valid_0's auc: 0.838233


[I 2025-11-25 20:30:21,673] Trial 4 finished with value: 0.8382332760560074 and parameters: {'learning_rate': 0.1745079573321527, 'num_leaves': 401, 'max_depth': 3, 'min_data_in_leaf': 172, 'feature_fraction': 0.932162538513332, 'bagging_fraction': 0.6439182719434021, 'bagging_freq': 4, 'lambda_l1': 6.4399230062565955, 'lambda_l2': 5.143909448435361}. Best is trial 4 with value: 0.8382332760560074.


Best AUC: 0.8382332760560074
Best Params: {'learning_rate': 0.1745079573321527, 'num_leaves': 401, 'max_depth': 3, 'min_data_in_leaf': 172, 'feature_fraction': 0.932162538513332, 'bagging_fraction': 0.6439182719434021, 'bagging_freq': 4, 'lambda_l1': 6.4399230062565955, 'lambda_l2': 5.143909448435361}


In [7]:
best_params = study.best_params
best_params["objective"] = "binary"
best_params["metric"] = "auc"
best_params["boosting_type"] = "gbdt"
best_params["verbose"] = -1
best_params["feature_pre_filter"] = False

final_model = lgb.train(
    best_params,
    train_data,
    num_boost_round=5000,
    valid_sets=[val_data],
    callbacks=[lgb.early_stopping(stopping_rounds=100)]
)

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[5000]	valid_0's auc: 0.838233


In [8]:
preds_proba = final_model.predict(X_test)
preds = (preds_proba > 0.5).astype(int)

auc = roc_auc_score(y_test, preds_proba)
pr_auc = average_precision_score(y_test, preds_proba)
acc = accuracy_score(y_test, preds)

print("\n================= PHYSICS LGBM RESULTS =================")
print("ROC-AUC:", round(auc, 5))
print("PR-AUC:", round(pr_auc, 5))
print("Accuracy:", round(acc, 5))
print("Best Params:", best_params)
print("========================================================")


ROC-AUC: 0.83785
PR-AUC: 0.85069
Accuracy: 0.75463
Best Params: {'learning_rate': 0.1745079573321527, 'num_leaves': 401, 'max_depth': 3, 'min_data_in_leaf': 172, 'feature_fraction': 0.932162538513332, 'bagging_fraction': 0.6439182719434021, 'bagging_freq': 4, 'lambda_l1': 6.4399230062565955, 'lambda_l2': 5.143909448435361, 'objective': 'binary', 'metric': 'auc', 'boosting_type': 'gbdt', 'verbose': -1, 'feature_pre_filter': False}


In [9]:
final_model.save_model("Models/lgbm_physics.txt")

<lightgbm.basic.Booster at 0x1da04632cc0>