In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import roc_auc_score, accuracy_score, average_precision_score

In [None]:
# 1. Load the dataset

df = pd.read_csv("HIGGS_short.csv")

# Target & Features
y = df["label"]
X = df.drop(columns=["label"])

In [None]:
# 2. Train/Val/Test Split (70/15/15)

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

In [None]:
# 3. Feature Importance generated by LightGBM

feature_importances = {
    " m_bb": 6374,
    " lepton  pT": 4739,
    " m_wwbb": 4497,
    " jet 1 pt": 4345,
    " m_jlv": 4162,
    " m_wbb": 4140,
    " m_jj": 3798,
    " m_jjj": 3770,
    " jet 2 pt": 3221,
    " jet 1 eta": 2925,
    " missing energy magnitude": 2797,
    " lepton  eta": 2763,
    " jet 3 pt": 2228,
    " jet 2 eta": 2113,
    " m_lv": 2020,
    " jet 4 pt": 1768,
    " jet 3 eta": 1576,
    " jet 1 b-tag": 1433,
    " jet 4 eta": 1327,
    " jet 2 b-tag": 668
}

In [None]:
# 4. Function: Get Top-K features

def get_top_k_features(feature_importances, K):
    sorted_features = sorted(feature_importances.items(), key=lambda x: x[1], reverse=True)
    return [f for f, _ in sorted_features[:K]]

In [None]:
# 5. Train model (LightGBM / XGBoost)

def train_model(model_type, X_train, y_train, X_val, y_val):
    
    if model_type == "lgbm":
        train_data = lgb.Dataset(X_train, label=y_train)
        val_data   = lgb.Dataset(X_val,   label=y_val)

        params = {
            "objective": "binary",
            "metric": "auc",
            "learning_rate": 0.05,
            "num_leaves": 64,
            "feature_fraction": 0.9,
            "bagging_fraction": 0.8,
            "bagging_freq": 5,
            "verbose": -1
        }
        
        model = lgb.train(
            params,
            train_data,
            num_boost_round=2000,
            valid_sets=[val_data],
            callbacks=[lgb.early_stopping(stopping_rounds=50)]
        )
        return model

    elif model_type == "xgb":
        train_d = xgb.DMatrix(X_train, label=y_train)
        val_d   = xgb.DMatrix(X_val,   label=y_val)

        params = {
            "objective": "binary:logistic",
            "eval_metric": "auc",
            "eta": 0.05,
            "max_depth": 8,
            "subsample": 0.8,
            "colsample_bytree": 0.9
        }

        model = xgb.train(
            params,
            train_d,
            num_boost_round=2000,
            evals=[(val_d, 'val')],
            early_stopping_rounds=50,
            verbose_eval=False
        )
        return model

    else:
        raise ValueError("model_type must be 'lgbm' or 'xgb'")

In [None]:
# 6. Evaluation

def evaluate(model, model_type, X_test, y_test):

    if model_type == "lgbm":
        preds_proba = model.predict(X_test)
    else:
        preds_proba = model.predict(xgb.DMatrix(X_test))

    preds = (preds_proba > 0.5).astype(int)

    return {
        "AUC": round(roc_auc_score(y_test, preds_proba), 5),
        "PR-AUC": round(average_precision_score(y_test, preds_proba), 5),
        "Accuracy": round(accuracy_score(y_test, preds), 5)
    }

In [None]:
# 7. Run TOP-K experiment

K = 20 # or lower
model_type = "xgb"   # or "xgb"

top_k_features = get_top_k_features(feature_importances, K)

X_train_k = X_train[top_k_features]
X_val_k   = X_val[top_k_features]
X_test_k  = X_test[top_k_features]

model = train_model(model_type, X_train_k, y_train, X_val_k, y_val)
results = evaluate(model, model_type, X_test_k, y_test)

print(f"TOP-{K} RESULTS:", results)

In [None]:
# 8. Save Model

model.save_model(f"Models/{model_type}_TOP{K}Features.txt")