In [34]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier

def train_stacking_model(X, y, event_types, event_weights, run_id):
    print(f"\n🟢 Training Stacking Model {run_id}...\n")

    X_train, X_test, y_train, y_test, event_train, event_test = train_test_split(
        X, y, event_types, test_size=0.2, # random split everytime
    )

    print("Fitting LGBM-RF")
    lgbm_rf_model.fit(X_train, y_train)
    print("Fitting LGBM")
    lgbm_model.fit(X_train, y_train)
    print("Fitting XGBoost")
    xgb_model.fit(X_train, y_train)
    print("Fitting MLP")
    mlp_model.fit(X_train, y_train)
    print("Models refitted on the new dataset")

    meta_xgb = XGBClassifier(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        colsample_bytree=0.8,
        subsample=0.8,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42
    )

    base_learners = {
        "LGBM_RF": lgbm_rf_model,
        "LGBM": lgbm_model,
        "XGBoost": xgb_model,
        "MLP": mlp_model
    }

    stacking_model = StackingClassifier(
        estimators=[(name, model) for name, model in base_learners.items()],
        final_estimator=meta_xgb,
        cv="prefit",
        n_jobs=-1, 
        passthrough=False 
    )

    stacking_model.fit(X_train, y_train)
    y_pred = stacking_model.predict(X_test)
    
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
    
    # calculate best signal significance
    y_scores = stacking_model.predict_proba(X_test)[:, 1]

    best_threshold = 0
    best_significance = 0

    for threshold in np.linspace(0.05, 0.95, 50):  
        y_pred = (y_scores >= threshold).astype(int)  

        signal = sum(
            event_weights[event_type] * np.sum((y_pred[event_test == event_type] == 1) & (y_test[event_test == event_type] == 1))
            for event_type in event_weights.keys()
        )
        background = sum(
            event_weights[event_type] * np.sum((y_pred[event_test == event_type] == 1) & (y_test[event_test == event_type] == 0))
            for event_type in event_weights.keys()
        )

        if signal + background > 0:
            signal_significance = signal / np.sqrt(signal + background)
            
            if signal_significance > best_significance:
                best_significance = signal_significance
                best_threshold = threshold

    print(f"Best Threshold: {best_threshold:.3f}")
    print(f"Best Weighted Signal Significance): {best_significance:.4f}")

    # compute weighted background for chosen threshold
    y_pred = (y_scores >= best_threshold).astype(int)  

    surviving_background = {event: 0 for event in event_weights.keys()}
    total_weighted_FP = 0  

    for event_type in event_weights.keys():
        weight = event_weights[event_type] # get event weight
        event_mask = (event_test.values == event_type) 
        
        FP_count = np.sum((y_pred[event_mask] == 1) & (y_test.values[event_mask] == 0))  # false positives
        
        weighted_FP = FP_count * weight  
        surviving_background[event_type] = weighted_FP
        total_weighted_FP += weighted_FP  

    print(f"Total Weighted Surviving Background Events: {total_weighted_FP:.4f}")

    print("Weighted False Positives Per Background Type:")
    for event_type, fp_weight in surviving_background.items():
        print(f"  {event_type}: {fp_weight:.4f}")

    return best_significance, total_weighted_FP

# TRIALS
signal_significance_scores = []
weighted_background_scores = []

for run_id in range(20):
    significance, weighted_background = train_stacking_model(X, y, event_types, event_weights, run_id)
    signal_significance_scores.append(significance)
    weighted_background_scores.append(weighted_background)

mean_signal_significance = np.mean(signal_significance_scores)
variance_signal_significance = np.var(signal_significance_scores)

# Print final results
print("\nFinal Results Across 5 Models:")
print(f"Mean Signal Significance: {mean_signal_significance:.4f}")
print(f"Variance of Signal Significance: {variance_signal_significance:.4f}")
print(f"Mean Weighted Background: {mean_weighted_background:.4f}")
print(f"Variance of Weighted Background: {variance_weighted_background:.4f}")



🟢 Training Stacking Model 0...

Fitting LGBM-RF
[LightGBM] [Info] Number of positive: 184203, number of negative: 672981
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.140329 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15703
[LightGBM] [Info] Number of data points in the train set: 857184, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.214893 -> initscore=-1.295679
[LightGBM] [Info] Start training from score -1.295679
Fitting LGBM
[LightGBM] [Info] Number of positive: 184203, number of negative: 672981
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 15703
[LightGBM] [Info] Number of data points in the train set: 857184, number of used features: 64
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Inf

Parameters: { "use_label_encoder" } are not used.



✅ Training Complete
✅ Accuracy: 0.8842
✅ Recall: 0.6540
✅ F1 Score: 0.7093
✅ Best Threshold: 0.950
✅ Best Weighted Signal Significance (S/√(S+B)): 7.8194
✅ Total Weighted Surviving Background Events: 71.8304
✅ Weighted False Positives Per Background Type:
  HH: 0.0000
  ZZ: 0.9868
  ZH: 0.2516
  WW: 0.0000
  tt: 10.0600
  qqX: 0.0000
  qqqqX: 0.0000
  qqHX: 0.5950
  qq: 5.4095
  pebb: 22.6080
  pebbqq: 4.5660
  peqqH: 25.3585
  pett: 1.9950

🟢 Training Stacking Model 1...

Fitting LGBM-RF
[LightGBM] [Info] Number of positive: 184359, number of negative: 672825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.141924 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15705
[LightGBM] [Info] Number of data points in the train set: 857184, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.215075 -> initscore=-1.294600
[LightGBM] [Info] Start training from score -1.294600
Fitting 

Parameters: { "use_label_encoder" } are not used.



✅ Training Complete
✅ Accuracy: 0.8839
✅ Recall: 0.6554
✅ F1 Score: 0.7085
✅ Best Threshold: 0.932
✅ Best Weighted Signal Significance (S/√(S+B)): 8.1484
✅ Total Weighted Surviving Background Events: 85.5965
✅ Weighted False Positives Per Background Type:
  HH: 0.0000
  ZZ: 0.9868
  ZH: 0.4313
  WW: 2.5745
  tt: 17.6050
  qqX: 0.2174
  qqqqX: 0.4000
  qqHX: 0.7200
  qq: 7.6780
  pebb: 15.0720
  pebbqq: 7.6100
  peqqH: 30.3065
  pett: 1.9950

🟢 Training Stacking Model 2...

Fitting LGBM-RF
[LightGBM] [Info] Number of positive: 184287, number of negative: 672897
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.154338 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15707
[LightGBM] [Info] Number of data points in the train set: 857184, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.214991 -> initscore=-1.295098
[LightGBM] [Info] Start training from score -1.295098
Fitting 

Parameters: { "use_label_encoder" } are not used.



✅ Training Complete
✅ Accuracy: 0.8847
✅ Recall: 0.6575
✅ F1 Score: 0.7110
✅ Best Threshold: 0.950
✅ Best Weighted Signal Significance (S/√(S+B)): 7.9495
✅ Total Weighted Surviving Background Events: 66.2589
✅ Weighted False Positives Per Background Type:
  HH: 0.0000
  ZZ: 0.0000
  ZH: 0.3354
  WW: 0.0000
  tt: 17.6050
  qqX: 0.0000
  qqqqX: 0.2000
  qqHX: 0.5900
  qq: 5.7585
  pebb: 18.8400
  pebbqq: 2.2830
  peqqH: 19.7920
  pett: 0.8550

🟢 Training Stacking Model 3...

Fitting LGBM-RF
[LightGBM] [Info] Number of positive: 184283, number of negative: 672901
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.155602 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15710
[LightGBM] [Info] Number of data points in the train set: 857184, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.214987 -> initscore=-1.295126
[LightGBM] [Info] Start training from score -1.295126
Fitting 

Parameters: { "use_label_encoder" } are not used.



✅ Training Complete
✅ Accuracy: 0.8838
✅ Recall: 0.6551
✅ F1 Score: 0.7087
✅ Best Threshold: 0.950
✅ Best Weighted Signal Significance (S/√(S+B)): 8.4433
✅ Total Weighted Surviving Background Events: 46.6617
✅ Weighted False Positives Per Background Type:
  HH: 0.0000
  ZZ: 1.9737
  ZH: 0.2276
  WW: 2.5745
  tt: 5.0300
  qqX: 0.2174
  qqqqX: 0.2000
  qqHX: 0.5300
  qq: 6.6310
  pebb: 7.5360
  pebbqq: 2.2830
  peqqH: 19.1735
  pett: 0.2850

🟢 Training Stacking Model 4...

Fitting LGBM-RF
[LightGBM] [Info] Number of positive: 184524, number of negative: 672660
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.152736 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15704
[LightGBM] [Info] Number of data points in the train set: 857184, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.215268 -> initscore=-1.293460
[LightGBM] [Info] Start training from score -1.293460
Fitting LG

Parameters: { "use_label_encoder" } are not used.



✅ Training Complete
✅ Accuracy: 0.8849
✅ Recall: 0.6586
✅ F1 Score: 0.7106
✅ Best Threshold: 0.932
✅ Best Weighted Signal Significance (S/√(S+B)): 8.0222
✅ Total Weighted Surviving Background Events: 92.2778
✅ Weighted False Positives Per Background Type:
  HH: 0.0000
  ZZ: 0.9868
  ZH: 0.5271
  WW: 0.0000
  tt: 35.2100
  qqX: 0.2174
  qqqqX: 0.4000
  qqHX: 0.7550
  qq: 9.0740
  pebb: 3.7680
  pebbqq: 3.8050
  peqqH: 35.2545
  pett: 2.2800

Final Results Across 5 Models:
Mean Signal Significance: 8.0765
Variance of Signal Significance: 0.0450
Mean Weighted Background: 72.5251
Variance of Weighted Background: 253.9393
