In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from tqdm.notebook import tqdm  # Use notebook version for Jupyter
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib
from datetime import datetime

In [7]:
# Define the folder containing the raw dataset files
raw_data_folder = "raw dataset"  # Update with your actual folder path

# Get all file names in the folder
all_files = sorted([f for f in os.listdir(raw_data_folder) if f.endswith(".csv")])

# Identify signal and background files
signal_file = [f for f in all_files if "signal" in f.lower()][0]  # Assumes "signal" is in filename
background_files = sorted([f for f in all_files if "B" in f.upper()])  # Assumes "B" in filename means background

# Load the signal dataset and add a label column
signal_df = pd.read_csv(os.path.join(raw_data_folder, signal_file))
# Drop 'nParticles' if it exists
if 'nParticles' in signal_df.columns:
    signal_df.drop(columns=['nParticles'], inplace=True)
signal_df["label"] = 1  # Assign label 1 for signal events

# Load background datasets and add a label column
background_dfs = []
background_labels = []  # Store filenames for indexing reference
background_types = []  # Store background type

# Define weights for each background type
background_weights = {
    "HH": 0.0015552 * 1.155,
    "qq": 0.0349,
    "ttbar": 0.503,
    "ZZ": 0.17088 * 1.155,
    "WW": 0.5149,
    "qqX": 0.04347826,
    "qqqqX": 0.04,
    "qqHX": 0.001,
    "ZH": 0.00207445 * 1.155,
    "pebb": 0.7536,
    "pebbqq": 0.1522,
    "peqqH": 0.1237,
    "pett": 0.0570,
}

# Apply reweighting factor for the test set
test_size = 0.25
reweight_factor = 1 / test_size  # = 4.0
background_weights = {k: v * reweight_factor for k, v in background_weights.items()}

for idx, bg_file in enumerate(background_files):
    bg_df = pd.read_csv(os.path.join(raw_data_folder, bg_file))

    # Drop 'nParticles' if it exists
    if 'nParticles' in bg_df.columns:
        bg_df.drop(columns=['nParticles'], inplace=True)

    bg_df["label"] = 0  # Assign label 0 for background events

    # Extract background type from filename (remove "B" and ".csv")
    bg_type = bg_file[1:].replace(".csv", "")
    bg_df["background_type"] = bg_type  # Store background type

    background_dfs.append(bg_df)
    background_labels.append(bg_file)  # Store file name for reference
    background_types.append(bg_type)

# Drop 'background_type' column before extracting features
X_signal = signal_df.drop(columns=["label"], errors="ignore")  # Ensure label is dropped
y_signal = signal_df["label"]  # Extract labels

X_backgrounds = [bg.drop(columns=["label", "background_type"], errors="ignore") for bg in background_dfs]  # Drop extra columns
y_backgrounds = [bg["label"] for bg in background_dfs]  # Extract labels correctly

# # Extract features (X) and labels (y)
# X_signal = signal_df.iloc[:, :-1]  # Features for signal
# y_signal = signal_df.iloc[:, -1]   # Labels for signal

# X_backgrounds = [bg.iloc[:, :-1] for bg in background_dfs]  # Features for each background dataset
# y_backgrounds = [bg.iloc[:, -1] for bg in background_dfs]  # Labels

In [8]:
# Create a directory to store the split datasets
os.makedirs('split_datasets', exist_ok=True)

# First, create a consistent three-way split for the signal dataset
X_train_val_signal, X_test_signal, y_train_val_signal, y_test_signal = train_test_split(
    X_signal, y_signal, test_size=test_size, random_state=42, stratify=y_signal
)

X_train_signal, X_val_signal, y_train_signal, y_val_signal = train_test_split(
    X_train_val_signal, y_train_val_signal, test_size=0.2, random_state=42, stratify=y_train_val_signal
)

# Save signal splits
joblib.dump(X_train_signal, 'split_datasets/X_train_signal.pkl')
joblib.dump(X_val_signal, 'split_datasets/X_val_signal.pkl')
joblib.dump(X_test_signal, 'split_datasets/X_test_signal.pkl')
joblib.dump(y_train_signal, 'split_datasets/y_train_signal.pkl')
joblib.dump(y_val_signal, 'split_datasets/y_val_signal.pkl')
joblib.dump(y_test_signal, 'split_datasets/y_test_signal.pkl')

# Prepare to store test background types
background_types_train = []
background_types_val = []
background_types_test = []

# Split and save each background dataset
for i, bg_label in enumerate(background_labels):
    clean_name = bg_label.replace('.csv', '').replace(' ', '_')

    X_bg = X_backgrounds[i]
    y_bg = y_backgrounds[i]

    X_train_val_bg, X_test_bg, y_train_val_bg, y_test_bg = train_test_split(
        X_bg, y_bg, test_size=test_size, random_state=42, stratify=y_bg
    )

    X_train_bg, X_val_bg, y_train_bg, y_val_bg = train_test_split(
        X_train_val_bg, y_train_val_bg, test_size=0.2, random_state=42, stratify=y_train_val_bg
    )

    # Store background types for test set
    background_types_train.extend([background_labels[i][1:].replace(".csv", "")] * len(X_train_bg))
    background_types_val.extend([background_labels[i][1:].replace(".csv", "")] * len(X_val_bg))
    background_types_test.extend([background_labels[i][1:].replace(".csv", "")] * len(X_test_bg))

    # Save background splits
    joblib.dump(X_train_bg, f'split_datasets/X_train_{clean_name}.pkl')
    joblib.dump(X_val_bg, f'split_datasets/X_val_{clean_name}.pkl')
    joblib.dump(X_test_bg, f'split_datasets/X_test_{clean_name}.pkl')
    joblib.dump(y_train_bg, f'split_datasets/y_train_{clean_name}.pkl')
    joblib.dump(y_val_bg, f'split_datasets/y_val_{clean_name}.pkl')
    joblib.dump(y_test_bg, f'split_datasets/y_test_{clean_name}.pkl')

    print(f"Saved splits for background {i}: {bg_label}")

# Save background types for the test set
joblib.dump(background_types_train, 'split_datasets/background_types_train.pkl')
joblib.dump(background_types_val, 'split_datasets/background_types_val.pkl')
joblib.dump(background_types_test, 'split_datasets/background_types_test.pkl')

print("All dataset splits have been saved!")

# # Split and save each background dataset
# for i, bg_label in enumerate(background_labels):
#     # Get clean filename for saving
#     clean_name = bg_label.replace('.csv', '').replace(' ', '_')
    
#     X_bg = X_backgrounds[i]
#     y_bg = y_backgrounds[i]
    
#     X_train_val_bg, X_test_bg, y_train_val_bg, y_test_bg = train_test_split(
#         X_bg, y_bg, test_size=0.25, random_state=42, stratify=y_bg
#     )
    
#     X_train_bg, X_val_bg, y_train_bg, y_val_bg = train_test_split(
#         X_train_val_bg, y_train_val_bg, test_size=0.2, random_state=42, stratify=y_train_val_bg
#     )
    
#     # Save background splits
#     joblib.dump(X_train_bg, f'split_datasets/X_train_{clean_name}.pkl')
#     joblib.dump(X_val_bg, f'split_datasets/X_val_{clean_name}.pkl')
#     joblib.dump(X_test_bg, f'split_datasets/X_test_{clean_name}.pkl')
#     joblib.dump(y_train_bg, f'split_datasets/y_train_{clean_name}.pkl')
#     joblib.dump(y_val_bg, f'split_datasets/y_val_{clean_name}.pkl')
#     joblib.dump(y_test_bg, f'split_datasets/y_test_{clean_name}.pkl')
    
#     print(f"Saved splits for background {i}: {bg_label}")

# print("All dataset splits have been saved!")

Saved splits for background 0: BWW.csv
Saved splits for background 1: BZH.csv
Saved splits for background 2: BZZ.csv
Saved splits for background 3: Bpebb.csv
Saved splits for background 4: Bpebbqq.csv
Saved splits for background 5: BpeqqH.csv
Saved splits for background 6: Bpett.csv
Saved splits for background 7: Bqq.csv
Saved splits for background 8: BqqHX.csv
Saved splits for background 9: BqqX.csv
Saved splits for background 10: BqqqqX.csv
Saved splits for background 11: Btt.csv
All dataset splits have been saved!


In [4]:
# Train simple BTD models
# Store trained models and test sets
trained_models = []
train_test_splits = []

depth = 2
n = 100
lr = 0.1

print("\nStarting BDT Training...\n")

# Train simple BDT models
# Store trained models
trained_models = []

depth = 2
n = 100
lr = 0.1

print("\nStarting BDT Training...\n")

# Load signal training data
X_train_signal = joblib.load('split_datasets/X_train_signal.pkl')
y_train_signal = joblib.load('split_datasets/y_train_signal.pkl')

# Initialize progress bar correctly in Jupyter
with tqdm(total=12, desc="Training Progress", unit="model", leave=True) as pbar:
    for i in range(12):
        start_time = time.time()  # Track time for each model

        # Clean filename for loading
        clean_name = background_labels[i].replace('.csv', '').replace(' ', '_')
        
        # Load background training data
        X_train_bg = joblib.load(f'split_datasets/X_train_{clean_name}.pkl')
        y_train_bg = joblib.load(f'split_datasets/y_train_{clean_name}.pkl')
        
        # Combine signal + one background dataset for training
        X_train_combined = pd.concat([X_train_signal, X_train_bg])
        y_train_combined = np.concatenate([y_train_signal, y_train_bg])

        # Train a Boosted Decision Tree (BDT)
        bdt = AdaBoostClassifier(
            estimator=DecisionTreeClassifier(max_depth=depth),  
            n_estimators=n, 
            learning_rate=lr,
            algorithm="SAMME"
        )

        bdt.fit(X_train_combined, y_train_combined)

        # Store trained model
        trained_models.append(bdt)

        # Use `tqdm.write()` instead of `print()`
        elapsed_time = time.time() - start_time
        tqdm.write(f"✔ Model {i+1}/12 trained on {background_labels[i]} (Time: {elapsed_time:.2f} sec)")

        # Update progress bar
        pbar.update(1)

tqdm.write("\n✅ Training Complete! All models are ready.\n")

# # Initialize progress bar correctly in Jupyter
# with tqdm(total=12, desc="Training Progress", unit="model", leave=True) as pbar:
#     for i in range(12):
#         start_time = time.time()  # Track time for each model

#         # Combine signal + one background dataset
#         X_combined = pd.concat([X_signal, X_backgrounds[i]])
#         y_combined = np.concatenate([y_signal, y_backgrounds[i]])

#         # Split into train (75%) and test (25%)
#         X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.25, random_state=42)

#         # Train a Boosted Decision Tree (BDT)
#         bdt = AdaBoostClassifier(
#             estimator=DecisionTreeClassifier(max_depth=depth),  
#             n_estimators=n, 
#             learning_rate=lr,
#             algorithm="SAMME"
#         )

#         bdt.fit(X_train, y_train)

#         # Store trained model and test data
#         trained_models.append(bdt)
#         train_test_splits.append((X_test, y_test))

#         # Use `tqdm.write()` instead of `print()`
#         elapsed_time = time.time() - start_time
#         tqdm.write(f"✔ Model {i+1}/12 trained on {background_labels[i]} (Time: {elapsed_time:.2f} sec)")

#         # Update progress bar
#         pbar.update(1)

# tqdm.write("\n✅ Training Complete! All models are ready.\n")


Starting BDT Training...


Starting BDT Training...



Training Progress:   0%|          | 0/12 [00:00<?, ?model/s]



✔ Model 1/12 trained on BWW.csv (Time: 166.74 sec)




✔ Model 2/12 trained on BZH.csv (Time: 257.60 sec)




✔ Model 3/12 trained on BZZ.csv (Time: 2967.88 sec)




✔ Model 4/12 trained on Bpebb.csv (Time: 216.72 sec)




✔ Model 5/12 trained on Bpebbqq.csv (Time: 233.46 sec)




✔ Model 6/12 trained on BpeqqH.csv (Time: 252.57 sec)




✔ Model 7/12 trained on Bpett.csv (Time: 181.97 sec)




✔ Model 8/12 trained on Bqq.csv (Time: 288.71 sec)




✔ Model 9/12 trained on BqqHX.csv (Time: 317.38 sec)




✔ Model 10/12 trained on BqqX.csv (Time: 178.74 sec)




✔ Model 11/12 trained on BqqqqX.csv (Time: 165.68 sec)




✔ Model 12/12 trained on Btt.csv (Time: 207.63 sec)

✅ Training Complete! All models are ready.



In [5]:
# Define the folder to save models
model_dir = "models"
os.makedirs(model_dir, exist_ok=True)  # Create folder if it doesn't exist

# Save each trained model with detailed filename
for i, model in enumerate(trained_models):
    filename = f"bdt_model_bg{i+1}_depth{depth}_n{n}_lr{lr}.joblib"
    filepath = os.path.join(model_dir, filename)
    joblib.dump(model, filepath)
    print(f"Model {i+1} saved to {filepath}")

Model 1 saved to models/bdt_model_bg1_depth2_n100_lr0.1.joblib
Model 2 saved to models/bdt_model_bg2_depth2_n100_lr0.1.joblib
Model 3 saved to models/bdt_model_bg3_depth2_n100_lr0.1.joblib
Model 4 saved to models/bdt_model_bg4_depth2_n100_lr0.1.joblib
Model 5 saved to models/bdt_model_bg5_depth2_n100_lr0.1.joblib
Model 6 saved to models/bdt_model_bg6_depth2_n100_lr0.1.joblib
Model 7 saved to models/bdt_model_bg7_depth2_n100_lr0.1.joblib
Model 8 saved to models/bdt_model_bg8_depth2_n100_lr0.1.joblib
Model 9 saved to models/bdt_model_bg9_depth2_n100_lr0.1.joblib
Model 10 saved to models/bdt_model_bg10_depth2_n100_lr0.1.joblib
Model 11 saved to models/bdt_model_bg11_depth2_n100_lr0.1.joblib
Model 12 saved to models/bdt_model_bg12_depth2_n100_lr0.1.joblib


In [6]:
# Load signal test data
X_test_signal = joblib.load('split_datasets/X_test_signal.pkl')
y_test_signal = joblib.load('split_datasets/y_test_signal.pkl')

# Load background test datasets
X_test_backgrounds = []
y_test_backgrounds = []

for bg_file in background_labels:
    clean_name = bg_file.replace('.csv', '').replace(' ', '_')
    X_test_bg = joblib.load(f'split_datasets/X_test_{clean_name}.pkl')
    y_test_bg = joblib.load(f'split_datasets/y_test_{clean_name}.pkl')
    
    X_test_backgrounds.append(X_test_bg)
    y_test_backgrounds.append(y_test_bg)

# Initialize a 12x13 matrix to store results
output_matrix = np.zeros((12, 13))

# Evaluate each trained BDT model on the test datasets
for model_idx, model in enumerate(trained_models):
    for dataset_idx, dataset in enumerate([X_test_signal] + X_test_backgrounds):  
        # Get predicted probability of being signal
        predictions = model.predict_proba(dataset)[:, 1]  # Extract P(class=1) (signal probability)
        
        # Store the average probability of being signal on **test dataset only**
        output_matrix[model_idx, dataset_idx] = np.mean(predictions)

# Create DataFrame for visualization
datasets = ["Signal Test"] + [f"Background {i+1} Test" for i in range(12)]
model_labels = [f"Model {i+1}" for i in range(12)]

df_results = pd.DataFrame(output_matrix, index=model_labels, columns=datasets)

df_results


Unnamed: 0,Signal Test,Background 1 Test,Background 2 Test,Background 3 Test,Background 4 Test,Background 5 Test,Background 6 Test,Background 7 Test,Background 8 Test,Background 9 Test,Background 10 Test,Background 11 Test,Background 12 Test
Model 1,0.830092,0.522488,0.789417,0.729303,0.664949,0.790988,0.826804,0.805323,0.557029,0.816805,0.657025,0.786037,0.784022
Model 2,0.716942,0.450917,0.478177,0.438582,0.447671,0.554803,0.67414,0.661642,0.491773,0.666135,0.455183,0.580045,0.737439
Model 3,0.758447,0.473946,0.618781,0.428758,0.433737,0.549142,0.658349,0.672656,0.511647,0.649224,0.432573,0.581949,0.805927
Model 4,0.783884,0.583464,0.718744,0.618569,0.31666,0.603033,0.659444,0.663665,0.55706,0.686363,0.346506,0.655702,0.795954
Model 5,0.757006,0.58936,0.672788,0.572893,0.458409,0.407339,0.473991,0.570513,0.6165,0.588306,0.524924,0.570369,0.757428
Model 6,0.731884,0.596802,0.669612,0.595648,0.454242,0.397495,0.410166,0.524707,0.631354,0.560646,0.535347,0.572344,0.729097
Model 7,0.789843,0.743163,0.775685,0.766907,0.740535,0.733954,0.734542,0.562559,0.761066,0.730211,0.760102,0.720166,0.635722
Model 8,0.753819,0.358388,0.700176,0.624582,0.443739,0.716683,0.758758,0.701672,0.293732,0.737114,0.424522,0.700906,0.455015
Model 9,0.678968,0.608082,0.654303,0.559328,0.447581,0.376136,0.408135,0.501965,0.634333,0.405804,0.458886,0.422025,0.778306
Model 10,0.818564,0.636414,0.764232,0.687371,0.397175,0.723322,0.775027,0.75913,0.607725,0.762132,0.392722,0.725604,0.84621
