In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from tqdm.notebook import tqdm  # Use notebook version for Jupyter
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib
from datetime import datetime

In [16]:
# Define the folder containing the raw dataset files
raw_data_folder = "raw dataset"  # Update with your actual folder path

# Get all file names in the folder
all_files = sorted([f for f in os.listdir(raw_data_folder) if f.endswith(".csv")])

# Identify signal and background files
signal_file = [f for f in all_files if "signal" in f.lower()][0]  # Assumes "signal" is in filename
background_files = sorted([f for f in all_files if "B" in f.upper()])  # Assumes "B" in filename means background

# Load the signal dataset and add a label column
signal_df = pd.read_csv(os.path.join(raw_data_folder, signal_file))
signal_df["label"] = 1  # Assign label 1 for signal events

# Load background datasets and add a label column
background_dfs = []
background_labels = []  # Store filenames for indexing reference

for idx, bg_file in enumerate(background_files):
    bg_df = pd.read_csv(os.path.join(raw_data_folder, bg_file))
    bg_df["label"] = 0  # Assign label 0 for background events
    background_dfs.append(bg_df)
    background_labels.append(bg_file)  # Store file name for reference
    print(f"Background {idx}: {bg_file}")  # Print index and file name

# Extract features (X) and labels (y)
X_signal = signal_df.iloc[:, :-1]  # Features for signal
y_signal = signal_df.iloc[:, -1]   # Labels for signal

X_backgrounds = [bg.iloc[:, :-1] for bg in background_dfs]  # Features for each background dataset
y_backgrounds = [bg.iloc[:, -1] for bg in background_dfs]  # Labels

Background 0: BWW.csv
Background 1: BZH.csv
Background 2: BZZ.csv
Background 3: Bpebb.csv
Background 4: Bpebbqq.csv
Background 5: BpeqqH.csv
Background 6: Bpett.csv
Background 7: Bqq.csv
Background 8: BqqHX.csv
Background 9: BqqX.csv
Background 10: BqqqqX.csv
Background 11: Btt.csv


In [None]:
# Train simple BTD models
# Store trained models and test sets
trained_models = []
train_test_splits = []

depth = 2
n = 100
lr = 0.1

# for i in range(12):
#     # Combine signal + one background dataset
#     X_combined = pd.concat([X_signal, X_backgrounds[i]])
#     y_combined = np.concatenate([y_signal, y_backgrounds[i]])

#     # Split into train (75%) and test (25%)
#     X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.25, random_state=42)

#     # Train a Boosted Decision Tree (BDT)
#     bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=100, learning_rate=0.1)
#     bdt.fit(X_train, y_train)

#     # Store trained model and test data
#     trained_models.append(bdt)
#     train_test_splits.append((X_test, y_test))

print("\nStarting BDT Training...\n")

# Initialize progress bar correctly in Jupyter
with tqdm(total=12, desc="Training Progress", unit="model", leave=True) as pbar:
    for i in range(12):
        start_time = time.time()  # Track time for each model

        # Combine signal + one background dataset
        X_combined = pd.concat([X_signal, X_backgrounds[i]])
        y_combined = np.concatenate([y_signal, y_backgrounds[i]])

        # Split into train (75%) and test (25%)
        X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.25, random_state=42)

        # Train a Boosted Decision Tree (BDT)
        bdt = AdaBoostClassifier(
            estimator=DecisionTreeClassifier(max_depth=depth),  
            n_estimators=n, 
            learning_rate=lr,
            algorithm="SAMME"
        )

        bdt.fit(X_train, y_train)

        # Store trained model and test data
        trained_models.append(bdt)
        train_test_splits.append((X_test, y_test))

        # Use `tqdm.write()` instead of `print()`
        elapsed_time = time.time() - start_time
        tqdm.write(f"✔ Model {i+1}/12 trained on {background_labels[i]} (Time: {elapsed_time:.2f} sec)")

        # Update progress bar
        pbar.update(1)

tqdm.write("\n✅ Training Complete! All models are ready.\n")


Starting BDT Training...



Training Progress:   0%|          | 0/12 [00:00<?, ?model/s]



✔ Model 1/12 trained on BWW.csv (Time: 212.68 sec)




✔ Model 2/12 trained on BZH.csv (Time: 335.61 sec)




✔ Model 3/12 trained on BZZ.csv (Time: 281.50 sec)




✔ Model 4/12 trained on Bpebb.csv (Time: 280.03 sec)




✔ Model 5/12 trained on Bpebbqq.csv (Time: 302.71 sec)




✔ Model 6/12 trained on BpeqqH.csv (Time: 318.09 sec)




✔ Model 7/12 trained on Bpett.csv (Time: 227.41 sec)




✔ Model 8/12 trained on Bqq.csv (Time: 354.41 sec)




✔ Model 9/12 trained on BqqHX.csv (Time: 399.32 sec)




✔ Model 10/12 trained on BqqX.csv (Time: 227.25 sec)




✔ Model 11/12 trained on BqqqqX.csv (Time: 216.57 sec)




✔ Model 12/12 trained on Btt.csv (Time: 217.96 sec)

✅ Training Complete! All models are ready.



In [None]:
# Define the folder to save models
model_dir = "models"
os.makedirs(model_dir, exist_ok=True)  # Create folder if it doesn't exist

# Save each trained model with detailed filename
for i, model in enumerate(trained_models):
    filename = f"bdt_model_bg{i+1}_depth{depth}_n{n}_lr{lr}.joblib"
    filepath = os.path.join(model_dir, filename)
    joblib.dump(model, filepath)
    print(f"Model {i+1} saved to {filepath}")

Model 1 saved to models/bdt_model_bg1_depth2_n100_lr0.1.joblib
Model 2 saved to models/bdt_model_bg2_depth2_n100_lr0.1.joblib
Model 3 saved to models/bdt_model_bg3_depth2_n100_lr0.1.joblib
Model 4 saved to models/bdt_model_bg4_depth2_n100_lr0.1.joblib
Model 5 saved to models/bdt_model_bg5_depth2_n100_lr0.1.joblib
Model 6 saved to models/bdt_model_bg6_depth2_n100_lr0.1.joblib
Model 7 saved to models/bdt_model_bg7_depth2_n100_lr0.1.joblib
Model 8 saved to models/bdt_model_bg8_depth2_n100_lr0.1.joblib
Model 9 saved to models/bdt_model_bg9_depth2_n100_lr0.1.joblib
Model 10 saved to models/bdt_model_bg10_depth2_n100_lr0.1.joblib
Model 11 saved to models/bdt_model_bg11_depth2_n100_lr0.1.joblib
Model 12 saved to models/bdt_model_bg12_depth2_n100_lr0.1.joblib


In [None]:
# Load models into a list (uncomment)
# loaded_models = []
# for i in range(12):  # Assuming 12 models
#     filename = f"bdt_model_bg{i+1}_depth2_n100_lr0.1.joblib"
#     filepath = os.path.join(model_dir, filename)
    
#     if os.path.exists(filepath):  # Check if file exists before loading
#         model = joblib.load(filepath)
#         loaded_models.append(model)
#         print(f"Loaded Model {i+1} from {filepath}")
#     else:
#         print(f"Model {i+1} not found, you may need to train it first.")

In [8]:
# Initialize a 12x13 matrix to store results
output_matrix = np.zeros((12, 13))

# Evaluate each trained model
for model_idx, model in enumerate(trained_models):
    for dataset_idx, dataset in enumerate([X_signal] + X_backgrounds):  
        predictions = model.predict_proba(dataset)[:, 1]  # Get probability of being signal
        output_matrix[model_idx, dataset_idx] = np.mean(predictions)  # Average probability

In [11]:
# Create DataFrame for visualization
datasets = ["Signal"] + [f"Background {i+1}" for i in range(12)]
model_labels = [f"Model {i+1}" for i in range(12)]

df_results = pd.DataFrame(output_matrix, index=model_labels, columns=datasets)
df_results

Unnamed: 0,Signal,Background 1,Background 2,Background 3,Background 4,Background 5,Background 6,Background 7,Background 8,Background 9,Background 10,Background 11,Background 12
Model 1,0.82933,0.518853,0.78973,0.729123,0.660186,0.792251,0.826352,0.805004,0.546488,0.8161,0.65374,0.785044,0.741435
Model 2,0.718104,0.453543,0.479341,0.439948,0.45378,0.558133,0.676278,0.668601,0.496444,0.668854,0.461575,0.572301,0.734033
Model 3,0.760446,0.469451,0.617597,0.427545,0.422012,0.549956,0.657868,0.675237,0.511196,0.649354,0.423028,0.575309,0.801717
Model 4,0.782636,0.577541,0.71986,0.620379,0.314716,0.597573,0.654531,0.662603,0.550508,0.685059,0.351786,0.650289,0.794562
Model 5,0.75604,0.588866,0.671874,0.575086,0.466459,0.407967,0.476452,0.574316,0.616441,0.586483,0.530589,0.565403,0.758752
Model 6,0.733044,0.60368,0.673059,0.600643,0.461829,0.398319,0.409299,0.529636,0.637337,0.560735,0.544431,0.568625,0.735133
Model 7,0.787894,0.742779,0.772597,0.76471,0.740871,0.731651,0.730455,0.554853,0.760857,0.731797,0.76129,0.715189,0.644955
Model 8,0.752996,0.358355,0.697628,0.62382,0.441038,0.715498,0.758033,0.701991,0.291595,0.73704,0.424869,0.698345,0.455589
Model 9,0.680289,0.611787,0.655951,0.560006,0.446233,0.372373,0.402835,0.50295,0.634751,0.403931,0.458518,0.41525,0.775237
Model 10,0.820191,0.637893,0.762925,0.687892,0.395469,0.726147,0.778457,0.763381,0.613091,0.765423,0.393905,0.721822,0.845053


In [None]:
# Load trained models

model_dir = "models"

trained_models = []
for i in range(12):
    filename = f"bdt_model_bg{i+1}_depth2_n100_lr0.1.joblib"  # Ensure this matches your saved format
    filepath = os.path.join(model_dir, filename)
    
    if os.path.exists(filepath):
        model = joblib.load(filepath)
        trained_models.append(model)
        print(f"Loaded Model {i+1} from {filepath}")
    else:
        print(f"Model {i+1} not found, you may need to train it first.")