In [5]:
# Import Libraries
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from tqdm.notebook import tqdm  # Use notebook version for Jupyter
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib
import xgboost as xgb
from datetime import datetime

In [None]:
# Define the folder containing the raw dataset files
raw_data_folder = "raw dataset"  # Update with your actual folder path

# Get all file names in the folder
all_files = sorted([f for f in os.listdir(raw_data_folder) if f.endswith(".csv")])

# Identify signal and background files
signal_file = [f for f in all_files if "signal" in f.lower()][0]  # Assumes "signal" is in filename
background_files = sorted([f for f in all_files if "B" in f.upper()])  # Assumes "B" in filename means background

# Load the signal dataset and add a label column
signal_df = pd.read_csv(os.path.join(raw_data_folder, signal_file))
signal_df["label"] = 1  # Assign label 1 for signal events

# Load background datasets and add a label column
background_dfs = []
background_labels = []  # Store filenames for indexing reference

for idx, bg_file in enumerate(background_files):
    bg_df = pd.read_csv(os.path.join(raw_data_folder, bg_file))
    bg_df["label"] = 0  # Assign label 0 for background events
    background_dfs.append(bg_df)
    background_labels.append(bg_file)  # Store file name for reference
    print(f"Background {idx}: {bg_file}")  # Print index and file name

# Extract features (X) and labels (y)
X_signal = signal_df.iloc[:, :-1]  # Features for signal
y_signal = signal_df.iloc[:, -1]   # Labels for signal

X_backgrounds = [bg.iloc[:, :-1] for bg in background_dfs]  # Features for each background dataset
y_backgrounds = [bg.iloc[:, -1] for bg in background_dfs]  # Labels

Background 0: BWW.csv
Background 1: BZH.csv
Background 2: BZZ.csv
Background 3: Bpebb.csv
Background 4: Bpebbqq.csv
Background 5: BpeqqH.csv
Background 6: Bpett.csv
Background 7: Bqq.csv
Background 8: BqqHX.csv
Background 9: BqqX.csv
Background 10: BqqqqX.csv
Background 11: Btt.csv


In [7]:
depth = 2
n = 100
lr = 0.1

# Store trained models and test sets
trained_xgb_models = []
train_test_splits = []

print("\nStarting XGBoost Training...\n")

# Initialize tqdm progress bar
with tqdm(total=12, desc="Training Progress", unit="model", leave=True) as pbar:
    for i in range(12):
        start_time = time.time()  # Track time for each model

        # Combine signal + one background dataset
        X_combined = np.vstack([X_signal, X_backgrounds[i]])
        y_combined = np.concatenate([y_signal, y_backgrounds[i]])

        # Split into train (75%) and test (25%)
        X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.25, random_state=42)

        # Convert to XGBoost DMatrix (optimized for speed)
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test, label=y_test)

        # Define XGBoost parameters
        xgb_params = {
            "objective": "binary:logistic",  # Binary classification
            "eval_metric": "logloss",  # Log-loss for binary classification
            "max_depth": depth,  # Similar to BDT depth
            "learning_rate": lr,  # Step size
            "n_estimators": n,  # Number of boosting rounds
            "tree_method": "hist",  # Optimized for speed
        }

        # Train XGBoost model
        xgb_model = xgb.train(params=xgb_params, dtrain=dtrain, num_boost_round=100)

        # Store trained model and test data
        trained_xgb_models.append(xgb_model)
        train_test_splits.append((X_test, y_test))

        # Print progress without interfering with tqdm
        elapsed_time = time.time() - start_time
        tqdm.write(f"Model {i+1}/12 trained on {background_labels[i]} (Time: {elapsed_time:.2f} sec)")

        # Update progress bar
        pbar.update(1)

print("\nTraining Complete! All models are ready.\n")


Starting XGBoost Training...



Training Progress:   0%|          | 0/12 [00:00<?, ?model/s]

Parameters: { "n_estimators" } are not used.



Model 1/12 trained on BWW.csv (Time: 0.79 sec)


Parameters: { "n_estimators" } are not used.



Model 2/12 trained on BZH.csv (Time: 1.14 sec)


Parameters: { "n_estimators" } are not used.



Model 3/12 trained on BZZ.csv (Time: 1.09 sec)


Parameters: { "n_estimators" } are not used.



Model 4/12 trained on Bpebb.csv (Time: 0.97 sec)


Parameters: { "n_estimators" } are not used.



Model 5/12 trained on Bpebbqq.csv (Time: 1.00 sec)


Parameters: { "n_estimators" } are not used.



Model 6/12 trained on BpeqqH.csv (Time: 1.05 sec)


Parameters: { "n_estimators" } are not used.



Model 7/12 trained on Bpett.csv (Time: 0.76 sec)


Parameters: { "n_estimators" } are not used.



Model 8/12 trained on Bqq.csv (Time: 1.13 sec)


Parameters: { "n_estimators" } are not used.



Model 9/12 trained on BqqHX.csv (Time: 1.20 sec)


Parameters: { "n_estimators" } are not used.



Model 10/12 trained on BqqX.csv (Time: 0.73 sec)
Model 11/12 trained on BqqqqX.csv (Time: 0.72 sec)


Parameters: { "n_estimators" } are not used.



Model 12/12 trained on Btt.csv (Time: 0.68 sec)

Training Complete! All models are ready.



In [8]:
# Define the folder to save models
model_dir = "xgb_models"
os.makedirs(model_dir, exist_ok=True)  # Create folder if it doesn't exist

# Save each trained XGBoost model
for i, model in enumerate(trained_xgb_models):
    filename = f"xgb_model_bg{i+1}_depth{depth}_n{n}_lr{lr}.model"
    filepath = os.path.join(model_dir, filename)
    model.save_model(filepath)
    print(f"Model {i+1} saved to {filepath}")

Model 1 saved to xgb_models/xgb_model_bg1_depth2_n100_lr0.1.model
Model 2 saved to xgb_models/xgb_model_bg2_depth2_n100_lr0.1.model
Model 3 saved to xgb_models/xgb_model_bg3_depth2_n100_lr0.1.model
Model 4 saved to xgb_models/xgb_model_bg4_depth2_n100_lr0.1.model
Model 5 saved to xgb_models/xgb_model_bg5_depth2_n100_lr0.1.model
Model 6 saved to xgb_models/xgb_model_bg6_depth2_n100_lr0.1.model
Model 7 saved to xgb_models/xgb_model_bg7_depth2_n100_lr0.1.model
Model 8 saved to xgb_models/xgb_model_bg8_depth2_n100_lr0.1.model
Model 9 saved to xgb_models/xgb_model_bg9_depth2_n100_lr0.1.model
Model 10 saved to xgb_models/xgb_model_bg10_depth2_n100_lr0.1.model
Model 11 saved to xgb_models/xgb_model_bg11_depth2_n100_lr0.1.model
Model 12 saved to xgb_models/xgb_model_bg12_depth2_n100_lr0.1.model




In [None]:
# Load trained XGBoost models (uncomment)
# loaded_xgb_models = []
# for i in range(12):
#     filename = f"xgb_model_bg{i+1}_depth2_n100_lr0.1.model"
#     filepath = os.path.join(model_dir, filename)
    
#     if os.path.exists(filepath):
#         model = xgb.Booster()
#         model.load_model(filepath)
#         loaded_xgb_models.append(model)
#         print(f"Loaded Model {i+1} from {filepath}")
#     else:
#         print(f"Model {i+1} not found, you may need to train it first.")

In [10]:
# Initialize a 12x13 matrix to store results
output_matrix = np.zeros((12, 13))

# Evaluate each trained XGBoost model
for model_idx, model in enumerate(trained_xgb_models):
    for dataset_idx, dataset in enumerate([X_signal] + X_backgrounds):  
        # Convert dataset to XGBoost DMatrix (necessary for prediction)
        dmatrix = xgb.DMatrix(dataset)
        
        # Get predicted probability (XGBoost automatically returns probabilities for binary classification)
        predictions = model.predict(dmatrix)
        
        # Store the average probability of being signal
        output_matrix[model_idx, dataset_idx] = np.mean(predictions)  # Mean probability

In [12]:
# Create DataFrame for visualization
datasets = ["Signal"] + [f"Background {i+1}" for i in range(12)]
model_labels = [f"Model {i+1}" for i in range(12)]

df_results = pd.DataFrame(output_matrix, index=model_labels, columns=datasets)

# Display table
df_results

Unnamed: 0,Signal,Background 1,Background 2,Background 3,Background 4,Background 5,Background 6,Background 7,Background 8,Background 9,Background 10,Background 11,Background 12
Model 1,0.993649,0.473465,0.976086,0.924206,0.813605,0.974714,0.991889,0.979553,0.561648,0.988394,0.805734,0.968984,0.951971
Model 2,0.804245,0.388103,0.385314,0.325866,0.345268,0.516611,0.690861,0.65853,0.467735,0.690669,0.366423,0.537095,0.826497
Model 3,0.907747,0.398684,0.688099,0.316064,0.311271,0.541892,0.757652,0.750392,0.50555,0.755028,0.317166,0.606262,0.948601
Model 4,0.959195,0.719086,0.89048,0.722945,0.145923,0.661975,0.77462,0.782171,0.667691,0.843504,0.222054,0.786753,0.976075
Model 5,0.906918,0.68417,0.802721,0.63187,0.308784,0.245318,0.373711,0.570032,0.723295,0.694635,0.529048,0.664372,0.910781
Model 6,0.87792,0.744561,0.815153,0.696199,0.321517,0.240395,0.275199,0.489328,0.782762,0.649883,0.546951,0.664872,0.867263
Model 7,0.973228,0.920616,0.95462,0.94935,0.873963,0.922836,0.922949,0.48832,0.939087,0.894678,0.91564,0.854072,0.705936
Model 8,0.928231,0.18736,0.86103,0.740113,0.428797,0.892375,0.943419,0.869361,0.118601,0.919175,0.394982,0.862769,0.258159
Model 9,0.740843,0.625889,0.676677,0.519521,0.302819,0.2935,0.338638,0.45192,0.677558,0.331556,0.330082,0.336535,0.828503
Model 10,0.979883,0.795984,0.932743,0.810026,0.253542,0.868784,0.940528,0.904915,0.737079,0.928834,0.255895,0.869282,0.993078
