In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from tqdm.notebook import tqdm  # Use notebook version for Jupyter
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib
import xgboost as xgb
from datetime import datetime

In [2]:
# Define the folder containing the raw dataset files
raw_data_folder = "raw dataset"  # Update with your actual folder path

# Get all file names in the folder
all_files = sorted([f for f in os.listdir(raw_data_folder) if f.endswith(".csv")])

# Identify signal and background files
signal_file = [f for f in all_files if "signal" in f.lower()][0]  # Assumes "signal" is in filename
background_files = sorted([f for f in all_files if "B" in f.upper()])  # Assumes "B" in filename means background

# # Load the signal dataset and add a label column
# signal_df = pd.read_csv(os.path.join(raw_data_folder, signal_file))
# signal_df["label"] = 1  # Assign label 1 for signal events

# # Load background datasets and add a label column
# background_dfs = []
background_labels = []  # Store filenames for indexing reference

for idx, bg_file in enumerate(background_files):
    bg_df = pd.read_csv(os.path.join(raw_data_folder, bg_file))
    # bg_df["label"] = 0  # Assign label 0 for background events
    # background_dfs.append(bg_df)
    background_labels.append(bg_file)  # Store file name for reference
    # print(f"Background {idx}: {bg_file}")  # Print index and file name

# # Extract features (X) and labels (y)
# X_signal = signal_df.iloc[:, :-1]  # Features for signal
# y_signal = signal_df.iloc[:, -1]   # Labels for signal

# X_backgrounds = [bg.iloc[:, :-1] for bg in background_dfs]  # Features for each background dataset
# y_backgrounds = [bg.iloc[:, -1] for bg in background_dfs]  # Labels

In [3]:
depth = 2
n = 100
lr = 0.1

# Store trained models and test sets
trained_xgb_models = []
train_test_splits = []

# Load signal training data
X_train_signal = joblib.load('split_datasets/X_train_signal.pkl')
y_train_signal = joblib.load('split_datasets/y_train_signal.pkl')

print("\nStarting XGBoost Training...\n")

# Initialize tqdm progress bar
with tqdm(total=12, desc="Training Progress", unit="model", leave=True) as pbar:
    for i in range(12):
        start_time = time.time()  # Track time for each model

        # Clean filename for loading
        clean_name = background_labels[i].replace('.csv', '').replace(' ', '_')
        
        # Load background training data
        X_train_bg = joblib.load(f'split_datasets/X_train_{clean_name}.pkl')
        y_train_bg = joblib.load(f'split_datasets/y_train_{clean_name}.pkl')
        # X_test_bg = joblib.load(f'split_datasets/X_test_{clean_name}.pkl')
        # y_test_bg = joblib.load(f'split_datasets/y_test_{clean_name}.pkl')

        # Combine signal + one background dataset
        X_train_combined = pd.concat([X_train_signal, X_train_bg])
        y_train_combined = np.concatenate([y_train_signal, y_train_bg])

        # Split into train (75%) and test (25%)
        # X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.25, random_state=42)

        # Convert to XGBoost DMatrix (optimized for speed)
        dtrain = xgb.DMatrix(X_train_combined, label=y_train_combined)
        # dtest = xgb.DMatrix(X_test_, label=y_test)

        # Define XGBoost parameters
        xgb_params = {
            "objective": "binary:logistic",  # Binary classification
            "eval_metric": "logloss",  # Log-loss for binary classification
            "max_depth": depth,  # Similar to BDT depth
            "learning_rate": lr,  # Step size
            "n_estimators": n,  # Number of boosting rounds
            "tree_method": "hist",  # Optimized for speed
        }

        # Train XGBoost model
        xgb_model = xgb.train(params=xgb_params, dtrain=dtrain, num_boost_round=100)

        # Store trained model and test data
        trained_xgb_models.append(xgb_model)
        # train_test_splits.append((X_test, y_test))

        # Print progress without interfering with tqdm
        elapsed_time = time.time() - start_time
        tqdm.write(f"Model {i+1}/12 trained on {background_labels[i]} (Time: {elapsed_time:.2f} sec)")

        # Update progress bar
        pbar.update(1)

print("\nTraining Complete! All models are ready.\n")


Starting XGBoost Training...



Training Progress:   0%|          | 0/12 [00:00<?, ?model/s]

Parameters: { "n_estimators" } are not used.



Model 1/12 trained on BWW.csv (Time: 0.64 sec)


Parameters: { "n_estimators" } are not used.



Model 2/12 trained on BZH.csv (Time: 0.87 sec)


Parameters: { "n_estimators" } are not used.



Model 3/12 trained on BZZ.csv (Time: 0.70 sec)


Parameters: { "n_estimators" } are not used.



Model 4/12 trained on Bpebb.csv (Time: 0.75 sec)


Parameters: { "n_estimators" } are not used.



Model 5/12 trained on Bpebbqq.csv (Time: 0.68 sec)


Parameters: { "n_estimators" } are not used.



Model 6/12 trained on BpeqqH.csv (Time: 0.87 sec)


Parameters: { "n_estimators" } are not used.



Model 7/12 trained on Bpett.csv (Time: 0.72 sec)


Parameters: { "n_estimators" } are not used.



Model 8/12 trained on Bqq.csv (Time: 0.96 sec)


Parameters: { "n_estimators" } are not used.



Model 9/12 trained on BqqHX.csv (Time: 0.86 sec)


Parameters: { "n_estimators" } are not used.



Model 10/12 trained on BqqX.csv (Time: 0.68 sec)


Parameters: { "n_estimators" } are not used.



Model 11/12 trained on BqqqqX.csv (Time: 0.65 sec)


Parameters: { "n_estimators" } are not used.



Model 12/12 trained on Btt.csv (Time: 0.60 sec)

Training Complete! All models are ready.



In [4]:
# Define the folder to save models
model_dir = "xgb_models"
os.makedirs(model_dir, exist_ok=True)  # Create folder if it doesn't exist

# Save each trained XGBoost model
for i, model in enumerate(trained_xgb_models):
    filename = f"xgb_model_bg{i+1}_depth{depth}_n{n}_lr{lr}.model"
    filepath = os.path.join(model_dir, filename)
    model.save_model(filepath)
    print(f"Model {i+1} saved to {filepath}")

Model 1 saved to xgb_models/xgb_model_bg1_depth2_n100_lr0.1.model
Model 2 saved to xgb_models/xgb_model_bg2_depth2_n100_lr0.1.model
Model 3 saved to xgb_models/xgb_model_bg3_depth2_n100_lr0.1.model
Model 4 saved to xgb_models/xgb_model_bg4_depth2_n100_lr0.1.model
Model 5 saved to xgb_models/xgb_model_bg5_depth2_n100_lr0.1.model
Model 6 saved to xgb_models/xgb_model_bg6_depth2_n100_lr0.1.model
Model 7 saved to xgb_models/xgb_model_bg7_depth2_n100_lr0.1.model
Model 8 saved to xgb_models/xgb_model_bg8_depth2_n100_lr0.1.model
Model 9 saved to xgb_models/xgb_model_bg9_depth2_n100_lr0.1.model
Model 10 saved to xgb_models/xgb_model_bg10_depth2_n100_lr0.1.model
Model 11 saved to xgb_models/xgb_model_bg11_depth2_n100_lr0.1.model
Model 12 saved to xgb_models/xgb_model_bg12_depth2_n100_lr0.1.model




In [5]:
# Load signal test data
X_test_signal = joblib.load('split_datasets/X_test_signal.pkl')
y_test_signal = joblib.load('split_datasets/y_test_signal.pkl')

# Load background test datasets
X_test_backgrounds = []
y_test_backgrounds = []

for bg_file in background_labels:
    clean_name = bg_file.replace('.csv', '').replace(' ', '_')
    X_test_bg = joblib.load(f'split_datasets/X_test_{clean_name}.pkl')
    y_test_bg = joblib.load(f'split_datasets/y_test_{clean_name}.pkl')
    
    X_test_backgrounds.append(X_test_bg)
    y_test_backgrounds.append(y_test_bg)

# Initialize a 12x13 matrix to store results
output_matrix = np.zeros((12, 13))

# Evaluate each trained XGBoost model on the test datasets
for model_idx, model in enumerate(trained_xgb_models):
    for dataset_idx, dataset in enumerate([X_test_signal] + X_test_backgrounds):  
        # Convert dataset to XGBoost DMatrix (necessary for prediction)
        dmatrix = xgb.DMatrix(dataset)
        
        # Get predicted probability (XGBoost automatically returns probabilities for binary classification)
        predictions = model.predict(dmatrix)
        
        # Store the average probability of being signal on **test dataset only**
        output_matrix[model_idx, dataset_idx] = np.mean(predictions)  # Mean probability

# Create DataFrame for visualization
datasets = ["Signal Test"] + [f"Background {i+1} Test" for i in range(12)]
model_labels = [f"Model {i+1}" for i in range(12)]

df_results = pd.DataFrame(output_matrix, index=model_labels, columns=datasets)

df_results

Unnamed: 0,Signal Test,Background 1 Test,Background 2 Test,Background 3 Test,Background 4 Test,Background 5 Test,Background 6 Test,Background 7 Test,Background 8 Test,Background 9 Test,Background 10 Test,Background 11 Test,Background 12 Test
Model 1,0.993409,0.486408,0.975552,0.914625,0.819303,0.975878,0.992067,0.976934,0.575091,0.988096,0.809345,0.967928,0.966737
Model 2,0.802685,0.39411,0.387245,0.326103,0.341408,0.51737,0.692131,0.656223,0.466731,0.69039,0.359923,0.555711,0.826447
Model 3,0.906257,0.411589,0.686774,0.315371,0.308418,0.537459,0.757065,0.746431,0.506697,0.752536,0.313584,0.617498,0.953453
Model 4,0.957887,0.715824,0.887123,0.712968,0.147485,0.659394,0.776503,0.780099,0.657704,0.839312,0.212282,0.784424,0.975413
Model 5,0.906087,0.68855,0.800367,0.624372,0.312735,0.248639,0.376321,0.571715,0.726837,0.688594,0.522142,0.659355,0.912911
Model 6,0.876353,0.736722,0.809523,0.686953,0.323358,0.244592,0.278215,0.489057,0.783427,0.639192,0.5399,0.651393,0.867412
Model 7,0.972122,0.922036,0.951148,0.946626,0.885136,0.928268,0.928325,0.506853,0.940202,0.890921,0.916063,0.865407,0.687261
Model 8,0.926772,0.18255,0.861391,0.734916,0.423933,0.888514,0.940293,0.862564,0.118182,0.915685,0.385749,0.858085,0.261355
Model 9,0.740123,0.624049,0.679108,0.519131,0.307153,0.28947,0.336098,0.453179,0.678544,0.332121,0.330982,0.344116,0.837742
Model 10,0.979376,0.805793,0.933545,0.808098,0.25192,0.862323,0.937322,0.895942,0.743812,0.924177,0.251315,0.874349,0.992957
