# Import Libraries

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from tqdm.notebook import tqdm  # Use notebook version for Jupyter
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib
import xgboost as xgb
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import seaborn as sns
from deap import base, creator, tools, algorithms
from sklearn.metrics import log_loss
import joblib
import time
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from joblib import Parallel, delayed

# Load Datasets

In [None]:
# Set seed for reproducibility
SEED = 9999
random.seed(SEED)  # Ensures consistent behavior in DEAP
np.random.seed(SEED)  # Ensures NumPy-based operations are reproducible

In [None]:
# Define the folder containing the raw dataset files
raw_data_folder = "raw dataset"  # Update with your actual folder path

# Get all file names in the folder
all_files = sorted([f for f in os.listdir(raw_data_folder) if f.endswith(".csv")])

# Identify signal and background files
signal_file = [f for f in all_files if "signal" in f.lower()][0]  # Assumes "signal" is in filename
background_files = sorted([f for f in all_files if "B" in f.upper()])  # Assumes "B" in filename means background

# Load the signal dataset and add a label column
signal_df = pd.read_csv(os.path.join(raw_data_folder, signal_file))
# Drop 'nParticles' if it exists
if 'nParticles' in signal_df.columns:
    signal_df.drop(columns=['nParticles'], inplace=True)
signal_df["label"] = 1  # Assign label 1 for signal events

# Load background datasets and add a label column
background_dfs = []
background_labels = []  # Store filenames for indexing reference
background_types = []  # Store background type

# Define weights for each background type
background_weights = {
    "HH": 0.0015552 * 1.155,
    "qq": 0.0349,
    "tt": 0.503,
    "ZZ": 0.17088 * 1.155,
    "WW": 0.5149,
    "qqX": 0.04347826,
    "qqqqX": 0.04,
    "qqHX": 0.001,
    "ZH": 0.00207445 * 1.155,
    "pebb": 0.7536,
    "pebbqq": 0.1522,
    "peqqH": 0.1237,
    "pett": 0.0570,
}

# Apply reweighting factor for the test set
test_size = 0.25
reweight_factor = 1 / test_size  # = 4.0
background_weights = {k: v * reweight_factor for k, v in background_weights.items()}

for idx, bg_file in enumerate(background_files):
    bg_df = pd.read_csv(os.path.join(raw_data_folder, bg_file))

    # Drop 'nParticles' if it exists
    if 'nParticles' in bg_df.columns:
        bg_df.drop(columns=['nParticles'], inplace=True)

    bg_df["label"] = 0  # Assign label 0 for background events

    # Extract background type from filename (remove "B" and ".csv")
    bg_type = bg_file[1:].replace(".csv", "")
    bg_df["background_type"] = bg_type  # Store background type

    background_dfs.append(bg_df)
    background_labels.append(bg_file)  # Store file name for reference
    background_types.append(bg_type)

# Drop 'background_type' column before extracting features
X_signal = signal_df.drop(columns=["label"], errors="ignore")  # Ensure label is dropped
y_signal = signal_df["label"]  # Extract labels

X_backgrounds = [bg.drop(columns=["label", "background_type"], errors="ignore") for bg in background_dfs]  # Drop extra columns
y_backgrounds = [bg["label"] for bg in background_dfs]  # Extract labels correctly

# Split Datasets and Save

In [None]:
# Create a directory to store the split datasets
os.makedirs('split_datasets', exist_ok=True)

# First, create a consistent three-way split for the signal dataset
X_train_val_signal, X_test_signal, y_train_val_signal, y_test_signal = train_test_split(
    X_signal, y_signal, test_size=test_size, random_state=SEED, stratify=y_signal
)

X_train_signal, X_val_signal, y_train_signal, y_val_signal = train_test_split(
    X_train_val_signal, y_train_val_signal, test_size=0.2, random_state=SEED, stratify=y_train_val_signal
)

# Save signal splits
joblib.dump(X_train_signal, 'split_datasets/X_train_signal.pkl')
joblib.dump(X_val_signal, 'split_datasets/X_val_signal.pkl')
joblib.dump(X_test_signal, 'split_datasets/X_test_signal.pkl')
joblib.dump(y_train_signal, 'split_datasets/y_train_signal.pkl')
joblib.dump(y_val_signal, 'split_datasets/y_val_signal.pkl')
joblib.dump(y_test_signal, 'split_datasets/y_test_signal.pkl')

# Prepare to store test background types
background_types_train = []
background_types_val = []
background_types_test = []

# Split and save each background dataset
for i, bg_label in enumerate(background_labels):
    clean_name = bg_label.replace('.csv', '').replace(' ', '_')

    X_bg = X_backgrounds[i]
    y_bg = y_backgrounds[i]

    X_train_val_bg, X_test_bg, y_train_val_bg, y_test_bg = train_test_split(
        X_bg, y_bg, test_size=test_size, random_state=SEED, stratify=y_bg
    )

    X_train_bg, X_val_bg, y_train_bg, y_val_bg = train_test_split(
        X_train_val_bg, y_train_val_bg, test_size=0.2, random_state=SEED, stratify=y_train_val_bg
    )

    # Store background types for test set
    background_types_train.extend([background_labels[i][1:].replace(".csv", "")] * len(X_train_bg))
    background_types_val.extend([background_labels[i][1:].replace(".csv", "")] * len(X_val_bg))
    background_types_test.extend([background_labels[i][1:].replace(".csv", "")] * len(X_test_bg))

    # Save background splits
    joblib.dump(X_train_bg, f'split_datasets/X_train_{clean_name}.pkl')
    joblib.dump(X_val_bg, f'split_datasets/X_val_{clean_name}.pkl')
    joblib.dump(X_test_bg, f'split_datasets/X_test_{clean_name}.pkl')
    joblib.dump(y_train_bg, f'split_datasets/y_train_{clean_name}.pkl')
    joblib.dump(y_val_bg, f'split_datasets/y_val_{clean_name}.pkl')
    joblib.dump(y_test_bg, f'split_datasets/y_test_{clean_name}.pkl')

    print(f"Saved splits for background {i}: {bg_label}")

# Save background types for the test set
joblib.dump(background_types_train, 'split_datasets/background_types_train.pkl')
joblib.dump(background_types_val, 'split_datasets/background_types_val.pkl')
joblib.dump(background_types_test, 'split_datasets/background_types_test.pkl')

print("All dataset splits have been saved!")

In [None]:
# # Convert list to numpy array
# background_array = np.array(background_types_test)

# # Get unique background types and their counts
# unique_types, counts = np.unique(background_array, return_counts=True)

# # Print each background type and its count
# for bg_type, count in zip(unique_types, counts):
#     print(f"{bg_type}: {count}")

# Train Simple BTD Models

## Train BTD Models

In [None]:
# Load signal training data
X_train_signal = joblib.load('split_datasets/X_train_signal.pkl')
y_train_signal = joblib.load('split_datasets/y_train_signal.pkl')

# Training hyperparameters
depth = 2
n = 100
lr = 0.1

print("\nStarting Parallel BDT Training...\n")

def train_model(i, background_label):
    """Function to train a single BDT model on one background dataset."""
    start_time = time.time()

    # Load background training data
    clean_name = background_label.replace('.csv', '').replace(' ', '_')
    X_train_bg = joblib.load(f'split_datasets/X_train_{clean_name}.pkl')
    y_train_bg = joblib.load(f'split_datasets/y_train_{clean_name}.pkl')

    # Combine signal + one background dataset for training
    X_train_combined = pd.concat([X_train_signal, X_train_bg])
    y_train_combined = np.concatenate([y_train_signal, y_train_bg])

    # Train a Boosted Decision Tree (BDT)
    bdt = AdaBoostClassifier(
        estimator=DecisionTreeClassifier(max_depth=depth),
        n_estimators=n,
        learning_rate=lr,
        algorithm="SAMME"
    )
    bdt.fit(X_train_combined, y_train_combined)

    elapsed_time = time.time() - start_time
    return bdt, f"✔ Model {i+1}/12 trained on {background_label} (Time: {elapsed_time:.2f} sec)"

# ** Train models in parallel **
num_jobs = min(12, joblib.cpu_count())  # Limit to available CPU cores
results = Parallel(n_jobs=num_jobs)(
    delayed(train_model)(i, background_labels[i]) for i in range(12)
)

# Unpack trained models and messages
trained_models, messages = zip(*results)

# Print training messages
for msg in messages:
    tqdm.write(msg)

tqdm.write("\n Parallel Training Complete! All models are ready.\n")

## Save BTD Models

In [None]:
# Define the folder to save models
model_dir = "models"
os.makedirs(model_dir, exist_ok=True)  # Create folder if it doesn't exist

# Save each trained model with detailed filename
for i, model in enumerate(trained_models):
    filename = f"bdt_model_bg{i+1}_depth{depth}_n{n}_lr{lr}.joblib"
    filepath = os.path.join(model_dir, filename)
    joblib.dump(model, filepath)
    print(f"Model {i+1} saved to {filepath}")

## Check BTD Performance on Test Dataset

In [None]:
# Load signal test data
X_test_signal = joblib.load('split_datasets/X_test_signal.pkl')
y_test_signal = joblib.load('split_datasets/y_test_signal.pkl')

# Load background test datasets
X_test_backgrounds = []
y_test_backgrounds = []

for bg_file in background_labels:
    clean_name = bg_file.replace('.csv', '').replace(' ', '_')
    X_test_bg = joblib.load(f'split_datasets/X_test_{clean_name}.pkl')
    y_test_bg = joblib.load(f'split_datasets/y_test_{clean_name}.pkl')
    
    X_test_backgrounds.append(X_test_bg)
    y_test_backgrounds.append(y_test_bg)

# Initialize a 12x13 matrix to store results
output_matrix = np.zeros((12, 13))

# Evaluate each trained BDT model on the test datasets
for model_idx, model in enumerate(trained_models):
    for dataset_idx, dataset in enumerate([X_test_signal] + X_test_backgrounds):  
        # Get predicted probability of being signal
        predictions = model.predict_proba(dataset)[:, 1]  # Extract P(class=1) (signal probability)
        
        # Store the average probability of being signal on **test dataset only**
        output_matrix[model_idx, dataset_idx] = np.mean(predictions)

# Create DataFrame for visualization
datasets = ["Signal Test"] + [f"Background {i+1} Test" for i in range(12)]
model_labels = [f"Model {i+1}" for i in range(12)]

df_results = pd.DataFrame(output_matrix, index=model_labels, columns=datasets)

# Define a function to apply conditional formatting
def highlight_matrix(df):
    styles = pd.DataFrame("", index=df.index, columns=df.columns)  # Initialize with empty styles

    # Highlight the first column
    styles.iloc[:, 0] = "background-color: yellow"

    # Highlight diagonal [1,2], [2,3], ..., [12,13]
    for i in range(min(len(df.index), len(df.columns) - 1)):
        styles.iloc[i, i + 1] = "background-color: lightblue"

    return styles

# Apply the styling function to df_results
styled_df = df_results.style.apply(highlight_matrix, axis=None)

styled_df


# Train XGB Models

## Train XGB Models

In [None]:
depth = 2
n = 100
lr = 0.1

# Store trained models and test sets
trained_xgb_models = []
train_test_splits = []

# Load signal training data
X_train_signal = joblib.load('split_datasets/X_train_signal.pkl')
y_train_signal = joblib.load('split_datasets/y_train_signal.pkl')

print("\nStarting XGBoost Training...\n")

# Initialize tqdm progress bar
with tqdm(total=12, desc="Training Progress", unit="model", leave=True) as pbar:
    for i in range(12):
        start_time = time.time()  # Track time for each model

        # Clean filename for loading
        clean_name = background_labels[i].replace('.csv', '').replace(' ', '_')
        
        # Load background training data
        X_train_bg = joblib.load(f'split_datasets/X_train_{clean_name}.pkl')
        y_train_bg = joblib.load(f'split_datasets/y_train_{clean_name}.pkl')
        # X_test_bg = joblib.load(f'split_datasets/X_test_{clean_name}.pkl')
        # y_test_bg = joblib.load(f'split_datasets/y_test_{clean_name}.pkl')

        # Combine signal + one background dataset
        X_train_combined = pd.concat([X_train_signal, X_train_bg])
        y_train_combined = np.concatenate([y_train_signal, y_train_bg])

        # Split into train (75%) and test (25%)
        # X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.25, random_state=42)

        # Convert to XGBoost DMatrix (optimized for speed)
        dtrain = xgb.DMatrix(X_train_combined, label=y_train_combined)
        # dtest = xgb.DMatrix(X_test_, label=y_test)

        # Define XGBoost parameters
        xgb_params = {
            "objective": "binary:logistic",  # Binary classification
            "eval_metric": "logloss",  # Log-loss for binary classification
            "max_depth": depth,  # Similar to BDT depth
            "learning_rate": lr,  # Step size
            "n_estimators": n,  # Number of boosting rounds
            "tree_method": "hist",  # Optimized for speed
        }

        # Train XGBoost model
        xgb_model = xgb.train(params=xgb_params, dtrain=dtrain, num_boost_round=100)

        # Store trained model and test data
        trained_xgb_models.append(xgb_model)
        # train_test_splits.append((X_test, y_test))

        # Print progress without interfering with tqdm
        elapsed_time = time.time() - start_time
        tqdm.write(f"Model {i+1}/12 trained on {background_labels[i]} (Time: {elapsed_time:.2f} sec)")

        # Update progress bar
        pbar.update(1)

print("\nTraining Complete! All models are ready.\n")

## Save BTD Models

In [None]:
# Define the folder to save models
model_dir = "xgb_models"
os.makedirs(model_dir, exist_ok=True)  # Create folder if it doesn't exist

# Save each trained XGBoost model
for i, model in enumerate(trained_xgb_models):
    filename = f"xgb_model_bg{i+1}_depth{depth}_n{n}_lr{lr}.model"
    filepath = os.path.join(model_dir, filename)
    model.save_model(filepath)
    print(f"Model {i+1} saved to {filepath}")

## Check XGB Performance on Test Dataset

In [None]:
# Load signal test data
X_test_signal = joblib.load('split_datasets/X_test_signal.pkl')
y_test_signal = joblib.load('split_datasets/y_test_signal.pkl')

# Load background test datasets
X_test_backgrounds = []
y_test_backgrounds = []

for bg_file in background_labels:
    clean_name = bg_file.replace('.csv', '').replace(' ', '_')
    X_test_bg = joblib.load(f'split_datasets/X_test_{clean_name}.pkl')
    y_test_bg = joblib.load(f'split_datasets/y_test_{clean_name}.pkl')
    
    X_test_backgrounds.append(X_test_bg)
    y_test_backgrounds.append(y_test_bg)

# Initialize a 12x13 matrix to store results
output_matrix = np.zeros((12, 13))

# Evaluate each trained XGBoost model on the test datasets
for model_idx, model in enumerate(trained_xgb_models):
    for dataset_idx, dataset in enumerate([X_test_signal] + X_test_backgrounds):  
        # Convert dataset to XGBoost DMatrix (necessary for prediction)
        dmatrix = xgb.DMatrix(dataset)
        
        # Get predicted probability (XGBoost automatically returns probabilities for binary classification)
        predictions = model.predict(dmatrix)
        
        # Store the average probability of being signal on **test dataset only**
        output_matrix[model_idx, dataset_idx] = np.mean(predictions)  # Mean probability

# Create DataFrame for visualization
datasets = ["Signal Test"] + [f"Background {i+1} Test" for i in range(12)]
model_labels = [f"Model {i+1}" for i in range(12)]

df_results = pd.DataFrame(output_matrix, index=model_labels, columns=datasets)

# Apply the styling function to df_results
styled_df = df_results.style.apply(highlight_matrix, axis=None)

# Display the styled DataFrame
styled_df

# Train NN Model

## Load Trained BDT and XGB Models

In [None]:
# Load BDT models
bdt_models = []
for i in range(12):
    filename = f"bdt_model_bg{i+1}_depth2_n100_lr0.1.joblib"
    filepath = os.path.join("models", filename)
    
    if os.path.exists(filepath):
        model = joblib.load(filepath)
        bdt_models.append(model)
        print(f"Loaded BDT Model {i+1} from {filepath}")
    else:
        print(f"Model {i+1} not found, you may need to train it first.")

# Load XGBoost models
xgb_models = []
for i in range(12):
    filename = f"xgb_model_bg{i+1}_depth2_n100_lr0.1.model"
    filepath = os.path.join("xgb_models", filename)
    
    if os.path.exists(filepath):
        model = xgb.Booster()
        model.load_model(filepath)
        xgb_models.append(model)
        print(f"Loaded XGBoost Model {i+1} from {filepath}")
    else:
        print(f"Model {i+1} not found, you may need to train it first.")

## Load Validation and Test Dataset

In [None]:
# Load signal validation data
X_val_signal = joblib.load('split_datasets/X_val_signal.pkl')
y_val_signal = joblib.load('split_datasets/y_val_signal.pkl')

# Load background validation datasets
X_val_backgrounds = []
y_val_backgrounds = []

for bg_file in background_labels:
    clean_name = bg_file.replace('.csv', '').replace(' ', '_')
    X_val_bg = joblib.load(f'split_datasets/X_val_{clean_name}.pkl')
    y_val_bg = joblib.load(f'split_datasets/y_val_{clean_name}.pkl')
    
    X_val_backgrounds.append(X_val_bg)
    y_val_backgrounds.append(y_val_bg)

# Load signal test data
X_test_signal = joblib.load('split_datasets/X_test_signal.pkl')
y_test_signal = joblib.load('split_datasets/y_test_signal.pkl')

# Load background test datasets
X_test_backgrounds = []
y_test_backgrounds = []

for bg_file in background_labels:
    clean_name = bg_file.replace('.csv', '').replace(' ', '_')
    X_test_bg = joblib.load(f'split_datasets/X_test_{clean_name}.pkl')
    y_test_bg = joblib.load(f'split_datasets/y_test_{clean_name}.pkl')
    
    X_test_backgrounds.append(X_test_bg)
    y_test_backgrounds.append(y_test_bg)

## Apply BTD / XGB on Validation and Test Dataset

In [None]:
# Initialize storage for NN datasets
X_nn_bdt_train, X_nn_xgb_train = [], []
y_nn_train = []

# Function to extract model outputs
def get_model_outputs(models, dataset_X):
    outputs = []
    for model in models:
        if isinstance(model, xgb.Booster):  # XGBoost models
            dmatrix = xgb.DMatrix(dataset_X)
            outputs.append(model.predict(dmatrix))  # Direct prediction for XGBoost
        else:  # BDT models (Scikit-Learn)
            outputs.append(model.predict_proba(dataset_X)[:, 1])  # Probability of being signal
    return outputs

# Apply models to validation datasets (for NN training)
for dataset_idx, (X_val, y_val) in enumerate(
    zip([X_val_signal] + X_val_backgrounds, [y_val_signal] + y_val_backgrounds)
):
    # Extract BDT and XGB outputs separately
    bdt_train_features = np.column_stack(get_model_outputs(bdt_models, X_val))
    xgb_train_features = np.column_stack(get_model_outputs(xgb_models, X_val))

    # Store separately
    X_nn_bdt_train.append(bdt_train_features)
    X_nn_xgb_train.append(xgb_train_features)
    y_nn_train.append(y_val.to_numpy())
    
# Convert lists to single NumPy arrays
X_nn_bdt_train = np.vstack(X_nn_bdt_train)
X_nn_xgb_train = np.vstack(X_nn_xgb_train)
y_nn_train = np.concatenate(y_nn_train)


# Initialize storage for NN test datasets
X_nn_bdt_test, X_nn_xgb_test = [], []
y_nn_test = []

# Apply models to test datasets (for finding optimal threshold)
for dataset_idx, (X_test, y_test) in enumerate(
    zip([X_test_signal] + X_test_backgrounds, [y_test_signal] + y_test_backgrounds)
):
    # Extract BDT and XGB outputs separately
    bdt_test_features = np.column_stack(get_model_outputs(bdt_models, X_test))
    xgb_test_features = np.column_stack(get_model_outputs(xgb_models, X_test))

    # Store separately
    X_nn_bdt_test.append(bdt_test_features)
    X_nn_xgb_test.append(xgb_test_features)
    y_nn_test.append(y_test.to_numpy())

# Convert lists to single NumPy arrays
X_nn_bdt_test = np.vstack(X_nn_bdt_test)
X_nn_xgb_test = np.vstack(X_nn_xgb_test)
y_nn_test = np.concatenate(y_nn_test)

## Define and Train Simple NN Model

In [None]:
# Convert to PyTorch tensors
X_bdt_train_tensor = torch.tensor(X_nn_bdt_train, dtype=torch.float32)
X_xgb_train_tensor = torch.tensor(X_nn_xgb_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_nn_train, dtype=torch.float32).view(-1, 1)

X_bdt_test_tensor = torch.tensor(X_nn_bdt_test, dtype=torch.float32)
X_xgb_test_tensor = torch.tensor(X_nn_xgb_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_nn_test, dtype=torch.float32).view(-1, 1)

# Create DataLoaders
bdt_train_loader = DataLoader(TensorDataset(X_bdt_train_tensor, y_train_tensor), batch_size=32, shuffle=True)
xgb_train_loader = DataLoader(TensorDataset(X_xgb_train_tensor, y_train_tensor), batch_size=32, shuffle=True)

# Define a simple NN model
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 32)
        self.fc2 = nn.Linear(32, 16)
        self.fc3 = nn.Linear(16, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

# Initialize models
nn_bdt = SimpleNN(12)  # 12 BDT features
nn_xgb = SimpleNN(12)  # 12 XGB features

criterion = nn.BCELoss()
optimizer_bdt = optim.Adam(nn_bdt.parameters(), lr=0.001)
optimizer_xgb = optim.Adam(nn_xgb.parameters(), lr=0.001)

loss_history_bdt = []
loss_history_xgb = []

# Train NN_BDT and track loss
print("Training NN_BDT...")
for epoch in range(30):
    total_loss = 0
    num_batches = 0
    
    for batch_X, batch_y in bdt_train_loader:
        optimizer_bdt.zero_grad()
        outputs = nn_bdt(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer_bdt.step()
        
        total_loss += loss.item()
        num_batches += 1
    
    avg_loss = total_loss / num_batches
    loss_history_bdt.append(avg_loss)
    # print(f"Epoch {epoch+1}/30, Average Loss: {avg_loss:.4f}")

# Plot loss curve
plt.plot(range(1, 31), loss_history_bdt, label="NN_BDT Training Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training Loss Curve for NN_BDT")
plt.legend()
plt.show()

# Train NN_XGB
for epoch in range(30):
    total_loss = 0
    num_batches = 0
    
    for batch_X, batch_y in xgb_train_loader:
        optimizer_xgb.zero_grad()
        outputs = nn_xgb(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer_xgb.step()
        
        total_loss += loss.item()
        num_batches += 1
    
    avg_loss = total_loss / num_batches
    loss_history_xgb.append(avg_loss)
    # print(f"Epoch {epoch+1}/30, Average Loss: {avg_loss:.4f}")

# Plot loss curve
plt.plot(range(1, 31), loss_history_bdt, label="NN_XGB Training Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training Loss Curve for NN_XGB")
plt.legend()
plt.show()

# Optimize NN Cutoff for Signal Significance

## Get NN Outputs on Test Dataset

In [None]:
# Get predictions
nn_bdt_output = nn_bdt(X_bdt_test_tensor).detach().numpy().flatten()
nn_xgb_output = nn_xgb(X_xgb_test_tensor).detach().numpy().flatten()

## Load background Type Info

In [None]:
# Load background types for test set
background_types_train = joblib.load('split_datasets/background_types_train.pkl')
background_types_val = joblib.load('split_datasets/background_types_val.pkl')
background_types_test = joblib.load('split_datasets/background_types_test.pkl')

## Optimize Cutoff for Signal Significance

In [None]:

def compute_significance(threshold, predictions, true_labels, background_types):
    """
    Computes signal significance and returns signal count, background counts per type, and weighted significance.
    
    Parameters:
        threshold (float): Cutoff threshold for classification.
        predictions (array): Array of predicted scores.
        true_labels (array): Ground truth labels (1 for signal, 0 for background).
        background_types (list): List of background type strings corresponding to each background event.
    
    Returns:
        significance (float): The computed signal significance.
        signal_count (int): The number of signal events passing the threshold.
        background_counts (dict): Dictionary mapping background type to its event count.
    """
    # Initialize signal count and weight
    signal_count = 0
    signal_weight = background_weights.get("HH", 1)  # Default to 1 if not found

    # Initialize background counts per type
    background_counts = {bg_type: 0 for bg_type in background_weights.keys()}
    background_sum = 0  # Weighted sum of background counts

    j = -1  # Background indexing
    for i in range(len(predictions)):
        if predictions[i] >= threshold:
            if true_labels[i] == 1:
                signal_count += 1  # Count signal event
            else:
                j += 1  # Increment background index
                bg_type = background_types[j]  # Get background type
                if bg_type in background_counts:
                    background_counts[bg_type] += 1  # Count background event
                    background_sum += background_weights.get(bg_type, 0)  # Add weighted count
        else:
            if true_labels[i] == 0:
                j += 1

    # Compute weighted signal
    weighted_signal = signal_count * signal_weight

    # Compute significance
    if weighted_signal + background_sum > 0:
        significance = weighted_signal / np.sqrt(weighted_signal + background_sum)
    else:
        significance = 0

    return significance, signal_count, background_counts


def compute_asimov_significance(threshold, predictions, true_labels):
    S = np.sum((predictions >= threshold) & (true_labels == 1))
    B = np.sum((predictions >= threshold) & (true_labels == 0))
    
    if B == 0:
        return np.sqrt(2 * S)  # Asimov approximation when B = 0
    
    return np.sqrt(2 * (S + B * np.log(1 + S / B) - B))

# Sweep through cutoffs
thresholds = np.linspace(0.01, 0.99, 99)

best_threshold_bdt, best_threshold_xgb = 0, 0
best_significance_bdt, best_significance_xgb = 0, 0
best_signal_count_bdt, best_background_count_bdt = 0, {bg_type: 0 for bg_type in background_weights.keys()}
best_signal_count_xgb, best_background_count_xgb = 0, {bg_type: 0 for bg_type in background_weights.keys()}

# best_asimov_threshold_bdt, best_asimov_threshold_xgb = 0, 0
# best_asimov_significance_bdt, best_asimov_significance_xgb = 0, 0

for threshold in thresholds:
    # BDT model
    significance_bdt, signal_counts_bdt, background_counts_bdt = compute_significance(threshold, nn_bdt_output, y_nn_test, background_types_test)
    # asimov_significance_bdt = compute_asimov_significance(threshold, nn_bdt_output, y_nn_test)
    
    if significance_bdt > best_significance_bdt:
        best_significance_bdt = significance_bdt
        best_threshold_bdt = threshold
        best_signal_count_bdt = signal_counts_bdt
        best_background_count_bdt = background_counts_bdt

    # if asimov_significance_bdt > best_asimov_significance_bdt:
    #     best_asimov_significance_bdt = asimov_significance_bdt
    #     best_asimov_threshold_bdt = threshold

    # XGB model
    significance_xgb, signal_counts_xgb, background_counts_xgb = compute_significance(threshold, nn_xgb_output, y_nn_test, background_types_test)
    # asimov_significance_xgb = compute_asimov_significance(threshold, nn_xgb_output, y_nn_test)
    
    if significance_xgb > best_significance_xgb:
        best_significance_xgb = significance_xgb
        best_threshold_xgb = threshold
        best_signal_count_xgb = signal_counts_xgb
        best_background_count_xgb = background_counts_xgb

    # if asimov_significance_xgb > best_asimov_significance_xgb:
    #     best_asimov_significance_xgb = asimov_significance_xgb
    #     best_asimov_threshold_xgb = threshold

print(f"NN_BDT: Best Threshold = {best_threshold_bdt:.2f}, Max Significance = {best_significance_bdt:.2f}")
print(f"NN_BDT: Signal Counts = {best_signal_count_bdt}, Background Counts = {best_background_count_bdt}")
print(f"NN_XGB: Best Threshold = {best_threshold_xgb:.2f}, Max Significance = {best_significance_xgb:.2f}")
print(f"NN_XGB: Signal Counts = {best_signal_count_xgb}, Background Counts = {best_background_count_xgb}")

## Plot Results

In [None]:
# Plot distribution of NN outputs for signal and background (BDT)
plt.figure(figsize=(10, 5))
sns.histplot(nn_bdt_output[y_nn_test == 1], bins=50, color='red', label="Signal", kde=True)
sns.histplot(nn_bdt_output[y_nn_test == 0], bins=50, color='blue', label="Background", kde=True)

# Add vertical line for best threshold
plt.axvline(x=best_threshold_bdt, color='black', linestyle='--', linewidth=2, label=f"Best Threshold: {best_threshold_bdt:.2f}")

plt.title("NN_BDT Output Probability Distribution")
plt.xlabel("Predicted Probability")
plt.ylabel("Count")
plt.legend()
plt.show()

# Plot distribution of NN outputs for signal and background (XGB)
plt.figure(figsize=(10, 5))
sns.histplot(nn_xgb_output[y_nn_test == 1], bins=50, color='red', label="Signal", kde=True)
sns.histplot(nn_xgb_output[y_nn_test == 0], bins=50, color='blue', label="Background", kde=True)

# Add vertical line for best threshold
plt.axvline(x=best_threshold_xgb, color='black', linestyle='--', linewidth=2, label=f"Best Threshold: {best_threshold_xgb:.2f}")

plt.title("NN_XGB Output Probability Distribution")
plt.xlabel("Predicted Probability")
plt.ylabel("Count")
plt.legend()
plt.show()

# Train GA Model

## Train GA Model

In [None]:
# Define the optimization problem to maximize significance
creator.create("FitnessMax", base.Fitness, weights=(1.0,))  # Maximizing significance
creator.create("Individual", list, fitness=creator.FitnessMax)

weightHH = background_weights.get("HH")

# Function to create an individual (12 cutoffs for 12 BDT/XGB models)
def create_individual():
    cutoffs = [random.uniform(0, 1) for _ in range(12)]
    return creator.Individual(cutoffs)

# Function to compute signal significance
def compute_significance(thresholds, predictions, true_labels, background_types):
    thresholded_preds = (predictions >= thresholds.reshape(1, -1)).astype(int)
    final_preds = np.all(thresholded_preds, axis=1).astype(int)
    signal_count = np.sum((final_preds == 1) & (true_labels == 1))
    weighted_signal = signal_count * weightHH

    # Background processing
    background_sums = {bg: 0 for bg in background_weights}
    background_idx = 0
    for i in range(len(predictions)):
        if true_labels[i] == 0 and final_preds[i] == 1:  # Background sample passing threshold
            bg_type = background_types[background_idx]
            if bg_type in background_weights:
                background_sums[bg_type] += background_weights[bg_type]
            background_idx += 1
        elif true_labels[i] == 0:
            background_idx += 1

    background_sum = sum(background_sums.values())

    # Compute significance
    if weighted_signal + background_sum > 0:
        significance = weighted_signal / np.sqrt(weighted_signal + background_sum)
    else:
        significance = 0

    return significance


def evaluate(individual, predictions, labels, background_types):
    thresholds = np.array(individual)  # Convert to NumPy array

    # Ensure thresholds is a 1D array of shape (12,)
    if thresholds.shape != (12,):  
        print(f"🚨 Error: thresholds has shape {thresholds.shape}, expected (12,)")
        thresholds = np.array([individual] * 12)  # Fix if needed

    significance = compute_significance(thresholds, predictions, labels, background_types)
    return (significance,)  # DEAP requires a tuple


# Create GA toolbox
def create_ga_toolbox(predictions, labels, background_types):
    toolbox = base.Toolbox()
    toolbox.register("individual", create_individual)
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)
    toolbox.register("evaluate", evaluate, predictions=predictions, labels=labels, background_types=background_types)
    toolbox.register("mate", tools.cxBlend, alpha=0.5)
    toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=0.1, indpb=0.2)
    toolbox.register("select", tools.selTournament, tournsize=3)
    return toolbox

# Train GA function
def train_ga(toolbox, pop, ngen=300, cxpb=0.5, mutpb=0.2, conv_steps=20, conv_crit=0.001):
    hof = tools.HallOfFame(1)
    best_fitness = None
    stagnation_count = 0
    
    for gen in range(ngen):
        algorithms.eaSimple(pop, toolbox, cxpb, mutpb, 1, stats=None, halloffame=hof, verbose=False)
        current_fitness = hof[0].fitness.values[0]
        
        # Print signal significance for this generation
        print(f"Generation {gen+1}: Significance = {current_fitness:.4f}, Stagnation = {stagnation_count}")

        if best_fitness is None or current_fitness > best_fitness + conv_crit:
            best_fitness = current_fitness
            stagnation_count = 0
        else:
            stagnation_count += 1
        
        if stagnation_count >= conv_steps:
            print("Stopping early due to convergence.")
            break
    return hof[0]

# Create GA toolboxes for BDT and XGB
toolbox_bdt = create_ga_toolbox(X_nn_bdt_train, y_nn_train, background_types_val)
toolbox_xgb = create_ga_toolbox(X_nn_xgb_train, y_nn_train, background_types_val)

# Initialize populations
pop_bdt = toolbox_bdt.population(n=100)
pop_xgb = toolbox_xgb.population(n=100)

# Implement spread control
def spread_control(toolbox, pop, best_fitness, scsteps, scrate, scfactor):
    improvements = 0
    for ind in pop:
        if ind.fitness.values[0] > best_fitness:
            improvements += 1
    if improvements < scrate:
        toolbox.unregister("mutate")
        toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=0.1 / scfactor, indpb=0.2)
    elif improvements > scrate:
        toolbox.unregister("mutate")
        toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=0.1 * scfactor, indpb=0.2)

# Train GA function with dynamic mutation rate and spread control
def train_ga(toolbox, pop, ngen=300, cxpb=0.5, mutpb=0.2, conv_steps=20, conv_crit=0.001, scsteps=10, scrate=5, scfactor=0.95):
    hof = tools.HallOfFame(1)
    best_fitness = None
    stagnation_count = 0
    
    for gen in range(ngen):
        if gen < 10:
            toolbox.unregister("mutate")
            toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=0.2, indpb=0.2)
        else:
            toolbox.unregister("mutate")
            toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=0.1, indpb=0.2)
        
        algorithms.eaSimple(pop, toolbox, cxpb, mutpb, 1, stats=None, halloffame=hof, verbose=False)
        current_fitness = hof[0].fitness.values[0]
        
        # Print signal significance for this generation
        print(f"Generation {gen+1}: Significance = {current_fitness:.4f}, Stagnation = {stagnation_count}")

        if best_fitness is None or current_fitness > best_fitness + conv_crit:
            best_fitness = current_fitness
            stagnation_count = 0
        else:
            stagnation_count += 1
        
        if stagnation_count >= conv_steps:
            print("Stopping early due to convergence.")
            break
        
        spread_control(toolbox, pop, best_fitness, scsteps, scrate, scfactor)
    
    return hof[0]

# Run GA training for BDT and XGB
best_w_bdt = train_ga(toolbox_bdt, pop_bdt)
best_w_xgb = train_ga(toolbox_xgb, pop_xgb)

# Extract best weights (12 values each)
w_bdt = np.array(best_w_bdt)
w_xgb = np.array(best_w_xgb)

## Test GA Model on Test Set

In [None]:

# Apply optimized 12-dimensional weights to test set
y_pred_bdt_final = np.all((X_nn_bdt_test >= w_bdt[None, :]), axis=1).astype(int)
y_pred_xgb_final = np.all((X_nn_xgb_test >= w_xgb[None, :]), axis=1).astype(int)

# Evaluate test performance
significance_bdt = compute_significance(w_bdt, X_nn_bdt_test, y_nn_test, background_types_test)
significance_xgb = compute_significance(w_xgb, X_nn_xgb_test, y_nn_test, background_types_test)

print(f"Optimized Cutoffs for BDT: {w_bdt}")
print(f"Optimized Cutoffs for XGB: {w_xgb}")
print(f"Test Significance (BDT): {significance_bdt:.4f}")
print(f"Test Significance (XGB): {significance_xgb:.4f}")

In [None]:
# Apply optimized cutoffs to test set
final_preds_bdt = np.all((X_nn_bdt_test >= w_bdt[None, :]), axis=1).astype(int)
final_preds_xgb = np.all((X_nn_xgb_test >= w_xgb[None, :]), axis=1).astype(int)

# Count signals
signal_count_bdt = np.sum((final_preds_bdt == 1) & (y_nn_test == 1))
signal_count_xgb = np.sum((final_preds_xgb == 1) & (y_nn_test == 1))

# Initialize background counts
background_counts_bdt = {bg: 0 for bg in background_weights}
background_counts_xgb = {bg: 0 for bg in background_weights}

# Background index counter
background_idx = 0

# Count backgrounds that passed for BDT
for i in range(len(y_nn_test)):
    if y_nn_test[i] == 0 and final_preds_bdt[i] == 1:  # If background and classified as signal
        bg_type = background_types_test[background_idx]  # Get background type
        if bg_type in background_counts_bdt:
            background_counts_bdt[bg_type] += 1
        background_idx += 1
    elif y_nn_test[i] == 0:
        background_idx += 1

# Reset background index counter for XGB
background_idx = 0

# Count backgrounds that passed for XGB
for i in range(len(y_nn_test)):
    if y_nn_test[i] == 0 and final_preds_xgb[i] == 1:
        bg_type = background_types_test[background_idx]
        if bg_type in background_counts_xgb:
            background_counts_xgb[bg_type] += 1
        background_idx += 1
    elif y_nn_test[i] == 0:
        background_idx += 1

# Print final counts
print(f"Signals Passing Cutoff (BDT): {signal_count_bdt}")
print(f"Background Counts Passing Cutoff (BDT): {background_counts_bdt}")
print(f"Signals Passing Cutoff (XGB): {signal_count_xgb}")
print(f"Background Counts Passing Cutoff (XGB): {background_counts_xgb}")