In [None]:
import uproot
import matplotlib.pyplot as plt
import numpy as np
from itertools import product

In [None]:
import hepkit
from hepkit import Var

In [None]:
from b2bH_vlq import get_variable_group_names, get_variables_by_group
get_variable_group_names()

In [None]:
all_vars = {}
for gr in get_variable_group_names():
    all_vars.update(get_variables_by_group(gr))
all_vars.keys()

In [None]:
#all_vars["photon1_pt"].expression = np.log
#all_vars["photon2_pt"].expression = np.log
#all_vars["diphoton_pt"].expression = np.log
#all_vars["bjet_pt"].expression = np.log
#all_vars["forward_jet_pt"].expression = np.log
#all_vars["HT"].expression = np.log
#all_vars["VLQ_pt"].expression = np.log

In [None]:
from hepkit.histograms import hist1d_from_var, plot_hist1d_comparison, multi_hist1d_comparison
from hepkit.plotting import set_cms_style, create_multiplot_layout
set_cms_style(grid=True)

### Load signal and bkg files

In [None]:
t_sig = uproot.open("../data/BDT_tree_M1000_14TeV.root:BbH_tree")
t_bkg = uproot.open("../data/BKG_tree_tHq_14TeV.root:BbH_tree")

In [None]:
df_sig = t_sig.arrays(library="pd")
df_bkg = t_bkg.arrays(library="pd")

- Convert multiplicity columns to integers originally stored as float
- Create unique ids based on the Event number (Nevts column)

In [None]:
# Convert multiplicity columns to integers
df_sig = df_sig.astype({
    'n_bjet': 'int32',
    'n_jets': 'int32',
    'n_fjet': 'int32',
},)
df_bkg = df_bkg.astype({
    'n_bjet': 'int32',
    'n_jets': 'int32',
    'n_fjet': 'int32',
},)

In [None]:
df_sig["NEvts"] = int(1e6) + df_sig.NEvts
df_bkg["NEvts"] = int(2e6) + df_bkg.NEvts

In [None]:
df_bkg.columns

## BDT training

We'll use catboost classifier por signal background classification.

Select as input features kinematic variables $p_{T}$ and $\eta$ from  the VLQ candidate and its decay products.
Also include $H_{T}$ and deltaR between b-jet and Higgs candidates.

In [None]:
cands = ["photon1", "photon2", "diphoton", "bjet", "VLQ"]
obs = ["pt", "eta"]

mva_vars = { }
for cand, obs in product(cands, obs):
    key = f"{cand}_{obs}"
    mva_vars[key] = all_vars[key]
mva_vars["HT"] = all_vars["HT"]
mva_vars["deltaR_bjet_Higgs"] = all_vars["deltaR_bjet_Higgs"]
mva_vars.keys()

In [None]:
mva_names = list(mva_vars.keys())
sig_mva_hist = {}
bkg_mva_hist = {}
for name in mva_names:
    var = mva_vars[name]
    sig_mva_hist[name] = hist1d_from_var(var, df_sig)
    bkg_mva_hist[name] = hist1d_from_var(var, df_bkg)

In [None]:
from sklearn.model_selection import train_test_split
from hepkit.classification.preprocessing import prepare_training_data, split_train_test_by_unique_id

In [None]:
from hepkit.classification.visualization import (
    plot_signal_background_comparison,
    plot_train_test_response,
    plot_signal_efficiency_vs_background_rejection,
    plot_roc_auc,
    plot_shap_summary
)

In [None]:
fig, axes = plot_signal_background_comparison(
    sig_mva_hist, bkg_mva_hist, subplot_titles=False
)
plt.tight_layout()

In [None]:
Xy, ids = prepare_training_data(
    df_sig, df_bkg, mva_vars.values(), mva_vars.values(), id_columns=["NEvts"]
)

In [None]:
train_set, test_set = split_train_test_by_unique_id(
    Xy, ids, test_ratio=0.2, id_columns=["NEvts"], 
)

In [None]:
# Define the training and validation sets
train_X, val_X, train_y, val_y = train_test_split(
    train_set.drop("label", axis=1), train_set["label"], test_size=0.2, random_state=42
)

Train the model

In [None]:
from hepkit.classification.catboost_utils import (
    create_catboost_pools, 
    cross_val_catboost_model,
    get_catboost_shap_values,
    plot_catboost_learning_curve,
    train_catboost_model,
)

In [None]:
cb_params = {
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "custom_metric": ["Accuracy", "Precision"],  # "Recall", "F1"],
    "iterations": 2500,
    "random_state": 42,
    "learning_rate": 0.05,
    "depth": 2,
    #'rsm': 0.5,
    #'reg_lambda': 1,
    'od_wait': 100,
}

In [None]:
cv_data = cross_val_catboost_model(cb_params, X=train_X, y=train_y, nfolds=5, plot=True, stratified=False)

In [None]:
best_value = cv_data["test-Logloss-mean"].min()
best_iter = cv_data["test-Logloss-mean"].values.argmin()
print(best_iter, best_value)

In [None]:
model = train_catboost_model(
    cb_params, train_X, train_y, val_X, val_y, plot=True
)

In [None]:
plot_catboost_learning_curve(model)

In [None]:
train_predict = model.predict(train_X)
train_proba = model.predict_proba(train_X) # these are the scores
val_predict = model.predict(val_X)
val_proba = model.predict_proba(val_X) # these are the scores

Check overfitting

In [None]:
plot_train_test_response(
    model,
    train_X, train_y, val_X, val_y, log_y=True
)

In [None]:
plot_signal_efficiency_vs_background_rejection(train_y, train_proba[:, 1])

In [None]:
plot_roc_auc(
    train_y, train_proba[:, 1])

In [None]:
train_pool, val_pool = create_catboost_pools(train_X, train_y)
shap_values = get_catboost_shap_values(model, train_pool)
shap_values.shape

In [None]:
plot_shap_summary(shap_values, train_X)