In [1]:
import os
from dotenv import load_dotenv
from mlflow import MlflowClient

# Load environment variables from .env file
load_dotenv()

# Verify environment variables are loaded
print(f"MLFLOW_TRACKING_URI: {os.getenv('MLFLOW_TRACKING_URI')}")
print(f"MLFLOW_TRACKING_TOKEN: {'*' * 20 if os.getenv('MLFLOW_TRACKING_TOKEN') else 'Not set'}")

MLFLOW_TRACKING_URI: https://gitlab.cern.ch/api/v4/projects/222677/ml/mlflow/
MLFLOW_TRACKING_TOKEN: ********************


In [2]:
client = MlflowClient()

In [3]:
# model_name = "b2bH-vlq-xgboost-model"
# description = "XGBoost model for b2bH to vlq analysis"
# model = client.create_registered_model(model_name, description=description)
# exp = client.get_experiment_by_name(f"[model]{model_name}")
# run = client.create_run(experiment_id=exp.experiment_id)

In [4]:
import mlflow

try:
    experiment_id = mlflow.create_experiment(name="xgboost-tHq-bkg")
except mlflow.exceptions.RestException:
    # Experiment already exists, get its ID
    experiment = mlflow.get_experiment_by_name("xgboost-tHq-bkg")
    experiment_id = experiment.experiment_id

mlflow.set_experiment(experiment_id=experiment_id)

<Experiment: artifact_location='not_implemented', creation_time=None, experiment_id='4', last_update_time=None, lifecycle_stage='active', name='xgboost-tHq-bkg', tags={}>

In [5]:
import uproot

t_sig = uproot.open("../data/BDT_tree_M800_14TeV.root:BbH_tree")
t_bkg = uproot.open("../data/BKG_tree_tHq_14TeV.root:BbH_tree")

In [6]:
df_sig = t_sig.arrays(library="pd")
df_bkg = t_bkg.arrays(library="pd")

In [7]:
w_sig = df_sig["evt_weight"]
w_bkg = df_bkg["evt_weight"]

In [8]:
# Convert multiplicity columns to integers
df_sig = df_sig.astype({
    'n_bjet': 'int32',
    'n_jets': 'int32',
    'n_fjet': 'int32',
},)
df_bkg = df_bkg.astype({
    'n_bjet': 'int32',
    'n_jets': 'int32',
    'n_fjet': 'int32',
},)

Select as input features kinematic variables $p_{T}$ and $\eta$ from  the VLQ candidate and its decay products.
Also include $H_{T}$ and deltaR between b-jet and Higgs candidates.

In [9]:
import matplotlib.pyplot as plt
from itertools import product
from sklearn.model_selection import train_test_split

from b2bH_vlq import get_variable_group_names, get_variables_by_group
from hepkit.classification.visualization import (
    plot_signal_background_comparison,
    plot_train_test_response,
    plot_signal_efficiency_vs_background_rejection,
    plot_roc_auc,
    # plot_shap_summary
)
from hepkit.classification.preprocessing import prepare_training_data, split_train_test_by_unique_id
from hepkit.histograms import hist1d_from_var


In [10]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [11]:
all_vars = {}
for gr in get_variable_group_names():
    all_vars.update(get_variables_by_group(gr))
all_vars.keys();

In [12]:
cands = ["photon1", "photon2", "diphoton", "bjet", "VLQ"]
obs = ["pt", "eta"]

mva_vars = { }
for cand, obs in product(cands, obs):
    key = f"{cand}_{obs}"
    mva_vars[key] = all_vars[key]
mva_vars["HT"] = all_vars["HT"]
mva_vars["deltaR_bjet_Higgs"] = all_vars["deltaR_bjet_Higgs"]
mva_vars.keys();

In [13]:
mva_names = list(mva_vars.keys())
sig_mva_hist = {}
bkg_mva_hist = {}
for name in mva_names:
    var = mva_vars[name]
    sig_mva_hist[name] = hist1d_from_var(var, df_sig)
    bkg_mva_hist[name] = hist1d_from_var(var, df_bkg)

In [14]:
Xy, ids = prepare_training_data(
    df_sig,
    df_bkg,
    mva_vars.values(),
    mva_vars.values(),
    sig_weights=w_sig,
    bkg_weights=w_bkg,
    id_columns=["NEvts"]
)

In [15]:
train_set, test_set = split_train_test_by_unique_id(
    Xy, ids, test_ratio=0.2, id_columns=["NEvts"], 
)

# Define the training and validation sets
train_X, val_X, train_y, val_y = train_test_split(
    train_set.drop("label", axis=1), train_set["label"], test_size=0.2, random_state=42
)

train_weights = train_X.pop("weights")
val_weights = val_X.pop("weights")

In [16]:
import xgboost as xgb

In [17]:
# Disable autologging due to GitLab MLflow compatibility issues
# mlflow.xgboost.autolog()

In [18]:
with mlflow.start_run(run_name="run9"):
    fig, axes = plot_signal_background_comparison(
        sig_mva_hist, bkg_mva_hist, subplot_titles=False
    )
    plt.tight_layout()
    plot_filename = "mva_variable_distributions.png"
    plt.savefig(plot_filename)
    mlflow.log_artifact(plot_filename)
    plt.close()

    params = {
        "n_estimators": 2000,
        "max_depth": 2,
        "learning_rate": 0.05,
        "random_state": 42,
        "early_stopping_rounds": 100,
        "eval_metric": ["auc", "logloss"],
    }

    mlflow.log_params(params)

    model_xgb = xgb.XGBClassifier(**params)

    model_xgb.fit(train_X, train_y, eval_set=[(train_X, train_y), (val_X, val_y)], verbose=False)

    # After training
    evals_result = model_xgb.evals_result()

    # log training history
    for epoch, (train_logloss, val_logloss, train_auc, val_auc) in enumerate(
        zip(evals_result["validation_0"]["logloss"], 
            evals_result["validation_1"]["logloss"],
            evals_result["validation_0"]["auc"],
            evals_result["validation_1"]["auc"])
    ):
        mlflow.log_metrics(
            {
                "train_logloss": train_logloss,
                "val_logloss": val_logloss,
                "train_auc": train_auc,
                "val_auc": val_auc,
            }, step=epoch
        )

    train_predict_xgb = model_xgb.predict(train_X)
    train_proba_xgb = model_xgb.predict_proba(train_X) # these are the scores
    val_predict_xgb = model_xgb.predict(val_X)
    val_proba_xgb = model_xgb.predict_proba(val_X) # these are the scores
    
    final_metrics = {
        "accuracy": accuracy_score(val_y, val_predict_xgb),
        "roc_auc": roc_auc_score(val_y, val_proba_xgb[:, 1]),
    }

    mlflow.log_metrics(final_metrics)

    plot_train_test_response(
        model_xgb,
        train_X, train_y, val_X, val_y, log_y=False
    )
    plt.tight_layout()
    plot_filename = "train_test_response.png"
    plt.savefig(plot_filename)
    mlflow.log_artifact(plot_filename)
    plt.close()

    plot_signal_efficiency_vs_background_rejection(train_y, train_proba_xgb[:, 1])
    plt.tight_layout()
    plot_filename = "signal_efficiency_vs_background_rejection.png"
    plt.savefig(plot_filename)
    mlflow.log_artifact(plot_filename)
    plt.close()

    # Save model locally as pickle file and log as artifact
    import joblib
    model_filename = "xgboost_model.pkl"
    joblib.dump(model_xgb, model_filename)
    mlflow.log_artifact(model_filename)

    

üèÉ View run run9 at: https://gitlab.cern.ch/api/v4/projects/222677/ml/mlflow/#/experiments/4/runs/49233ce4-5148-4137-825e-2c1f31f48b7c
üß™ View experiment at: https://gitlab.cern.ch/api/v4/projects/222677/ml/mlflow/#/experiments/4
