In [None]:
!pip install --user --upgrade comet_ml

In [None]:
from comet_ml import Experiment

import os
from functools import partial

import uproot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score as acc, f1_score, roc_curve, roc_auc_score, classification_report, confusion_matrix, auc
from xgboost import XGBClassifier

In [None]:
from helper.plotting import plot_roc, plot_score_vs_pt, plot_tagging_eff, plot_confusion_matrix, plot_xgb_learning_curve, plot_score_distr, plot_signal_significance
from helper.utils import signal_eff, get_optimal_threshold, convert_float64_to_float32, save_model, printmd

In [None]:
plt.rcParams['font.size']=16
pd.options.display.max_columns = 200

# Load data from csv

In [None]:
nrows_b    = 200000
nrows_udsg = 200000

In [None]:
df_b = pd.read_csv('datasets/iter2/bjets_10-150GeV_base.csv', nrows=nrows_b)
df_b['flavour'] = 'b'
df_b = convert_float64_to_float32(df_b)
# df_b.describe()
# df_b['Jet_Pt'].describe([0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99])

In [None]:
df_udsg = pd.read_csv('datasets/iter2/udsgjets_10-150GeV_base.csv', nrows=nrows_udsg)
df_udsg['flavour'] = 'udsg'
df_udsg = convert_float64_to_float32(df_udsg)
# df_udsg.describe()

In [None]:
df = pd.concat([df_b, df_udsg])
n_b_jets, n_udsg_jets = len(df_b), len(df_udsg)
del df_b
del df_udsg

__\>\> Select columns HERE (before logging data info) <<__

In [None]:
# df = df[[col for col in df.columns if 'Jet_SecVtx_' not in col]]
n_tracks, n_sv = 10, 3
filter_tracks = lambda col: ('Jet_Track'  in col and int(col.split('_')[2]) < n_tracks)
filter_sv     = lambda col: ('Jet_SecVtx' in col and int(col.split('_')[2]) < n_sv)
filter_jet    = lambda col: ('Jet_Track'  not in col and 'Jet_SecVtx' not in col)
filter_cols   = lambda col: filter_tracks(col) or filter_sv(col) or filter_jet(col)
df = df[[col for col in df.columns if filter_cols(col)]]

In [None]:
# def subtract_phi(phi1, phi2):
#     diff = phi1-phi2
#     if abs(diff) <= np.pi: return diff
#     elif diff > np.pi: return diff - 2*np.pi
#     elif diff < -np.pi: return diff + 2*np.pi

# def subtract_eta(eta1, eta2):
#     diff = eta1-eta2
#     return diff
    
# # for col in df.columns:
# #     print(col)
# #     if '_Phi__' in col:
# #         df[col.replace('_Phi_', '_DeltaPhi_')] = df[[col, 'Jet_Phi']].apply(lambda row: subtract_phi(row[col], row['Jet_Phi']), axis=1)
# #         df = df.drop(col, axis=1)
# #     if  '_Eta__' in col:
# #         df[col.replace('_Eta_', '_DeltaEta_')] = df[[col, 'Jet_Eta']].apply(lambda row: subtract_eta(row[col], row['Jet_Eta']), axis=1)
# #         df = df.drop(col, axis=1)
# #     if '_Pt__' in col:
# #         df[col.replace('_Pt_', '_PtFrac_')] = df[[col, 'Jet_Pt']].apply(lambda row: row[col] / row['Jet_Pt'], axis=1)
# #         df = df.drop(col, axis=1)     
        
# # for i_part in range(n_tracks):
# #     print(i_part)
# #     df[f'Jet_Track_{i_part}_DeltaR__sortby__IPdNsigma__desc']   = df[[f'Jet_Track_{i_part}_DeltaPhi__sortby__IPdNsigma__desc', f'Jet_Track_{i_part}_DeltaEta__sortby__IPdNsigma__desc']].apply(lambda row: np.sqrt(row[f'Jet_Track_{i_part}_DeltaPhi__sortby__IPdNsigma__desc']**2 + row[f'Jet_Track_{i_part}_DeltaEta__sortby__IPdNsigma__desc']**2), axis=1 )
    

# # for col in df.columns:
# #     if 'IPd__' in col or 'IPz__' in col:
# #         df = df.drop(col, axis=1)
    
# for i_part in range(n_tracks):
#     print(i_part)
#     df[f'Jet_Track_{i_part}_IPdNsigma__sortby__IPdNsigma__desc']   = df.apply(lambda row: row[f'Jet_Track_{i_part}_IPd__sortby__IPdNsigma__desc']/row[f'Jet_Track_{i_part}_CovIPd__sortby__IPdNsigma__desc'], axis=1 )
#     df[f'Jet_Track_{i_part}_IPzNsigma__sortby__IPdNsigma__desc']   = df.apply(lambda row: row[f'Jet_Track_{i_part}_IPz__sortby__IPdNsigma__desc']/row[f'Jet_Track_{i_part}_CovIPz__sortby__IPdNsigma__desc'], axis=1 )
#     df[f'Jet_Track_{i_part}_IP3dNsigma__sortby__IPdNsigma__desc']   = df.apply(lambda row: np.sqrt(row[f'Jet_Track_{i_part}_IPdNsigma__sortby__IPdNsigma__desc']**2 + row[f'Jet_Track_{i_part}_IPzNsigma__sortby__IPdNsigma__desc']**2), axis=1 )
    
    
    
# feats_descr = 'add Nsigma of IPd/IPz/IP3d '

In [None]:
df.columns

In [None]:
df.info(verbose=False, memory_usage='deep')

# Prepare dataset (DataFrame -> X & y)

## Split data using `stratify` with `flavour` and `Jet_Pt`

In [None]:
ptbins = np.array([df['Jet_Pt'].min()-1e-6] + [20, 30, 40, 50, 60, 70, 80, 90, 100] + [df['Jet_Pt'].max()+1e-6])
flavour_ptbin = df[['flavour', 'Jet_Pt']].apply(lambda row: (row['flavour']+str(sum(row['Jet_Pt'] >= np.array(ptbins)))), axis=1)
pt_bin_arr = df['Jet_Pt'].apply(lambda pt: str(sum(pt >= ptbins)))
flavour_ptbin = df['flavour'] + pt_bin_arr

In [None]:
y = df['flavour'].map({'b':1, 'udsg':0})
X = df.drop(['flavour', 'ptbin'], axis=1)
X_train, X_test, y_train, y_test, flavour_ptbin_train, flavour_ptbin_test = train_test_split(X, y, flavour_ptbin, test_size=0.2, stratify=flavour_ptbin, random_state=122)

## Validate stratification

In [None]:
def plot_pt_spectra(pt_train, pt_test, bins=np.linspace(10,150,29), color='k', label='', ax=None):
    density = 0
    if not ax: 
        fig,ax = plt.subplots(figsize=(10,7))
    ax.hist(pt_test    , bins=bins, histtype='step', lw=2, density=density, label='test '+label     , linestyle='-', color=color);
    ax.hist(pt_train   , bins=bins, histtype='step', lw=2, density=density, label='train '+label    , linestyle='--', color=color);
    ax.semilogy()
    ax.legend();
    ax.grid(linestyle=':')
    ax.set_xlabel('jet $p_T^{reco}$ [GeV/c]')
    ax.set_ylabel('counts')
    return ax
    
ax = plot_pt_spectra(X_train['Jet_Pt'][y_train==1], X_test['Jet_Pt'][y_test==1], label='b', color='r')
ax = plot_pt_spectra(X_train['Jet_Pt'][y_train==0], X_test['Jet_Pt'][y_test==0], label='udsg', color='b', ax=ax)
# plt.savefig('pt_spect.png')

## Scale X

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

## Create experiment and log data info

In [None]:
try: 
    exp.end()
except:
    pass
exp = Experiment(
                 auto_output_logging='simple',
                 log_env_gpu=False, log_env_cpu=False,
                 project_name="default-setting-adjusting", workspace="phd")

In [None]:
exp.add_tags(['b-vs-light'])

n_jets_str = f'there is:\n\t{n_b_jets} b jets\n\t{n_udsg_jets} udsg jets'
dataset_info = n_jets_str + f'\ndataframe size = {df.memory_usage(deep=True).sum()/1024/1024} MB'
print(dataset_info)
exp.log_dataset_info(dataset_info)
exp.log_dataset_hash(df)
exp.log_other('n_jets_b', n_b_jets)
exp.log_other('n_jets_light', n_udsg_jets)
exp.log_other('n_columns', X.shape[1])

exp.log_parameter('n_tracks', n_tracks)
exp.log_parameter('n_sv', n_sv)

# Train model

In [None]:
training_iter = 0

def xgb_callback(y_pred, dtrain, mistag_rates=[0.1, 0.01, 0.001], make_plots=False):
    global training_iter
    y_true = dtrain.get_label()
    metrics = []
    for mistag_rate in mistag_rates:
        metrics.append((f'bEff@mistag_{mistag_rate:.0e}', signal_eff(y_true, y_pred, mistag_rate)))
    metrics.append(('ROC_AUC', roc_auc_score(y_true, y_pred)))
    if any([' ' in met_name or ':' in met_name for met_name, _ in metrics]):
        raise ValueError('Metric names cannot contain space nor colon(:)')

    if not make_plots: 
        return metrics
    is_testset = False
    if len(y_true) == len(y_test):
        is_testset = all(y_true == y_test)
    if (not (training_iter % 20)) or training_iter in [0,1,3]:
        if not is_testset:
            ax = plot_tagging_eff(y_true, y_pred, label='train', color='r' if is_testset else 'b')
        else:
            ax = plot_tagging_eff(y_true, y_pred, label='test', color='r' if is_testset else 'b', ax=plt.gca())
            ax.set_ylim(1e-4, 2)
            exp.log_figure(f'plot_iter{training_iter:04}')        
    if is_testset:
        training_iter += 1        
    return metrics


params = dict(n_estimators=100, learning_rate=0.1, 
              max_depth=5, tree_method='exact', 
              subsample=0.8, colsample_bytree=0.8, colsample_bynode=0.8,
              gamma=1, reg_lambda=1,
              scale_pos_weight=(sum(y==0)/sum(y==1)), random_state=123,
             )
    
exp.add_tag('XGB')
exp.log_parameters(params, prefix='manual')
clf = XGBClassifier(**params)
clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric=partial(xgb_callback, make_plots=True), verbose=10)
# exp.send_notification(title='COMETML - test done', status='training finished', additional_data='No need of additional data')

## Eval training

In [None]:
eval_res = clf.evals_result()
for metric in eval_res['validation_0'].keys():
    ax = plot_xgb_learning_curve(eval_res, metric)
    exp.log_figure(f'{metric}_vs_ntrees')

## Save model

In [None]:
save_model(clf, X.columns, exp, 'xgb_model')

# Report performance

## Inference

In [None]:
y_train_proba = clf.predict_proba(X_train)[:,1]
y_test_proba = clf.predict_proba(X_test)[:,1]

y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

opt_thresh = get_optimal_threshold(y_train, y_train_proba, 0.03)
y_train_pred_opt = (y_train_proba > opt_thresh).astype('int')
y_test_pred_opt  = (y_test_proba  > opt_thresh).astype('int')

## Scores distribution

In [None]:
fig,axes = plt.subplots(nrows=2, figsize=(12,7), gridspec_kw={'height_ratios': [2,1]})
plot_score_distr(y_train, y_train_proba, linestyle=':', ax=axes[0])
plot_score_distr(y_test , y_test_proba , linestyle='-', ax=axes[0], lw=2)
plot_signal_significance(y_train, y_train_proba, 0.01,    linestyle=':', color='cyan'   ,  label='b frac. = 1%', ax=axes[1])
plot_signal_significance(y_train, y_train_proba, 0.03,    linestyle=':', color='lime'   ,  label='b frac. = 3%', ax=axes[1])
plot_signal_significance(y_train, y_train_proba, 0.04,    linestyle=':', color='magenta',  label='b frac. = 4%', ax=axes[1])
# plot_signal_significance(y_test , y_test_proba , 0.01,    linestyle='-', color='lime'   , lw=2, label='b frac. = 1%', ax=axes[1])
# plot_signal_significance(y_test , y_test_proba , 0.04,    linestyle='-', color='magenta', lw=2, label='b frac. = 4%', ax=axes[1])
axes[0].vlines(opt_thresh, *axes[0].get_ylim(), color='lime', lw=2, linestyle=':')
exp.log_figure('score_and_significance_vs_threshold')

xmax = max(max(y_train_proba), max(y_test_proba))
axes[0].set_xlim(xmax-0.2, xmax+0.01)
axes[1].set_xlim(xmax-0.2, xmax+0.01)
axes[1].set_ylim(0.95,1)
exp.log_figure('score_and_significance_vs_threshold_zoom')

## ROC - log AUC scores and plot vs pT

In [None]:
ax = plot_roc(y_train, y_train_proba, label='train', color='b');
ax = plot_roc(y_test, y_test_proba, label='test' , color='r', ax=ax);
exp.log_figure('roc_curve')

In [None]:
exp.log_metric('roc_auc_test', roc_auc_score(y_test, y_test_proba))
exp.log_metric('roc_auc_train', roc_auc_score(y_train, y_train_proba))

In [None]:
ax = plot_score_vs_pt(y_train, y_train_pred, y_train_proba, flavour_ptbin_train, ptbins, score=(roc_auc_score, 'ROC AUC'), label='train', marker='o', color='b')
ax = plot_score_vs_pt(y_test, y_test_pred, y_test_proba, flavour_ptbin_test , ptbins, score=(roc_auc_score, 'ROC AUC'), label='test' , marker='^', color='r', ax=ax)
exp.log_figure('roc_auc_vs_pt');

## mistagging rate VS _b_ tagging efficiency

In [None]:
ax = plot_tagging_eff(y_test, y_test_proba, label='$b$ vs $udsg$ test', color='r')
plot_tagging_eff(y_train, y_train_proba, label='$b$ vs $udsg$ train', color='b', ax=ax)
exp.log_figure('tagging_eff')

In [None]:
mistag_rates = [0.1, 0.01, 0.001]
for mistag_rate in mistag_rates:
    eff = signal_eff(y_test, y_test_proba, mistag_rate)
    exp.log_metric(f'bEff@mistag_{mistag_rate:.0e}', eff)

## Confusion matrices

In [None]:
printmd('__TRAIN__')
fig, axes = plt.subplots(ncols=2, figsize=(10,5))
fig.tight_layout()
fig.subplots_adjust(wspace=0.5)
plot_confusion_matrix(y_train, y_train_pred_opt, ['udsg', 'b'], title='train, unnormalized', normalize=False, ax=axes[0])
plot_confusion_matrix(y_train, y_train_pred_opt, ['udsg', 'b'], title='train, normalized'  , normalize=True , ax=axes[1])
exp.log_figure('confusion_matrix_train')

In [None]:
printmd('__TEST__')
fig, axes = plt.subplots(ncols=2, figsize=(10,5))
fig.tight_layout()
fig.subplots_adjust(wspace=0.5)
plot_confusion_matrix(y_test, y_test_pred_opt, ['udsg', 'b'], title='test, unnormalized', normalize=False, ax=axes[0])
plot_confusion_matrix(y_test, y_test_pred_opt, ['udsg', 'b'], title='test, normalized'  , normalize=True , ax=axes[1])
exp.log_figure('confusion_matrix_test')

## Model explainers

In [None]:
for feat_imp, feat_name in sorted(zip(map(lambda x: round(x, 4), clf.feature_importances_), X.columns), reverse=True):
    print(feat_name, feat_imp)

In [None]:
exp.end()