In [None]:
from comet_ml import Experiment

In [None]:
# sklearn 0.22 needed for permutation importance
import sys
sys.path.insert(0, "/eos/user/s/sbysiak/.local/lib/python3.7/site-packages/")
import sklearn
sklearn.__version__

In [None]:
import os
from functools import partial

import uproot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score as acc, f1_score, roc_curve, roc_auc_score, classification_report, confusion_matrix, auc
from sklearn.inspection import permutation_importance
from xgboost import XGBClassifier

In [None]:
from helper.plotting import plot_roc, plot_score_vs_pt, plot_score_vs_col, plot_tagging_eff, plot_confusion_matrix, plot_xgb_learning_curve, plot_score_distr, plot_signal_significance, plot_eff_vs_threshold, plot_pdp
from helper.utils import signal_eff, get_optimal_threshold, convert_float64_to_float32, save_model, printmd
from helper.interpret import feature_importance_report

In [None]:
plt.rcParams['font.size']=16
pd.options.display.max_columns = 200

# Load data from csv

In [None]:
# [c for c in df.columns if 'Track_0_' in c]

In [None]:
try: 
    del df
except:
    pass

In [None]:
def assign_weight(target_pt, real_pt):
    weight_dict = {}
    pt_bins = np.arange(target_pt.min(), target_pt.max()+5, 5)
    for low,high in zip(pt_bins[:-1], pt_bins[1:]):
        n_target = np.sum( (target_pt > low) & (target_pt < high))
        n_real = np.sum((real_pt > low) & (real_pt < high))
        for pt in range(int(low), int(high)):
            weight_dict[pt] = n_target / n_real
    return [weight_dict[int(pt)] for pt in real_pt]

In [None]:
nrows_b    = 200000
nrows_c    = 100000
nrows_udsg = 100000
custom_descr = 'Tr-sortbyIPdNsigmaAbs-noCuts_SV-sortbyLxyNsigma-noCuts'
# custom_descr = 'Tr-sortbyPt-cuts-IPdLT02_SV-sortbyDispersion-noCuts'

In [None]:
df_b = pd.read_csv(f'datasets/iter2/bjets_10-150GeV_{custom_descr}.csv', nrows=nrows_b)
# df_b = df_b.sample(n=int(nrows_b/2), weights=df_b['Jet_Pt'].apply(lambda pt: 1 if pt < 50 else 0.1))
df_b['flavour'] = 'b'
df_b = convert_float64_to_float32(df_b)
# df_b.describe()
# df_b['Jet_Pt'].describe([0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99])

In [None]:
df_c = pd.read_csv(f'datasets/iter2/cjets_10-150GeV_{custom_descr}.csv', nrows=nrows_c*5)
df_c['flavour'] = 'c'
df_c = convert_float64_to_float32(df_c)

weights = assign_weight(df_b['Jet_Pt'], df_c['Jet_Pt'])
df_c = df_c.sample(weights=weights, n=nrows_c, replace=False) 

ax=df_b['Jet_Pt'].hist(bins=100, alpha=0.5, density=1)
df_c['Jet_Pt'].hist(bins=100, alpha=0.5, density=1, ax=ax)

In [None]:
df_udsg = pd.read_csv(f'datasets/iter2/udsgjets_10-150GeV_{custom_descr}.csv', nrows=nrows_udsg*5)
df_udsg['flavour'] = 'udsg'
df_udsg = convert_float64_to_float32(df_udsg)
# df_udsg.describe()

weights = assign_weight(df_b['Jet_Pt'], df_udsg['Jet_Pt'])
df_udsg = df_udsg.sample(weights=weights, n=nrows_udsg, replace=False) 

ax=df_b['Jet_Pt'].hist(bins=100, alpha=0.5, density=1)
df_udsg['Jet_Pt'].hist(bins=100, alpha=0.5, density=1, ax=ax)

In [None]:
df = pd.concat([df_b, df_c, df_udsg])
n_b_jets, n_c_jets, n_udsg_jets = len(df_b), len(df_c), len(df_udsg)
del df_b
del df_c
del df_udsg

preserve pT in case it will not be used for training:

In [None]:
ptbins = np.array([df['Jet_Pt'].min()-1e-6] + [20, 30, 40, 50, 60, 70, 80, 90, 100] + [df['Jet_Pt'].max()+1e-6])
flavour_ptbin = df[['flavour', 'Jet_Pt']].apply(lambda row: (row['flavour']+str(sum(row['Jet_Pt'] >= np.array(ptbins)))), axis=1)
pt_bin_arr = df['Jet_Pt'].apply(lambda pt: str(sum(pt >= ptbins)))
flavour_ptbin = df['flavour'] + pt_bin_arr

__\>\> Select columns HERE (before logging data info) <<__

In [None]:
[c for c in df.columns if 'Track_0_' in c]

In [None]:
# df = df[[col for col in df.columns if 'Jet_SecVtx_' not in col]]
n_tracks, n_sv = 10,3
filter_tracks = lambda col: ('Jet_Track'  in col and int(col.split('_')[2]) < n_tracks 
                             and 'PID' not in col 
#                              and '_Pt_' not in col
#                              and col.split('_')[3] in ['Pt', 'Phi', 'Eta', 'IPdSigma', 'IPzSigma'] + ['IPdNsigmaAbs', 'IPzNsigmaAbs']
                            )
filter_sv     = lambda col: ('Jet_SecVtx' in col and int(col.split('_')[2]) < n_sv)
filter_jet    = lambda col: ('Jet_Track'  not in col and 'Jet_SecVtx' not in col 
#                              and col != 'Jet_Pt'
                            )
filter_cols   = lambda col: ((filter_tracks(col) or filter_sv(col) or filter_jet(col) or col == 'flavour')
                            and 'DerivCorr' not in col
#                             and 'Nsigma__sortby' in col or 'flavour' in col or 'ptbin' in col
                            )
df = df[[col for col in df.columns if filter_cols(col)]]

# remove features correlated with pt
# corr = df.corr()
# corr_thresh = 0.20
# pt_correlated = corr['Jet_Pt'][abs(corr['Jet_Pt']) > corr_thresh]
# df = df[[col for col in df.columns if col not in pt_correlated.index.to_list()]]
# print(f'removed pt-correlated features (threshold = {corr_thresh:.2f}):\n{pt_correlated}')

In [None]:
df.info(verbose=False, memory_usage='deep')

# Prepare dataset (DataFrame -> X & y)

## Split data using `stratify` with `flavour` and `Jet_Pt`

In [None]:
df_orig = df.copy()

In [None]:
y = df['flavour'].map({'b':1, 'c':0, 'udsg':0})
X = df.drop(['flavour', 'ptbin'], axis=1)

In [None]:
X_train, X_test, y_train, y_test, flavour_ptbin_train, flavour_ptbin_test = train_test_split(X, y, flavour_ptbin, test_size=0.2, stratify=flavour_ptbin, random_state=122)

## Validate stratification

In [None]:
def plot_pt_spectra(pt_train, pt_test, bins=np.linspace(10,150,29), color='k', label='', ax=None):
    density = 0
    if not ax: 
        fig,ax = plt.subplots(figsize=(10,7))
    ax.hist(pt_test    , bins=bins, histtype='step', lw=2, density=density, label='test '+label     , linestyle='-', color=color);
    ax.hist(pt_train   , bins=bins, histtype='step', lw=2, density=density, label='train '+label    , linestyle='--', color=color);
    ax.semilogy()
    ax.legend();
    ax.grid(linestyle=':')
    ax.set_xlabel('jet $p_T^{reco}$ [GeV/c]')
    ax.set_ylabel('counts')
    return ax
    
# b & c together    
if 'Jet_Pt' in X_train.columns:
    ax = plot_pt_spectra(X_train['Jet_Pt'][y_train==1], X_test['Jet_Pt'][y_test==1], label='b', color='r')
    ax = plot_pt_spectra(X_train['Jet_Pt'][y_train==0], X_test['Jet_Pt'][y_test==0], label='c+udsg', color='b', ax=ax)

# b & c separately
# ax = plot_pt_spectra(X_train['Jet_Pt'][np.array(['b' in fp for fp in  flavour_ptbin_train])], X_test['Jet_Pt'][np.array(['b' in fp for fp in  flavour_ptbin_test])], label='b', color='r')
# ax = plot_pt_spectra(X_train['Jet_Pt'][np.array(['c' in fp for fp in  flavour_ptbin_train])], X_test['Jet_Pt'][np.array(['c' in fp for fp in  flavour_ptbin_test])], label='c', color='orange', ax=ax)
# ax = plot_pt_spectra(X_train['Jet_Pt'][np.array(['udsg' in fp for fp in  flavour_ptbin_train])], X_test['Jet_Pt'][np.array(['udsg' in fp for fp in  flavour_ptbin_test])], label='udsg', color='b', ax=ax)

# plt.savefig('pt_spect.png')

## Scale X

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

## Create experiment and log data info

In [None]:
try: 
    exp.end()
except:
    pass
exp = Experiment(
                 auto_output_logging='simple',
                 log_env_gpu=False, log_env_cpu=False,
                 project_name="test", workspace="phd")

In [None]:
exp.add_tags(['b-vs-rest'])
if nrows_c == nrows_udsg: exp.add_tag('N_c = N_udsg')
else: exp.add_tag('N_c =/= N_udsg')

n_jets_str = f'there is:\n\t{n_b_jets} b jets\n\t{n_c_jets} c jets\n\t{n_udsg_jets} udsg jets'
dataset_info = n_jets_str + f'\ndataframe size = {df.memory_usage(deep=True).sum()/1024/1024:.1f} MB'
print(dataset_info)
print(df.columns.to_list())
exp.log_dataset_info(dataset_info)
exp.log_dataset_hash(df)
exp.log_other('n_jets_b', n_b_jets)
exp.log_other('n_jets_c', n_c_jets)
exp.log_other('n_jets_udsg', n_udsg_jets)
exp.log_other('n_jets_rest', n_c_jets+n_udsg_jets)
exp.log_other('n_columns', X.shape[1])

exp.log_parameter('n_tracks', n_tracks)
exp.log_parameter('n_sv', n_sv)

In [None]:
exp.add_tag('full-info')
exp.log_other('descr', f'{custom_descr}, adjusted pT, n_tr={n_tracks}, n_sv={n_sv}')

# Train model

In [None]:
training_iter = 0

def xgb_callback(y_pred, dtrain, mistag_rates=[0.1, 0.01, 0.001], make_plots=False):
    global training_iter
    y_true = dtrain.get_label()
    metrics = []
    for mistag_rate in mistag_rates:
        metrics.append((f'bEff@mistag_{mistag_rate:.0e}', signal_eff(y_true, y_pred, mistag_rate)))
    metrics.append(('ROC_AUC', roc_auc_score(y_true, y_pred)))
    if any([' ' in met_name or ':' in met_name for met_name, _ in metrics]):
        raise ValueError('Metric names cannot contain space nor colon(:)')

    if not make_plots: 
        return metrics
    is_testset = False
    if len(y_true) == len(y_test):
        is_testset = all(y_true == y_test)
    if (not (training_iter % 30)) or training_iter in [0,1,3]:
        if not is_testset:
            ax = plot_tagging_eff(y_true, y_pred, label='train', color='r' if is_testset else 'b')
        else:
            ax = plot_tagging_eff(y_true, y_pred, label='test', color='r' if is_testset else 'b', ax=plt.gca())
            ax.set_ylim(1e-4, 2)
            exp.log_figure(f'plot_iter{training_iter:04}')        
    if is_testset:
        training_iter += 1        
    return metrics


params = dict(n_estimators=100, learning_rate=0.2, 
              max_depth=5, tree_method='hist', 
              gamma=10, reg_lambda=0,
              subsample=0.8, colsample_bytree=0.8, colsample_bynode=0.8,
              scale_pos_weight=(sum(y==0)/sum(y==1)), random_state=123,
             )
    
exp.add_tag('XGB')
exp.log_parameters(params, prefix='manual')  # backward compatibility
exp.log_parameters(params, prefix='man')
clf = XGBClassifier(**params)
clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric=partial(xgb_callback, make_plots=True), verbose=10)
# exp.send_notification(title='COMETML - test done', status='training finished', additional_data='No need of additional data')

## Eval training

In [None]:
eval_res = clf.evals_result()
for metric in eval_res['validation_0'].keys():
    print(metric)
    ax = plot_xgb_learning_curve(eval_res, metric)
    exp.log_figure(f'{metric}_vs_ntrees')

## Save model

In [None]:
save_model(clf, X.columns, scaler, exp, 'xgb_model')

# Report performance

## Inference

In [None]:
y_train_proba = clf.predict_proba(X_train)[:,1]
y_test_proba = clf.predict_proba(X_test)[:,1]

y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

opt_thresh = get_optimal_threshold(y_train, y_train_proba, 0.04)
y_train_pred_opt = (y_train_proba > opt_thresh).astype('int')
y_test_pred_opt  = (y_test_proba  > opt_thresh).astype('int')

## Scores distribution

In [None]:
fig,axes = plt.subplots(nrows=2, figsize=(12,7), gridspec_kw={'height_ratios': [2,1]})
plot_score_distr(y_train, y_train_proba, linestyle=':', ax=axes[0])
plot_score_distr(y_test , y_test_proba , linestyle='-', ax=axes[0], lw=2)
plot_signal_significance(y_train, y_train_proba, 0.02,    linestyle=':', color='cyan'   ,  label='b frac. = 2%', ax=axes[1])
plot_signal_significance(y_train, y_train_proba, 0.04,    linestyle=':', color='lime'   ,  label='b frac. = 4%', ax=axes[1])
plot_signal_significance(y_train, y_train_proba, 0.08,    linestyle=':', color='magenta',  label='b frac. = 8%', ax=axes[1])
# plot_signal_significance(y_test , y_test_proba , 0.01,    linestyle='-', color='lime'   , lw=2, label='b frac. = 1%', ax=axes[1])
# plot_signal_significance(y_test , y_test_proba , 0.04,    linestyle='-', color='magenta', lw=2, label='b frac. = 4%', ax=axes[1])
axes[0].vlines(opt_thresh, *axes[0].get_ylim(), color='lime', lw=2, linestyle=':')
exp.log_figure('score_and_significance_vs_threshold')

xmax = max(max(y_train_proba), max(y_test_proba))
axes[0].set_xlim(xmax-0.2, xmax+0.01)
axes[1].set_xlim(xmax-0.2, xmax+0.01)
axes[1].set_ylim(0.95,1)
exp.log_figure('score_and_significance_vs_threshold_zoom')

## eff vs threshold

In [None]:
ax = plot_eff_vs_threshold(y_test, y_test_proba)
exp.log_figure('eff_vs_threshold')
ax.set_xlim(0.6,1)
exp.log_figure('eff_vs_threshold_zoom')
ax.set_yscale('log')
exp.log_figure('eff_vs_threshold_logy')

## ROC - log AUC scores and plot vs pT

In [None]:
ax = plot_roc(y_train, y_train_proba, label='train', color='b');
ax = plot_roc(y_test, y_test_proba, label='test' , color='r', ax=ax);
exp.log_figure('roc_curve')

In [None]:
exp.log_metric('roc_auc_test', roc_auc_score(y_test, y_test_proba))
exp.log_metric('roc_auc_train', roc_auc_score(y_train, y_train_proba))

In [None]:
ax = plot_score_vs_pt(y_train, y_train_pred, y_train_proba, flavour_ptbin_train, ptbins, score=(roc_auc_score, 'ROC AUC'), label='train', marker='o', color='b')
ax = plot_score_vs_pt(y_test, y_test_pred, y_test_proba, flavour_ptbin_test , ptbins, score=(roc_auc_score, 'ROC AUC'), label='test' , marker='^', color='r', ax=ax)
exp.log_figure('roc_auc_vs_pt');

In [None]:
def aver_pred(y_true, y_score):
    return np.average(y_score)

ax = plot_score_vs_pt(y_train, y_train_pred, y_train_proba, flavour_ptbin_train, ptbins, score=(aver_pred, 'aver. pred'), label='train', marker='o', color='b')
ax = plot_score_vs_pt(y_test, y_test_pred, y_test_proba, flavour_ptbin_test , ptbins, score=(aver_pred, 'aver. score'), label='test' , marker='^', color='r', ax=ax)
exp.log_figure('aver_score_vs_pt');

## mistagging rate VS tagging efficiency

In [None]:
ax = plot_tagging_eff(y_test, y_test_proba, label='$b$ vs $c+udsg$ test', color='r')
plot_tagging_eff(y_train, y_train_proba, label='$b$ vs $c+udsg$ train', color='b', ax=ax)
exp.log_figure('tagging_eff')

In [None]:
mistag_rates = [0.1, 0.01, 0.001]
for mistag_rate in mistag_rates:
    eff = signal_eff(y_test, y_test_proba, mistag_rate)
    exp.log_metric(f'tagEff@mistag_{mistag_rate:.0e}', eff)

## Confusion matrices

In [None]:
printmd('__TRAIN__')
fig, axes = plt.subplots(ncols=2, figsize=(10,5))
fig.tight_layout()
fig.subplots_adjust(wspace=0.5)
plot_confusion_matrix(y_train, y_train_pred_opt, ['c+udsg', 'b'], title='train, unnormalized', normalize=False, ax=axes[0])
plot_confusion_matrix(y_train, y_train_pred_opt, ['c+udsg', 'b'], title='train, normalized'  , normalize=True , ax=axes[1])
exp.log_figure('confusion_matrix_train')

In [None]:
printmd('__TEST__')
fig, axes = plt.subplots(ncols=2, figsize=(10,5))
fig.tight_layout()
fig.subplots_adjust(wspace=0.5)
plot_confusion_matrix(y_test, y_test_pred_opt, ['c+udsg', 'b'], title='test, unnormalized', normalize=False, ax=axes[0])
plot_confusion_matrix(y_test, y_test_pred_opt, ['c+udsg', 'b'], title='test, normalized'  , normalize=True , ax=axes[1])
exp.log_figure('confusion_matrix_test')

## Performance vs _feature_

In [None]:
from importlib import reload
import helper
reload(helper)
reload(helper.plotting)
reload(helper.plotting.performance_plots)
plot_score_vs_col = helper.plotting.plot_score_vs_col
plot_pdp = helper.plotting.plot_pdp

In [None]:
def aver_pred(y_true, y_score):
    return np.average(y_score)


for feature,bins,bins_distplot in [
                     ('Jet_Pt', (10,20,30,40,50,60,80,100), 100), 
                     ('Jet_Phi', 18*3, 10), 
                     ('Jet_Eta', 20, 100), 
                     ('Jet_NumTracks', np.arange(0,30,2), np.arange(0,30,1)),
                      ]:
    if feature not in X.columns: continue
    for score in [(roc_auc_score, 'ROC AUC'), 
                  (partial(signal_eff, mistag_rate_thresh=1e-2), 'signal eff for mistag=1e-2'),
                  (aver_pred, 'aver. score')
                    ]:
        ax=plot_score_vs_col(y_train, y_train_proba, 
                      vals=scaler.inverse_transform(X_train)[:, df.columns.get_loc(feature) ], 
                      bins=bins, bins_distplot=bins_distplot,
                      score=score, color='b', label='train',
                      show_distplot=True,                          
                      show_errorbars=True,
                     )
        plot_score_vs_col(y_test, y_test_proba, 
                          vals=scaler.inverse_transform(X_test)[:, df.columns.get_loc(feature) ],
                          bins=bins, bins_distplot=bins_distplot,
                          score=score, color='r', marker='^', label='test', 
                          xlabel=feature,
                          show_distplot=True,
                          show_errorbars=True,
                          ax=ax
                         )
        exp.log_figure(f"{score[1].replace(' ', '').replace('=','-').replace('.', '-')}_vs_{feature.replace('Jet', '')}");



# Interpretability

## Feature importance

**XGB's weight**   
  = how many times feature was used

In [None]:
feature_importance_report(clf.feature_importances_, X.columns, importance_type='XGB\'s weight')

**XGB's total gain**

In [None]:
imp_dict = clf.get_booster().get_score(importance_type='total_gain')
imp = imp_dict.values()
names = X.columns[[int(k[1:]) for k in imp_dict.keys()]]
feature_importance_report(imp, names, importance_type='XGB\'s total_gain')

**Permutation importance**  
remember to use scaled input data  
it also quite time-consuming

In [None]:
imp = permutation_importance(clf,X_train[:10000],y_train[:10000])['importances_mean']
feature_importance_report(imp, X.columns, importance_type='permutation imp.')
perm_imp = imp
perm_imp_feats = X.columns

## Partial dependence plots
for 5 features with highest permutation importance

In [None]:
most_imp_idx = np.argsort(perm_imp)[:-6:-1]
features = X.columns[most_imp_idx]
for feat in features:
    ax = plot_pdp(clf, X_train[:30000], feat, 
             scaler=scaler, 
             column_names = X.columns,
             query='',
             show_deciles=True,
             show_distplot=True,
             y=y_train[:30000],
             pardep_kws=dict(percentiles=(0.1,0.9)),
            )
    exp.log_figure(f"pdp_{feat}")



## Model explainers

In [None]:
code = '\n#\n'.join(In)
exp.set_code(code=code)
In.clear()
exp.end()