In [None]:
from comet_ml import Experiment
import comet_ml
import pickle

import os
from functools import partial

import uproot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score as acc, f1_score, roc_curve, roc_auc_score, classification_report, confusion_matrix, auc
from xgboost import XGBClassifier

In [None]:
from helper.plotting import plot_roc, plot_score_vs_pt, plot_tagging_eff, plot_confusion_matrix, plot_xgb_learning_curve, plot_score_distr, plot_signal_significance
from helper.utils import signal_eff, get_optimal_threshold, convert_float64_to_float32, save_model, printmd

In [None]:
plt.rcParams['font.size']=16
pd.options.display.max_columns = 200

# Load data from csv

In [None]:
nrows_b    = 200000
nrows_c    = 200000
nrows_udsg = 200000

skiprows   = 200000

In [None]:
df_b = pd.read_csv('datasets/iter2/bjets_10-150GeV_base.csv', nrows=nrows_b, skiprows=range(1,skiprows))
df_b['flavour'] = 'b'
df_b = convert_float64_to_float32(df_b)

In [None]:
df_c = pd.read_csv('datasets/iter2/cjets_10-150GeV_base.csv', nrows=nrows_c, skiprows=range(1,skiprows))
df_c['flavour'] = 'c'
df_c = convert_float64_to_float32(df_c)

In [None]:
df_udsg = pd.read_csv('datasets/iter2/udsgjets_10-150GeV_base.csv', nrows=nrows_udsg, skiprows=range(1,skiprows))
df_udsg['flavour'] = 'udsg'
df_udsg = convert_float64_to_float32(df_udsg)

# Load models from _comet.ml_

TODO: 
- ?? select_best_model(metric='test_roc_auc', constrains='test_metric / train_metric < 1.05')

In [None]:
x = 12
isinstance(x, (int,str))

In [None]:
def get_model_from_exp(exp_id, model_type=XGBClassifier, featnames_type=(pd.Index, pd.Series, np.array, list), scaler_type=StandardScaler, api=comet_ml.API()):
    exp = api.get(exp_id)
    assets = exp.get_model_asset_list(exp.get_model_names()[0])
    asset_id_model     = assets[  ['model' in a['fileName']     for a in assets].index(True)  ]['assetId']
    asset_id_featnames = assets[  ['feat' in a['fileName'] for a in assets].index(True)  ]['assetId']
    asset_id_scaler    = assets[  ['scaler' in a['fileName']    for a in assets].index(True)  ]['assetId']

    model_bin = exp.get_asset(asset_id_model)
    model = pickle.loads(model_bin)
    assert isinstance(model, model_type)
    
    featnames_bin = exp.get_asset(asset_id_featnames)
    featnames = pickle.loads(featnames_bin)
    assert isinstance(featnames, featnames_type)
    
    scaler_bin = exp.get_asset(asset_id_scaler)
    scaler = pickle.loads(scaler_bin)
    assert isinstance(scaler, scaler_type)
    
    return model, np.array(featnames), scaler

In [None]:
exp_id_bc_vs_udsg = 'phd/bc-vs-udsg/bcf99db8f5a94b2184e6e13161c50bbe'
# exp_id_bc_vs_udsg = 'phd/bc-vs-udsg/61c014a2ff7c49e8bae9ec466ffaa998'
exp_id_b_vs_c     = 'phd/b-vs-c/3ce14e4e99d54283bc66eb24c98b6468' 
clf_bc_vs_udsg , feats_bc_vs_udsg, scaler_bc_vs_udsg = get_model_from_exp(exp_id_bc_vs_udsg)
clf_b_vs_c     , feats_b_vs_c    , scaler_b_vs_c     = get_model_from_exp(exp_id_b_vs_c)
feats_all = np.unique(np.hstack([feats_bc_vs_udsg, feats_b_vs_c]))

def short_exp_id(exp_id):
    return exp_id.split('/')[-1][:6]

# Apply models

In [None]:
X = scaler_bc_vs_udsg.transform(df_b[feats_bc_vs_udsg])
y_b_proba_bc_vs_udsg = clf_bc_vs_udsg.predict_proba(X)[:,1]
X = scaler_b_vs_c.transform(df_b[feats_b_vs_c])
y_b_proba_b_vs_c = clf_b_vs_c.predict_proba(X)[:,1]

X = scaler_bc_vs_udsg.transform(df_c[feats_bc_vs_udsg])
y_c_proba_bc_vs_udsg = clf_bc_vs_udsg.predict_proba(X)[:,1]
X = scaler_b_vs_c.transform(df_c[feats_b_vs_c])
y_c_proba_b_vs_c = clf_b_vs_c.predict_proba(X)[:,1]

X = scaler_bc_vs_udsg.transform(df_udsg[feats_bc_vs_udsg])
y_udsg_proba_bc_vs_udsg = clf_bc_vs_udsg.predict_proba(X)[:,1]
X = scaler_b_vs_c.transform(df_udsg[feats_b_vs_c])
y_udsg_proba_b_vs_c = clf_b_vs_c.predict_proba(X)[:,1]

# Report performance

## 2D histos / scatterplots for each flavour

In [None]:
n = 100000
alpha = 0.25

fig,ax = plt.subplots(figsize=(10,8))
ax.plot(y_udsg_proba_bc_vs_udsg[:n], y_udsg_proba_b_vs_c[:n], ',', c='b', alpha=alpha)
ax.plot(y_c_proba_bc_vs_udsg[:n], y_c_proba_b_vs_c[:n], ',', c='orange', alpha=alpha)
ax.plot(y_b_proba_bc_vs_udsg[:n], y_b_proba_b_vs_c[:n], ',', c='r', alpha=alpha)
ax.set_xlabel('score bc vs udsg')
ax.set_ylabel('score b vs c')
plt.savefig(f'scores_2BDTs_all-flavours_{short_exp_id(exp_id_bc_vs_udsg)}-{short_exp_id(exp_id_b_vs_c)}.png')

In [None]:
for y_proba_bc_vs_udsg, y_proba_b_vs_c, flavour in zip(
                                        [y_udsg_proba_bc_vs_udsg, y_c_proba_bc_vs_udsg, y_b_proba_bc_vs_udsg],
                                        [y_udsg_proba_b_vs_c,     y_c_proba_b_vs_c,     y_b_proba_b_vs_c],
                                        ['udsg',                  'c',                  'b'],
                                    ):
    plt.figure(figsize=(7,5))
    plt.hist2d(y_proba_bc_vs_udsg, y_proba_b_vs_c, bins=50, norm=mpl.colors.LogNorm(), vmin=10, vmax=3000);
    plt.colorbar()
    plt.xlabel('score bc vs udsg')
    plt.ylabel('score b vs c')
    plt.title(f'{flavour}-jets')
    plt.savefig(f'scores_2BDTs_{flavour}_{short_exp_id(exp_id_bc_vs_udsg)}-{short_exp_id(exp_id_b_vs_c)}.png')


## Performance plots for given thresholds
TODO