# Load modules

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

from keras.layers import Input, Dense
from keras.models import Model

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from scipy.stats import percentileofscore

import pyod
from pyod.models.abod import ABOD
from pyod.models.auto_encoder import AutoEncoder
from pyod.models.cblof import CBLOF
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.loci import LOCI
from pyod.models.lof import LOF
from pyod.models.lscp import LSCP
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.pca import PCA
from pyod.models.sos import SOS
from pyod.models.xgbod import XGBOD

import lime
import lime.lime_tabular
import shap
shap.initjs()
from helper.pdp import compute_pdp, plot_pdp, plot_ice
from helper.utilis import stat_descr

from os import path
from time import time
from copy import deepcopy
import dill
import pickle
import warnings
warnings.filterwarnings("default", category=FutureWarning, module='pyod')
warnings.filterwarnings("default", category=FutureWarning, module='sklearn')

from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

# Load model

In [None]:
period = 'LHC18f'

In [None]:
model_data = dill.load(open(f'storage/model_LHC18f_IF-10_train-on-all_noSHAP.dill', 'rb'))

In [None]:
clf            = model_data['clf']
X              = model_data['X']
y              = model_data['y']
scaler         = model_data['scaler']
col_names      = model_data['col_names']
description    = model_data['description']
fname_df_orig  = model_data['fname_df_orig']
scores         = model_data['scores']
proba_method   = model_data['proba_method']

has_shap = 'shap_explainer' in model_data.keys()
if has_shap:
    shap_explainer = model_data['shap_explainer']
    shap_values    = model_data['shap_values'] 
    k_in_kmeans    = model_data['k_in_kmeans']
    assert all(shap_explainer.model.f(X) == scores)
    print((shap_explainer.model.f(X) - scores).max())
    
cols = ['scores', 'period_lst', 'run_lst', 'id_lst', 'start_lst', 'y']
df_model = pd.DataFrame(np.array([model_data[col] for col in cols]).transpose(), columns=[c.replace('_lst', '').replace('id', 'chunk') for c in cols])

df_orig = pd.read_csv(fname_df_orig)

In [None]:
good_idx = y == 0
bad_idx  = y == 1

scores = clf.predict_proba(X, method=proba_method)[:,1]

bins = np.histogram(scores, 50)[1]
fig, axes = plt.subplots(2,2, figsize=(14,10))

axes[0][0].hist(scores, bins=bins, histtype='step', lw=2, density=1, color='k');
axes[0][0].set_title('normalized');

axes[0][1].hist(scores, bins=bins, histtype='step', lw=2, density=0, color='k');
axes[0][1].set_yscale("log", nonposy='clip')
axes[0][1].set_title('unnormalized (log y)');

axes[1][0].hist(scores[good_idx], bins=bins, histtype='step', lw=2, density=1, color='b');
axes[1][0].hist(scores[bad_idx], bins=bins, histtype='step', lw=2, density=1, color='r');
axes[1][0].set_title('normalized by class');

axes[1][1].hist(scores[good_idx], bins=bins, histtype='step', lw=2, density=0, color='b');
axes[1][1].hist(scores[bad_idx], bins=bins, histtype='step', lw=2, density=0, color='r');
axes[1][1].set_yscale("log", nonposy='clip')
axes[1][1].set_title('unnormalized (log y)');

# Join model's training data and fitting results

In [None]:
# df1 = pd.DataFrame({'run':[123,123,123,124,124,125,111], 'chunk':[1,2,3,1,2,1,1], 'dummy1':[11,12,13,14,15,16,100]})
# df2 = pd.DataFrame({'run':[123,123,123,124,124,125,222], 'chunk':[1,2,3,1,2,1,1], 'dummy2':[21,22,23,24,25,26,200]})
# # pd.concat([df1,df2], join='outer', keys=['chunk', 'run'])
# df1 = df1.set_index(['run', 'chunk'], drop=False)
# df2 = df2.set_index(['run', 'chunk'], drop=False)
# # df1
# df12 = df1.join(df2, how='outer',
#          lsuffix='', rsuffix='_2')
# df12

In [None]:
from IPython.display import display, Image

nbins = 100
df_fits = pd.read_csv(f'data_validation_V0s/fitting/fit_results_{period}.csv').query('nbins == @nbins')

fail_condition = 'mu < 495 or counts < 1000 or bound == True or sigma < 3'
potential_fails = df_fits.query(fail_condition)
df_fits = df_fits.query(f'not ({fail_condition})')
for _, row in potential_fails.iterrows():
    run, chunk = row['run'], row['chunk']
    plot_name = f'data_validation_V0s/fitting/plots/{period}/fit_K0s_{period}_{run:.0f}_{chunk:03.0f}_nbins{nbins}.png'
    print(plot_name)
    if path.isfile(plot_name): display(Image(filename=plot_name))
    else: print(f'no such file: {plot_name}')
        
print('dataframe of failed fits (failed based on `fail_condition`)')
potential_fails


In [None]:
df_fits.set_index(['run', 'chunk'], inplace=True, drop=False)
df_model.set_index(['run', 'chunk'], inplace=True, drop=False)
df_merged = df_model.join(df_fits, how='inner',
                          lsuffix='', rsuffix='_fromFit')
print('columns in merged df: ', df_merged.columns.tolist())
print('merged df')
df_merged.head()

There should be no difference between `global_Warning` from both dataframes  
`y` is from model df and `bad` from fitting df

In [None]:
df_merged[['y', 'bad']].query('y != bad')

# V0 valiadation

In [None]:
def plot_V0(q_thresh):
    #     q_thresh = 0.98
    thresh_val = np.quantile(df_merged['scores'], q_thresh)
    print(f'threshold quantile={q_thresh:.3f} \t threshold value={thresh_val:.3f} ')
    pred_bool_lst = [1 if sc >= thresh_val else 0 for sc in df_merged['scores']]

    df_merged['scores_2'] = pred_bool_lst

    sns.set(font_scale=1.3)
    sns.set_style('white')
    var_lst = ['mu', 'sigma', 'mu_err', 'counts']
    sns.pairplot(df_merged.query('counts > 2000'), 
                 x_vars=var_lst,
                 y_vars=var_lst,
                 hue='scores_2', 
                 palette=np.array(sns.diverging_palette(260, 15, s=99, l=40, sep=1, n=15))[[5,-1]],
                 aspect=1, height=2.5,
                 diag_kws=dict(bw='silverman'),
                 plot_kws=dict(s=50, edgecolor='w', alpha=0.7))
    
wg_q_thresh = widgets.FloatSlider(description='threshold quantile', min=0.7, max=1, value=0.95, step=0.01, continuous_update=False)
ui_q = widgets.HBox([wg_q_thresh,])
out = widgets.interactive_output(plot_V0, {'q_thresh': wg_q_thresh})
display(ui_q, out)

In [None]:
def score2cat(score):
    perc = percentileofscore(df_merged['scores'], score, 'mean')
    if   perc < 50: return 0
    elif perc < 80: return 1
    elif perc < 90: return 2
    elif perc < 94: return 3
    elif perc < 96: return 4
    elif perc < 98: return 5
    else:           return 6
score2cat_labels = {'0':'< 50', '1':'50-80', '2':'80-90', '3':'90-94', '4':'94-96', '5':'96-98', '6':'> 98'}
    
df_merged['scores_2'] = df_merged.apply(lambda row: score2cat(row['scores']), axis=1)
plt.hist([percentileofscore(df_merged['scores'], a, 'mean') for a in df_merged['scores']], bins=20)
plt.xlabel('score percentiles')
plt.figure()
plt.hist(df_merged['scores_2'], bins=20)
plt.xlabel('bins of score percentiles')


g = sns.pairplot(df_merged, 
             x_vars=['mu', 'sigma', 'mu_err',  'counts'],
             y_vars=['mu', 'sigma', 'mu_err',  'counts'],
             hue='scores_2', 
#              palette=np.array(sns.diverging_palette(260, 15, s=99, l=40, sep=1, n=11))[-7:],
             palette='coolwarm',
             aspect=1, height=2.5,
             diag_kws=dict(bw='silverman'),
             plot_kws=dict(s=50, edgecolor='w', alpha=0.8))

for t in g._legend.texts: t.set_text(score2cat_labels[t.get_text()])