# Load modules

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

from keras.layers import Input, Dense
from keras.models import Model

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from scipy.stats import percentileofscore

import pyod
from pyod.models.abod import ABOD
from pyod.models.auto_encoder import AutoEncoder
from pyod.models.cblof import CBLOF
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.loci import LOCI
from pyod.models.lof import LOF
from pyod.models.lscp import LSCP
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.pca import PCA
from pyod.models.sos import SOS
from pyod.models.xgbod import XGBOD

import lime
import lime.lime_tabular
import shap
shap.initjs()
from helper.pdp import compute_pdp, plot_pdp, plot_ice

from time import time
from copy import deepcopy
import dill
import warnings
warnings.filterwarnings("default", category=FutureWarning, module='pyod')
warnings.filterwarnings("default", category=FutureWarning, module='sklearn')

from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

# Prepare input

In [None]:
def stat_descr(arr, quantiles=[0, 0.25, 0.5, 0.75, 1.]):       
    n = len(arr)
    mu = np.mean(arr)
    std = np.std(arr)
    q_vals = np.quantile(arr, quantiles)
    
    def custom_format(x):
        if x > 50: return '{:^12.0f}'.format(x)
        else: return '{:^12.2f}'.format(x)
    q_str_perc = (''+'|').join(['{:8.0f}%   '.format(q*100) for q in quantiles])
    q_str_vals = (''+'|').join([custom_format(q) for q in q_vals])
    line = '-------------'*len(quantiles) 
    print(f'{n} values\n{mu:.3f} +/- {std:.3f}\n{q_str_perc}\n{line}\n{q_str_vals}\n')
    
# x = (np.random.randn(100)+5)
# stat_descr(x)
# stat_descr(x, quantiles=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8])

In [None]:
%%time 
fname_df_orig = 'data/trending_merged_LHC18q_withGraphs.csv'
df_orig = pd.read_csv(fname_df_orig)

target_col = 'alias_global_Warning'
#----------

df = df_orig[[c for c in df_orig.columns if 
              ('gr' not in c and 'alias' not in c and 'Unnamed' not in c)
              and c != 'dataType.fString'
              or c == target_col
             ]]
rename = lambda c: c if c != target_col else 'bad'
df.columns = [rename(c) for c in df.columns]

good_idx = df['bad'] == 0
bad_idx  = df['bad'] == 1

In [None]:
sens_vars = pd.read_csv('data/sensitive_variables_list.csv', sep=':')
sens_vars = [var[:-2] for var in sens_vars] + ['bad']

df = df[sens_vars]

# Train model

In [None]:
%%time
# out_det = KNN(n_neighbors=3, method='mean')  # method, n_neighbours


data = df.drop(['bad'], axis=1)
data = data.drop(['oroc_A_side', 'oroc_C_side', 'iroc_A_side', 'iroc_C_side'], axis=1)
x = data.to_numpy()
scaler = StandardScaler(with_mean=False, with_std=False) 
x_s = scaler.fit_transform(x)
# data_s = (data - data.mean()) / data.std()
X = pd.DataFrame(x_s, columns=data.columns)
y = df['bad']
col_names = X.columns

# out_det = ABOD(n_neighbors=15)
# out_det = CBLOF()
# out_det = HBOS()
# out_det = IForest(100, contamination=0.04)
# out_det = KNN(n_neighbors=5)
# out_det = LOCI()
# out_det = LOF()
# out_det = LSCP()
# out_det = MCD()
# out_det = OCSVM(gamma=0.1)
# out_det = PCA(n_components=10, whiten=True)
# out_det = SOS()
# out_det = XGBOD()
out_det = AutoEncoder(hidden_neurons=[16,4,16], dropout_rate=0.4, 
                      validation_size=0.2,
#                       epochs=100, 
#                       l2_regularizer=0.,
                      verbose=2)
out_det.fit(X);
# scores = out_det.decision_scores_

####
# SPLIT HERE
####

clf = out_det

modelname = 'AE'
method='linear'

def score(X):
    return clf.predict_proba(X, method=method)[:,1]

scores = score(X)
stat_descr(scores, quantiles=[0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.])

bins = np.histogram(scores, 30)[1]
fig, axes = plt.subplots(2,2, figsize=(14,10))
# axes[0].hist(scores, bins=bins, histtype='step', lw=2, density=1, color='k');

axes[0][0].hist(scores, bins=bins, histtype='step', lw=2, density=1, color='k');
axes[0][0].set_title('normalized');

axes[0][1].hist(scores, bins=bins, histtype='step', lw=2, density=0, color='k');
axes[0][1].set_yscale("log", nonposy='clip')
axes[0][1].set_title('unnormalized (log y)');

axes[1][0].hist(scores[good_idx], bins=bins, histtype='step', lw=2, density=1, color='b');
axes[1][0].hist(scores[bad_idx], bins=bins, histtype='step', lw=2, density=1, color='r');
axes[1][0].set_title('normalized by class');

axes[1][1].hist(scores[good_idx], bins=bins, histtype='step', lw=2, density=0, color='b');
axes[1][1].hist(scores[bad_idx], bins=bins, histtype='step', lw=2, density=0, color='r');
axes[1][1].set_yscale("log", nonposy='clip')
axes[1][1].set_title('unnormalized (log y)');

In [None]:
fig, axes = plt.subplots(1,2, figsize=(14,5))

axes[0].hist(scores, bins=bins, histtype='step', lw=2, density=1, color='k');
axes[0].set_title('normalized');

axes[1].hist(scores, bins=bins, histtype='step', lw=2, density=0, color='k');
axes[1].set_yscale("log", nonposy='clip')
axes[1].set_title('unnormalized (log y)');

fig.savefig(f'graphics/{modelname}_{method}_bw.png')

In [None]:
fig, axes = plt.subplots(1,2, figsize=(14,5))

axes[0].hist(scores[good_idx], bins=bins, histtype='step', lw=2, density=1, color='b');
axes[0].hist(scores[bad_idx], bins=bins, histtype='step', lw=2, density=1, color='r');
axes[0].set_title('normalized by class');

axes[1].hist(scores[good_idx], bins=bins, histtype='step', lw=2, density=0, color='b');
axes[1].hist(scores[bad_idx], bins=bins, histtype='step', lw=2, density=0, color='r');
axes[1].set_yscale("log", nonposy='clip')
axes[1].set_title('unnormalized (log y)');

fig.savefig(f'graphics/{modelname}_{method}_color.png')

# Compute SHAPley values

In [None]:


# # kernel explainer with k-means
k_in_kmeans = 10
X_summary = shap.kmeans(X, k=k_in_kmeans)
shap_explainer = shap.KernelExplainer(score, X_summary)
shap_values = shap_explainer.shap_values(X);  # ~2-4 it/sec for k=5-20 in k-means

# Save model data

In [None]:
# X_summary is already stored in explainer.data
# score_func is already stored in  explainer_kmeans.model.f

# scores stored for validation of shap_explainer correct dumping



description = f"model={clf};scaler={scaler};shap_explainer={shap_explainer};k_in_kmeans={k_in_kmeans}"

model_data = dict(
                  clf=clf, # has to be read into `clf` (like in `score` function), otherwise shap_explainer will be invalid
                  X=X, y=y, scaler=scaler, col_names=col_names, 
                  fname_df_orig=fname_df_orig,
                  shap_explainer=shap_explainer, shap_values=shap_values, k_in_kmeans=k_in_kmeans, 
                  scores=scores,
                  description=description
                 )

In [None]:
out_fname = 'storage/model_IF-1000_k10.dill.dill'
with open(out_fname, 'wb') as f:
    dill.dump(model_data, f, dill.HIGHEST_PROTOCOL)