# Load modules

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

from keras.layers import Input, Dense
from keras.models import Model

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from scipy.stats import percentileofscore

import pyod
from pyod.models.abod import ABOD
from pyod.models.auto_encoder import AutoEncoder
# from pyod.models.cof import COF
from pyod.models.cblof import CBLOF
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.loci import LOCI
from pyod.models.lof import LOF
from pyod.models.lscp import LSCP
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.pca import PCA
from pyod.models.sos import SOS
from pyod.models.xgbod import XGBOD

import lime
import lime.lime_tabular
import shap
shap.initjs()
from helper.pdp import compute_pdp, plot_pdp, plot_ice
from helper.utilis import stat_descr

from time import time
from copy import deepcopy
import dill
import warnings
warnings.filterwarnings("default", category=FutureWarning, module='pyod')
warnings.filterwarnings("default", category=FutureWarning, module='sklearn')

from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

# Config

In [None]:
### DATA

period = 'LHC18f'
# train_on_good = True
train_on_good = False
scaler = StandardScaler(with_mean=False, with_std=False) 


### MODEL

# out_det = ABOD(n_neighbors=5)
# out_det = CBLOF(n_clusters=8)
# out_det = HBOS(n_bins=10)
out_det = IForest(10)
# out_det = KNN(n_neighbors=100)
# out_det = LOCI(alpha=0.005, k=3)
# out_det = LOF()
# out_det = LSCP()
# out_det = MCD()
# out_det = OCSVM(gamma=1e-5)
# out_det = PCA(n_components=5, whiten=True)
# out_det = SOS()
# out_det = XGBOD()

# hidden_neurons = [64,32,16,32,64]
# dropout_rate = 0.2
# epochs = 100
# out_det = AutoEncoder(hidden_neurons=hidden_neurons, 
#                       dropout_rate=dropout_rate, 
#                       validation_size=0.2,
#                       epochs=epochs, 
#                       l2_regularizer=0.,
#                       verbose=2)


### OTHER
proba_method = 'unify'  # PyOD: predict_proba based on raw score
compute_shap = False


### OUTPUT

description = f"model={out_det};scaler={scaler};proba_method={proba_method};period={period}"
train_on_str = 'train-on-good' if train_on_good else 'train-on-all'
shap_str = 'withSHAP' if compute_shap else 'noSHAP'
if 'AutoEncoder' in out_det.__str__():
    out_fname = f"storage/model_{period}_AE-{'-'.join([str(hn) for hn in hidden_neurons])}_dropout{str(dropout_rate).replace('.', '')}_epochs{str(epochs)}_{train_on_str}_{shap_str}.dill"
else:
    out_fname = f"storage/model_{period}_IF-10_{train_on_str}_{shap_str}.dill"

print(out_fname)

# Prepare input

In [None]:
%%time 

fname_df_orig = f'data/trending_merged_{period}_withGraphs.csv'
df_orig = pd.read_csv(fname_df_orig)

target_col = 'alias_global_Warning'
#----------

df = df_orig[[c for c in df_orig.columns if 
              ('gr' not in c and 'alias' not in c and 'Unnamed' not in c)
              and c != 'dataType.fString'
              or c == target_col
             ]]
rename = lambda c: c if c != target_col else 'bad'
df.columns = [rename(c) for c in df.columns]

good_idx = df['bad'] == 0
bad_idx  = df['bad'] == 1

run_lst = df['run']
id_lst = df['chunkID']
start_lst = df['chunkStart']
period_lst = df['period.fString']

In [None]:
sens_vars = pd.read_csv('data/sensitive_variables_list.csv', sep=':')
sens_vars = [var[:-2] for var in sens_vars] + ['bad']

df = df[sens_vars]

# Train model

In [None]:
%%time


data = df.drop(['bad'], axis=1)
data = data.drop(['oroc_A_side', 'oroc_C_side', 'iroc_A_side', 'iroc_C_side'], axis=1)
x = data.to_numpy()
x_s = scaler.fit_transform(x)
X = pd.DataFrame(x_s, columns=data.columns)
y = df['bad']
col_names = X.columns

if train_on_good: 
    out_det.fit(X[y==0]);
else:
    out_det.fit(X);

# end of training

    

def score(X):   # used also in SHAP values computing
    return out_det.predict_proba(X, method=proba_method)[:,1]

scores = score(X)
stat_descr(scores, quantiles=[0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.])

bins = np.histogram(scores, 30)[1]
fig, axes = plt.subplots(2,2, figsize=(14,10))

axes[0][0].hist(scores, bins=bins, histtype='step', lw=2, density=1, color='k');
axes[0][0].set_title('normalized');

axes[0][1].hist(scores, bins=bins, histtype='step', lw=2, density=0, color='k');
axes[0][1].set_yscale("log", nonposy='clip')
axes[0][1].set_title('unnormalized (log y)');

axes[1][0].hist(scores[good_idx], bins=bins, histtype='step', lw=2, density=1, color='b');
axes[1][0].hist(scores[bad_idx], bins=bins, histtype='step', lw=2, density=1, color='r');
axes[1][0].set_title('normalized by class');

axes[1][1].hist(scores[good_idx], bins=bins, histtype='step', lw=2, density=0, color='b');
axes[1][1].hist(scores[bad_idx], bins=bins, histtype='step', lw=2, density=0, color='r');
axes[1][1].set_yscale("log", nonposy='clip')
axes[1][1].set_title('unnormalized (log y)');

# Compute SHAPley values

In [None]:
if compute_shap:
    # kernel explainer with k-means
    k_in_kmeans = 10
    X_summary = shap.kmeans(X, k=k_in_kmeans)
    shap_explainer = shap.KernelExplainer(score, X_summary, l1_reg='num_features(10)')
    shap_values = shap_explainer.shap_values(X);  # ~2-4 it/sec for k=5-20 in k-means

# Save model data

In [None]:
# X_summary is already stored in explainer.data
# score_func is already stored in  explainer_kmeans.model.f

# scores stored for validation of shap_explainer correct dumping



model_data = dict(
                  clf=out_det, # LEGACY, has to be read into `clf` (like in `score` function), otherwise shap_explainer will be invalid
                  out_det=out_det, 
                  X=X, y=y, scaler=scaler, col_names=col_names, 
                  fname_df_orig=fname_df_orig,
                  scores=scores,
                  description=description,
                  proba_method=proba_method,
                  run_lst = run_lst,
                  id_lst = id_lst,
                  start_lst = start_lst,
                  period_lst = period_lst
                 )
if compute_shap: 
    model_data['shap_explainer'] = shap_explainer
    model_data['shap_values'] = shap_values
    model_data['k_in_kmeans'] = k_in_kmeans


In [None]:
with open(out_fname, 'wb') as f:
    dill.dump(model_data, f, dill.HIGHEST_PROTOCOL)