# Load modules

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

from keras.layers import Input, Dense, Dropout
from keras.models import Model
from keras.optimizers import adadelta

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn.metrics import mean_squared_error

from scipy.stats import percentileofscore

from livelossplot import PlotLossesKeras

from helper import ae_errors
from helper.utilis import show_qa_plots

In [None]:
from IPython.core.display import HTML
style = """
<style>
div.output_area {
    overflow-y: scroll;
}
div.output_area img {
    max-width: unset;
}
</style>
"""

def make_cell_scrollable():
    HTML(style)
    
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

# Input preparation

In [None]:
%%time 
df_orig = pd.read_csv('data/trending_merged_LHC18q_withGraphs.csv')

target_col = 'alias_global_Warning'
#----------
df = df_orig[[c for c in df_orig.columns if 
              ('gr' not in c and 'alias' not in c and 'Unnamed' not in c)
              and c != 'dataType.fString'
              or c == target_col
             ]]
rename = lambda c: c if c != target_col else 'bad'
df.columns = [rename(c) for c in df.columns]

In [None]:
df.columns.tolist()
nonphysical_cols = ['run', 'chunkID', 'time', 
                   'year', 'period.fString', 'pass.fString', 'runType.fString', 
                   'startTimeGRP', 'stopTimeGRP', 'duration', 
                   'iroc_A_side', 'oroc_A_side', 'iroc_C_side', 'oroc_C_side',
                   'chunkStart', 'chunkStop', 'chunkMean', 'chunkMedian', 'chunkRMS', 'chunkDuration']

no_variance_cols = df.std()[(df.std() < 1e-6).tolist()].index.tolist()
cols_exclude = nonphysical_cols + no_variance_cols

for c in df.columns:
    if c not in cols_exclude:
        print(c)

In [None]:
input_data = df[[c for c in df.columns if c not in cols_exclude]].drop('bad', axis=1)

for i in range(5):
    input_data[f'random{i}'] = np.random.randn(len(input_data))

x = input_data.to_numpy()
y = df['bad'].to_numpy()

x_test_bad = x[y == 1]
x_train_val_good, x_test_good = train_test_split(x[y == 0], test_size=0.1)  
x_train, x_val = train_test_split(x_train_val_good, test_size=0.1)  # x_val are GOOD samples used to monitor overfitting

scaler = StandardScaler()
# scaler = MaxAbsScaler()
scaler.fit(x_train)
x_train     = scaler.transform(x_train)
x_val       = scaler.transform(x_val)
x_test_good = scaler.transform(x_test_good)
x_test_bad  = scaler.transform(x_test_bad)
x_all       = scaler.transform(x)

In [None]:
bins = np.linspace(0.6, 1, 30)
plt.hist(df_orig.query('alias_global_Warning == 1')['tpcItsMatchHighPtA'], histtype='step', color='r', bins=bins, density=1)
plt.hist(df_orig.query('alias_global_Outlier == 1')['tpcItsMatchHighPtA'], histtype='step', color='k', bins=bins, density=1)
plt.hist(df_orig.query('alias_global_Warning == 0')['tpcItsMatchHighPtA'], histtype='step', color='blue', bins=bins, density=1)
plt.semilogy()

## InteractionRate viz. and binning

In [None]:
jitter_y = 1000
#-------------
plt.figure(figsize=(25,5))
# plt.plot(df['chunkMean'], df['interactionRate']+np.random.random(len(df))*jitter_y, '.', ms=3)
plt.plot(df.query('bad == 0')['chunkMean'], df.query('bad == 0')['interactionRate']+np.random.random(len(df.query('bad == 0')))*jitter_y, '.', ms=2, color='b')
plt.plot(df.query('bad == 1')['chunkMean'], df.query('bad == 1')['interactionRate']+np.random.random(len(df.query('bad == 1')))*jitter_y, '.', ms=8, marker='x', color='r')
plt.xlabel('chunk mean time');
plt.ylabel('interactionRate');

In [None]:
# plt.hist(df.query('bad == 0')['interactionRate'], bins=20, histtype='step', color='b', density=1)
# plt.hist(df.query('bad == 1')['interactionRate'], bins=20, histtype='step', color='r', density=1)

plt.hist(df['interactionRate'], bins=50, histtype='step', color='b', density=1);

Division into 3/5 interaction rate bins:  
5 bins: (0-2),(2-4),(4-6),(6-7),(7-8)k  
3 bins: (0-4),(4-7),(7-8)k

In [None]:
def assign_irate_bin3(row):
    irate_val = row['interactionRate']
    if irate_val < 4000:
        return 1
    if irate_val > 4000 and irate_val < 7000: 
        return 2
    if irate_val > 7000:
        return 3
    
def assign_irate_bin5(row):
    irate_val = row['interactionRate']
    if irate_val < 2000:
        return 1
    if irate_val > 2000 and irate_val < 4000: 
        return 2
    if irate_val > 4000 and irate_val < 6000: 
        return 3
    if irate_val > 6000 and irate_val < 7000: 
        return 4
    if irate_val > 7000:
        return 5

df['irate_bin3'] = df.apply(assign_irate_bin3, axis=1)
df['irate_bin5'] = df.apply(assign_irate_bin5, axis=1)

In [None]:
for i in range(1,4):
    print(f'bin {i}/3: {df["irate_bin3"].value_counts(sort=False)[i]}')
print()
for i in range(1,6):
    print(f'bin {i}/5: {df["irate_bin5"].value_counts(sort=False)[i]}')

In [None]:
irate_bin_31 = df.query('irate_bin3 == 1').index
irate_bin_32 = df.query('irate_bin3 == 2').index
irate_bin_33 = df.query('irate_bin3 == 3').index

irate_bin_51 = df.query('irate_bin5 == 1').index
irate_bin_52 = df.query('irate_bin5 == 2').index
irate_bin_53 = df.query('irate_bin5 == 3').index
irate_bin_54 = df.query('irate_bin5 == 4').index
irate_bin_55 = df.query('irate_bin5 == 5').index

In [None]:
# irate_bin_31 = df.query('interactionRate < 4000').index   # indices of instances from interactionRate bin 1 out of 3
# irate_bin_32 = df.query('interactionRate > 4000 & interactionRate < 7000 ').index
# irate_bin_33 = df.query('interactionRate > 7000 ').index

# print(f'Counts per bin:\n1/3: {len(irate_bin_31)}\n2/3: {len(irate_bin_32)}\n3/3: {len(irate_bin_33)}')

In [None]:
# irate_bin_51 = df.query('interactionRate < 2000').index   # indices of instances from interactionRate bin 1 out of 5
# irate_bin_52 = df.query('interactionRate > 2000 & interactionRate < 4000 ').index
# irate_bin_53 = df.query('interactionRate > 4000 & interactionRate < 6000 ').index
# irate_bin_54 = df.query('interactionRate > 6000 & interactionRate < 7000 ').index
# irate_bin_55 = df.query('interactionRate > 7000 ').index

# print(f'Counts per bin:\n1/5: {len(irate_bin_51)}\n2/5: {len(irate_bin_52)}\n3/5: {len(irate_bin_53)}\n4/5: {len(irate_bin_54)}\n5/5: {len(irate_bin_55)}')

# Model training

In [None]:
input_size = x_train.shape[1]
coding_layers_sizes = [64,32]
bottleneck_size = 16

ae_input = Input(shape=(input_size,))
encoded = Dense(coding_layers_sizes[0], activation='relu')(ae_input)
for lsize in coding_layers_sizes[1:]:
    encoded = Dense(lsize, activation='relu')(encoded)
#     encoded = Dropout(0.2)(encoded)
encoded = Dense(bottleneck_size, activation='relu')(encoded)

decoded = Dense(coding_layers_sizes[-1], activation='relu')(encoded)
for lsize in reversed(coding_layers_sizes[:-1]):
    decoded = Dense(lsize, activation='relu')(decoded)
decoded = Dense(input_size, activation='linear')(decoded)

autoencoder = Model(ae_input, decoded)

In [None]:
autoencoder = Model(ae_input, decoded)
autoencoder.compile(optimizer=adadelta(lr=0.2), loss='mean_squared_error')

fit = autoencoder.fit(x_train, x_train, 
                epochs=20,
                batch_size=32,
                verbose=2,
                shuffle=True,
                validation_data=(x_val, x_val),
                callbacks=[PlotLossesKeras()])
PlotLossesKeras()

In [None]:
loss = fit.history['loss']
val_loss = fit.history['val_loss']
epochs = fit.epoch

plt.plot(epochs, loss, 'bx--', label='train loss', color='blue')
plt.plot(epochs, val_loss, 'rx--', label='val loss', color='green')
plt.legend()
plt.show()

In [None]:
fit.model.summary()

# Compute predictions and errors

In [None]:
x_pred_train     = autoencoder.predict(x_train)
x_pred_val       = autoencoder.predict(x_val)
x_pred_test_good = autoencoder.predict(x_test_good)
x_pred_test_bad  = autoencoder.predict(x_test_bad)
x_pred_all       = autoencoder.predict(x_all)

mse_train     = mean_squared_error(x_train, x_pred_train)
mse_val       = mean_squared_error(x_val, x_pred_val)
mse_test_good = mean_squared_error(x_test_good, x_pred_test_good)
mse_test_bad  = mean_squared_error(x_test_bad, x_pred_test_bad)
mse_all       = mean_squared_error(x_all, x_pred_all)

print(f'average MSE:\n\t all = {mse_all:.3f}\n\t {"-"*10}\n\t train = {mse_train:.3f}\n\t val = {mse_val:.3f}\n\t test_good = {mse_test_good:.3f}\n\t test_bad = {mse_test_bad:.3f}')

In [None]:
mse_distr_train     = ((x_train - x_pred_train)**2).mean(axis=1)
mse_distr_val       = ((x_val - x_pred_val)**2).mean(axis=1)
mse_distr_test_good = ((x_test_good - x_pred_test_good)**2).mean(axis=1)
mse_distr_test_bad  = ((x_test_bad - x_pred_test_bad)**2).mean(axis=1)
mse_distr_all       = ((x_all - x_pred_all)**2).mean(axis=1)

# plot histos
bins = np.linspace(np.quantile(np.log10(mse_distr_all), 0), np.quantile(np.log10(mse_distr_all), 1), 20)
plt.hist(np.log10(mse_distr_train), bins=bins, density=1, lw=2, ls='-.', histtype='step', label='train', color='y')
plt.hist(np.log10(mse_distr_test_good), bins=bins, density=1, lw=2, histtype='step', label='test good', color='blue')
plt.hist(np.log10(mse_distr_test_bad),  bins=bins, density=1, lw=2, histtype='step', label='test bad', color='red')
plt.legend()
plt.xlabel('log (MSE)');

**MSE by column:**

In [None]:
mse_columns_train     = ((x_train - x_pred_train)**2).mean(axis=0)
mse_columns_val       = ((x_val - x_pred_val)**2).mean(axis=0)
mse_columns_test_good = ((x_test_good - x_pred_test_good)**2).mean(axis=0)
mse_columns_test_bad  = ((x_test_bad - x_pred_test_bad)**2).mean(axis=0)
mse_columns_all       = ((x_all - x_pred_all)**2).mean(axis=0)

for i_c, (c, train, test_g, test_b) in enumerate(zip(input_data.columns, mse_columns_train, mse_columns_test_good, mse_columns_test_bad)):
    print(f'{i_c:3.0f}. {c:<30s}: {train:.3f}, \t {test_g:.3f}, {test_b:6.3f}, \t {test_b/test_g:.2f}')

# Visualization

## General

In [None]:
ae_errors.plot_AE_error(mse_columns_all, input_data.columns, ylabels='AE squared error\n(all instances)');

In [None]:
axes = ae_errors.plot_AE_error([mse_columns_train, mse_columns_val, mse_columns_test_good, mse_columns_test_bad], 
                        ylabels=[   'train',           'val',           'test_good',           'test_bad'],
                        columns=input_data.columns);
# for i in range(3): axes[i].set_ylim([0,4])

## MSE by interactionRate bins



In [None]:
def plot_histos_irate_bins(bin_to_highlight, irate_nbins, histo_nbins, qlow, qhigh):
    colors = ['lime', 'r', 'b', 'c', 'k']
    bins = np.linspace(np.quantile(np.log10(mse_distr_all), 0), np.quantile(np.log10(mse_distr_all), 1), histo_nbins)
    xmin = np.quantile(np.log10(mse_distr_all), qlow)
    xmax = np.quantile(np.log10(mse_distr_all), qhigh)
    max_bin_val = 0    
    plt.subplots(1,1, figsize=(12,6))
    for i,color in zip(range(1,irate_nbins+1), colors):
        idx = df.query(f'irate_bin{irate_nbins} == @i').index
        counts, xs, _ = plt.hist(np.log10(mse_distr_all[ idx ]),  bins=bins, density=1, lw=2, histtype='step', label=f'bin {i}/{irate_nbins}', color=color)
        for c,x in zip(counts, xs[1:]):
            if x > xmin:
                max_bin_val = max(max_bin_val, c)

    if bin_to_highlight in range(1,irate_nbins+1):
        idx = df.query(f'irate_bin{irate_nbins} == {bin_to_highlight}').index
        plt.hist(np.log10(mse_distr_all[ idx ]),  bins=bins, density=1, lw=6, histtype='step', color=colors[bin_to_highlight-1])

    plt.legend();
    plt.xlim(xmin, xmax )
#     plt.ylim(top = max_bin_val*1.2)
    plt.semilogy()
    

wg_bins_highlight = widgets.Dropdown(description='bin to highlight', options=[0,1,2,3,4,5])
wg_nbins = widgets.IntSlider(description='n bins', min=5, max=60, value=20, step=5, continuous_update=False)
wg_qlow = widgets.FloatSlider(description='lower quantile', min=0, max=1, value=0.0, step=0.005, continuous_update=False)
wg_qhigh = widgets.FloatSlider(description='upper quantile', min=0, max=1, value=1, step=0.005, continuous_update=False)
wg_irate_nbins = widgets.RadioButtons(description='interactionRate nbins', options=[3,5], value=5)

ui = widgets.HBox([wg_bins_highlight])
ui_quantiles = widgets.HBox([wg_qlow, wg_qhigh])
ui_nbins = widgets.HBox([wg_nbins, wg_irate_nbins])

out = widgets.interactive_output(plot_histos_irate_bins, {'bin_to_highlight': wg_bins_highlight, 
                                                          'histo_nbins':wg_nbins,
                                                          'irate_nbins':wg_irate_nbins,
                                                          'qlow':wg_qlow,
                                                          'qhigh':wg_qhigh
                                                          })
display(ui_quantiles)
display(ui_nbins)
display(ui, out)

## Single instance

In [None]:
for index in df.index:
    if df.iloc[index]['bad'] == 1: continue
    mse_instance = (x_all[index,:]-x_pred_all[index,:])**2 
    log_mse = np.log10(mse_instance.mean())
    arrow = '\t\t<------' if log_mse > 0.5 else ''
    print(f'{index:5d}: {log_mse:7.4f} {arrow}')

In [None]:
instance_index = 145
irate_nbins = 5
#----------------------

row_orig = df_orig.iloc[instance_index]
row      = df.iloc[instance_index]
global_warning_flag = row_orig['alias_global_Warning']
mse_instance_number = mse_distr_all[instance_index]
mse_percentile = percentileofscore(mse_distr_all, mse_instance_number)

instance_irate = row['interactionRate']
instance_irate_bin = assign_irate_bin5(row) if irate_nbins == 5 else assign_irate_bin3(row)

status_str =  f"chunk {instance_index} [ {row_orig['period.fString']} / {row_orig['run']} / chunk {row_orig['chunkID']} ]:  \n - _globalWarning_ flag set to: **{bool(global_warning_flag)}**  \n  MSE = **{mse_instance_number:.3f}**  \n  log(MSE) = **{np.log10(mse_instance_number):.3f}**  \n  \n interactionRate = {instance_irate:.1f} (bin **{instance_irate_bin}**/{irate_nbins})"
printmd(status_str)

In [None]:
mse_instance = (x_all[instance_index,:]-x_pred_all[instance_index,:])**2 

irate_bin_idx = df.query(f'irate_bin{irate_nbins} == {instance_irate_bin}').index
mse_columns_irate_bin = ((x_all[irate_bin_idx] - x_pred_all[irate_bin_idx])**2).mean(axis=0)

mse_instance_relative_all       = mse_instance / mse_columns_all
mse_instance_relative_irate_bin = mse_instance / mse_columns_irate_bin

In [None]:
# plot histos
nbins = 30
fig,axes = plt.subplots(1,2, figsize=(16,4))

ax = axes[0]
bins = np.linspace(np.quantile(np.log10(mse_distr_all), 0), np.quantile(np.log10(mse_distr_all), 1), nbins)
ax.hist(np.log10(mse_distr_train), bins=bins, density=1, lw=2, ls='-.', histtype='step', label='train', color='y')
ax.hist(np.log10(mse_distr_test_good), bins=bins, density=1, lw=2, histtype='step', label='test good', color='blue')
ax.hist(np.log10(mse_distr_test_bad),  bins=bins, density=1, lw=2, histtype='step', label='test bad', color='red')

# ax.hist(np.log10(mse_distr_all[ df.query(f'irate_bin3 == {instance_irate_bin}').index ]),  bins=bins, density=1, lw=2, histtype='step', label=f'bin {instance_irate_bin}/3', color='k')

ax.legend()
ax.set_xlabel('log (MSE)');

xrange = ax.get_xlim()[1] - ax.get_xlim()[0]
yrange = ax.get_ylim()[1] - ax.get_ylim()[0]
ax.arrow(np.log10(mse_instance.mean()), yrange*0.95, 0, -0.2*yrange, 
            width=0.01*xrange, fc='k')
ax.text(np.log10(mse_instance.mean())-0.1, yrange*0.85, f'{mse_instance.mean():.2f}\nlog={np.log10(mse_instance.mean()):.2f}', horizontalalignment='right', fontdict=dict(fontsize=14));


####


ax = axes[1]
bins = np.linspace(np.quantile(np.log10(mse_distr_all), 0), np.quantile(np.log10(mse_distr_all), 1), nbins)
# ax.hist(np.log10(mse_distr_train), bins=bins, density=1, lw=2, ls='-.', histtype='step', label='train', color='y')
# ax.hist(np.log10(mse_distr_test_good), bins=bins, density=1, lw=2, histtype='step', label='test good', color='blue')
# ax.hist(np.log10(mse_distr_test_bad),  bins=bins, density=1, lw=2, histtype='step', label='test bad', color='red')

ax.hist(np.log10(mse_distr_all),  bins=bins, density=1, lw=2, histtype='step', label=f'all bins', color='gray')
ax.hist(np.log10(mse_distr_all[ df.query(f'irate_bin5 == {instance_irate_bin}').index ]),  bins=bins, density=1, lw=2, histtype='step', label=f'bin {instance_irate_bin}/5', color='k')

ax.legend()
ax.set_xlabel('log (MSE)');

xrange = ax.get_xlim()[1] - ax.get_xlim()[0]
yrange = ax.get_ylim()[1] - ax.get_ylim()[0]
ax.arrow(np.log10(mse_instance.mean()), yrange*0.95, 0, -0.2*yrange, 
            width=0.01*xrange, fc='k')
ax.text(np.log10(mse_instance.mean())-0.1, yrange*0.85, f'{mse_instance.mean():.2f}\nlog={np.log10(mse_instance.mean()):.2f}', horizontalalignment='right', fontdict=dict(fontsize=14));

In [None]:
fig,axes = plt.subplots(3,1,figsize=(50,15))
ae_errors.plot_AE_error([mse_instance, 
                         mse_instance_relative_all, 
                         mse_instance_relative_irate_bin], 
                        ylabels=
                        [f'squared errors\ninstance={instance_index}', 
                          'sq. errors relative to\naver. (column) error',
                          'sq. errors relative to\naver. in this IRate bin'],
                        columns=input_data.columns,
                       axes=axes);

In [None]:
show_qa_plots(df_orig.iloc[instance_index])

# TODO:

1. Compare with AE trained on both _good_ and _bad_
2. Check correlations of MSE of columns
3. Find justification for bad chunks of being rejected!  
   Then also look at apropriate QA control plots and see what can be wrong
4. Train AE without matching eff. and use it as flags  
   search for matching eff. to other detectors
5. Overall AE reproducibility - impact of architecture/training data on scores for particular chunks
6. Permutation importance (after repro check)