In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

from keras.layers import Input, Dense, Dropout
from keras.models import Model

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Input preparation

In [None]:
%%time 
df_orig = pd.read_csv('data/trending_merged_LHC18q_withGraphs.csv')

target_col = 'alias_global_Warning'
#----------

df = df_orig[[c for c in df_orig.columns if 
              ('gr' not in c and 'alias' not in c and 'Unnamed' not in c)
              and c != 'dataType.fString'
              or c == target_col
             ]]
rename = lambda c: c if c != target_col else 'bad'
df.columns = [rename(c) for c in df.columns]

In [None]:
df.columns.tolist()
nonphysical_cols = ['run', 'chunkID', 'time', 
                   'year', 'period.fString', 'pass.fString', 'runType.fString', 
                   'startTimeGRP', 'stopTimeGRP', 'duration', 
                   'iroc_A_side', 'oroc_A_side', 'iroc_C_side', 'oroc_C_side',
                   'chunkStart', 'chunkStop', 'chunkMean', 'chunkMedian', 'chunkRMS', 'chunkDuration']

no_variance_cols = df.std()[(df.std() < 1e-6).tolist()].index.tolist()
cols_exclude_corr = nonphysical_cols + no_variance_cols

for c in df.columns:
    if c not in cols_exclude_corr:
        print(c)

In [None]:
import matplotlib.patches as mpatches

THRESHOLD = 1.01

corr = df[[c for c in df.columns if c not in cols_exclude_corr]].corr()
s = corr.abs().unstack()
so = s.sort_values(kind="quicksort", ascending=False)

fig, ax = plt.subplots()
plt.hist(so, histtype='step', bins=100);
plt.title('histo of corr. coef.');
ylim = plt.ylim()
y_l = ylim[0] + ylim[1]*0.15
y_h = ylim[0] + ylim[1]*0.4
arrow = mpatches.FancyArrowPatch((THRESHOLD, y_h), (THRESHOLD, y_l),
                                 mutation_scale=25, color='red', 
                                 arrowstyle='fancy');
ax.add_patch(arrow);
ax.text(THRESHOLD-0.03, y_h, f'{THRESHOLD}');

cols_no_corr = corr.columns.tolist()
for c1 in corr.columns:
    for c2 in corr.columns:
        if corr.columns.tolist().index(c1) <= corr.columns.tolist().index(c2): continue
        cval = corr.abs()[c1][c2]
        if cval > THRESHOLD and  c1 in cols_no_corr and c2 in cols_no_corr:
            cols_no_corr.remove(c2)
#             print(f'{c2} removed due to its corr. with {c1} = {cval:.3f}')
print('\n\n', cols_no_corr)
print(f'\n{len(cols_no_corr)} out of {len(corr.columns)} columns were selected')

In [None]:
df[cols_no_corr].columns

In [None]:
input_data = df[cols_no_corr].query('bad == 0')
input_data = input_data.drop(['bad'], axis=1)
x = input_data.to_numpy()
x_s = StandardScaler().fit(x).transform(x)

x_train, x_val = train_test_split(x_s, test_size=0.1)  # x_val are GOOD samples used to monitor overfitting


In [None]:
input_data = df[cols_no_corr].drop('bad', axis=1)
x = input_data.to_numpy()
x_s = StandardScaler().fit(x).transform(x)
# x_s = pd.DataFrame(x_s, columns=df.columns)
# x_s['bad'] = df['bad']
y = df['bad'].to_numpy()

x_test_bad = x_s[y == 1]

x_train_val_good, x_test_good = train_test_split(x_s[y == 0], test_size=0.1)  
x_train, x_val = train_test_split(x_train_val_good, test_size=0.1)  # x_val are GOOD samples used to monitor overfitting

# y_test_good = np.zeros(x_test_good.shape[0])
# y_test_bad  = np.ones(x_test_bad.shape[0])

# x_test = np.concatenate([x_test_good, x_test_bad])
# y_test = np.concatenate([y_test_good, y_test_bad])

In [None]:
x.shape

In [None]:
# this is our input placeholder
input_size = x_train.shape[1]
coding_layers_sizes = [32,16]
bottleneck_size = 8

ae_input = Input(shape=(input_size,))
encoded = Dense(coding_layers_sizes[0], activation='relu')(ae_input)
for lsize in coding_layers_sizes[1:]:
    encoded = Dense(lsize, activation='relu')(encoded)
#     encoded = Dropout(0.2)(encoded)
encoded = Dense(bottleneck_size, activation='relu')(encoded)
    
# encoded = Dense(8, activation='relu')(encoded)
# encoded = Dense(4, activation='relu')(encoded)
# encoded = Dense(8, activation='relu')(encoded)

# decoded = Dense(input_size, activation='linear')(encoded)

decoded = Dense(coding_layers_sizes[-1], activation='relu')(encoded)
for lsize in reversed(coding_layers_sizes[:-1]):
    decoded = Dense(lsize, activation='relu')(decoded)
# decoded = Dense(32, activation='relu')(decoded)
# decoded = Dense(16, activation='relu')(decoded)
decoded = Dense(input_size, activation='linear')(decoded)

# this model maps an input to its reconstruction
autoencoder = Model(ae_input, decoded)

In [None]:
autoencoder = Model(ae_input, decoded)
autoencoder.compile(optimizer='adadelta', loss='mean_squared_error')

fit = autoencoder.fit(x_train, x_train,
                epochs=50,
                batch_size=32,
                verbose=2,
                shuffle=True,
                validation_data=(x_val, x_val))

In [None]:
loss = fit.history['loss']
val_loss = fit.history['val_loss']
epochs = fit.epoch

plt.plot(epochs, loss, 'bx--', label='train loss', color='blue')
plt.plot(epochs, val_loss, 'rx--', label='val loss', color='green')
plt.legend()
plt.show()

In [None]:
fit.model.summary()

In [None]:
from sklearn.metrics import mean_squared_error

x_pred_train     = autoencoder.predict(x_train)
x_pred_val       = autoencoder.predict(x_val)
x_pred_test_good = autoencoder.predict(x_test_good)
x_pred_test_bad  = autoencoder.predict(x_test_bad)

mse_train     = mean_squared_error(x_train, x_pred_train)
mse_val       = mean_squared_error(x_val, x_pred_val)
mse_test_good = mean_squared_error(x_test_good, x_pred_test_good)
mse_test_bad  = mean_squared_error(x_test_bad, x_pred_test_bad)

In [None]:
print(f'MSE:\n\t train = {mse_train:.3f}\n\t val = {mse_val:.3f}\n\t test_good = {mse_test_good:.3f}\n\t test_bad = {mse_test_bad:.3f}')

In [None]:
mse_distr_train = ((x_train - x_pred_train)**2).mean(axis=1)
mse_distr_val = ((x_val - x_pred_val)**2).mean(axis=1)
mse_distr_test_good = ((x_test_good - x_pred_test_good)**2).mean(axis=1)
mse_distr_test_bad = ((x_test_bad - x_pred_test_bad)**2).mean(axis=1)

mse_distr = np.concatenate([mse_distr_train, mse_distr_val, mse_distr_test_good, mse_distr_test_bad])

# for mse_d in [mse_distr_train, mse_distr_val, mse_distr_test_good, mse_distr_test_bad]:
#     mse_d = np.log10(mse_d)

# bins = np.linspace(np.quantile(mse_distr, 0), np.quantile(mse_distr, 0.999), 100)

# plt.hist(mse_distr_train, bins=bins, density=1, lw=2, histtype='step', label='train', color='blue')
# plt.hist(mse_distr_val, bins=bins, density=1, lw=2, histtype='step', label='val', color='green')
# plt.hist(mse_distr_test_good, bins=bins, density=1, lw=2, histtype='step', label='test good', color='red')
# plt.hist(mse_distr_test_bad, bins=bins, density=1, lw=2, histtype='step', label='test bad', color='k')
# plt.legend()


bins = np.linspace(np.quantile(np.log10(mse_distr), 0), np.quantile(np.log10(mse_distr), 1), 30)

# plt.hist(np.log10(mse_distr_test_good), bins=bins, density=1, lw=2, alpha=0.1, color='red')
# plt.hist(np.log10(mse_distr_test_bad), bins=bins, density=1, lw=2,  alpha=0.1, color='k')


# plt.hist(np.log10(mse_distr_train),     bins=bins, density=1, lw=2, histtype='step', label='train', color='blue')
# plt.hist(np.log10(mse_distr_val),       bins=bins, density=1, lw=2, histtype='step', label='val', color='c')
plt.hist(np.log10(mse_distr_test_good), bins=bins, density=1, lw=2, histtype='step', label='test good', color='red')
plt.hist(np.log10(mse_distr_test_bad),  bins=bins, density=1, lw=2, histtype='step', label='test bad', color='k')
plt.legend()
plt.xlabel('log (MSE)');

In [None]:
s = 'TPCnCl'
print(s.lower())

In [None]:
groups_indices = dict()


for group_name, group_func in [
                                ('TPC-ncl', lambda col: 'TPC' in col), 
                                ('vertex', lambda col: 'vert' in col.lower()),
                                ('dZ-fits', lambda col: 'dZA' in col or 'dZC' in col),
                                ('dR-fits', lambda col: 'dRA' in col or 'dRC' in col),
                                ('multiplicity', lambda col: 'Mult' in col),
                                ('pT', lambda col: ('PtA' in col or 'PtC' in col or 'qOverPt' in col) and 'tpcIts' not in col and 'deltaPt' not in col),
                                ('delta pT', lambda col: 'deltaPt' in col),
                                ('DCA', lambda col: ('dcar' in col or 'dcaz' in col) and 'dcarAP' not in col and 'dcarCP' not in col),
                                ('MIP', lambda col: ('MIP' in col)),
                                ('TPC-ITS_match.', lambda col: 'tpcIts' in col),
                                ('distr._pulls', lambda col: 'Pull' in col),
                                ('work_conditions', lambda col: 'PTR' in col or 'HVandPT' in col or 'VDrift' in col),
                                ]:
    lst = []
    print()
    print(group_name, ':\n------', )
    
    for i_c, c in enumerate(input_data.columns):
        if group_func(c):
            lst.append(i_c)
#             print(c)
        
    lst = np.array(lst)
    print(all(lst[1:] - lst[:-1] == 1))
    print(lst[1:] - lst[:-1] == 1)
    print(lst)
    groups_indices[group_name] = lst


In [None]:
mse_columns_train = ((x_train - x_pred_train)**2).mean(axis=0)
mse_columns_val = ((x_val - x_pred_val)**2).mean(axis=0)
mse_columns_test_good = ((x_test_good - x_pred_test_good)**2).mean(axis=0)
mse_columns_test_bad = ((x_test_bad - x_pred_test_bad)**2).mean(axis=0)

for i_c, (c, train, test_g, test_b) in enumerate(zip(input_data.columns, mse_columns_train, mse_columns_test_good, mse_columns_test_bad)):
    print(f'{i_c:3.0f}. {c:<30s}: {train:.3f}, \t {test_g:.3f}, {test_b:6.3f}, \t {test_b/test_g:.2f}')

# idx = np.argsort(mse_columns_train)
# for i in idx:
#     print(input_data.columns[i], mse_columns_train[i])



In [None]:
input_data.columns

In [None]:
%matplotlib inline


from IPython.core.display import HTML
style = """
<style>
div.output_area {
    overflow-y: scroll;
}
div.output_area img {
    max-width: unset;
}
</style>
"""
HTML(style) 



fig, axes = plt.subplots(4,1, figsize=(50,20))
plt.subplots_adjust(hspace=0.3)

for i,mse_columns in enumerate([mse_columns_train, mse_columns_val, mse_columns_test_good, mse_columns_test_bad]):
    ax = axes[i]
    ax.bar(range(mse_columns_train.shape[0]), mse_columns, width=0.5)
    ax.set_xlim([-5, len(input_data.columns)+5])
    ax.set_xticks(range(0, len(input_data.columns)))
    ax.set_xticklabels(labels=input_data.columns, rotation=90, horizontalalignment='center')
#     ax.semilogy()
    
    ymax = ax.get_ylim()[1]

    for group_name, indices in groups_indices.items():
        ax.vlines([np.min(indices)-0.5, np.max(indices)+0.5], 0, ymax, linestyles='--')
        ax.text(np.mean(indices), 0.8*ymax, group_name.replace('_', '\n'), horizontalalignment='center', fontdict=dict(fontsize=20))


In [None]:
x_test_good.shape

In [None]:
for instance_index in range(91):
    mse_instance = (x_test_bad[instance_index,:]-x_pred_test_bad[instance_index,:])**2 
    log_mse = np.log10(mse_instance.mean())
    arrow = '\t\t<------' if log_mse > 1 else ''
    print(f'{instance_index}: {log_mse} {arrow}')

In [None]:
instance_index = 90
mse_instance = (x_test_bad[instance_index,:]-x_pred_test_bad[instance_index,:])**2 


bins = np.linspace(np.quantile(np.log10(mse_distr), 0), np.quantile(np.log10(mse_distr), 1), 30)

# plt.hist(np.log10(mse_distr_test_good), bins=bins, density=1, lw=2, alpha=0.1, color='red')
# plt.hist(np.log10(mse_distr_test_bad), bins=bins, density=1, lw=2,  alpha=0.1, color='k')
# plt.hist(np.log10(mse_distr_train),     bins=bins, density=1, lw=2, histtype='step', label='train', color='blue')
# plt.hist(np.log10(mse_distr_val),       bins=bins, density=1, lw=2, histtype='step', label='val', color='c')
plt.hist(np.log10(mse_distr_test_good), bins=bins, density=1, lw=2, histtype='step', label='test good', color='red')
plt.hist(np.log10(mse_distr_test_bad),  bins=bins, density=1, lw=2, histtype='step', label='test bad', color='k')
plt.legend()
plt.xlabel('log (MSE)');

xrange = plt.xlim()[1] - plt.xlim()[0]
yrange = plt.ylim()[1] - plt.ylim()[0]
plt.arrow(np.log10(mse_instance.mean()), yrange*0.95, 0, -0.2*yrange, 
            width=0.01*xrange, 
#             length_includes_head=True, head_length=0.1*yrange, head_width=0.02*xrange,
                fc='k')

In [None]:
%matplotlib inline


from IPython.core.display import HTML
style = """
<style>
div.output_area {
    overflow-y: scroll;
}
div.output_area img {
    max-width: unset;
}
</style>
"""
HTML(style) 


fig, ax = plt.subplots(1,1, figsize=(50,8))
ax.bar(range(mse_columns_train.shape[0]), mse_instance, width=0.5)
# ax.set_xlim([0, len(input_data.columns)])
# tick_locs = ax.get_xticks()
ax.set_xlim([-5, len(input_data.columns)+5])
ax.set_xticks(range(0, len(input_data.columns)))
ax.set_xticklabels(labels=input_data.columns, rotation=90, horizontalalignment='center')

ymax = ax.get_ylim()[1]

for group_name, indices in groups_indices.items():
    ax.vlines([np.min(indices)-0.5, np.max(indices)+0.5], 0, ymax, linestyles='--')
    ax.text(np.mean(indices), 0.8*ymax, group_name.replace('_', '\n'), horizontalalignment='center', fontdict=dict(fontsize=20))


In [None]:
x_test_good_r = x_test_good.reshape([np.product(x_test_good.shape),1])
x_pred_test_good_r = x_pred_test_good.reshape([np.product(x_pred_test_good.shape),1])
mean_squared_error(x_test_good_r, x_pred_test_good_r)

# TODO:

1. train **basic** AE (feature scaling, training only on good time intervals? ) - DONE
2. try to viz. it - ?
3. check dependence: 
    - on overall performance w/ and w/o _bad_ timeIntervals, 
    - performance on _bad_ and _good_ timeIntervals - DONE
__________________
4. Compare with AE trained on both _good_ and _bad_
5. Check correlations of MSE of columns
6. Try to viz. columns sq. errors as a ratio to aver. sq. error of this column (of train / test_all / test_bad)