# DESCRIPTION

Exploratory data analysis (EDA) performed on data collected during 2016 and quality evaluated manually by experts.  
Each data point (row) corresponds to one run (= up to 8-12 h of data collecting).

# LOAD

load modules and the data  
perform first steps of preprocessing

load modules:

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

import csv
import re

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

read data from files:
* data.csv                             -- data
* runs_2016_TPC_GOOD_Tracking.dat -- list of runs with good tracking
* runs_2016_TPC_GOOD_HadronID.dat -- list of runs with good PID
* sensitive_variables_list.csv           -- list of variables which are important in QA acording to experts

In [None]:
df_orig = pd.read_csv('data-2016-run-level/data.csv', index_col=False)  # index_col=False due to commas at the end of each line
df_orig.columns = [c.replace('/I', '').replace('/D','').replace('\n','') for c in df_orig.columns]

In [None]:
df_orig.head()

Print sensitive variables:

In [None]:
with open('data-2016-run-level/sensitive_variables_list.csv') as f:
    sensitive_variables = f.readlines()[0].split(':')
sensitive_variables = [sv.replace('/D', '').replace('/I', '').replace('\n', '') for sv in sensitive_variables]
print('\n'.join(sensitive_variables))

In [None]:
good_runs_files = {'track':'data-2016-run-level/runs_2016_TPC_GOOD_Tracking.dat', 
                   'PID':'data-2016-run-level/runs_2016_TPC_GOOD_HadronID.dat'}

In [None]:
with open(good_runs_files['track']) as f:
    good_trk = [int(line.replace(',\n', '')) for line in f.readlines()]
with open(good_runs_files['PID']) as f:
    good_pid = [int(line.replace(',\n', '')) for line in f.readlines()]
    
trk_only, pid_only, both, none = [],[],[],[]
for run in df_orig['run']:
    intrk, inpid = run in good_trk, run in good_pid
    if     intrk and     inpid:   both.append(run)
    if     intrk and not inpid:   trk_only.append(run)
    if not intrk and     inpid:   pid_only.append(run)
    if not intrk and not inpid:   none.append(run)
print(f'There are N runs for which good are/is:\n'
      f'tracking only = {len(trk_only)}\n'
      f'pid only = {len(pid_only)}\n'
      f'both = {len(both)}\n'
      f'none = {len(none)}')
print()
print(f'good tracking & bad PID runs:\n {trk_only}')

There are only 8 runs with good tracking and bad PID.  
__For further analysis use only runs with consistently good/bad tracking & PID__:

In [None]:
good_runs = both
bad_runs = none
print(good_runs[:10], f'... -> {len(good_runs)} good runs')
print(bad_runs[:10], f'... -> {len(bad_runs)} bad runs')

In [None]:
df_sens = df_orig.drop([c for c in df_orig.columns if c not in (sensitive_variables+['run', ]) ], axis=1)
df_sens = df_sens[[run in good_runs or run in bad_runs for run in df_sens['run']]]
df_sens['good'] = [1 if run in good_runs else 0 for run in df_sens['run']]

# ANALYSIS OF SENSITIVE VARIABLES

## 1D analysis of sensitive variables

In [None]:
pd.set_option('max_columns', 50)
df_sens.describe()
# pd.reset_option('max_columns')

In [None]:
df_sens.query('good==1').describe()


In [None]:
df_sens.query('good==0').describe()

### 1D histograms

plot 1D histograms for: whole range (left) and quantiles: unnormalized (center) and normalized per class (right)  
vertical dashed lines denotes quentiles

In [None]:

# for var in df_sens.columns:
#     if var == 'good': continue


def plot_hist1D(var, n_bins, q_low, q_high):
    quantiles = [q_low, q_high]
    var_good = df_sens.query('good==1')[var].tolist()
    var_bad  = df_sens.query('good==0')[var].tolist()

    plt.figure()
    fig, axes = plt.subplots(1,3, figsize=(18,5))
    for i in range(3):
        qs = [0,1] if not i else quantiles
        quantile_vals = np.quantile(var_good+var_bad, qs)
        _, bins = np.histogram(var_good + var_bad, bins=n_bins, range=quantile_vals)
        normed = (i == 2)
        axes[i].hist(var_good, bins=bins, alpha=1, color='blue', linewidth=1.2, histtype='step', linestyle='-', density=normed)
        axes[i].hist(var_bad, bins=bins, alpha=1, edgecolor='red', linewidth=2, histtype='step', density=normed)


        for qv in quantile_vals: 
            axes[0].axvline(qv, linestyle='--', color='k')
            axes[0].text()
        axes[0].set_title('whole range')
        axes[1].set_title(f'limited range (q={quantiles[0]*100:.0f},{quantiles[1]*100:.0f}): {quantile_vals[0]:.3f}, {quantile_vals[1]:.3f}')
        axes[2].set_title('normalized per class')

        plt.suptitle(var)

#w_nbins = interactive(lambda n: n, n=(3,100,1));
#display(w_nbins)
#print(w_nbins.kwargs)
#w_varname.observe(update_x_range, 'value')

plt.text?

wg_colname = widgets.Dropdown(description='column name', options=df_sens.columns)
wg_nbins = widgets.IntSlider(description='n bins', min=3, max=100, value=20, step=1, continuous_update=False)
wg_qlow = widgets.FloatSlider(description='lower quantile', min=0, max=0.3, value=0.05, step=0.01, continuous_update=False)
wg_qhigh = widgets.FloatSlider(description='upper quantile', min=0.7, max=1, value=0.95, step=0.01, continuous_update=False)

def make_plot(name, n_bins):
    print('making plot of {} with {} bins'.format(name, n_bins))
ui = widgets.HBox([wg_colname, wg_nbins])
ui2 = widgets.HBox([wg_qlow, wg_qhigh])

out = widgets.interactive_output(plot_hist1D, {'var': wg_colname, 'n_bins': wg_nbins, 'q_low':wg_qlow, 'q_high':wg_qhigh})
display(ui2)
display(ui, out)




#w_varname = interactive(plot_hist1D, var=df_sens.columns, n_bins=(3,100,1), quantiles=fixed([0.01, 0.99]), continous_update=False);
#w_qlow = interactive(lambda q: q, q=(0.0, 0.5, 0.01));
#w_qhigh = interactive(lambda q: q, q=(0.5, 1.0, 0.01));

#display(w_varname)

In [None]:
%matplotlib inline
quantiles = [0.05, 0.95]
n_bins = 25

for var in df_sens.columns:
    if var == 'good': continue
    var_good = df_sens.query('good==1')[var].tolist()
    var_bad  = df_sens.query('good==0')[var].tolist()
    
    plt.figure()
    fig, axes = plt.subplots(1,3, figsize=(18,5))
    for i in range(3):
        qs = [0,1] if not i else quantiles
        quantile_vals = np.quantile(var_good+var_bad, qs)
        _, bins = np.histogram(var_good + var_bad, bins=n_bins, range=quantile_vals)
        normed = (i == 2)
        axes[i].hist(var_good, bins=bins, alpha=1, color='blue', linewidth=1.2, histtype='step', linestyle='-', density=normed)
        axes[i].hist(var_bad, bins=bins, alpha=1, edgecolor='red', linewidth=2, histtype='step', density=normed)
        
        
    for qv in quantile_vals: axes[0].axvline(qv, linestyle='--', color='k')
    axes[0].set_title('whole range')
    axes[1].set_title(f'limited range (q={quantiles[0]*100:.0f},{quantiles[1]*100:.0f}): {quantile_vals[0]:.3f}, {quantile_vals[1]:.3f}')
    axes[2].set_title('normalized')
        
    plt.suptitle(var)

#     sns.distplot(var_good, kde=False)
#     sns.distplot(var_bad, kde=False)

### Partial conlusions
There are not so many distinctive variables, some of most promising:
* __meanTPCnclF__ (mean no. TPC clusters findable fraction, i.e. after subtraction of points crossing installation frames etc) - bad runs have much higher variance, note that some bad runs are beyond central quantiles
* __meanTPCChi2__ and __meanTPCncl__ - same as above
* __offsetdRC__ - bad runs are shifted towards lower values and have higher variance
* __meanMult/Pos/Neg__ - bad runs are a bit wider and shifted
* __iroc/oroc_\*__ - takes only 2 values (17 or 18) but 17 appears only for bad runs (without exceptions), in around 15-35% cases
* __MIPattachSlopeA/C__, __tpcitsMatchA__, __y/lambdaPull(HighPt)__ - substantial amount of bad runs is outside of central quantiles

Also:  
__meanVertX/Y__ - (almost) all runs have the same values - they will be removed from further analysis

In [None]:
df_sens = df_sens.drop(['meanVertX', 'meanVertY'], axis=1)

## Correlations

In [None]:
sns.set(style="white")
colors = ['#F53E22', '#3E22F5']
pal = sns.color_palette(colors)

In [None]:
# ------------------------------
# --  BEAUTIFUL PANDAS TABLE  --
# ------------------------------
# cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)

# def magnify():
#     return [dict(selector="th",
#                  props=[("font-size", "7pt")]),
#             dict(selector="td",
#                  props=[('padding', "0em 0em")]),
#             dict(selector="th:hover",
#                  props=[("font-size", "12pt")]),
#             dict(selector="tr:hover td:hover",
#                  props=[('max-width', '200px'),
#                         ('font-size', '15pt')])
# ]

# corr.style.background_gradient(cmap, axis=1)\
#     .set_properties(**{'max-width': '80px', 'font-size': '10pt'})\
#     .set_caption("Hover to magify")\
#     .set_precision(2)\
#     .set_table_styles(magnify())

### Correlation matrix

In [None]:
corr = df_sens.corr()

fig = plt.figure(figsize=(13,10))
ax = fig.add_subplot(111)
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns,
        cbar=1, cmap='RdBu')

In [None]:
threshold = 0.85

print(f'Variable pairs with corr. coef > {threshold}:')
print('- -'*19)
n_above_th = 0
for c1 in corr.columns:
    for c2 in corr.columns:
        if corr.columns.tolist().index(c1) == corr.columns.tolist().index(c2): continue
        val = corr[c1][c2]
        if abs(val) > threshold:
            print (f'{c1:20s} : {c2:20s}: {val:6.2f}\t:')
            n_above_th += 1 #
print('- -'*19)
print(f'There are {int(n_above_th/2)} variable pairs with corr > {threshold}')

### Partial conculsions
<a id='correlated_sets'></a>

One can find several sets of highly correlated variables:  
{meanMult - meanMultPos - meanMultNeg } -- tpcMatch* -- {meanTPCnclF - meanTPCncl -- meanTPCChi2}   
{offsetdZA - offsetZC - zPull(HighPt)}  
{ptPull(HighPt) - lambdaPull(HighPt)} -- MIPattachSlopeA  
{zPull(HighPt) - offsetdZA}  
{resolutionMIP(ele)}  

## PCA

calculate PCA:

In [None]:
%matplotlib inline
n_components = 20

from mpl_toolkits.mplot3d import Axes3D
from sklearn import decomposition

X = df_sens.drop(['good', 'run'], axis=1)
y = df_sens['good']

scaler = StandardScaler()
X = scaler.fit_transform(X)

df_sens_scaled = pd.DataFrame(X)
df_sens_scaled.columns = df_sens.drop(['good', 'run'], axis=1).columns

pca = decomposition.PCA(n_components=n_components)
pca.fit(X)
X = pca.transform(X)
X_good = X[y==1,:]
X_bad  = X[y==0,:]

data_dict = {'good':y}
for c in range(n_components):
    data_dict['PCA'+str(c)] = X[:,c]
df_pca = pd.DataFrame(data_dict)

check variance explained with consecutive components:

In [None]:
Y = pca.explained_variance_ratio_.cumsum()
plt.plot(range(n_components), Y, 'bo-')
plt.ylabel('Explained variance')
plt.xlabel('PCA components')
plt.grid()

Once again correlation plot, but including PCA components:

In [None]:
%matplotlib inline

df_sens_plus_pca = pd.concat([df_sens_scaled, df_pca], axis=1)
corr = df_sens_plus_pca.corr()

fig = plt.figure(figsize=(15,12))
ax = fig.add_subplot(111)
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns,
        cbar=1, cmap='RdBu')

In [None]:
%matplotlib notebook
%matplotlib inline
centers = [[1, 1], [-1, -1], [1, -1]]
fig = plt.figure(figsize=(10, 6))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
plt.cla()

# # for name, label in [('Bad', 0), ('Good', 1)]:
# #     ax.text3D(X[y == label, 0].mean(),
# #               X[y == label, 1].mean() + 1.5,
# #               X[y == label, 2].mean(), name,
# #               horizontalalignment='center',
# #               bbox=dict(alpha=.5, edgecolor='w', facecolor='w'))
# # Reorder the labels to have colors matching the cluster results

ax.scatter(X_bad[:, 0], X_bad[:, 1], X_bad[:, 2], c='r',edgecolor='k')
ax.scatter(X_good[:, 0], X_good[:, 1], X_good[:, 2], c='b',edgecolor='k')

ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel('PCA 0')
ax.set_ylabel('PCA 1')
ax.set_zlabel('PCA 2')

plt.show()
# plt.close()

In [None]:
%matplotlib inline
n_comp = 4

fig, axes = plt.subplots(1,n_comp, figsize=(15,4))

for i in range(n_comp):
    _, bins = np.histogram(list(X_good[:,i])+list(X_bad[:,i]), bins=50)
    normed = 1
    axes[i].hist(X_good[:,i], bins=bins, alpha=1, color='blue', linewidth=1.2, histtype='step', linestyle='-', density=normed)
    axes[i].hist(X_bad[:,i], bins=bins, alpha=1, edgecolor='red', linewidth=2, histtype='step', density=normed)
    axes[i].set_title('PCA'+str(i))

Find variables most correlated with selected variable and their 2D distributions:

In [None]:
threshold = 0.5
min_n_vars = 4  # includes autocorr
n_pca = 20

for pca_comp in range(n_pca):
    selected_var = 'PCA'+str(pca_comp)
     # can be (0,1) or integer
    var_lst = corr[selected_var][abs(corr[selected_var]) > threshold].index.tolist()
    if len(var_lst) < min_n_vars: var_lst = abs(corr[selected_var]).sort_values(ascending=False)[:min_n_vars].index.tolist()
    vars_corrs = sorted(
        [(var, corr[selected_var][var]) for var in var_lst], 
        key=lambda x: -abs(x[1])
    )
    vars_corrs_str = '\n\t'.join([f'{v}({c:.2f})' for v,c in vars_corrs[1:]])
    print(f'Variables most correlated with {selected_var}:\n\t{vars_corrs_str}')
#     for var in var_lst:
#         print(f'{var:20s}: {corr[selected_var][var]:6.2f}')
#     print(f'There are {len(var_lst)} such variables')

In [None]:
%matplotlib inline
selected_var = 'lambdaPull'
threshold = 5 # can be (0,1) or integer
if threshold < 1:
    # corr. coef. value threshold
    var_lst = corr[selected_var][abs(corr[selected_var]) > threshold].index.tolist()
else:
    # no. variables threshold
    var_lst = abs(corr[selected_var]).sort_values(ascending=False)[:threshold].index.tolist()
    
print(f'Variables most correlated with {selected_var}: ')
for var in var_lst:
    print(f'{var:20s}: {corr[selected_var][var]:6.2f}')
print(f'There are {len(var_lst)} such variables')

fig = plt.figure(figsize=(10,5))
# g = sns.pairplot(df_sens_plus_pca, hue='good', vars=[c for c in df_pca.columns if 'PCA' in c], palette=pal, markers=['x', 'o'])
g = sns.pairplot(df_sens_plus_pca, hue='good', hue_order=[0,1], vars=var_lst, 
                 palette=pal, markers=['x', 'o'], plot_kws=dict(s=20, alpha=1)
                )

### Partial conclusions

Keeping 5, 10 and 15 first PCA component explains 65%, 80%, 90% respectively

Significant number of variables could be removed without losing too much information -- but not necessarly replaced with PCA, e.g. correlation between PCA0 and its most correlated variables is 0.6-0.5 while between those variables 0.98-0.8.

Variables most correlated with consecutive PCA components create clusters which partially correspond to sets from 
[correlation check](#correlated_sets)

Some variables (e.g. PCA0) despite being highly disciminative are weakly correlated with goodness of run - commonly bad runs are frequently distributed just more widely than good ones  
One may think about creating simple metric which would tell how discriminative is a variable, like accuracy decision tree with max depth = 2 or 4 (2 is enough to select central peak)

# TODO

* test removing most correlated features

* NSigmasClf: 
    * simple product of n_sigmas of each variable
    * product of n_sigmas where n_sigma = 1 if n_sigma < 2,3
    