# Load

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

In [None]:
%%time 
df_orig = pd.read_csv('data/trending_merged_LHC18q_withGraphs.csv')

## define target

In [None]:
target_col = 'alias_global_Warning'
#----------

df = df_orig[[c for c in df_orig.columns if 
              ('gr' not in c and 'alias' not in c and 'Unnamed' not in c)
              and c != 'dataType.fString'
              or c == target_col
             ]]
rename = lambda c: c if c != target_col else 'bad'
df.columns = [rename(c) for c in df.columns]

# Univariate analysis

assumes existance of target

## Reference tables

In [None]:
pd.set_option('max_columns', 50)
df.describe(include=np.number)

In [None]:
df.query('bad==1').describe()

In [None]:
df.query('bad==0').describe()

## 1D plots

In [None]:
def plot_hist1D(var, n_bins, q_low, q_high):
    quantiles = [q_low, q_high]
    var_good = df.query('bad==0')[var].tolist()
    var_bad  = df.query('bad==1')[var].tolist()

    plt.figure()
    fig, axes = plt.subplots(1,3, figsize=(18,5))
    for i in range(3):
        qs = [0,1] if not i else quantiles
        quantile_vals = np.quantile(var_good+var_bad, qs)
        _, bins = np.histogram(var_good + var_bad, bins=n_bins, range=quantile_vals)
        normed = (i == 2)
        axes[i].hist(var_good, bins=bins, alpha=1, color='blue', linewidth=1.2, histtype='step', linestyle='-', density=normed)
        axes[i].hist(var_bad, bins=bins, alpha=1, edgecolor='red', linewidth=2, histtype='step', density=normed)


        for qv in quantile_vals: 
            axes[0].axvline(qv, linestyle='--', color='grey')
#             axes[0].text()
        axes[0].set_title('whole range')
        axes[1].set_title(f'limited range (q={quantiles[0]*100:.0f},{quantiles[1]*100:.0f}): {quantile_vals[0]:.3f}, {quantile_vals[1]:.3f}')
        axes[2].set_title('normalized per class')

        plt.suptitle(var)

#w_nbins = interactive(lambda n: n, n=(3,100,1));
#display(w_nbins)
#print(w_nbins.kwargs)
#w_varname.observe(update_x_range, 'value')

# plt.text?

wg_colname = widgets.Dropdown(description='column name', options=df.columns)
wg_nbins = widgets.IntSlider(description='n bins', min=3, max=100, value=20, step=1, continuous_update=False)
wg_qlow = widgets.FloatSlider(description='lower quantile', min=0, max=0.3, value=0.01, step=0.01, continuous_update=False)
wg_qhigh = widgets.FloatSlider(description='upper quantile', min=0.7, max=1, value=0.99, step=0.01, continuous_update=False)

def make_plot(name, n_bins):
    print('making plot of {} with {} bins'.format(name, n_bins))
ui = widgets.HBox([wg_colname, wg_nbins])
ui2 = widgets.HBox([wg_qlow, wg_qhigh])

out = widgets.interactive_output(plot_hist1D, {'var': wg_colname, 'n_bins': wg_nbins, 'q_low':wg_qlow, 'q_high':wg_qhigh})
display(ui2)
display(ui, out)

In [None]:
[c for c in df.columns.tolist() if 'its' in c.lower()]

In [None]:
# %matplotlib inline
# plt.rcParams['figure.max_open_warning'] = 150

# quantiles = [0.05, 0.95]
# n_bins = 25



# for var in df.columns:
#     print(var)
#     if var == 'bad': continue
#     if not isinstance(df[var][0], (float, int, np.int64, np.float64)): continue
#     var_good = df.query('bad==0')[var].tolist()
#     var_bad  = df.query('bad==1')[var].tolist()
#     print('plot')
    
#     plt.figure()
#     fig, axes = plt.subplots(1,3, figsize=(18,5))
#     for i in range(3):
#         qs = [0,1] if not i else quantiles
#         quantile_vals = np.quantile(var_good+var_bad, qs)
#         _, bins = np.histogram(var_good + var_bad, bins=n_bins, range=quantile_vals)
#         normed = (i == 2)
#         axes[i].hist(var_good, bins=bins, alpha=1, color='blue', linewidth=1.2, histtype='step', linestyle='-', density=normed)
#         axes[i].hist(var_bad, bins=bins, alpha=1, edgecolor='red', linewidth=2, histtype='step', density=normed)
        
        
#     for qv in quantile_vals: axes[0].axvline(qv, linestyle='--', color='k')
#     axes[0].set_title('whole range')
#     axes[1].set_title(f'limited range (q={quantiles[0]*100:.0f},{quantiles[1]*100:.0f}): {quantile_vals[0]:.3f}, {quantile_vals[1]:.3f}')
#     axes[2].set_title('normalized')
        
#     plt.suptitle(var)

# Multivariate analysis

## Correlations

In [None]:
sns.set(style="white")
colors = ['#F53E22', '#3E22F5']
pal = sns.color_palette(colors)

In [None]:
s = df.std()[(df.std() < 1e-6).tolist()]
s

In [None]:
df.columns.tolist()
nonphysical_cols = ['run', 'chunkID', 'time', 
                   'year', 'period.fString', 'pass.fString', 'runType.fString', 
                   'startTimeGRP', 'stopTimeGRP', 'duration', 
                   'chunkStart', 'chunkMean', 'chunkMedian', 'chunkRMS']

no_variance_cols = df.std()[(df.std() < 1e-6).tolist()].index.tolist()
cols_exclude_corr = nonphysical_cols + no_variance_cols

for c in df.columns:
    if c not in cols_exclude_corr:
        print(c)

### Corr matrix for all variables

unreadable, but plenty of very strong correlations

In [None]:
corr = df[[c for c in df.columns if c not in cols_exclude_corr]].corr()
fig = plt.figure(figsize=(26,20))
ax = fig.add_subplot(111)
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns,
        cbar=1, cmap='RdBu')

Correlation distribution and most correlated pairs

In [None]:
s = corr.abs().unstack()
so = s.sort_values(kind="quicksort", ascending=False)
n_vars = df.shape[1]
print('Most correlated pairs:\n---------------\n', 
      so[n_vars:n_vars+20]);

In [None]:
import matplotlib.patches as mpatches

THRESHOLD = 0.9

fig, ax = plt.subplots()
plt.hist(so, histtype='step', bins=100);
plt.title('histo of corr. coef.');
ylim = plt.ylim()
y_l = ylim[0] + ylim[1]*0.15
y_h = ylim[0] + ylim[1]*0.4
arrow = mpatches.FancyArrowPatch((THRESHOLD, y_h), (THRESHOLD, y_l),
                                 mutation_scale=25, color='red', 
                                 arrowstyle='fancy');
ax.add_patch(arrow);
ax.text(THRESHOLD-0.03, y_h, f'{THRESHOLD}');

Remove columns with corr. over threshold

In [None]:
cols_no_corr = corr.columns.tolist()
# corr = corr.abs()

for c1 in corr.columns:
    for c2 in corr.columns:
        if corr.columns.tolist().index(c1) <= corr.columns.tolist().index(c2): continue
        cval = corr.abs()[c1][c2]
        if cval > THRESHOLD and  c1 in cols_no_corr and c2 in cols_no_corr:
            cols_no_corr.remove(c2)
            print(f'{c2} removed due to its corr. with {c1} = {cval:.3f}')
print('\n\n', cols_no_corr)
print(f'\n{len(cols_no_corr)} out of {len(corr.columns)} columns were selected')

# pd.DataFrame(df.corr()['offsetdZCchi2Neg']).query('offsetdZCchi2Neg > 0.999')

In [None]:
corr_sel = df[cols_no_corr].corr()

fig = plt.figure(figsize=(18,15))
ax = fig.add_subplot(111)
sns.heatmap(corr_sel, 
        xticklabels=corr_sel.columns,
        yticklabels=corr_sel.columns,
        cbar=1, cmap='RdBu', 
        vmin=-1, vmax=1)

In [None]:
so_top = so[256:266]
print(so_top.index.levels[1])

## 2D distribution

In [None]:
var1 = 'meanTPCncl'
var2 = 'rmsTPCncl'

var1_vals = df[var1].tolist()
var2_vals = df[var2].tolist()

qs = [0.05, 0.9]
q_vals1 = np.quantile(var1_vals, qs)
q_vals2 = np.quantile(var2_vals, qs)

quant_query = f'{var1}>{q_vals1[0]} & {var1}<{q_vals1[1]} & {var2}>{q_vals2[0]} & {var2}<{q_vals2[1]}'
index_bad = df.query('bad==1 & ' + quant_query)
index_bad

In [None]:
def plot_hist2D(var1, var2, n_bins, q_low, q_high):
    qs = [q_low, q_high]
    
    var1_vals = df[var1].tolist()
    var2_vals = df[var2].tolist()
    
    var1_good = df.query('bad==0')[var1].tolist()
    var1_bad  = df.query('bad==1')[var1].tolist()
    var2_good = df.query('bad==0')[var2].tolist()
    var2_bad  = df.query('bad==1')[var2].tolist()
#     index_good = df.query('bad==0').index.to_list()
#     index_bad  = df.query('bad==1').index.to_list()
    
    q_vals1 = np.quantile(var1_vals, qs)
    q_vals2 = np.quantile(var2_vals, qs)
    
    quant_query = f'{var1}>{q_vals1[0]} & {var1}<{q_vals1[1]} & {var2}>{q_vals2[0]} & {var2}<{q_vals2[1]}'
    index_good = df.query('bad==0 & ' + quant_query).index
    index_bad  = df.query('bad==1 & ' + quant_query).index
    
    plt.figure()
    plt.scatter(df.loc[index_good][var1], df.loc[index_good][var2], color='b', facecolor='none')
    plt.scatter(df.loc[index_bad][var1], df.loc[index_bad][var2], marker='x', color='r')
    
    xrange = q_vals1[1] - q_vals1[0]
    yrange = q_vals2[1] - q_vals2[0]

    plt.xlim([q_vals1[0]-0.1*xrange, q_vals1[1]+0.1*xrange])
    plt.ylim([q_vals2[0]-0.1*yrange, q_vals2[1]+0.1*yrange])
#     plt.ylim(q_vals2)
#     plt.scatter(df.loc[index_bad][var1], df.loc[index_bad][var2], color='r', facecolor='none', lw=lw)
#     fig, axes = plt.subplots(1,3, figsize=(18,5))
#     for i in range(3):
#         qs = [0,1] if not i else quantiles
#         quantile_vals = np.quantile(var_good+var_bad, qs)
#         _, bins = np.histogram(var_good + var_bad, bins=n_bins, range=quantile_vals)
#         normed = (i == 2)
#         axes[i].hist(var_good, bins=bins, alpha=1, color='blue', linewidth=1.2, histtype='step', linestyle='-', density=normed)
#         axes[i].hist(var_bad, bins=bins, alpha=1, edgecolor='red', linewidth=2, histtype='step', density=normed)


#         for qv in quantile_vals: 
#             axes[0].axvline(qv, linestyle='--', color='grey')
# #             axes[0].text()
#         axes[0].set_title('whole range')
#         axes[1].set_title(f'limited range (q={quantiles[0]*100:.0f},{quantiles[1]*100:.0f}): {quantile_vals[0]:.3f}, {quantile_vals[1]:.3f}')
#         axes[2].set_title('normalized per class')

    plt.suptitle(var1+':'+var2)

#w_nbins = interactive(lambda n: n, n=(3,100,1));
#display(w_nbins)
#print(w_nbins.kwargs)
#w_varname.observe(update_x_range, 'value')

# plt.text?

wg_colname1 = widgets.Dropdown(description='column name 1', options=df.columns)
wg_colname2 = widgets.Dropdown(description='column name 2', options=df.columns)
# wg_nbins = widgets.IntSlider(description='n bins', min=3, max=100, value=20, step=1, continuous_update=False)
wg_qlow = widgets.FloatSlider(description='lower quantile', min=0, max=0.3, value=0.0, step=0.01, continuous_update=False)
wg_qhigh = widgets.FloatSlider(description='upper quantile', min=0.7, max=1, value=1, step=0.01, continuous_update=False)

def make_plot(name, n_bins):
    print('making plot of {} with {} bins'.format(name, n_bins))
ui_cols = widgets.HBox([wg_colname1, wg_colname2])
ui_quantiles = widgets.HBox([wg_qlow, wg_qhigh])
# ui_bins = widgets.HBox([wg_nbins])

out = widgets.interactive_output(plot_hist2D, {'var1': wg_colname1, 'var2':wg_colname2, 'n_bins': wg_nbins, 'q_low':wg_qlow, 'q_high':wg_qhigh})
display(ui_quantiles)
display(ui_cols, out)

# SANDBOX

In [None]:
df.shape

In [None]:
[c for c in df_orig.columns if 'Warning' in c]

In [None]:
for xrun in list(set(df_orig['run'].tolist())):
#     print(f'\n *** {xrun} ***')
    aver = np.mean(df_orig.query('run == @xrun')['alias_tpcItsMatchHighPtA_Warning'])
    if aver > 0 and aver < 1: suffix = '\t<---------'
    else: suffix = ''
    print(f'\n {xrun} -- {aver} {suffix}') 