In [1]:
import os
import re
import sys
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import collections

from kolmov.core import ktable, kplot
from kolmov.core.constants import str_etbins_jpsiee, str_etabins
from itertools import product

Welcome to JupyROOT 6.16/00
Using all sub packages with ROOT dependence


In [2]:
tuned_info = collections.OrderedDict( {
              # validation
              "max_sp_val"      : 'summary/max_sp_val',
              "max_sp_pd_val"   : 'summary/max_sp_pd_val#0',
              "max_sp_fa_val"   : 'summary/max_sp_fa_val#0',
              # Operation
              "max_sp_op"       : 'summary/max_sp_op',
              "max_sp_pd_op"    : 'summary/max_sp_pd_op#0',
              "max_sp_fa_op"    : 'summary/max_sp_fa_op#0',
              # Tight
              'tight_pd_ref'    : "reference/tight_cutbased/pd_ref#0",
              'tight_fa_ref'    : "reference/tight_cutbased/fa_ref#0",
              'tight_sp_ref'    : "reference/tight_cutbased/sp_ref",
              'tight_pd_val'    : "reference/tight_cutbased/pd_val#0",
              'tight_fa_val'    : "reference/tight_cutbased/fa_val#0",
              'tight_sp_val'    : "reference/tight_cutbased/sp_val",
              'tight_pd_op'     : "reference/tight_cutbased/pd_op#0",
              'tight_fa_op'     : "reference/tight_cutbased/fa_op#0",
              'tight_sp_op'     : "reference/tight_cutbased/sp_op",
              # Medium
              'medium_pd_ref'   : "reference/medium_cutbased/pd_ref#0",
              'medium_fa_ref'   : "reference/medium_cutbased/fa_ref#0",
              'medium_sp_ref'   : "reference/medium_cutbased/sp_ref",
              'medium_pd_val'   : "reference/medium_cutbased/pd_val#0",
              'medium_fa_val'   : "reference/medium_cutbased/fa_val#0",
              'medium_sp_val'   : "reference/medium_cutbased/sp_val",
              'medium_pd_op'    : "reference/medium_cutbased/pd_op#0",
              'medium_fa_op'    : "reference/medium_cutbased/fa_op#0",
              'medium_sp_op'    : "reference/medium_cutbased/sp_op",
              # Loose
              'loose_pd_ref'    : "reference/loose_cutbased/pd_ref#0",
              'loose_fa_ref'    : "reference/loose_cutbased/fa_ref#0",
              'loose_sp_ref'    : "reference/loose_cutbased/sp_ref",
              'loose_pd_val'    : "reference/loose_cutbased/pd_val#0",
              'loose_fa_val'    : "reference/loose_cutbased/fa_val#0",
              'loose_sp_val'    : "reference/loose_cutbased/sp_val",
              'loose_pd_op'     : "reference/loose_cutbased/pd_op#0",
              'loose_fa_op'     : "reference/loose_cutbased/fa_op#0",
              'loose_sp_op'     : "reference/loose_cutbased/sp_op",
              # Very Loose
              'vloose_pd_ref'   : "reference/vloose_cutbased/pd_ref#0",
              'vloose_fa_ref'   : "reference/vloose_cutbased/fa_ref#0",
              'vloose_sp_ref'   : "reference/vloose_cutbased/sp_ref",
              'vloose_pd_val'   : "reference/vloose_cutbased/pd_val#0",
              'vloose_fa_val'   : "reference/vloose_cutbased/fa_val#0",
              'vloose_sp_val'   : "reference/vloose_cutbased/sp_val",
              'vloose_pd_op'    : "reference/vloose_cutbased/pd_op#0",
              'vloose_fa_op'    : "reference/vloose_cutbased/fa_op#0",
              'vloose_sp_op'    : "reference/vloose_cutbased/sp_op",
               # Counts
              'tight_pd_ref_passed'    : "reference/tight_cutbased/pd_ref#1",
              'tight_fa_ref_passed'    : "reference/tight_cutbased/fa_ref#1",
              'tight_pd_ref_total'    : "reference/tight_cutbased/pd_ref#2",
              'tight_fa_ref_total'    : "reference/tight_cutbased/fa_ref#2",
              
              'tight_pd_val_passed'    : "reference/tight_cutbased/pd_val#1",
              'tight_fa_val_passed'    : "reference/tight_cutbased/fa_val#1",
              'tight_pd_val_total'     : "reference/tight_cutbased/pd_val#2",
              'tight_fa_val_total'     : "reference/tight_cutbased/fa_val#2",
              
              'tight_pd_op_passed'     : "reference/tight_cutbased/pd_op#1",
              'tight_fa_op_passed'     : "reference/tight_cutbased/fa_op#1",
              'tight_pd_op_total'     : "reference/tight_cutbased/pd_op#2",
              'tight_fa_op_total'     : "reference/tight_cutbased/fa_op#2",

              # operation thresholds
              'tight_op_threshold'  : 'reference/tight_cutbased/threshold_op',
              'medium_op_threshold' : 'reference/medium_cutbased/threshold_op',
              'loose_op_threshold'  : 'reference/loose_cutbased/threshold_op',
              'vloose_op_threshold' : 'reference/vloose_cutbased/threshold_op'
              } )

In [3]:
base_path     = os.environ['DATA_PATH']
tunes_path    = os.environ['TUNES_PATH']
analysis_path = os.environ['ANALYSIS_PATH']
print('Variables path defined \n base: %s \n tunes: %s \n analysis: %s' %(base_path,
                                                                          tunes_path,
                                                                          analysis_path))

Variables path defined 
 base: /home/micael/Documents/NeuralRinger/jpsiee_data 
 tunes: /home/micael/Documents/NeuralRinger/jpsiee_tunes 
 analysis: /home/micael/Documents/NeuralRinger/jpsiee_analysis


In [4]:
kt = ktable( tuned_info )

kt.fill(os.path.join(tunes_path, 'data17_tuned_files')+'/*/*/*.pic.gz', 'v1.data17')


2020-08-06 20:21:47,030 | Py.ktable                               INFO Reading file for v1.data17 tag...
2020-08-06 20:21:47,030 | Py.ktable                               INFO There are 1500 files for this task...
2020-08-06 20:21:47,031 | Py.ktable                               INFO Filling the table... 
2020-08-06 20:21:52,605 | Py.ktable                               INFO End of fill step, a pandas DataFrame was created...


In [5]:
table = kt.get_pandas_table()
table.head(5)

Unnamed: 0,train_tag,et_bin,eta_bin,model_idx,sort,init,file_name,tuned_idx,max_sp_val,max_sp_pd_val,...,tight_pd_val_total,tight_fa_val_total,tight_pd_op_passed,tight_fa_op_passed,tight_pd_op_total,tight_fa_op_total,tight_op_threshold,medium_op_threshold,loose_op_threshold,vloose_op_threshold
0,v1.data17,0,0,5,3,3,/home/micael/Documents/NeuralRinger/jpsiee_tun...,0,0.944513,0.963457,...,2846,20500,27987,24750,28455,205005,-0.897117,-0.897117,-0.860319,-0.933001
1,v1.data17,0,0,6,3,3,/home/micael/Documents/NeuralRinger/jpsiee_tun...,1,0.944451,0.967323,...,2846,20500,27987,24702,28455,205005,-0.880173,-0.880173,-0.840672,-0.922216
2,v1.data17,0,0,7,3,3,/home/micael/Documents/NeuralRinger/jpsiee_tun...,2,0.944726,0.961701,...,2846,20500,27987,24688,28455,205005,-0.89182,-0.89182,-0.854874,-0.930133
3,v1.data17,0,0,8,3,3,/home/micael/Documents/NeuralRinger/jpsiee_tun...,3,0.945072,0.969079,...,2846,20500,27987,24922,28455,205005,-0.887782,-0.887782,-0.85244,-0.926832
4,v1.data17,0,0,5,1,2,/home/micael/Documents/NeuralRinger/jpsiee_tun...,0,0.943787,0.95397,...,2846,20500,27987,25457,28455,205005,-0.839659,-0.839659,-0.802934,-0.880908


### Get the best inits table

In [6]:
best_inits = kt.filter_inits("max_sp_val")
best_inits.head()

Unnamed: 0,train_tag,et_bin,eta_bin,model_idx,sort,init,file_name,tuned_idx,max_sp_val,max_sp_pd_val,...,tight_pd_val_total,tight_fa_val_total,tight_pd_op_passed,tight_fa_op_passed,tight_pd_op_total,tight_fa_op_total,tight_op_threshold,medium_op_threshold,loose_op_threshold,vloose_op_threshold
261,v1.data17,0,0,0,0,1,/home/micael/Documents/NeuralRinger/jpsiee_tun...,0,0.946916,0.95116,...,2846,20500,27987,23933,28455,205005,-0.892544,-0.892544,-0.852873,-0.938755
359,v1.data17,0,0,0,1,3,/home/micael/Documents/NeuralRinger/jpsiee_tun...,0,0.944001,0.956079,...,2846,20500,27987,25498,28455,205005,-0.844698,-0.844698,-0.807455,-0.885452
80,v1.data17,0,0,0,2,1,/home/micael/Documents/NeuralRinger/jpsiee_tun...,0,0.9417,0.94624,...,2846,20500,27987,25581,28455,205005,-0.842062,-0.842062,-0.80697,-0.881531
283,v1.data17,0,0,0,3,4,/home/micael/Documents/NeuralRinger/jpsiee_tun...,0,0.944651,0.968377,...,2846,20500,27987,24161,28455,205005,-0.885199,-0.885199,-0.845055,-0.928306
354,v1.data17,0,0,0,4,4,/home/micael/Documents/NeuralRinger/jpsiee_tun...,0,0.944673,0.960998,...,2846,20500,27987,25467,28455,205005,-0.883971,-0.883971,-0.84843,-0.918506


# Create a boxplot 

In [7]:
map_key_dict ={
   'max_sp_val'    : (r'$SP_{max}$ (Validation)', 'sp'),
   'max_sp_pd_val' : (r'$P_D$ (Validation)', 'pd'),
   'max_sp_fa_val' : (r'$F_A$ (Validation)', 'fa'),
   'auc_val'       : (r'AUC (Validation)', 'auc'),
}

# using as simple function in order to make easier plot all need measures
def create_cool_box_plot(df, key, mapped_key, output_name, tuning_flag):
    # create the box plot. 
    # rename the columns names.
    # map the model idx into real # neurons.

    sns.factorplot(data=(df
                        .replace({'model_idx' : {i :  n for i, n in zip(range(0,9+1),
                        range(2,10+1))},
                                'et_bin'    : {i : str_etbins_jpsiee[i] for i in range(3)},
                                'eta_bin'   : {i : str_etabins[i] for i in range(5)}})
                        .rename({'model_idx'  : '# Neurons',
                                'et_bin'     : r'$E_T$',
                                'eta_bin'    : r'$\eta$',
                                key : mapped_key},
                        axis=1)), x='# Neurons',
                        y=mapped_key, col=r'$\eta$', 
                        row=r'$E_T$', kind='box', sharey=False)

    plt.savefig(os.path.join(analysis_path, 'v1.data17/plots/box_plot_%s_%s.png' %(output_name, tuning_flag)), dpi=300)
    plt.close();

In [8]:
ikey         = 'max_sp_val'
map_k, o_name = map_key_dict[ikey]
create_cool_box_plot(df=best_inits, key=ikey, mapped_key=map_k, output_name=o_name, tuning_flag='v1.data17.all_neurons')

In [9]:
n_min, n_max = 2, 10
model_add_tag = { idx : '.mlp%i' %(neuron) for idx, neuron in enumerate(range(n_min, n_max +1))}
# add a sufix in train_tag
best_inits.train_tag = best_inits.train_tag + best_inits.model_idx.replace(model_add_tag)

In [10]:
best_inits.head()

Unnamed: 0,train_tag,et_bin,eta_bin,model_idx,sort,init,file_name,tuned_idx,max_sp_val,max_sp_pd_val,...,tight_pd_val_total,tight_fa_val_total,tight_pd_op_passed,tight_fa_op_passed,tight_pd_op_total,tight_fa_op_total,tight_op_threshold,medium_op_threshold,loose_op_threshold,vloose_op_threshold
261,v1.data17.mlp2,0,0,0,0,1,/home/micael/Documents/NeuralRinger/jpsiee_tun...,0,0.946916,0.95116,...,2846,20500,27987,23933,28455,205005,-0.892544,-0.892544,-0.852873,-0.938755
359,v1.data17.mlp2,0,0,0,1,3,/home/micael/Documents/NeuralRinger/jpsiee_tun...,0,0.944001,0.956079,...,2846,20500,27987,25498,28455,205005,-0.844698,-0.844698,-0.807455,-0.885452
80,v1.data17.mlp2,0,0,0,2,1,/home/micael/Documents/NeuralRinger/jpsiee_tun...,0,0.9417,0.94624,...,2846,20500,27987,25581,28455,205005,-0.842062,-0.842062,-0.80697,-0.881531
283,v1.data17.mlp2,0,0,0,3,4,/home/micael/Documents/NeuralRinger/jpsiee_tun...,0,0.944651,0.968377,...,2846,20500,27987,24161,28455,205005,-0.885199,-0.885199,-0.845055,-0.928306
354,v1.data17.mlp2,0,0,0,4,4,/home/micael/Documents/NeuralRinger/jpsiee_tun...,0,0.944673,0.960998,...,2846,20500,27987,25467,28455,205005,-0.883971,-0.883971,-0.84843,-0.918506


**Note**: In this example case, after apply the init filter there are other models that you can analyze. In this example in focus on with 5 neurons in hidden layer. If you don't do that the other models will be used in the calculation and this can be a problem.

In [11]:
best_inits.shape, best_inits.model_idx.nunique()*15*10

((1350, 66), 1350)

In [12]:
print('Dataframe size before model filter: %i' %(len(best_inits)))
# we use three models 2, 5 and 10 neurons
best_inits = best_inits.loc[(best_inits.train_tag=='v1.data17.mlp2') |
                            (best_inits.train_tag=='v1.data17.mlp5') |
                            (best_inits.train_tag=='v1.data17.mlp10')]
print('Dataframe size after model filter: %i' %(len(best_inits)))

Dataframe size before model filter: 1350
Dataframe size after model filter: 450


In [13]:
# now we want to see the boxplot with only 3 topologies
ikey         = 'max_sp_val'
map_k, o_name = map_key_dict[ikey]
create_cool_box_plot(df=best_inits, key=ikey, mapped_key=map_k, output_name=o_name, tuning_flag='v1.data17.selected_neurons')

In [14]:
# save this table
best_inits.to_csv(os.path.join(analysis_path,
                               'v1.data17/files/selected_neurons_info_table.csv'),
                  index=False)

### Get the Cross validation table

In [15]:
# Get the eff table for all tunings 
cv_table   = kt.describe( best_inits )
cv_table.head()

Unnamed: 0,train_tag,et_bin,eta_bin,max_sp_val_mean,max_sp_val_std,max_sp_pd_val_mean,max_sp_pd_val_std,max_sp_fa_val_mean,max_sp_fa_val_std,max_sp_op_mean,...,tight_fa_op_total_mean,tight_fa_op_total_std,tight_op_threshold_mean,tight_op_threshold_std,medium_op_threshold_mean,medium_op_threshold_std,loose_op_threshold_mean,loose_op_threshold_std,vloose_op_threshold_mean,vloose_op_threshold_std
0,v1.data17.mlp2,0,0,0.945854,0.002518,0.958426,0.007121,0.066618,0.005874,0.944333,...,205005.0,0.0,-0.862378,0.035905,-0.862378,0.035905,-0.82366,0.034712,-0.903661,0.034384
1,v1.data17.mlp2,0,1,0.932883,0.004367,0.945623,0.010756,0.079739,0.007228,0.930188,...,171335.0,0.0,-0.895581,0.010817,-0.895581,0.010817,-0.940163,0.009558,-0.950781,0.010317
2,v1.data17.mlp2,0,2,0.890077,0.020242,0.905878,0.035211,0.125238,0.030917,0.870265,...,27116.0,0.0,-0.913143,0.006179,-0.919457,0.003749,-0.923564,0.002982,-0.925935,0.002324
3,v1.data17.mlp2,0,3,0.917786,0.005463,0.94456,0.009227,0.108573,0.009881,0.914631,...,229486.0,0.0,-0.913367,0.009927,-0.922209,0.009689,-0.922209,0.009689,-0.92354,0.009825
4,v1.data17.mlp2,0,4,0.92287,0.025178,0.974074,0.035136,0.126395,0.04735,0.895782,...,16670.0,0.0,-0.928051,0.008606,-0.928051,0.008606,-0.927018,0.008771,-0.928051,0.008606


In [16]:
# save this table
cv_table.to_csv(os.path.join(analysis_path,
                               'v1.data17/files/selected_neurons_cv_table.csv'),
                index=False)

### Get the integrated table for v1 tags

In [17]:
# Get integrated table
tags = ['v1.data17.mlp2', 'v1.data17.mlp5', 'v1.data17.mlp10']
for itag in tags:
    print('Integrating: %s' %(itag))
    int_table = kt.integrate( best_inits, itag)
    print((int_table.head()*100).round(4))
    print('-'*15)

Integrating: v1.data17.mlp2
      tight_pd_ref  tight_fa_ref  tight_pd_val  tight_fa_val  tight_pd_op  \
mean       97.7795       36.9054       97.7813       10.2992      97.7792   
std         0.0000        0.0000        0.0119        0.2970       0.0006   

      tight_fa_op  
mean      10.4958  
std        0.1102  
---------------
Integrating: v1.data17.mlp5
      tight_pd_ref  tight_fa_ref  tight_pd_val  tight_fa_val  tight_pd_op  \
mean       97.7795       36.9054       97.7869       10.3017      97.7793   
std         0.0000        0.0000        0.0126        0.3775       0.0004   

      tight_fa_op  
mean      10.4654  
std        0.1203  
---------------
Integrating: v1.data17.mlp10
      tight_pd_ref  tight_fa_ref  tight_pd_val  tight_fa_val  tight_pd_op  \
mean       97.7795       36.9054       97.7841       10.2748      97.7788   
std         0.0000        0.0000        0.0063        0.3206       0.0006   

      tight_fa_op  
mean      10.4682  
std        0.0920  
-------

### Create the Beamer Presentation

In [18]:
zee_et_lims    = [15,20,25,30,40,50]
jpsiee_et_lims = [4, 7, 10, 15]
eta_lims       = [0, 0.8 , 1.37, 1.54, 2.37, 2.5]

In [22]:
# Create beamer presentation
kt.dump_beamer_table( pandas_best_inits=best_inits,
                      etbins=jpsiee_et_lims, etabins=eta_lims,
                      operation_points=['tight'],
                      output_file_name='v1.data17',
                      doPDF=False
                    )

2020-08-05 13:39:37,610 | Py.BeamerTexReportTemplate1             INFO Started creating beamer file v1.data17.tex latex code...


### Create monitoring plots

In [19]:
tags = ['v1.data17.mlp2', 'v1.data17.mlp5', 'v1.data17.mlp10']
for itag in tags:
    print('Dumping history: %s' %(itag))
    # dump all v1 training history for a give train tag
    # formating the path
    output_path = os.path.join(analysis_path, 'v1.data17/history')
    # dumping all
    kt.dump_all_history(best_inits, output_path, itag)

Dumping history: v1.data17.mlp2
Dumping history: v1.data17.mlp5
Dumping history: v1.data17.mlp10


In [20]:
# plot the monitoring curves
# get the jpsiee et bins.
aux_dict = {'mlp2' : 0, 'mlp5' : 3, 'mlp10' : 8}


for (imodel, idx) in aux_dict.items():
    kplot_tool = kplot(os.path.join(analysis_path, 'v1.data17/history'),
                       idx,
                       str_et_bins=str_etbins_jpsiee)
    # plot all bins.
    plot_path = os.path.join(analysis_path, 'v1.data17/plots/%s' %(imodel))
    for (iet, ieta) in product(range(3), range(5)):
        kplot_tool.plot_training_curves(iet,
                                        ieta,
                                        plot_path,
                                        'monitoring_v1.data17_et%i_eta%i' %(iet, ieta))

2020-08-06 20:22:45,709 | Py.kplot                                INFO Reading 450 files...
2020-08-06 20:24:08,053 | Py.kplot                                INFO Reading 450 files...
2020-08-06 20:25:22,361 | Py.kplot                                INFO Reading 450 files...
