# Calculate the Cross-validation values after Linear correction

Train and val came from 2017 data
Test came from 2018 data

In [1]:
from kolmov import crossval_table, test_table, fit_table, get_color_fader, fit_table
from Gaugi import mkdir_p
from kepler import load as kload
from sklearn.model_selection import StratifiedKFold, KFold
from Gaugi import load as gload


from itertools import product
from tqdm import tqdm
import tensorflow as tf

import saphyra
import numpy as np
import pandas as pd
import collections
import os
import matplotlib
import matplotlib.pyplot as plt
from pprint import pprint
from copy import deepcopy, copy
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

Welcome to JupyROOT 6.16/00
Using all sub packages with ROOT dependence

Applying ATLAS style settings...


### 1) Open all models:

In [2]:

def create_op_dict(op):
    d = {
              op+'_pd_ref'    : "reference/"+op+"_cutbased/pd_ref#0",
              op+'_fa_ref'    : "reference/"+op+"_cutbased/fa_ref#0",
              op+'_sp_ref'    : "reference/"+op+"_cutbased/sp_ref",
              op+'_pd_val'    : "reference/"+op+"_cutbased/pd_val#0",
              op+'_fa_val'    : "reference/"+op+"_cutbased/fa_val#0",
              op+'_sp_val'    : "reference/"+op+"_cutbased/sp_val",
              op+'_pd_op'     : "reference/"+op+"_cutbased/pd_op#0",
              op+'_fa_op'     : "reference/"+op+"_cutbased/fa_op#0",
              op+'_sp_op'     : "reference/"+op+"_cutbased/sp_op",

              # Counts
              op+'_pd_ref_passed'    : "reference/"+op+"_cutbased/pd_ref#1",
              op+'_fa_ref_passed'    : "reference/"+op+"_cutbased/fa_ref#1",
              op+'_pd_ref_total'     : "reference/"+op+"_cutbased/pd_ref#2",
              op+'_fa_ref_total'     : "reference/"+op+"_cutbased/fa_ref#2",
              op+'_pd_val_passed'    : "reference/"+op+"_cutbased/pd_val#1",
              op+'_fa_val_passed'    : "reference/"+op+"_cutbased/fa_val#1",
              op+'_pd_val_total'     : "reference/"+op+"_cutbased/pd_val#2",
              op+'_fa_val_total'     : "reference/"+op+"_cutbased/fa_val#2",
              op+'_pd_op_passed'     : "reference/"+op+"_cutbased/pd_op#1",
              op+'_fa_op_passed'     : "reference/"+op+"_cutbased/fa_op#1",
              op+'_pd_op_total'      : "reference/"+op+"_cutbased/pd_op#2",
              op+'_fa_op_total'      : "reference/"+op+"_cutbased/fa_op#2",
    }
    return d

tuned_info = collections.OrderedDict( {
              # validation
              "max_sp_val"      : 'summary/max_sp_val',
              "max_sp_pd_val"   : 'summary/max_sp_pd_val#0',
              "max_sp_fa_val"   : 'summary/max_sp_fa_val#0',
              # Operation
              "max_sp_op"       : 'summary/max_sp_op',
              "max_sp_pd_op"    : 'summary/max_sp_pd_op#0',
              "max_sp_fa_op"    : 'summary/max_sp_fa_op#0',
              } )

pidnames = ['tight', 'medium', 'loose', 'vloose']


for pid in pidnames:
    tuned_info.update(create_op_dict(pid))

In [3]:
etbins = [15, 20, 30, 40, 50, 1000000]
etabins = [0.0, 0.8, 1.37, 1.54, 2.37, 2.50]

In [4]:
cv  = crossval_table( tuned_info, etbins = etbins , etabins = etabins )

In [5]:
cv.from_csv('output/crossval/table_v8.csv')

### 1.1) Get best inits and sorts:

In [6]:
best_inits = cv.filter_inits("max_sp_val")
best_inits = best_inits.loc[(best_inits.model_idx==3)] # 5 neurons for all phase spaces
best_sorts = cv.filter_sorts( best_inits , 'max_sp_op')
best_inits.head()

Unnamed: 0,train_tag,et_bin,eta_bin,model_idx,sort,init,file_name,tuned_idx,max_sp_val,max_sp_pd_val,...,vloose_pd_ref_total,vloose_fa_ref_total,vloose_pd_val_passed,vloose_fa_val_passed,vloose_pd_val_total,vloose_fa_val_total,vloose_pd_op_passed,vloose_fa_op_passed,vloose_pd_op_total,vloose_fa_op_total
303,v8,0,0,3,0,4,/home/jodafons/public/tuning_data/Zee/v8/r0/us...,0,0.978118,0.982948,...,232819,187639,23015,683,23282,18764,230153,7089,232819,187639
318,v8,0,0,3,1,7,/home/jodafons/public/tuning_data/Zee/v8/r0/us...,0,0.975846,0.984237,...,232819,187639,23016,740,23282,18764,230153,7040,232819,187639
322,v8,0,0,3,2,2,/home/jodafons/public/tuning_data/Zee/v8/r0/us...,0,0.975065,0.983421,...,232819,187639,23014,752,23282,18764,230153,6963,232819,187639
331,v8,0,0,3,3,2,/home/jodafons/public/tuning_data/Zee/v8/r0/us...,0,0.976284,0.984795,...,232819,187639,23015,710,23282,18764,230153,6956,232819,187639
345,v8,0,0,3,4,5,/home/jodafons/public/tuning_data/Zee/v8/r0/us...,0,0.977574,0.983464,...,232819,187639,23015,661,23282,18764,230153,6956,232819,187639


In [7]:
best_sort_models = cv.get_best_models(best_sorts, remove_last=True, with_history=False)

### 1.2) Get threshold configurations:

In [8]:
config_table = pd.read_csv('output/fitting/config_v8_table.csv')

In [9]:
config_table.head()

Unnamed: 0.1,Unnamed: 0,et_bin,eta_bin,sort,init,file_name,min_avgmu,max_avgmu,model_idx,tight_offset,tight_slope,medium_offset,medium_slope,loose_offset,loose_slope,vloose_offset,vloose_slope
0,0,0,0,5,2,/home/jodafons/public/tuning_data/Zee/v8/r0/us...,16,100,3,0.72258,-0.012027,0.692966,-0.011851,0.248589,-0.010817,0.206654,-0.010792
1,1,0,1,5,0,/home/jodafons/public/tuning_data/Zee/v8/r0/us...,16,100,3,0.442392,-0.012948,0.44186,-0.012945,0.094706,-0.012172,0.04197,-0.012568
2,2,0,2,4,8,/home/jodafons/public/tuning_data/Zee/v8/r0/us...,16,100,3,0.70816,-0.01495,0.461989,-0.015294,0.361129,-0.016592,0.331879,-0.016387
3,3,0,3,1,3,/home/jodafons/public/tuning_data/Zee/v8/r0/us...,16,100,3,0.64927,-0.009891,0.592983,-0.009787,0.467367,-0.01109,0.465764,-0.011102
4,4,1,0,2,3,/home/jodafons/public/tuning_data/Zee/v8/r0/us...,16,100,3,1.652782,-0.015636,1.541345,-0.015422,1.02749,-0.014014,0.940379,-0.013809


### Rerun best inits:

In [10]:
def my_model_v8_generator( df ):
    col_names= ['trig_L2_cl_ring_%d'%i for i in range(100)]
    rings = df[col_names].values.astype(np.float32)
    def norm1( data ):
        norms = np.abs( data.sum(axis=1) )
        norms[norms==0] = 1
        return data/norms[:,None]    
    rings = norm1(rings)
    return [rings]

model_generators = {
                    'v8' : my_model_v8_generator,
                    }

In [11]:
def my_data_generator( path ):
    pidname = 'el_lhmedium'
    df = kload(path)
    # NOTE: Offline filter lhvloose -> lhmedium (as the training procedure)
    df = df.loc[ ((df[pidname]==True) & (df.target==1.0)) | (df.target==0) ]
    return df

### Open 2017 data:

In [12]:
path = '/home/jodafons/public/cern_data/new_files/data17_13TeV.AllPeriods.sgn.probes_lhvloose_EGAM1.bkg.vprobes_vlhvloose_EGAM7.GRL_v97.25bins/'
path+= 'data17_13TeV.AllPeriods.sgn.probes_lhvloose_EGAM1.bkg.vprobes_vlhvloose_EGAM7.GRL_v97.25bins_et{ET}_eta{ETA}.npz'
paths = [[ path.format(ET=et,ETA=eta) for eta in range(5)] for et in range(5)]

In [13]:
kf = StratifiedKFold(n_splits=10, random_state=512, shuffle=True)

In [16]:
cv_test = test_table.test_table( kf, my_data_generator, model_generators, etbins, etabins )

In [21]:
best_inits_test = best_inits.loc[best_inits.et_bin==0]
best_inits = cv_test.update( best_inits_test, paths, config_table )

 80%|████████  | 4/5 [00:38<00:09,  9.63s/it]


IndexError: index 0 is out of bounds for axis 0 with size 0

In [30]:

table = fit_table.update_best_inits( best_inits, paths, my_data_generator, model_generators, kf, pidnames , config_table)


Applying ATLAS style settings...


100%|██████████| 5/5 [00:41<00:00,  8.28s/it]


In [51]:
kf = KFold(n_splits=10, random_state=512, shuffle=True)

In [52]:
[(t, v) for t, v in kf.split(list(range(10)) )]

[(array([0, 1, 2, 3, 5, 6, 7, 8, 9]), array([4])),
 (array([0, 1, 2, 3, 4, 5, 7, 8, 9]), array([6])),
 (array([0, 1, 2, 3, 4, 5, 6, 8, 9]), array([7])),
 (array([0, 1, 2, 3, 4, 5, 6, 7, 8]), array([9])),
 (array([0, 1, 2, 3, 4, 6, 7, 8, 9]), array([5])),
 (array([1, 2, 3, 4, 5, 6, 7, 8, 9]), array([0])),
 (array([0, 1, 2, 3, 4, 5, 6, 7, 9]), array([8])),
 (array([0, 1, 3, 4, 5, 6, 7, 8, 9]), array([2])),
 (array([0, 2, 3, 4, 5, 6, 7, 8, 9]), array([1])),
 (array([0, 1, 2, 4, 5, 6, 7, 8, 9]), array([3]))]

In [31]:
table.head()

Unnamed: 0,train_tag,et_bin,eta_bin,model_idx,sort,init,file_name,tuned_idx,max_sp_val,max_sp_pd_val,...,vloose_pd_ref_total,vloose_fa_ref_total,vloose_pd_val_passed,vloose_fa_val_passed,vloose_pd_val_total,vloose_fa_val_total,vloose_pd_op_passed,vloose_fa_op_passed,vloose_pd_op_total,vloose_fa_op_total
0,v8,0,0,3,0,4,/home/jodafons/public/tuning_data/Zee/v8/r0/us...,0,0.978118,0.982948,...,232819,187639,23034,728,23282,18764,230297,7177,232818,187639
1,v8,0,0,3,1,7,/home/jodafons/public/tuning_data/Zee/v8/r0/us...,0,0.975846,0.984237,...,232819,187639,23033,719,23282,18764,230298,7152,232818,187639
2,v8,0,0,3,2,2,/home/jodafons/public/tuning_data/Zee/v8/r0/us...,0,0.975065,0.983421,...,232819,187639,23039,687,23282,18764,230321,7114,232818,187639
3,v8,0,0,3,3,2,/home/jodafons/public/tuning_data/Zee/v8/r0/us...,0,0.976284,0.984795,...,232819,187639,23046,737,23282,18764,230318,7119,232818,187639
4,v8,0,0,3,4,5,/home/jodafons/public/tuning_data/Zee/v8/r0/us...,0,0.977574,0.983464,...,232819,187639,23019,701,23282,18764,230232,6941,232818,187639


In [32]:
summary = cv.describe(table)

ValueError: arrays must all be same length

In [None]:



def update(best_inits, paths, data_generator, model_generator, kf, op_names, config_df):

        # create the final dataframe
        dataframe = {key:[] for key in best_inits.columns.values}

        def add(key , value):
            dataframe[key].append(value)

        bins = list(product(best_inits.et_bin.unique(),best_inits.eta_bin.unique())) 

        for et_bin, eta_bin in tqdm(bins):

                # open event dataframe (used to update op and val columns)
                data_df = data_generator( paths[et_bin][eta_bin] )

                for train_tag in best_inits.train_tag.unique():

                    # recover all models for this bin (only best inits)
                    models = get_models(best_inits.loc[(best_inits.et_bin==et_bin) & (best_inits.eta_bin==eta_bin) & (best_inits.train_tag==train_tag)])

                    
                    row = best_inits.loc[(best_inits.et_bin==et_bin) & (best_inits.eta_bin==eta_bin) & (best_inits.sort==sort) & (best_inits.train_tag==train_tag)]
                        
                        add('et_bin', et_bin)
                        add('eta_bin', eta_bin)
                        add('train_tag', train_tag)
                        
                        columns = ['max_sp_fa_op','max_sp_fa_val','max_sp_op','max_sp_pd_op',
                                   'max_sp_pd_val','max_sp_val','file_name','sort','init'
                                   ,'model_idx','tuned_idx']
        
                        for key in columns:
                            add(key, getattr(row, key).values[0] )

                        model = models[sort]['model']
                        output = model.predict( model_generator[train_tag](data_df) , batch_size=1024).flatten()
                        
                        temp_df = pd.DataFrame({'output':output , 
                                                'avgmu' :data_df.avgmu.values,    
                                                'target':data_df.target.values})

                        splits = [(train_idx, val_idx) for train_idx,val_idx in kf.split(temp_df, temp_df.target)]
                        val_idx = splits[sort][1]

                        for op in op_names:

                            _config = config_df.loc[ (config_df.et_bin==et_bin) & (config_df.eta_bin==eta_bin) ]
                            slope = getattr(_config, op+'_slope').values[0]
                            offset = getattr(_config, op+'_offset').values[0]
                            min_avgmu = _config.min_avgmu.values[0]
                            max_avgmu = _config.max_avgmu.values[0]
                            #temp_df.avgmu[temp_df.avgmu > max_avgmu] = max_avgmu
                            #temp_df.avgmu[temp_df.avgmu < min_avgmu] = min_avgmu
                            thr = slope * temp_df.avgmu + offset
                            temp_df['decision'] = np.greater(temp_df.output.values, thr)
                
                            # reference
                            add(op+'_fa_ref_passed', getattr(row, op+'_fa_ref_passed').values[0] )
                            add(op+'_fa_ref_total' , getattr(row, op+'_fa_ref_total' ).values[0] )
                            add(op+'_pd_ref_total' , getattr(row, op+'_pd_ref_total' ).values[0] )
                            add(op+'_pd_ref_passed', getattr(row, op+'_pd_ref_passed').values[0] )
                            add(op+'_pd_ref'       , getattr(row, op+'_pd_ref').values[0] )
                            add(op+'_fa_ref'       , getattr(row, op+'_fa_ref').values[0] )
                            add(op+'_sp_ref'       , getattr(row, op+'_sp_ref').values[0] )

                            
                            # update op pd columns
                            total = temp_df.loc[ (temp_df.target==1) ].shape[0]
                            passed = temp_df.loc[ (temp_df.target==1) & temp_df.decision==True ].shape[0]
                            add( op+'_pd_op_passed', passed)
                            add( op+'_pd_op_total' , total)
                            _pd = passed/total
                            add( op+'_pd_op', _pd)
                            

                            # update op fa columns
                            total = temp_df.loc[ (temp_df.target==0) ].shape[0]
                            passed = temp_df.loc[ (temp_df.target==0) & temp_df.decision==True ].shape[0]
                            add( op+'_fa_op_passed', passed)
                            add( op+'_fa_op_total' , total)
                            _fa = passed/total
                            add( op+'_fa_op', _fa)

                            add( op+'_sp_op', np.sqrt(  np.sqrt(_pd*(1-_fa)) * (0.5*(_pd+(1-_fa)))  ) )
                            
                            # get only validation set events
                            temp_val_df = temp_df.loc[val_idx]

                            # calculate val pd
                            total  = temp_val_df.loc[ (temp_val_df.target==1) ].shape[0]
                            passed = temp_val_df.loc[ (temp_val_df.target==1) & temp_val_df.decision==True ].shape[0]
                            add( op+'_pd_val_passed', passed)
                            add( op+'_pd_val_total' , total)
                            _pd = passed/total
                            add( op+'_pd_val', _pd)


                            # calculate val fa
                            total  = temp_val_df.loc[ (temp_val_df.target==0) ].shape[0]
                            passed = temp_val_df.loc[ (temp_val_df.target==0) & temp_val_df.decision==True ].shape[0]
                            add( op+'_fa_val_passed', passed)
                            add( op+'_fa_val_total' , total)
                            _fa = passed/total
                            add( op+'_fa_val', _fa)
                            
                            add( op+'_sp_val', np.sqrt(  np.sqrt(_pd*(1-_fa)) * (0.5*(_pd+(1-_fa)))  ) )



        #pprint(dataframe)
        table = pd.DataFrame(dataframe)    
        return table