In [None]:
import uproot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
from time import time

import numba

# import helper.preprocessing.apply_cut
from helper.preprocessing import add_sorting_index, add_sorted_col, add_nth_val, apply_cut
from helper.utils import convert_float64_to_float32

In [None]:
plt.rcParams['font.size']=16
CACHE = dict()

pd.options.display.max_columns = 200
# pd.options.display.max_colwith = 100
pd.set_option('max_colwidth', 80)

In [None]:
data_dir = '../ana_results/iter2/LHC16h3/'
tree_name_core = 'JetTree_AliAnalysisTaskJetExtractor_Jet_AKTChargedR040_tracks_pT0150_E_scheme_'
os.listdir(data_dir)

# def `add_features()` function
see: https://github.com/sbysiak/HF-jets/blob/master/analyses/helper/preprocessing/nested_features.py for docs

__JET:__ 'Jet_Pt', 'Jet_Phi', 'Jet_Eta', 'Jet_Area', 'Jet_NumTracks',  

__EVENT:__ 'Event_BackgroundDensity', 'Event_BackgroundDensityMass', 'Event_Vertex_X', 'Event_Vertex_Y', 'Event_Vertex_Z',  
'Event_Centrality', 'Event_Multiplicity', 'Event_ID', 'Event_MagneticField', 'Event_PtHard', 'Event_Weight', 'Event_ImpactParameter', 

__TRACKS:__
'Jet_Track_Pt', 'Jet_Track_Phi', 'Jet_Track_Eta', 'Jet_Track_Charge', 'Jet_Track_Label',   
'Jet_Track_IPd', 'Jet_Track_IPz', 'Jet_Track_CovIPd', 'Jet_Track_CovIPz', 'Jet_Track_ProdVtx_X', 'Jet_Track_ProdVtx_Y', 'Jet_Track_ProdVtx_Z',
       
__TRACKS-PID:__ 'Jet_Track_PID_ITS', 'Jet_Track_PID_TPC', 'Jet_Track_PID_TOF', 'Jet_Track_PID_TRD', 'Jet_Track_PID_Reconstructed', 'Jet_Track_PID_Truth', 

__SHAPE:__ ' 
Jet_Shape_Mass_NoCorr', 'Jet_Shape_Mass_DerivCorr_1', 'Jet_Shape_Mass_DerivCorr_2',  
'Jet_Shape_pTD_DerivCorr_1',  'Jet_Shape_pTD_DerivCorr_2', 'Jet_Shape_LeSub_NoCorr', 'Jet_Shape_LeSub_DerivCorr',  
'Jet_Shape_Angularity', 'Jet_Shape_Angularity_DerivCorr_1', 'Jet_Shape_Angularity_DerivCorr_2',  
'Jet_Shape_Circularity_DerivCorr_1', 'Jet_Shape_Circularity_DerivCorr_2', 'Jet_Shape_Sigma2_DerivCorr_1', 'Jet_Shape_Sigma2_DerivCorr_2',   
'Jet_Shape_NumTracks_DerivCorr', 'Jet_Shape_MomentumDispersion', 'Jet_Shape_TrackPtMean',   'Jet_Shape_TrackPtMedian', 
       
__SPLITTINGS:__'
Jet_NumSplittings', 'Jet_Splitting_Theta', 'Jet_Splitting_RadiatorE', 'Jet_Splitting_kT', 'Jet_Splitting_SecVtx_Rank', 'Jet_Splitting_SecVtx_Index',
       
__MC:__ 'Jet_MC_MotherParton', 'Jet_MC_MotherHadron', 'Jet_MC_MotherIC', 'Jet_MC_TruePtFraction', 'Jet_MC_TruePtFraction_PartLevel',
  
__SEC. VERTEX:__ 'Jet_NumSecVertices', 'Jet_SecVtx_X', 'Jet_SecVtx_Y', 'Jet_SecVtx_Z', 'Jet_SecVtx_Mass', 'Jet_SecVtx_Lxy', 'Jet_SecVtx_SigmaLxy', 'Jet_SecVtx_Chi2', 'Jet_SecVtx_Dispersion', 

__TRIGGER:__' Jet_NumTriggerTracks', 'Jet_TriggerTrack_Pt', 'Jet_TriggerTrack_dEta', 'Jet_TriggerTrack_dPhi', 'Jet_Track_IPdNsigma', 'Jet_SecVtx_LxyNSigma',

In [None]:
def add_features(df):

#     def IPdNSigmaAbs_cutSmallSigma(row):
#         pt = row['Jet_Track_Pt']
#         IPd_sigma = np.sqrt(row['Jet_Track_CovIPd'])
#         sigma_threshold = 0.004444561*pt**(-0.4790711) if pt < 10 else 0.0016
#         if IPd_sigma > sigma_threshold:
#             return abs(row['Jet_Track_IPd'] / IPd_sigma)
#         else:
#             return -1
    
    
    def subtract_phi(phi1, phi2):
        diff = phi1-phi2
        if abs(diff) <= np.pi: return diff
        elif diff > np.pi: return diff - 2*np.pi
        elif diff < -np.pi: return diff + 2*np.pi


    def subtract_eta(eta1, eta2):
        diff = eta1-eta2
        return diff
    
    # add custom features
    df['Jet_Track_DeltaPhi'] = df.apply(lambda row: np.array([ subtract_phi(tr_phi, row['Jet_Phi']) for tr_phi in row['Jet_Track_Phi']]), axis=1)
    df['Jet_Track_DeltaEta'] = df.apply(lambda row: np.array([ subtract_eta(tr_eta, row['Jet_Eta']) for tr_eta in row['Jet_Track_Eta']]), axis=1)
    df['Jet_Track_DeltaR']   = df.apply(lambda row: np.array([ np.sqrt(tr_phi**2 + tr_eta**2)       for tr_phi, tr_eta in zip(row['Jet_Track_DeltaPhi'], row['Jet_Track_DeltaEta'])]), axis=1)
    df['Jet_Track_PtFrac']   = df.apply(lambda row: np.array([ (tr_pt/row['Jet_Pt'])                for tr_pt in row['Jet_Track_Pt']]), axis=1)
#     df = df.drop(['Jet_Track_Phi', 'Jet_Track_Eta'])
# IPdNsigma, IPzNsigma, IP3dNsigma
# 

#     df['Jet_Track_IPdNsigmaAbs']  = df.apply(lambda row: abs(row['Jet_Track_IPd'] / np.sqrt(row['Jet_Track_CovIPd'])), axis=1)
#     df['Jet_Track_IPdNsigmaAbs']  = df.apply(lambda row: IPdNsigmaAbs_cutSmallSigma(row), axis=1)
    df['Jet_Track_IPdSigma']  = df['Jet_Track_CovIPd'].pow(0.5)
    df['Jet_Track_IPzSigma']  = df['Jet_Track_CovIPz'].pow(0.5)
    df = df.drop(['Jet_Track_CovIPd', 'Jet_Track_CovIPz'], axis=1)
    
    df['Jet_Track_IPdAbs']          = eval('abs(a)', dict(a=df['Jet_Track_IPd'])) 
    df['Jet_Track_IPzAbs']          = eval('abs(a)', dict(a=df['Jet_Track_IPz'])) 
    df['Jet_Track_IPdNsigma']       = eval('a/b', dict(a=df['Jet_Track_IPd'], b=df['Jet_Track_IPdSigma'])) 
    df['Jet_Track_IPzNsigma']       = eval('a/b', dict(a=df['Jet_Track_IPz'], b=df['Jet_Track_IPzSigma'])) 
    df['Jet_Track_IPdNsigmaAbs']    = eval('abs(a)/b', dict(a=df['Jet_Track_IPd'], b=df['Jet_Track_IPdSigma'])) 
    df['Jet_Track_IPzNsigmaAbs']    = eval('abs(a)/b', dict(a=df['Jet_Track_IPz'], b=df['Jet_Track_IPzSigma'])) 

#     def cut_val(track_pt):
#         return 0.004444561*track_pt**(-0.4790711) if track_pt < 10 else 0.0015
    
#     df['Jet_Track_CutIPdSigmaVSPt'] = df.apply(lambda row: 
#                                         np.array([int(ipd_sigma < cut_val(pt))  for ipd_sigma, pt in zip(row['Jet_Track_IPdSigma'], row['Jet_Track_Pt'])]),
#                                         axis=1
#                                       )
#     df = df.drop(['Jet_Track_IPd', ], axis=1)
#     df = df.drop(['Jet_Track_IPd', 'Jet_Track_IPz'], axis=1)
    
    df['Jet_SecVtx_LxyNsigma'] = eval('a / b', dict(a=df['Jet_SecVtx_Lxy'], b=df['Jet_SecVtx_SigmaLxy']))
    
    ### create index cols
    track_sorting_var = 'IPdNsigmaAbs'
    sv_sorting_var    = 'LxyNsigma'
    add_sorting_index(df, f'Jet_Track_{track_sorting_var}', 'desc')
    add_sorting_index(df, f'Jet_SecVtx_{sv_sorting_var}', 'desc')

    ### apply cuts a.k.a. filter index cols
#     apply_cut(df, 'Jet_Track_IPdNsigmaAbs < 50', track_sorting_var, 'desc')
#     apply_cut(df, 'Jet_Track_Pt > 0.5', track_sorting_var, 'desc')
#     apply_cut(df, 'Jet_Track_CutIPdSigmaVSPt < 0.5', track_sorting_var, 'desc')
#     apply_cut(df, 'Jet_SecVtx_Chi2 < 10' ,'LxyNsigma', 'desc')
#     apply_cut(df, 'Jet_SecVtx_Dispersion < 0.01' ,'LxyNsigma', 'desc')
#     apply_cut(df, 'Jet_SecVtx_SigmaLxy < 0.1' ,'LxyNsigma', 'desc')
    
    ### create sorted cols
    track_params = ['Jet_Track_Pt', 'Jet_Track_Phi', 'Jet_Track_Eta', 
                    'Jet_Track_DeltaPhi', 'Jet_Track_DeltaEta', 'Jet_Track_PtFrac', 'Jet_Track_DeltaR',
                    'Jet_Track_Charge', 'Jet_Track_Label', 
                    'Jet_Track_IPd', 'Jet_Track_IPz', 'Jet_Track_CovIPd', 'Jet_Track_CovIPz', 
                    'Jet_Track_ProdVtx_X', 'Jet_Track_ProdVtx_Y', 'Jet_Track_ProdVtx_Z',
                   
                    'Jet_Track_PID_ITS', 'Jet_Track_PID_TPC', 'Jet_Track_PID_TOF', 'Jet_Track_PID_TRD', 
                    'Jet_Track_PID_Reconstructed', 'Jet_Track_PID_Truth',
                    
                    'Jet_Track_IPdAbs'      , 'Jet_Track_IPzAbs',
                    'Jet_Track_IPdSigma'    , 'Jet_Track_IPzSigma',
                    'Jet_Track_IPdNsigma'   , 'Jet_Track_IPzNsigma',  
                    'Jet_Track_IPdNsigmaAbs', 'Jet_Track_IPzNsigmaAbs',
                   ]
    
    sv_params    = ['Jet_SecVtx_X', 'Jet_SecVtx_Y', 'Jet_SecVtx_Z', 
                    'Jet_SecVtx_Mass', 
                    'Jet_SecVtx_Lxy', 'Jet_SecVtx_SigmaLxy', 'Jet_SecVtx_Chi2', 'Jet_SecVtx_Dispersion', 'Jet_SecVtx_LxyNsigma',
                   ]
    
    track_params = [par for par in track_params if par in df.columns]
    sv_params    = [par for par in  sv_params   if par in df.columns]

    for param in track_params:
        add_sorted_col(df, param ,   track_sorting_var, 'desc')

    for param in sv_params:
        add_sorted_col(df, param ,   sv_sorting_var, 'desc')

    
    ### extract n-th value from sorted cols
#     new_training_cols = []
    n_tracks, n_sv = 10,10
    for param in track_params:
        for i in range(n_tracks):
            add_nth_val(df, col_name=f'{param}__sortby__{track_sorting_var}__desc', n=i, fillna=None)
#             new_training_cols.append(df.columns[-1])

    for param in sv_params:
        for i in range(n_sv):
            add_nth_val(df, col_name=f'{param}__sortby__{sv_sorting_var}__desc', n=i, fillna=None)
#             new_training_cols.append(df.columns[-1])

    ### drop temporary columns, i.e. those containing arrays, like 'Index__*' as well as initial columns used for extraction, like 'Jet_Track_Pt'
#     columns_to_keep = df.select_dtypes(exclude=['object']).columns
    columns_to_keep = [col for col,val in zip(df.columns, df.iloc[0]) if not hasattr(val, '__iter__') or isinstance(val, str)]
    return df[columns_to_keep]


In [None]:
training_cols = ['Jet_Pt', 
                 'Jet_Phi', 'Jet_Eta', 
                 'Jet_Area', 'Jet_NumTracks',
            'Jet_Track_Pt', 'Jet_Track_Phi', 'Jet_Track_Eta', 
                 'Jet_Track_IPd','Jet_Track_IPz', 'Jet_Track_CovIPd', 'Jet_Track_CovIPz',
#             'Jet_Track_PID_ITS', 'Jet_Track_PID_TPC', 'Jet_Track_PID_TOF', 'Jet_Track_PID_TRD', 'Jet_Track_PID_Reconstructed', 'Jet_Track_PID_Truth',
            'Jet_SecVtx_Mass', 'Jet_SecVtx_Lxy', 'Jet_SecVtx_SigmaLxy', 'Jet_SecVtx_Chi2', 'Jet_SecVtx_Dispersion',

#             'Jet_Shape_Mass_NoCorr', 'Jet_Shape_Mass_DerivCorr_1', 'Jet_Shape_Mass_DerivCorr_2',
#             'Jet_Shape_pTD_DerivCorr_1', 'Jet_Shape_pTD_DerivCorr_2', 'Jet_Shape_LeSub_NoCorr', 'Jet_Shape_LeSub_DerivCorr',
#             'Jet_Shape_Angularity', 'Jet_Shape_Angularity_DerivCorr_1', 'Jet_Shape_Angularity_DerivCorr_2',
#             'Jet_Shape_Circularity_DerivCorr_1', 'Jet_Shape_Circularity_DerivCorr_2', 'Jet_Shape_Sigma2_DerivCorr_1', 'Jet_Shape_Sigma2_DerivCorr_2',
#             'Jet_Shape_NumTracks_DerivCorr', 'Jet_Shape_MomentumDispersion', 'Jet_Shape_TrackPtMean', 'Jet_Shape_TrackPtMedian',
                ]

froot = uproot.open(os.path.join(data_dir, 'ptbin1/AnalysisResults.root'))
df = froot[tree_name_core+'bJets'].pandas.df(flatten=False, branches=training_cols).query('Jet_Pt > 10 and Jet_Pt < 100')
print('tree reading done')
df_after = add_features(df)

In [None]:
df_after.columns.to_list()

In [None]:
df.head(10)

# Read to dataframes & add features & write to csv

reading steps are:  
`for f in files:`  
`...   for batch in f:`  
`... ...  read`  
`... ...  process`  
`... ...  write to csv`

it's also possible to use generators from `uproot` but there is some performance issue:  
`generator = froot[tree_name_core+'udsgJets'].iterate(entrysteps=iter_entries, outputtype=pd.DataFrame)`

In [None]:
branches_to_read = ['Jet_Pt', 'Jet_Phi', 'Jet_Eta', 'Jet_Area', 'Jet_NumTracks',
            'Jet_Track_Pt', 'Jet_Track_Phi', 'Jet_Track_Eta', 
            'Jet_Track_IPd','Jet_Track_IPz', 'Jet_Track_CovIPd', 'Jet_Track_CovIPz',
#             'Jet_Track_PID_ITS', 'Jet_Track_PID_TPC', 'Jet_Track_PID_TOF', 'Jet_Track_PID_TRD', 'Jet_Track_PID_Reconstructed', 'Jet_Track_PID_Truth',
            'Jet_SecVtx_Mass', 'Jet_SecVtx_Lxy', 'Jet_SecVtx_SigmaLxy', 'Jet_SecVtx_Chi2', 'Jet_SecVtx_Dispersion',
                    
#             'Jet_Shape_Mass_NoCorr', 'Jet_Shape_Mass_DerivCorr_1', 'Jet_Shape_Mass_DerivCorr_2',
#             'Jet_Shape_pTD_DerivCorr_1', 'Jet_Shape_pTD_DerivCorr_2', 'Jet_Shape_LeSub_NoCorr', 'Jet_Shape_LeSub_DerivCorr',
#             'Jet_Shape_Angularity', 'Jet_Shape_Angularity_DerivCorr_1', 'Jet_Shape_Angularity_DerivCorr_2',
#             'Jet_Shape_Circularity_DerivCorr_1', 'Jet_Shape_Circularity_DerivCorr_2', 'Jet_Shape_Sigma2_DerivCorr_1', 'Jet_Shape_Sigma2_DerivCorr_2',
#             'Jet_Shape_NumTracks_DerivCorr', 'Jet_Shape_MomentumDispersion', 'Jet_Shape_TrackPtMean', 'Jet_Shape_TrackPtMedian',
            ]

query_str = 'Jet_Pt > 10 and Jet_Pt < 150'  

custom_name = 'Tr-sortbyIPdNsigmaAbs-noCuts_SV-sortbyLxyNsigma-noCuts'

In [None]:
n_per_iter = 100000
n_to_be_saved = 1e6  # per input file = per hard-pt-bin

for d in sorted(os.listdir(data_dir), key=lambda x: int(x.replace('ptbin','').replace('1-2', '3'))):
    print(d)
    if '-' in d: 
        print('\t\t-- skipping')
        continue
    froot = uproot.open(os.path.join(data_dir, d, 'AnalysisResults.root'))
    for flavour in ['b', 'c', 'udsg']:
        print('\t', flavour)
        tree = froot[tree_name_core+f'{flavour}Jets']
        
        tic = time()
        n_saved = 0
        for i in range(99999):
            if i*n_per_iter > tree.numentries: break
            df = tree.pandas.df(flatten=False, branches=branches_to_read, entrystart=i*n_per_iter, entrystop=(i+1)*n_per_iter).query(query_str)
            
            df = convert_float64_to_float32(df)
            print('\t\t adding features...')
            df = add_features(df)
            df['ptbin'] = d
            df = df.sample(frac=1, random_state=123).reset_index(drop=True)
            print(f'\t\tN = {len(df)} \t exec. time = {time() - tic} sec ')  
            df.to_csv(f'{flavour}jets_10-150GeV_{custom_name}_{d}_{i}.csv', index=False)
            n_saved += len(df)
            print(f'\t\titer {i} done, {n_saved} rows saved')
            if n_saved > n_to_be_saved: break

__investigate__

In [None]:
# df.shape

In [None]:
# df['Jet_Pt'].describe(percentiles=[0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99])

In [None]:
# df.head()

In [None]:
# # sum(df_b.memory_usage(deep=True))/1024/1024

__code__

In [None]:
import inspect
lines = inspect.getsource(add_features)
with open(f'add_features_10-150GeV_{custom_name}.txt', 'w') as fout:
    fout.writelines(lines)

__real data, all flavours__

In [None]:
data_fpath = '../ana_results/iter2/LHC15n/AnalysisResults.root'

froot = uproot.open(data_fpath)
tree = froot[tree_name_core+'allJets']

n_per_iter = 1000000
n_to_be_saved = 1e9
n_saved = 0
for i in range(99999):
    tic = time()
    if i*n_per_iter >= tree.numentries: break
    df = tree.pandas.df(flatten=False, branches=branches_to_read, entrystart=i*n_per_iter, entrystop=(i+1)*n_per_iter).query(query_str)

    df = convert_float64_to_float32(df)
    print('\t adding features...')
    df = add_features(df)
    df['ptbin'] = 'data'
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)
    print(f'\tN = {len(df)} \t exec. time = {time() - tic} sec ')  
    df.to_csv(f'alljets_10-150GeV_{custom_name}_{i}.csv', index=False)
    n_saved += len(df)
    print(f'\titer {i} done, {n_saved} rows saved')
    if n_saved >= n_to_be_saved: break

# Merging

In [None]:
print('ok')

In [None]:
from glob import glob
import pandas as pd
from helper.utils import convert_float64_to_float32

In [None]:
flavour = 'all'
custom_name = 'Tr-sortbyPt-cuts-IPdLT02_SV-sortbyDispersion-noCuts'
core = f'{flavour}jets_10-150GeV_{custom_name}'
output_fname = core+'.csv'
pattern = core+'_*.csv'

In [None]:
print(glob(pattern))

In [None]:
print(glob(pattern))
df_merged = pd.concat([convert_float64_to_float32(pd.read_csv(f)) for f in glob(pattern)])

In [None]:
# sample in order to remove pt-dependence of index
df_merged = df_merged.sample(frac=1, random_state=123).reset_index(drop=True)

In [None]:
print(output_fname)
df_merged.to_csv(output_fname,index=False)