# Jet Clustering

This workflow is for use with the jet samples, that contain both `ClusterTree` and `EventTree` (provided by the `MLTree` utility). This **cannot** handle data where the `EventTree` does not exist, because that contains info on piecing the clusters together into events*, and the baseline jet clustering.

\* This pieceing together can be accomplished in workflows like `EventReconstructionPion.ipynb` but it's rather complex.

#### TODO:

- finish up calculation of scores
- jet clustering (save to new file?)
- comparison of jets

#### 1) Setup

First, let's import a bunch of packages we know we'll need right off-the-bat.

Note that as we've set up our environment with `conda`, our `ROOT` installation has all the bells and whistles. This includes the `pythia8` library and its associated `ROOT` wrapper, `TPythia8`. We can optionally use this for jet-clustering, as it comes `fj-core`.
Alternatively we could use the Pythonic interface for `fastjet` or [pyjet](https://github.com/scikit-hep/pyjet), but the latter requires linking an external fastjet build for speed and this doesn't seem to work when following their documentation.

In [1]:
# Imports - generic stuff

import numpy as np
import ROOT as rt
import uproot as ur
import sys, os, glob
import subprocess as sub
from pathlib import Path

path_prefix = '/workspace/LCStudies/'
if(path_prefix not in sys.path): sys.path.append(path_prefix)
from util import ml_util as mu # for passing calo images to regression networks
from util import qol_util as qu # for progress bar

Welcome to JupyROOT 6.22/02


In [2]:
# Imports and setup for TensorFlow and Keras.
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # disable some of the tensorflow info printouts, only display errors
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

ngpu = 1
gpu_list = ["/gpu:"+str(i) for i in range(ngpu)]
strategy = tf.distribute.MirroredStrategy(devices=gpu_list)
ngpu = strategy.num_replicas_in_sync
print ('Number of devices: {}'.format(ngpu))

# Dictionary for storing all our neural network models that will be evaluated
network_models = {}

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
Number of devices: 1


In [3]:
# setup paths
data_dir = path_prefix + 'data/jet'
classification_dir = path_prefix + 'classifier/Models'
regression_dir = path_prefix + 'regression/Models'
fj_dir = path_prefix + '/setup/fastjet/fastjet-install/lib/python3.8/site-packages'

In [4]:
# ----- Calorimeter meta-data -----
layers = ["EMB1", "EMB2", "EMB3", "TileBar0", "TileBar1", "TileBar2"]
nlayers = len(layers)
cell_size_phi = [0.098, 0.0245, 0.0245, 0.1, 0.1, 0.1]
cell_size_eta = [0.0031, 0.025, 0.05, 0.1, 0.1, 0.2]
len_phi = [4, 16, 16, 4, 4, 4]
len_eta = [128, 16, 8, 4, 4, 2]
assert(len(len_phi) == nlayers)
assert(len(len_eta) == nlayers)
meta_data = {
    layers[i]:{
        'cell_size':(cell_size_eta[i],cell_size_phi[i]),
        'dimensions':(len_eta[i],len_phi[i])
    }
    for i in range(nlayers)
}

In [5]:
# flat classifiers
print('Loading flat classification models... ')
flat_model_files = glob.glob(classification_dir + '/flat/' + '*.h5')
flat_model_files.sort()
flat_model_names = []
for model in flat_model_files:
    model_name = model.split('model_')[-1].split('_flat')[0]
    print('\tLoading ' + model_name + '... ',end='')
    flat_model_names.append(model_name)
    network_models[model_name] = tf.keras.models.load_model(model)
    print('Done.')

# combo classifier
print('Loading simple combo classification model... ',end='')
combo_model_file = classification_dir + '/simple/' + 'model_simple_do20.h5'
network_models['combo'] = tf.keras.models.load_model(combo_model_file)
print('Done.')

# energy regression networks
print('Loading charged-pion energy regression model... ',end='')
charged_energy_model_file = regression_dir + '/' + 'all_charged.h5'
network_models['e_charged'] = tf.keras.models.load_model(charged_energy_model_file)
print('Done.')

print('Loading neutral-pion energy regression model... ',end='')
neutral_energy_model_file = regression_dir + '/' + 'all_neutral.h5'
network_models['e_neutral'] = tf.keras.models.load_model(neutral_energy_model_file)
print('Done.')

Loading flat classification models... 
	Loading EMB1... Done.
	Loading EMB2... Done.
	Loading EMB3... Done.
	Loading TileBar0... Done.
	Loading TileBar1... Done.
	Loading TileBar2... Done.
Loading simple combo classification model... Done.
Loading charged-pion energy regression model... Done.
Loading neutral-pion energy regression model... Done.


Now we make a "local" copy of the jet data. We will only copy over certain branches, and we will skip any files that don't contain an `eventTree` in them.

In [7]:
data_filenames = glob.glob(data_dir + '/' + '*.root')

# debugging
data_filenames = [data_dir + '/' + 'user.angerami.21685345.OutputStream._000062.root', data_dir + '/' + 'user.angerami.21685345.OutputStream._000113.root']

# our "local" data dir, where we create modified data files
jet_data_dir = path_prefix + 'jets/data'
Path(jet_data_dir).mkdir(parents=True, exist_ok=True)

# Get the original data.
files = {name:rt.TFile(name,'READ') for name in data_filenames}

# Some data files might be missing an EventTree.
# For now, we will skip these because our methods count on an existing EventTree.
delete_keys = []
for key, val in files.items():
    file_keys = [x.GetName() for x in val.GetListOfKeys()]
    if('ClusterTree' not in file_keys or 'EventTree' not in file_keys):
        delete_keys.append(key)

for key in delete_keys: 
    print('Ignoring file:',key,'(no EventTree/ClusterTree found).')
    del files[key]

if(path_prefix not in sys.path): sys.path.append(path_prefix)
from  util import qol_util as qu # for progress bar

# now we make a local copy of the files in the jet_data_dir, keeping only certain branches
active_branches = {}
active_branches['cluster'] = [
    'runNumber',
    'eventNumber',
    'truthE',
    'truthPt',
    'truthEta',
    'truthPhi',
    'clusterIndex',
    'nCluster',
    'clusterE',
    'clusterECalib',
    'clusterPt',
    'clusterEta',
    'clusterPhi',
    'cluster_nCells',
    'cluster_ENG_CALIB_TOT',
    'EMB1',
    'EMB2',
    'EMB3',
    'TileBar0',
    'TileBar1',
    'TileBar2'
]
active_branches['event'] = [
    'runNumber',
    'eventNumber',
    'lumiBlock',
    'NPV',
    'nTruthPart',
    'clusterCount',
    'nCluster',
    'clusterE',
    'clusterPt',
    'clusterEta',
    'clusterPhi',
    'AntiKt4EMTopoJetsPt',
    'AntiKt4EMTopoJetsEta',
    'AntiKt4EMTopoJetsPhi',
    'AntiKt4EMTopoJetsE',
    'AntiKt4LCTopoJetsPt',
    'AntiKt4LCTopoJetsEta',
    'AntiKt4LCTopoJetsPhi',
    'AntiKt4LCTopoJetsE',
    'AntiKt4TruthJetsPt',
    'AntiKt4TruthJetsEta',
    'AntiKt4TruthJetsPhi',
    'AntiKt4TruthJetsE'
]

tree_names = {'cluster':'ClusterTree','event':'EventTree'}
data_filenames = []

l = len(files.keys())
i = 0
qu.printProgressBarColor(i, l, prefix='Copying data files:', suffix='Complete', length=50)

for path, tfile in files.items():
    filename_new = jet_data_dir + '/' + path.split('/')[-1]
    old_trees = {x:tfile.Get(tree_names[x]) for x in tree_names.keys()}
    
    for key, tree in old_trees.items():
        tree.SetBranchStatus('*',0)
        for bname in active_branches[key]: tree.SetBranchStatus(bname,1)
    
    tfile_new = rt.TFile(filename_new,'RECREATE')
    new_trees = {x:old_trees[x].CloneTree() for x in old_trees.keys()}
    tfile_new.Write()
    data_filenames.append(filename_new)
    i += 1
    qu.printProgressBarColor(i, l, prefix='Copying data files:', suffix='Complete', length=50)
    del old_trees
    del new_trees

Copying data files: |[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m| 100.0% Complete


In [8]:
# Access the files & trees with uproot
files = {name:rt.TFile(name,'READ') for name in data_filenames}
tree_names = {'cluster':'ClusterTree','event':'EventTree'}
ur_trees = {file:{tree_key:ur.open(file)[tree_name] for tree_key,tree_name in tree_names.items()} for file in data_filenames}

# reminder: how to get an awkward array for a particular branch, here a is a key corresponding to a filename
#ur_trees[a]['cluster'].array('EMB1')

Now we will loop over our data files. This isn't the most notebook-esque code, but it should avoid "out of memory" issues: As we are dealing with a large amount of data, preparing all the data in memory before operating on it will result in very high memory usage. Thus we will sacrifice a multi-cell approach of preparing all the data step-by-step, in order to make sure we don't load more stuff into memory at a time than we need.

In [28]:
# branch buffer for filling our score trees
    # make our branch buffer
branch_buffer = {
    'charged_likelihood_combo': np.zeros(1,dtype=np.dtype('f8')),
    'clusterE_charged': np.zeros(1,dtype=np.dtype('f8')),
    'clusterE_neutral': np.zeros(1,dtype=np.dtype('f8'))
}

for dfile, trees in ur_trees.items():
    
    print (dfile)
    # prep the calo images
    print('\tPrepping calo images...')
    calo_images = {}
    for layer in layers:
        calo_images[layer] = mu.setupCells(trees['cluster'],layer)
    combined_images = np.concatenate(tuple([calo_images[layer] for layer in layers]), axis=1)

    # prep some extra combined input for energy regression
    print('\tPrepping extra inputs...')
    scaler_e = StandardScaler()
    scaler_cal = StandardScaler()
    scaler_eta = StandardScaler()
    
    e = trees['cluster'].array('clusterE')
    e_calib = trees['cluster'].array('cluster_ENG_CALIB_TOT')
    eta = trees['cluster'].array('clusterEta')
    
    # cleaning for e and e_calib (empirically needed for e_calib to remove values that are too large)
    epsilon = 1.0e-12
    e = np.where(e < epsilon, epsilon, e)
    e_calib = np.where(e_calib < epsilon, epsilon, e_calib)
    
    regression_cols = {}
    regression_cols['s_logE'] = scaler_e.fit_transform(np.log(e).reshape(-1,1))
    regression_cols['s_logECalib'] = scaler_cal.fit_transform(np.log(e_calib).reshape(-1,1))
    regression_cols['s_eta'] = scaler_eta.fit_transform(eta.reshape(-1,1))
    
    s_combined,scaler_combined = mu.standardCells(combined_images, layers)
    regression_input = np.column_stack((regression_cols['s_logE'], regression_cols['s_eta'],s_combined))

    # now find network scores
    print('\tCalculating network outputs...')
    model_scores = {}
    
    print('\t\tClassification... ', end='')
    # 1) flat networks
    for layer in flat_model_names:
        model = network_models[layer]
        model_scores[layer] = model.predict(calo_images[layer])[:,1] # [:,1] based on Max's code, this is input to combo network. Likelihood of being charged (vs. neutral)
    
    # 2) combo network
    name = 'combo'
    model = network_models[name]
    input_scores = np.column_stack([model_scores[layer] for layer in layers])
    model_scores[name] = model.predict(input_scores)[:,1] # likelihood of being charged pion (versus neutral pion)
    print('Done.')
    
    print('\t\tRegression... ', end='')
    # 3) energy regression networks
    name = 'e_charged'
    model = network_models[name]
    model_scores[name] = np.exp(scaler_cal.inverse_transform(model.predict(regression_input)))
    
    name = 'e_neutral'
    model = network_models[name]
    model_scores[name] = np.exp(scaler_cal.inverse_transform(model.predict(regression_input)))
    print('Done.')
    
    # Now we should save these scores to a new tree.
    f = rt.TFile(dfile, 'UPDATE')
    tree_name = 'ScoreTree'
    t = rt.TTree(tree_name, tree_name)
    
    print('Saving network scores to tree ' + tree_name + '... ',end='')    
    # --- Setup the branches using our buffer. This is a rather general/flexible code block. ---
    branches = {}
    for bname, val in branch_buffer.items():
        descriptor = bname
        bshape = val.shape
        if(bshape != (1,)):
            for i in range(len(bshape)):
                descriptor += '[' + str(bshape[i]) + ']'
        descriptor += '/'
        if(val.dtype == np.dtype('i2')): descriptor += 'S'
        elif(val.dtype == np.dtype('i4')): descriptor += 'I'
        elif(val.dtype == np.dtype('i8')): descriptor += 'L'
        elif(val.dtype == np.dtype('f4')): descriptor += 'F'
        elif(val.dtype == np.dtype('f8')): descriptor += 'D'
        else:
            print('Warning, setup issue for branch: ', key, '. Skipping.')
            continue
        branches[bname] = t.Branch(bname,val,descriptor)
    
    # Fill the model score tree, and save it to the local data file.
    nentries = model_scores['combo'].shape[0]
    for i in range(nentries):
        branch_buffer['charged_likelihood_combo'][0] = model_scores['combo'][i]
        branch_buffer['clusterE_charged'][0] = model_scores['e_charged'][i]
        branch_buffer['clusterE_neutral'][0] = model_scores['e_neutral'][i]
        t.Fill()
    
    t.Write('',rt.TObject.kOverwrite)
    f.Close()
    print('Done.')

/workspace/LCStudies/jets/data/user.angerami.21685345.OutputStream._000062.root
	Prepping calo images...
	Prepping extra inputs...
	Calculating network outputs...
		Classification... Done.
		Regression... 

  model_scores[name] = np.exp(scaler_cal.inverse_transform(model.predict(regression_input)))


Done.
/workspace/LCStudies/jets/data/user.angerami.21685345.OutputStream._000113.root
	Prepping calo images...
	Prepping extra inputs...
	Calculating network outputs...
		Classification... Done.
		Regression... 

  model_scores[name] = np.exp(scaler_cal.inverse_transform(model.predict(regression_input)))


Done.


Now we have classification and energy regression scores for all of our topo-clusters. Next, we want to perform jet-clustering, where we'll use the regressed energies (and the classification score will tell us which regressed energy to use for each cluster).

In [None]:
sys.path.append(fj_dir)
import fastjet as fj

# Jet clustering params
R = 0.4
jet_def = fj.JetDefinition(fj.antikt_algorithm, R)

files = {name:rt.TFile(name,'READ') for name in data_filenames}
tree_names = {'cluster':'ClusterTree','event':'EventTree','scores':'ScoreTree'}
ur_trees = {file:{tree_key:ur.open(file)[tree_name] for tree_key,tree_name in tree_names.items()} for file in data_filenames}

for dfile, trees in ur_trees.items():
    
    # event info
    cluster_min = trees['event'].array('clusterCount')
    cluster_max = cluster_min + trees['event'].array('nCluster') - 1
    
    # cluster info (pre-existing)
    cluster_vec = np.column_stack(tuple(trees['cluster'].arrays(['clusterPt','clusterEta','clusterPhi']).values()))
    
    # cluster info (scores)
    cluster_classification = trees['scores'].array('charged_likelihood_combo')
    cluster_energies = np.column_stack(tuple(trees['scores'].arrays(['clusterE_charged','clusterE_neutral']).values()))

    vec_polar = rt.Math.PtEtaPhiEVector()    
    # loop over events
    nevents = tree['event'].numentries
    for i in range(nevents):
        cluster_idxs = np.linspace(cluster_min[i], cluster_max[i], cluster_max[i] - cluster_min[i] + 1)        
        nCluster = cluster_idxs.shape[0]
        pseudojets = nCluster * [fj.PseudoJet(0.,0.,0.,0.)] # TODO: pre-allocating space? does this speed things up?

        for j, idx in enumerate(cluster_idxs):
            energy = cluster_energies[idx,0]
            if cluster_classification[idx] < 0.5: energy = cluster_energies[idx,1]
            vec_polar.SetCoordinates(cluster_vec[idx,0],cluster_vec[idx,1],cluster_vec[idx,2],energy)
            pseudojets[j] = fj.PseudoJet(vec_polar.Px(), vec_polar.Py(), vec_polar.Pz(), vec_polar.E()) # fastjet uses Cartesian
            
        jets = jet_def(pseudojets) # perform jet clustering
        njets = len(jets)
        # TODO: save jet info to a TTree

        
    
    
