Input: unordered jets
Outpu: one head per particle

In [1]:
import h5py
from coffea.nanoevents import NanoEventsFactory, NanoAODSchema
import awkward as ak
import numpy as np
import uproot as ur
import json
import os

### Accessing root files, Events filtering, Labels assignment

In [2]:
def training_filter_multiple_jets(jets, electrons, muons, genparts, mets):
    '''
    Filters events down to training set and calculates jet-level labels
    
    Args:
        jets: selected jets after region filter (and selecting leading four for each event)
        electrons: selected electrons after region filter
        muons: selected muons after region filter
        genparts: selected genpart after region filter
        even: whether the event is even-numbered (used to separate training events)
    
    Returns:
        jets: selected jets after training filter
        electrons: selected electrons after training filter
        muons: selected muons after training filter
        labels: labels of jets within an event (24=W, 6=top_hadron, -6=top_lepton)
        even: whether the event is even-numbered
    '''
    #### filter genPart to valid matching candidates ####

    # get rid of particles without parents
    genpart_parent = genparts.distinctParent
    genpart_filter = np.invert(ak.is_none(genpart_parent, axis=1))
    genparts = genparts[genpart_filter]
    genpart_parent = genparts.distinctParent

    # ensure that parents are top quark or W
    genpart_filter2 = ((np.abs(genpart_parent.pdgId)==6) | (np.abs(genpart_parent.pdgId)==24))
    genparts = genparts[genpart_filter2]

    # ensure particle itself is a quark
    genpart_filter3 = ((np.abs(genparts.pdgId)<7) & (np.abs(genparts.pdgId)>0))
    genparts = genparts[genpart_filter3]

    # get rid of duplicates
    genpart_filter4 = genparts.hasFlags("isLastCopy")
    genparts = genparts[genpart_filter4]
            
        
    #### get jet-level labels and filter events to training set
        
    # match jets to nearest valid genPart candidate
    nearest_genpart = jets.nearest(genparts, threshold=0.4)
    nearest_parent = nearest_genpart.distinctParent # parent of matched particle
    parent_pdgid = nearest_parent.pdgId # pdgId of parent particle
    grandchild_pdgid = nearest_parent.distinctChildren.distinctChildren.pdgId # pdgId of particle's parent's grandchildren

    grandchildren_flat = np.abs(ak.flatten(grandchild_pdgid,axis=-1)) # flatten innermost axis for convenience

    # if particle has a cousin that is a lepton
    has_lepton_cousin = (ak.sum(((grandchildren_flat%2==0) & (grandchildren_flat>10) & (grandchildren_flat<19)),
                                axis=-1)>0)
    # if particle has a cousin that is a neutrino
    has_neutrino_cousin = (ak.sum(((grandchildren_flat%2==1) & (grandchildren_flat>10) & (grandchildren_flat<19)),
                                  axis=-1)>0)

    # if a particle has a lepton cousin and a neutrino cousin
    has_both_cousins = ak.fill_none((has_lepton_cousin & has_neutrino_cousin), False) #not using .to_numpy bc inregular array size (different event level, multiple jets)

    # get labels from parent pdgId (fill none with 100 to filter out events with those jets)
    labels = np.abs(ak.fill_none(parent_pdgid,100)) #not using .to_numpy bc inregular array size (different event level, multiple jets)

    # changing the labels while still preserve awkward array jets. To bypass inplace assignment error of numpy_array vs awkward_array
    new_labels = ak.Array([])
    for idx in range(len(labels)):
        (labels[idx].to_numpy())[has_both_cousins[idx].to_numpy()]=-6
        new_labels = ak.concatenate([new_labels, ak.Array([labels[idx]])], axis=0)
    labels = new_labels
    
    #mask for event validation (atleast 4 jets with -6, 6, 24, 24, 100, 100,...)
    mask = ak.Array([])
    for idx in range(len(labels)):
        event_valid_bool = (ak.sum(labels[idx]==-6)==1) & (ak.sum(labels[idx]==6)==1) & (ak.sum(labels[idx]==24)==2)
        mask = ak.concatenate([mask, ak.Array([event_valid_bool])], axis=0)
            
    # filter events
    jets = jets[mask]
    electrons = electrons[mask]
    muons = muons[mask]
    labels = labels[mask]
    mets = mets[mask]
    
    return jets, electrons, muons, mets, labels

In [None]:
# with open("nanoaod_inputs.json") as f:
#     file_info = json.load(f)

# for file in file_info['ttbar']['scaledown']['files']:
#     input_file = file['path']
#     print(input_file)
#     suffix = file['path'].replace('.root','.h5').split('_')[-3:]
#     file_suffix = '_'.join(suffix)
#     output_file = 'scaledown_SPANet_structure' + '_' + file_suffix
#     print(output_file)

In [7]:
# list of files
with open("nanoaod_inputs.json") as f:
    file_info = json.load(f)

for file in file_info['ttbar']['scaleup']['files']:
    input_file = file['path']
#     print(input_file)
    suffix = file['path'].replace('.root','.h5').split('_')[-3:]
    file_suffix = '_'.join(suffix)
    output_file = 'scalup_SPANet_structure' + '_' + file_suffix
#     print(output_file)

# for file in file_list:
#     input_file = file
# #     print(file)
#     suffix = file.replace('.root','.h5').split('_')[-3:]
#     file_suffix = '_'.join(suffix)
#     output_file = 'training_SPANet_structure' + '_' + file_suffix
# #     print(output_file)

    tree_name = 'Events'
    events = NanoEventsFactory.from_root(input_file, treepath=tree_name, schemaclass=NanoAODSchema).events()
    
    ### SPANet Event Filtering
    selected_electrons = events.Electron[(events.Electron.pt > 30) & (np.abs(events.Electron.eta)<2.1) & 
                                            (events.Electron.cutBased==4) & (events.Electron.sip3d < 4)]
    selected_muons = events.Muon[(events.Muon.pt > 30) & (np.abs(events.Muon.eta)<2.1) & (events.Muon.tightId) & 
                                    (events.Muon.sip3d < 4) & (events.Muon.pfRelIso04_all < 0.15)]
    jet_filter = (events.Jet.pt > 30) & (np.abs(events.Jet.eta) < 2.4) & (events.Jet.isTightLeptonVeto)
    selected_jets = events.Jet[jet_filter]
    selected_genpart = events.GenPart
    selected_MET = events.MET # Part of SPANet features

    # single lepton requirement
    event_filters = ((ak.count(selected_electrons.pt, axis=1) + ak.count(selected_muons.pt, axis=1)) == 1)
    # require at least 4 jets
    event_filters = event_filters & (ak.count(selected_jets.pt, axis=1) >= 4)
    # require at least one jet above B_TAG_THRESHOLD
    B_TAG_THRESHOLD = 0.5
    event_filters = event_filters & (ak.sum(selected_jets.btagCSVV2 >= B_TAG_THRESHOLD, axis=1) >= 1)

    # apply event filters
    selected_electrons = selected_electrons[event_filters]
    selected_muons = selected_muons[event_filters]
    selected_jets = selected_jets[event_filters]
    selected_genpart = selected_genpart[event_filters]
    selected_MET = selected_MET[event_filters]

    ### only consider 4j2b (signal) region
    region_filter = ak.sum(selected_jets.btagCSVV2 > B_TAG_THRESHOLD, axis=1) >= 2 # at least two b-tagged jets
    selected_jets_region = selected_jets[region_filter] # all jet
    selected_electrons_region = selected_electrons[region_filter]
    selected_muons_region = selected_muons[region_filter]
    selected_genpart_region = selected_genpart[region_filter]
    selected_MET_region = selected_MET[region_filter]
    
    #getting labels for valid event
    jets, electrons, muons, mets, labels = training_filter_multiple_jets(selected_jets_region, selected_electrons_region, selected_muons_region, selected_genpart_region, selected_MET_region)
    

    #jets
    jets_btag = jets.btagCSVV2 #awkward array (not padded yet)
    jets_mass = jets.mass
    jets_pt = jets.pt
    jets_eta = jets.eta
    jets_sin_phi = np.sin(jets.phi)
    jets_cos_phi = np.cos(jets.phi)

    #Met
    mets_sumEt = mets.sumEt #awkward array (no padded needed)
    mets_sin_phi = np.sin(mets.phi)
    mets_cos_phi = np.cos(mets.phi)
    
    num_events = ak.num(jets, axis=0)

    #initializing (for memory allocation)
    leptons_btag = np.zeros(num_events)
    leptons_mass = np.zeros(num_events)
    leptons_pt = np.zeros(num_events)
    leptons_eta = np.zeros(num_events)
    leptons_sin_phi = np.zeros(num_events)
    leptons_cos_phi = np.zeros(num_events)

    # creating lepton mask (either one electron or one muon)
    has_electron = ak.num(electrons, axis=1) > 0
    has_muon = ak.num(muons, axis=1) > 0

    #filling electrons features from events
    leptons_mass[has_electron] = electrons.mass[has_electron][:, 0]
    leptons_pt[has_electron] = electrons.pt[has_electron][:, 0]
    leptons_eta[has_electron] = electrons.eta[has_electron][:, 0]
    leptons_sin_phi[has_electron] = np.sin(electrons.phi[has_electron][:, 0])
    leptons_cos_phi[has_electron] = np.cos(electrons.phi[has_electron][:, 0])
    #fillinig  muons features from events
    leptons_mass[has_muon] = muons.mass[has_muon][:, 0]
    leptons_pt[has_muon] = muons.pt[has_muon][:, 0]
    leptons_eta[has_muon] = muons.eta[has_muon][:, 0]
    leptons_sin_phi[has_muon] = np.sin(muons.phi[has_muon][:, 0])
    leptons_cos_phi[has_muon] = np.cos(muons.phi[has_muon][:, 0])
    

    # Combining momenta (leptons + jets)
    max_num_jets = ak.max(ak.num(jets, axis=1))
    if max_num_jets > 16:
        print('Maximum num_jets exceed 16: ', max_num_jets, output_file)
        continue
#         break
        
    max_num_jets = 16
    
    momenta_btag = np.zeros((num_events, max_num_jets + 1)) #initializing 2d array. num_events*(max_num_jets+1 lepton) (padded)
    momenta_mass = np.zeros((num_events, max_num_jets + 1))
    momenta_pt = np.zeros((num_events, max_num_jets + 1))
    momenta_eta = np.zeros((num_events, max_num_jets + 1))
    momenta_sin_phi = np.zeros((num_events, max_num_jets + 1))
    momenta_cos_phi = np.zeros((num_events, max_num_jets + 1))

    #the 0th argument from lepton information
    momenta_btag[:, 0] = leptons_btag #no btag for electron. Thus, all zero.
    momenta_mass[:, 0] = leptons_mass
    momenta_pt[:, 0] = leptons_pt
    momenta_eta[:, 0] = leptons_eta
    momenta_sin_phi[:, 0] = leptons_sin_phi
    momenta_cos_phi[:, 0] = leptons_cos_phi

    #filling jets into momenta
    for idx in range(max_num_jets): #going through jets level
        mask = ak.num(jets, axis=1) > idx #filtering events that has num_jets > idx
        momenta_btag[mask, idx + 1] = jets_btag[mask][:, idx]
        momenta_mass[mask, idx + 1] = jets_mass[mask][:, idx]
        momenta_pt[mask, idx + 1] = jets_pt[mask][:, idx]
        momenta_eta[mask, idx + 1] = jets_eta[mask][:, idx]
        momenta_sin_phi[mask, idx + 1] = jets_sin_phi[mask][:, idx]
        momenta_cos_phi[mask, idx + 1] = jets_cos_phi[mask][:, idx]

    momenta_MASK = momenta_mass != 0 #this should valid (since mass would never be zero -> jets/leptons exist)

    #convert met to numpy
    mets_sumEt = mets_sumEt.to_numpy()
    mets_sin_phi = mets_sin_phi.to_numpy()
    mets_cos_phi = mets_cos_phi.to_numpy()
    
    ### TRAGET indices ###
    #ht
    b_had_indices = np.zeros(num_events, dtype=int)
    q1_indices = np.zeros(num_events, dtype=int)
    q2_indices = np.zeros(num_events, dtype=int)
    #lt
    b_lep_indices = np.zeros(num_events, dtype=int)
    l_indices = np.zeros(num_events, dtype=int)

    b_lep_indices[:] = ak.argmax(labels == -6, axis=1) + 1 #avoiding for-loop since there is exact one -6 and one 6
    b_had_indices[:] = ak.argmax(labels == 6, axis=1) + 1

    # Find indices for 24
    for idx, event in enumerate(labels):
        q_indices = ak.where(event == 24)[0]
        q1_indices[idx] = q_indices[0] + 1
        q2_indices[idx] = q_indices[1] + 1
    
        
    ### Preparing data for h5 file ###
    #INPUTS
    ##Momenta
    momenta_data = {
        'MASK': momenta_MASK, # capital MASK
        'mass': momenta_mass,
        'pt': momenta_pt,
        'eta': momenta_eta,
        'cos_phi': momenta_cos_phi,
        'sin_phi': momenta_sin_phi,
        'btag': momenta_btag,
    }
    ##Met
    met_data = {
        'sumet': mets_sumEt,
        'cos_phi': mets_cos_phi,
        'sin_phi': mets_sin_phi,
    }


    #TARGETS
    ht_target = {
        'b': b_had_indices,
        'q1': q1_indices,
        'q2': q2_indices,
    }

    lt_target = {
        'b': b_lep_indices,
        'l': l_indices,
    }
    
#creating h5 file
    
    # Define the directory and file name
    directory = "events_h5_files/scaleup_h5_16maxJets"

#     file_name = "your_file_name.h5"

    # Ensure the directory exists
#     os.makedirs(directory, exist_ok=True)

    # Full path to the HDF5 file
    file_path = os.path.join(directory, output_file)

#     print(f"HDF5 file written to {file_path}")

    
    with h5py.File(file_path, 'w') as h5file:
        #INPUTS
        input_group = h5file.create_group('INPUTS')
        momenta_subgroup = input_group.create_group('Momenta')
        met_subgroup = input_group.create_group('Met')

        for key, value in momenta_data.items():
            momenta_subgroup.create_dataset(key, data=value)
        for key, value in met_data.items():
            met_subgroup.create_dataset(key, data=value)

        #EVENT
        event_group = h5file.create_group('TARGETS')
        ht_subgroup = event_group.create_group('ht')
        lt_subgroup = event_group.create_group('lt')

        for key, value in ht_target.items():
            ht_subgroup.create_dataset(key, data=value)

        for key, value in lt_target.items():
            lt_subgroup.create_dataset(key, data=value)

        #PERMUTATION
        perm_group = h5file.create_group('PERMUTATION')

        #REGRESSION
        regression_group = h5file.create_group('REGRESSION')

        #CLASSIFICATION
        classification_group = h5file.create_group('CLASSIFICATION')
    
    print(output_file,': convertion done')
    

scalup_SPANet_structure_ext3-v1_10000_0000.h5 : convertion done
scalup_SPANet_structure_ext3-v1_10000_0001.h5 : convertion done
scalup_SPANet_structure_ext3-v1_10000_0002.h5 : convertion done
scalup_SPANet_structure_ext3-v1_10000_0003.h5 : convertion done
scalup_SPANet_structure_ext3-v1_10000_0004.h5 : convertion done
scalup_SPANet_structure_ext3-v1_10000_0005.h5 : convertion done
scalup_SPANet_structure_ext3-v1_10000_0006.h5 : convertion done
scalup_SPANet_structure_ext3-v1_10000_0007.h5 : convertion done
scalup_SPANet_structure_ext3-v1_10000_0008.h5 : convertion done
scalup_SPANet_structure_ext3-v1_10000_0009.h5 : convertion done
scalup_SPANet_structure_ext3-v1_10000_0010.h5 : convertion done
scalup_SPANet_structure_ext3-v1_10000_0011.h5 : convertion done
scalup_SPANet_structure_ext3-v1_10000_0012.h5 : convertion done
scalup_SPANet_structure_ext3-v1_10000_0013.h5 : convertion done
scalup_SPANet_structure_ext3-v1_10000_0014.h5 : convertion done
scalup_SPANet_structure_ext3-v1_10000_00

In [None]:
# Maximum num_jets exceed 16:  40 training_SPANet_structure_ext4-v1_00000_0024.h5

In [None]:
print('test')

### SPANet Demo Features extraction

In [10]:
%%bash
python utils/examine_hdf5.py training_SPANet_structure_ext4-v1_00000_0008.h5 --shape

| Structure for training_SPANet_structure_ext4-v1_00000_0008.h5 

|-CLASSIFICATION                
|-INPUTS                        
|---Met                         
|-----cos_phi                    :: float32  : (8923,)
|-----sin_phi                    :: float32  : (8923,)
|-----sumet                      :: float32  : (8923,)
|---Momenta                     
|-----MASK                       :: bool     : (8923, 17)
|-----btag                       :: float64  : (8923, 17)
|-----cos_phi                    :: float64  : (8923, 17)
|-----eta                        :: float64  : (8923, 17)
|-----mass                       :: float64  : (8923, 17)
|-----pt                         :: float64  : (8923, 17)
|-----sin_phi                    :: float64  : (8923, 17)
|-PERMUTATION                   
|-REGRESSION                    
|-TARGETS                       
|---ht                          
|-----b                          :: int64    : (8923,)
|-----q1                         :: int64   

In [11]:
%%bash
python utils/examine_hdf5.py /eos/user/n/nmuangko/semi_leptonnic_ttbar/data_h5_13maxJets/training_SPANet_structure_ext4-v1_00000_0007.h5 --shape

| Structure for /eos/user/n/nmuangko/semi_leptonnic_ttbar/data_h5_13maxJets/training_SPANet_structure_ext4-v1_00000_0007.h5 

|-CLASSIFICATION                
|-INPUTS                        
|---Met                         
|-----cos_phi                    :: float32  : (9832,)
|-----sin_phi                    :: float32  : (9832,)
|-----sumet                      :: float32  : (9832,)
|---Momenta                     
|-----MASK                       :: bool     : (9832, 14)
|-----btag                       :: float64  : (9832, 14)
|-----cos_phi                    :: float64  : (9832, 14)
|-----eta                        :: float64  : (9832, 14)
|-----mass                       :: float64  : (9832, 14)
|-----pt                         :: float64  : (9832, 14)
|-----sin_phi                    :: float64  : (9832, 14)
|-PERMUTATION                   
|-REGRESSION                    
|-TARGETS                       
|---ht                          
|-----b                          :: int

### Understanding SPANet Data/Model Construction (based on the demo)

In [17]:
#cloud h5 files reading test
train_filepath_cloud = '/eos/user/n/nmuangko/semi_leptonnic_ttbar/training_mass_variation_subset.h5'
test_filepath_cloud = '/eos/user/n/nmuangko/semi_leptonnic_ttbar/testing_mass_variation_subset.h5'
# filepath_test = 'testing_mass_variation.h5' #path from SWAN
# filepath_train = 'training_mass_variation.h5'
train_file = h5py.File(train_filepath_cloud,'r')
test_file = h5py.File(test_filepath_cloud,'r')

In [18]:
train_file.keys(), test_file.keys()

(<KeysViewHDF5 ['INPUTS', 'REGRESSIONS', 'TARGETS']>,
 <KeysViewHDF5 ['INPUTS', 'REGRESSIONS', 'TARGETS']>)

In [70]:
train_file['INPUTS']['Momenta']['MASK'][:5,0]

array([ True,  True,  True,  True,  True])

In [45]:
ht_side = train_file['TARGETS']['ht']
lt_side = train_file['TARGETS']['lt']

b_ht = ht_side['b']
q1_ht = ht_side['q1']
q2_ht = ht_side['q2']
b_lt = lt_side['b']
l_lt = lt_side['l']

In [46]:
idx = 5
print('Hadronic side::   ', ' b:', b_ht[idx], ' q1:', q1_ht[idx], ' q2:', q2_ht[idx])
print('Leptonic side::   ', ' b:', b_lt[idx], ' l:', l_lt[idx])
# indices of lepton is always 0 because all event has one electron (need to indicate in .ymal so that lepton_target is associated with lepton_sequential)

Hadronic side::     b: 3  q1: 1  q2: 6
Leptonic side::     b: 2  l: 0


In [63]:
train_file['TARGETS']['ht']['b'][:3]

array([ 3,  1, -1])

In [42]:
%%bash
python utils/examine_hdf5.py /eos/user/n/nmuangko/semi_leptonnic_ttbar/training_mass_variation.h5 --shape
# python utils/examine_hdf5.py /eos/user/n/nmuangko/semi_leptonnic_ttbar/training_mass_variation.h5 --shape

| Structure for /eos/user/n/nmuangko/semi_leptonnic_ttbar/training_mass_variation.h5 

|-INPUTS                        
|---Met                         
|-----cos_phi                    :: float32  : (12427644,)
|-----met                        :: float32  : (12427644,)
|-----sin_phi                    :: float32  : (12427644,)
|-----sumet                      :: float32  : (12427644,)
|---Momenta                     
|-----MASK                       :: bool     : (12427644, 16)
|-----btag                       :: float32  : (12427644, 16)
|-----cos_phi                    :: float32  : (12427644, 16)
|-----eta                        :: float32  : (12427644, 16)
|-----etag                       :: float32  : (12427644, 16)
|-----mass                       :: float32  : (12427644, 16)
|-----pt                         :: float32  : (12427644, 16)
|-----qtag                       :: float32  : (12427644, 16)
|-----rapidity                   :: float32  : (12427644, 16)
|-----sin_phi       

In [None]:
# lt_dict, ht_dict = check_target_label(f_test)

# for key, value in lt_dict.items():
#     print('Leptonic top side:', key, value)
# for key, value in ht_dict.items():
#     print('Hadronic top side:', key, value)

In [None]:
# ### reducing testing and training size to avoid kernal crash
# def copy_group(source_group, target_group, n_events_to_copy):
#     for key in source_group.keys():
#         item = source_group[key]
#         if isinstance(item, h5py.Group):
#             # Create the group in the target file
#             print(f"Creating group {key}")
#             new_group = target_group.create_group(key)
#             # Recursively copy the subgroup
#             copy_group(item, new_group, n_events_to_copy)
#         elif isinstance(item, h5py.Dataset):
#             print(f"Copying dataset {key}")
#             # Copy the dataset with a subset of data
#             if item.ndim == 1:
#                 target_group.create_dataset(key, data=item[:n_events_to_copy])
#             else:
#                 target_group.create_dataset(key, data=item[:n_events_to_copy, :])
#         else:
#             print(f"Skipping unknown item type for {key}")

In [None]:
# newfile_path_test_subset = 'testing_mass_variation_subset_v2.h5'
# newfile_path_train_subset = 'training_mass_variation_subset_v2.h5'

# ratio = 0.8
# test_num_events = int(ratio*(1865837))
# train_num_events = int(ratio*(12427644))

In [None]:
# # Open the original HDF5 file
# with h5py.File(train_filepath_cloud, 'r') as train_file:
#     # Create a new HDF5 file
#     with h5py.File(newfile_path_train_subset, 'w') as new_train_file:
#         # Copy the structure and a subset of data
#         copy_group(train_file, new_train_file, train_num_events)

In [None]:
# with h5py.File(test_filepath_cloud, 'r') as test_file:
#     # Create a new HDF5 file
#     with h5py.File(newfile_path_test_subset, 'w') as new_test_file:
#         # Copy the structure and a subset of data
#         copy_group(test_file, new_test_file, test_num_events)

In [None]:
# %%bash
# python utils/examine_hdf5.py training_mass_variation_subset_v2.h5 --shape