Attention: dealing with jets symmetries (quark)

Special Loss function: dealing with particle symmetries (top and anti-top)

Tensor Attention: endoding the symmetries

Input: unordered jets
Outpu: one head per particle

In [1]:
import h5py
from coffea.nanoevents import NanoEventsFactory, NanoAODSchema
import awkward as ak
import numpy as np

### Data Construction

### Converting from coffea to h5 file

In [115]:
#writing h5 from coffea schema
# file_path = 'https://xrootd-local.unl.edu:1094//store/user/AGC/nanoAOD/TT_TuneCUETP8M1_13TeV-powheg-pythia8/cmsopendata2015_ttbar_19980_PU25nsData2015v1_76X_mcRun2_asymptotic_v12_ext3-v1_00000_0000.root' #for training
file_path = 'https://xrootd-local.unl.edu:1094//store/user/AGC/nanoAOD/TT_TuneCUETP8M1_13TeV-powheg-pythia8/cmsopendata2015_ttbar_19980_PU25nsData2015v1_76X_mcRun2_asymptotic_v12_ext3-v1_00000_0001.root' #for testing
tree_name = 'Events'
# events = NanoEventsFactory.from_root({file_path: tree_name}, schemaclass=NanoAODSchema).events()
events = NanoEventsFactory.from_root(file_path, treepath=tree_name, schemaclass=NanoAODSchema).events()



In [116]:
#code from jetassignment_training
# events filtering
selected_electrons = events.Electron[(events.Electron.pt > 30) & (np.abs(events.Electron.eta)<2.1) & 
                                        (events.Electron.cutBased==4) & (events.Electron.sip3d < 4)]
selected_muons = events.Muon[(events.Muon.pt > 30) & (np.abs(events.Muon.eta)<2.1) & (events.Muon.tightId) & 
                                (events.Muon.sip3d < 4) & (events.Muon.pfRelIso04_all < 0.15)]
jet_filter = (events.Jet.pt > 30) & (np.abs(events.Jet.eta) < 2.4) & (events.Jet.isTightLeptonVeto)
selected_jets = events.Jet[jet_filter]
selected_genpart = events.GenPart
# even = (events.event%2==0)
    
# single lepton requirement
event_filters = ((ak.count(selected_electrons.pt, axis=1) + ak.count(selected_muons.pt, axis=1)) == 1)
# require at least 4 jets
event_filters = event_filters & (ak.count(selected_jets.pt, axis=1) >= 4)
# require at least one jet above B_TAG_THRESHOLD
B_TAG_THRESHOLD = 0.5
event_filters = event_filters & (ak.sum(selected_jets.btagCSVV2 >= B_TAG_THRESHOLD, axis=1) >= 1)
    
# apply event filters
selected_electrons = selected_electrons[event_filters]
selected_muons = selected_muons[event_filters]
selected_jets = selected_jets[event_filters]
selected_genpart = selected_genpart[event_filters]
# even = even[event_filters]
    
### only consider 4j2b (signal) region
region_filter = ak.sum(selected_jets.btagCSVV2 > B_TAG_THRESHOLD, axis=1) >= 2 # at least two b-tagged jets
selected_jets_region = selected_jets[region_filter] # all jet
selected_electrons_region = selected_electrons[region_filter]
selected_muons_region = selected_muons[region_filter]
selected_genpart_region = selected_genpart[region_filter]
# even = even[region_filter]

In [117]:
def training_filter_multiple_jets(jets, electrons, muons, genparts):
    '''
    Filters events down to training set and calculates jet-level labels
    
    Args:
        jets: selected jets after region filter (and selecting leading four for each event)
        electrons: selected electrons after region filter
        muons: selected muons after region filter
        genparts: selected genpart after region filter
        even: whether the event is even-numbered (used to separate training events)
    
    Returns:
        jets: selected jets after training filter
        electrons: selected electrons after training filter
        muons: selected muons after training filter
        labels: labels of jets within an event (24=W, 6=top_hadron, -6=top_lepton)
        even: whether the event is even-numbered
    '''
    #### filter genPart to valid matching candidates ####

    # get rid of particles without parents
    genpart_parent = genparts.distinctParent
    genpart_filter = np.invert(ak.is_none(genpart_parent, axis=1))
    genparts = genparts[genpart_filter]
    genpart_parent = genparts.distinctParent

    # ensure that parents are top quark or W
    genpart_filter2 = ((np.abs(genpart_parent.pdgId)==6) | (np.abs(genpart_parent.pdgId)==24))
    genparts = genparts[genpart_filter2]

    # ensure particle itself is a quark
    genpart_filter3 = ((np.abs(genparts.pdgId)<7) & (np.abs(genparts.pdgId)>0))
    genparts = genparts[genpart_filter3]

    # get rid of duplicates
    genpart_filter4 = genparts.hasFlags("isLastCopy")
    genparts = genparts[genpart_filter4]
            
        
    #### get jet-level labels and filter events to training set
        
    # match jets to nearest valid genPart candidate
    nearest_genpart = jets.nearest(genparts, threshold=0.4)
    nearest_parent = nearest_genpart.distinctParent # parent of matched particle
    parent_pdgid = nearest_parent.pdgId # pdgId of parent particle
    grandchild_pdgid = nearest_parent.distinctChildren.distinctChildren.pdgId # pdgId of particle's parent's grandchildren

    grandchildren_flat = np.abs(ak.flatten(grandchild_pdgid,axis=-1)) # flatten innermost axis for convenience

    # if particle has a cousin that is a lepton
    has_lepton_cousin = (ak.sum(((grandchildren_flat%2==0) & (grandchildren_flat>10) & (grandchildren_flat<19)),
                                axis=-1)>0)
    # if particle has a cousin that is a neutrino
    has_neutrino_cousin = (ak.sum(((grandchildren_flat%2==1) & (grandchildren_flat>10) & (grandchildren_flat<19)),
                                  axis=-1)>0)

    # if a particle has a lepton cousin and a neutrino cousin
    has_both_cousins = ak.fill_none((has_lepton_cousin & has_neutrino_cousin), False) #not using .to_numpy bc inregular array size (different event level, multiple jets)

    # get labels from parent pdgId (fill none with 100 to filter out events with those jets)
    labels = np.abs(ak.fill_none(parent_pdgid,100)) #not using .to_numpy bc inregular array size (different event level, multiple jets)

    # changing the labels while still preserve awkward array jets. To bypass inplace assignment error of numpy_array vs awkward_array
    new_labels = ak.Array([])
    for idx in range(len(labels)):
        (labels[idx].to_numpy())[has_both_cousins[idx].to_numpy()]=-6
        new_labels = ak.concatenate([new_labels, ak.Array([labels[idx]])], axis=0)
    labels = new_labels
    
    #mask for event validation (atleast 4 jets with -6, 6, 24, 24, 100, 100,...)
    mask = ak.Array([])
    for idx in range(len(labels)):
        event_valid_bool = (ak.sum(labels[idx]==-6)==1) & (ak.sum(labels[idx]==6)==1) & (ak.sum(labels[idx]==24)==2)
        mask = ak.concatenate([mask, ak.Array([event_valid_bool])], axis=0)
            
    # filter events
    jets = jets[mask]
    electrons = electrons[mask]
    muons = muons[mask]
    labels = labels[mask]
    
    return jets, electrons, muons, labels

def pad_array(array, max_jets=12, pad_value=0): #padded arra yfor features, except MASK
    padded_array = ak.pad_none(array, max_jets, axis=1)
    padded_array_filled = ak.fill_none(padded_array, pad_value, axis=1)
    return padded_array_filled

In [118]:
#getting labels for valid event
jets, electrons, muons, labels = training_filter_multiple_jets(selected_jets_region, selected_electrons_region, selected_muons_region, selected_genpart_region)

In [119]:
#creating padded features for each event
Jets_pt_padded = pad_array(jets.pt)
Jets_mass_padded = pad_array(jets.mass)
Jets_eta_padded = pad_array(jets.eta)
Jets_phi_padded = pad_array(jets.phi)
Jets_btag_padded = pad_array(jets.btagCSVV2)
Jets_qgl_padded = pad_array(jets.qgl)

#lepton features (only electrons left)
#Since there's only one lepton, might not need to pad
lep_pt_padded = pad_array(electrons.pt)
lep_mass_padded = pad_array(electrons.mass)
lep_eta_padded = pad_array(electrons.eta)
lep_phi_padded = pad_array(electrons.phi)

In [120]:
#creating MASK for each Jet event
pad_check = ak.pad_none(jets, 12, axis=1)
Jets_mask_padded = np.invert(ak.is_none(pad_check, axis=1))

#creating MASK for each Lepton event
lep_mask_padded = np.invert(ak.is_none(pad_check, axis=1)) #not sure if this is necessary

#labels padded (missing jets should be label as -1 ?) #probably dont need padded_labels to preserve memory (only need to extract indices)
labels_padded = pad_array(labels, pad_value=-1)

In [121]:
#target indices fot jets and lepton
def label_indices(labels):
    l_indices = ak.Array([])
    b_lep_indices = ak.Array([])
    b_top_indices = ak.Array([])
    w1_indices = ak.Array([])
    w2_indices = ak.Array([])
    
    
    for event in labels:
        l_index = ak.Array([0]) #always 0th because only 1 lepton(electron) in each event. lepton target associated with lepton sequential input
        b_lep_index = ak.where(event==-6) #b_top_lep
        b_top_index = ak.where(event==6) #b_top_had
        w_index = ak.where(event==24)#giving two indices of W jets
        
        l_indices = ak.concatenate([l_indices, l_index], axis=0) # worked (alreay flatten)
        b_lep_indices = ak.concatenate([b_lep_indices, b_lep_index], axis=0)
        b_top_indices = ak.concatenate([b_top_indices, b_top_index], axis=0)
        
        w1_indices = ak.concatenate([w1_indices, ak.Array([w_index[0][0]])], axis=0) #worked (alreay flatten)
        w2_indices = ak.concatenate([w2_indices, ak.Array([w_index[0][1]])], axis=0) #worked (alreay flatten)
    
    b_lep_indices = ak.flatten(b_lep_indices)
    b_top_indices = ak.flatten(b_top_indices)
        
    return l_indices, b_lep_indices, b_top_indices, w1_indices, w2_indices
       
#     labels_indices = ak.Array([])
    
#     for event in labels:
#         l_index = ak.Array([[0]]) #always 0th because only 1 lepton(electron) in each event. lepton target associated with lepton sequential input
#         b_lep_index = ak.where(event==-6) #b_top_lep
#         b_top_index = ak.where(event==6) #b_top_had
#         w_index = ak.where(event==24)#giving two indices of W jets

#         #create array containing indices of particles in each event
#         indices_particle_level = ak.concatenate([l_index, b_lep_index, b_top_index, w_index], axis=1)
#         #create array of all events
#         label_indices = ak.concatenate([label_indices, indices_particle_level], axis=0)

#     return label_indices
    

In [122]:
# create label_indices from particles assignment labels
l_indices, b_lep_indices, b_top_indices, w1_indices, w2_indices = label_indices(labels_padded)

In [123]:
#preparing sequential data for h5 file
jets_data = {
    'MASK': Jets_mask_padded, # capital MASK
    'mass': Jets_mass_padded,
    'pt': Jets_pt_padded,
    'eta': Jets_eta_padded,
    'phi': Jets_phi_padded,
    'btag': Jets_btag_padded,
    'qgl': Jets_qgl_padded,
}

lep_data = {
    'MASK': lep_mask_padded, # capital MASK
    'mass':lep_mass_padded,
    'pt':lep_pt_padded,
    'eta':lep_eta_padded,
    'phi':lep_phi_padded
}

#preparing target data for h5 file
ht_target = {
    'b': b_top_indices,
    'q1': w1_indices,
    'q2': w2_indices,
}

lt_target = {
    'b': b_lep_indices,
    'l': l_indices,
}

In [124]:
#creating h5 file
with h5py.File('testing_converted.h5', 'w') as h5file:
    #INPUTS
    input_group = h5file.create_group('INPUTS')
#     seq_subgroup = input_group.create_group('SEQUENTIAL')
#     global_subgroup = input_group.create_group('GLOBAL')

#     jet_2subgroup = seq_subgroup.create_group('Jets')
#     lept_2subgroup = seq_subgroup.create_group('Lepton')
    jet_2subgroup = input_group.create_group('Jets')
    lept_2subgroup = input_group.create_group('Lepton')

    for key, value in jets_data.items():
        jet_2subgroup.create_dataset(key, data=ak.to_numpy(value)) #cant do normal numpy convertion because of irregular awkward array

    for key, value in lep_data.items():
        lept_2subgroup.create_dataset(key, data=ak.to_numpy(value))

    #EVENT
    event_group = h5file.create_group('TARGETS')
    ht_subgroup = event_group.create_group('ht')
    lt_subgroup = event_group.create_group('lt')
    
    for key, value in ht_target.items():
        ht_subgroup.create_dataset(key, data=ak.to_numpy(value))

    for key, value in lt_target.items():
        lt_subgroup.create_dataset(key, data=ak.to_numpy(value))


    #PERMUTATION
    perm_group = h5file.create_group('PERMUTATION')

    #REGRESSION
    regression_group = h5file.create_group('REGRESSION')

    #CLASSIFICATION
    classification_group = h5file.create_group('CLASSIFICATION')

In [126]:
# reading from test.h5 file
# testing reading h5 file
filename_test = 'testing_converted.h5'
f = h5py.File(filename_test, 'r')

In [127]:
f['TARGETS']['ht']['b'][:3]

array([3, 3, 0])

In [128]:
%%bash
python utils/examine_hdf5.py training_converted.h5 --shape

| Structure for training_converted.h5 

|-CLASSIFICATION                
|-INPUTS                        
|---Jets                        
|-----MASK                       :: bool     : (10736, 12)
|-----btag                       :: float64  : (10736, 12)
|-----eta                        :: float64  : (10736, 12)
|-----mass                       :: float64  : (10736, 12)
|-----phi                        :: float64  : (10736, 12)
|-----pt                         :: float64  : (10736, 12)
|-----qgl                        :: float64  : (10736, 12)
|---Lepton                      
|-----MASK                       :: bool     : (10736, 12)
|-----eta                        :: float64  : (10736, 12)
|-----mass                       :: float64  : (10736, 12)
|-----phi                        :: float64  : (10736, 12)
|-----pt                         :: float64  : (10736, 12)
|-PERMUTATION                   
|-REGRESSION                    
|-TARGETS                       
|---ht               

### Understanding SPANet Data/Model Construction (based on the demo)

In [None]:
def check_target_label(file_read):
    lt_dict = {'l': [], 'b': []}
    ht_dict = {'q1': [], 'q2': [], 'b': []}
    
    for key, value in lt_dict.items():
        for label in file_read['TARGETS']['lt'][key][:]:
            if label not in lt_dict[key]:
                lt_dict[key].append(label)
                
    for key, value in ht_dict.items():
        for label in file_read['TARGETS']['ht'][key][:]:
            if label not in ht_dict[key]:
                ht_dict[key].append(label)
    
    return lt_dict, ht_dict

In [43]:
#cloud h5 files reading test
train_filepath_cloud = '/eos/user/n/nmuangko/semi_leptonnic_ttbar/training_mass_variation_subset.h5'
test_filepath_cloud = '/eos/user/n/nmuangko/semi_leptonnic_ttbar/testing_mass_variation_subset.h5'
# filepath_test = 'testing_mass_variation.h5' #path from SWAN
# filepath_train = 'training_mass_variation.h5'
train_file = h5py.File(train_filepath_cloud,'r')
test_file = h5py.File(test_filepath_cloud,'r')

In [44]:
train_file.keys(), test_file.keys()

(<KeysViewHDF5 ['INPUTS', 'REGRESSIONS', 'TARGETS']>,
 <KeysViewHDF5 ['INPUTS', 'REGRESSIONS', 'TARGETS']>)

In [45]:
ht_side = train_file['TARGETS']['ht']
lt_side = train_file['TARGETS']['lt']

b_ht = ht_side['b']
q1_ht = ht_side['q1']
q2_ht = ht_side['q2']
b_lt = lt_side['b']
l_lt = lt_side['l']

In [46]:
idx = 5
print('Hadronic side::   ', ' b:', b_ht[idx], ' q1:', q1_ht[idx], ' q2:', q2_ht[idx])
print('Leptonic side::   ', ' b:', b_lt[idx], ' l:', l_lt[idx])
# indices of lepton is always 0 because all event has one electron (need to indicate in .ymal so that lepton_target is associated with lepton_sequential)

Hadronic side::     b: 3  q1: 1  q2: 6
Leptonic side::     b: 2  l: 0


In [63]:
train_file['TARGETS']['ht']['b'][:3]

array([ 3,  1, -1])

In [42]:
%%bash
python utils/examine_hdf5.py /eos/user/n/nmuangko/semi_leptonnic_ttbar/training_mass_variation.h5 --shape
# python utils/examine_hdf5.py /eos/user/n/nmuangko/semi_leptonnic_ttbar/training_mass_variation.h5 --shape

| Structure for /eos/user/n/nmuangko/semi_leptonnic_ttbar/training_mass_variation.h5 

|-INPUTS                        
|---Met                         
|-----cos_phi                    :: float32  : (12427644,)
|-----met                        :: float32  : (12427644,)
|-----sin_phi                    :: float32  : (12427644,)
|-----sumet                      :: float32  : (12427644,)
|---Momenta                     
|-----MASK                       :: bool     : (12427644, 16)
|-----btag                       :: float32  : (12427644, 16)
|-----cos_phi                    :: float32  : (12427644, 16)
|-----eta                        :: float32  : (12427644, 16)
|-----etag                       :: float32  : (12427644, 16)
|-----mass                       :: float32  : (12427644, 16)
|-----pt                         :: float32  : (12427644, 16)
|-----qtag                       :: float32  : (12427644, 16)
|-----rapidity                   :: float32  : (12427644, 16)
|-----sin_phi       

In [None]:
# lt_dict, ht_dict = check_target_label(f_test)

# for key, value in lt_dict.items():
#     print('Leptonic top side:', key, value)
# for key, value in ht_dict.items():
#     print('Hadronic top side:', key, value)

In [None]:
# ### reducing testing and training size to avoid kernal crash
# def copy_group(source_group, target_group, n_events_to_copy):
#     for key in source_group.keys():
#         item = source_group[key]
#         if isinstance(item, h5py.Group):
#             # Create the group in the target file
#             print(f"Creating group {key}")
#             new_group = target_group.create_group(key)
#             # Recursively copy the subgroup
#             copy_group(item, new_group, n_events_to_copy)
#         elif isinstance(item, h5py.Dataset):
#             print(f"Copying dataset {key}")
#             # Copy the dataset with a subset of data
#             if item.ndim == 1:
#                 target_group.create_dataset(key, data=item[:n_events_to_copy])
#             else:
#                 target_group.create_dataset(key, data=item[:n_events_to_copy, :])
#         else:
#             print(f"Skipping unknown item type for {key}")

In [None]:
# newfile_path_test_subset = 'testing_mass_variation_subset_v2.h5'
# newfile_path_train_subset = 'training_mass_variation_subset_v2.h5'

# ratio = 0.8
# test_num_events = int(ratio*(1865837))
# train_num_events = int(ratio*(12427644))

In [None]:
# # Open the original HDF5 file
# with h5py.File(train_filepath_cloud, 'r') as train_file:
#     # Create a new HDF5 file
#     with h5py.File(newfile_path_train_subset, 'w') as new_train_file:
#         # Copy the structure and a subset of data
#         copy_group(train_file, new_train_file, train_num_events)

In [None]:
# with h5py.File(test_filepath_cloud, 'r') as test_file:
#     # Create a new HDF5 file
#     with h5py.File(newfile_path_test_subset, 'w') as new_test_file:
#         # Copy the structure and a subset of data
#         copy_group(test_file, new_test_file, test_num_events)

In [None]:
# %%bash
# python utils/examine_hdf5.py training_mass_variation_subset_v2.h5 --shape

### Attention

### Loss

### Training

### Testing