Attention: dealing with jets symmetries (quark)

Special Loss function: dealing with particle symmetries (top and anti-top)

Tensor Attention: endoding the symmetries

Input: unordered jets
Outpu: one head per particle

In [1]:
import h5py
from coffea.nanoevents import NanoEventsFactory, NanoAODSchema
import awkward as ak
import numpy as np

### Data Construction

### Converting from coffea to h5 file

In [16]:
#writing h5 from coffea schema
file_path = 'https://xrootd-local.unl.edu:1094//store/user/AGC/nanoAOD/TT_TuneCUETP8M1_13TeV-powheg-pythia8/cmsopendata2015_ttbar_19980_PU25nsData2015v1_76X_mcRun2_asymptotic_v12_ext3-v1_00000_0000.root'
tree_name = 'Events'
# events = NanoEventsFactory.from_root({file_path: tree_name}, schemaclass=NanoAODSchema).events()
events = NanoEventsFactory.from_root(file_path, treepath=tree_name, schemaclass=NanoAODSchema).events()



In [17]:
#code from jetassignment_training
# events filtering
selected_electrons = events.Electron[(events.Electron.pt > 30) & (np.abs(events.Electron.eta)<2.1) & 
                                        (events.Electron.cutBased==4) & (events.Electron.sip3d < 4)]
selected_muons = events.Muon[(events.Muon.pt > 30) & (np.abs(events.Muon.eta)<2.1) & (events.Muon.tightId) & 
                                (events.Muon.sip3d < 4) & (events.Muon.pfRelIso04_all < 0.15)]
jet_filter = (events.Jet.pt > 30) & (np.abs(events.Jet.eta) < 2.4) & (events.Jet.isTightLeptonVeto)
selected_jets = events.Jet[jet_filter]
selected_genpart = events.GenPart
even = (events.event%2==0)
    
# single lepton requirement
event_filters = ((ak.count(selected_electrons.pt, axis=1) + ak.count(selected_muons.pt, axis=1)) == 1)
# require at least 4 jets
event_filters = event_filters & (ak.count(selected_jets.pt, axis=1) >= 4)
# require at least one jet above B_TAG_THRESHOLD
B_TAG_THRESHOLD = 0.5
event_filters = event_filters & (ak.sum(selected_jets.btagCSVV2 >= B_TAG_THRESHOLD, axis=1) >= 1)
    
# apply event filters
selected_electrons = selected_electrons[event_filters]
selected_muons = selected_muons[event_filters]
selected_jets = selected_jets[event_filters]
selected_genpart = selected_genpart[event_filters]
even = even[event_filters]
    
### only consider 4j2b (signal) region
region_filter = ak.sum(selected_jets.btagCSVV2 > B_TAG_THRESHOLD, axis=1) >= 2 # at least two b-tagged jets
selected_jets_region = selected_jets[region_filter][:,:10] # only keep top 10 jets
selected_electrons_region = selected_electrons[region_filter]
selected_muons_region = selected_muons[region_filter]
selected_genpart_region = selected_genpart[region_filter]
even = even[region_filter]

In [18]:
def pad_array(array, max_jets=10, pad_value=0):
    padded_array = ak.pad_none(array, max_jets, axis=1)
    padded_array_filled = ak.fill_none(padded_array, pad_value, axis=1)
    return padded_array_filled

In [19]:
#creating padded features for each event
Jets_pt_padded = pad_array(selected_jets_region.pt)
Jets_mass_padded = pad_array(selected_jets_region.mass)
Jets_eta_padded = pad_array(selected_jets_region.eta)
Jets_phi_padded = pad_array(selected_jets_region.phi)
Jets_btag_padded = pad_array(selected_jets_region.btagCSVV2)
Jets_qgl_padded = pad_array(selected_jets_region.qgl)

#lepton features (only electrons left)
lep_pt_padded = pad_array(selected_electrons_region.pt)
lep_mass_padded = pad_array(selected_electrons_region.mass)
lep_eta_padded = pad_array(selected_electrons_region.eta)
lep_phi_padded = pad_array(selected_electrons_region.phi)

In [20]:
#creating MASK for each event
pad_check = ak.pad_none(selected_jets_region.pt, 10, axis=1)
Jets_mask_padded = np.invert(ak.is_none(pad_check, axis=1))


In [21]:
#preparing sequential data for h5 file
jet_data = {
    'mask': Jets_mask_padded,
    'mass': Jets_mass_padded,
    'pt': Jets_pt_padded,
    'eta': Jets_eta_padded,
    'phi': Jets_phi_padded,
    'btag': Jets_btag_padded,
    'qgl': Jets_qgl_padded,
}

lep_data = {
    'mass':lep_mass_padded,
    'pt':lep_pt_padded,
    'eta':lep_eta_padded,
    'phi':lep_phi_padded
}

In [None]:
#need some kind of labels
# STILL NEED TO WORK ON IT
t1 = {
    'b': ,
    'q1': ,
    'q2':
    }
t2 = {
    'b': ,
    'mu': ,
    'nu':
    }

In [23]:
#creating h5 file
with h5py.File('test2.h5', 'w') as h5file:
    #INPUTS
    input_group = h5file.create_group('INPUTS')
    seq_subgroup = input_group.create_group('SEQUENTIAL')
    global_subgroup = input_group.create_group('GLOBAL')

    jet_2subgroup = seq_subgroup.create_group('Jet')
    lept_2subgroup = seq_subgroup.create_group('Lepton')

    for key, value in jet_data.items():
        jet_2subgroup.create_dataset(key, data=ak.to_numpy(value))

    for key, value in lep_data.items():
        lept_2subgroup.create_dataset(key, data=ak.to_numpy(value))

    #EVENT
    event_group = h5file.create_group('TARGETS')
    t1_subgroup = event_group.create_group('t1')
    t2_subgroup = event_group.create_group('t2')


    #PERMUTATION
    perm_group = h5file.create_group('PERMUTATION')

    #REGRESSION
    regression_group = h5file.create_group('REGRESSION')

    #CLASSIFICATION
    classification_group = h5file.create_group('CLASSIFICATION')

In [25]:
#reading from test.h5 file
#testing reading h5 file
filename_test = 'test2.h5'
f = h5py.File(filename_test, 'r')

In [26]:
f['INPUTS']['SEQUENTIAL']['Jet']['pt'][0]

array([141.625  ,  40.53125,  34.375  ,  33.96875,   0.     ,   0.     ,
         0.     ,   0.     ,   0.     ,   0.     ])

In [24]:
%%bash
python utils/examine_hdf5.py test2.h5 --shape

| Structure for test2.h5 

|-CLASSIFICATION                
|-INPUTS                        
|---GLOBAL                      
|---SEQUENTIAL                  
|-----Jet                       
|-------btag                     :: float64  : (28616, 10)
|-------eta                      :: float64  : (28616, 10)
|-------mask                     :: bool     : (28616, 10)
|-------mass                     :: float64  : (28616, 10)
|-------phi                      :: float64  : (28616, 10)
|-------pt                       :: float64  : (28616, 10)
|-------qgl                      :: float64  : (28616, 10)
|-----Lepton                    
|-------eta                      :: float64  : (28616, 10)
|-------mass                     :: float64  : (28616, 10)
|-------phi                      :: float64  : (28616, 10)
|-------pt                       :: float64  : (28616, 10)
|-PERMUTATION                   
|-REGRESSION                    
|-TARGETS                       
|---t1                     

### Understanding SPANet Data/Model Construction (based on the demo)

In [43]:
def check_target_label(file_read):
    lt_dict = {'l': [], 'b': []}
    ht_dict = {'q1': [], 'q2': [], 'b': []}
    
    for key, value in lt_dict.items():
        for label in file_read['TARGETS']['lt'][key][:]:
            if label not in lt_dict[key]:
                lt_dict[key].append(label)
                
    for key, value in ht_dict.items():
        for label in file_read['TARGETS']['ht'][key][:]:
            if label not in ht_dict[key]:
                ht_dict[key].append(label)
    
    return lt_dict, ht_dict

In [5]:
#cloud h5 files reading test
train_filepath_cloud = '/eos/user/n/nmuangko/semi_leptonnic_ttbar/training_mass_variation.h5'
test_filepath_cloud = '/eos/user/n/nmuangko/semi_leptonnic_ttbar/testing_mass_variation.h5'
# filepath_test = 'testing_mass_variation.h5' #path from SWAN
# filepath_train = 'training_mass_variation.h5'
train_file = h5py.File(train_filepath_cloud,'r')
test_file = h5py.File(test_filepath_cloud,'r')

In [92]:
train_file.keys(), test_file.keys()

(<KeysViewHDF5 ['INPUTS', 'REGRESSIONS', 'TARGETS']>,
 <KeysViewHDF5 ['INPUTS', 'REGRESSIONS', 'TARGETS']>)

In [93]:
%%bash
python utils/examine_hdf5.py /eos/user/n/nmuangko/semi_leptonnic_ttbar/testing_mass_variation.h5 --shape
python utils/examine_hdf5.py /eos/user/n/nmuangko/semi_leptonnic_ttbar/training_mass_variation.h5 --shape

| Structure for /eos/user/n/nmuangko/semi_leptonnic_ttbar/testing_mass_variation.h5 

|-INPUTS                        
|---Met                         
|-----cos_phi                    :: float32  : (1865837,)
|-----met                        :: float32  : (1865837,)
|-----sin_phi                    :: float32  : (1865837,)
|-----sumet                      :: float32  : (1865837,)
|---Momenta                     
|-----MASK                       :: bool     : (1865837, 16)
|-----btag                       :: float32  : (1865837, 16)
|-----cos_phi                    :: float32  : (1865837, 16)
|-----eta                        :: float32  : (1865837, 16)
|-----etag                       :: float32  : (1865837, 16)
|-----mass                       :: float32  : (1865837, 16)
|-----pt                         :: float32  : (1865837, 16)
|-----qtag                       :: float32  : (1865837, 16)
|-----rapidity                   :: float32  : (1865837, 16)
|-----sin_phi                    :

In [94]:
lt_dict, ht_dict = check_target_label(f_test)

for key, value in lt_dict.items():
    print('Leptonic top side:', key, value)
for key, value in ht_dict.items():
    print('Hadronic top side:', key, value)

Leptonic top side: l [0]
Leptonic top side: b [2, 6, 3, 1, 4, -1, 5, 8, 7, 11, 10, 9, 14, 12]
Hadronic top side: q1 [3, 4, 2, 1, -1, 5, 6, 7, 8, 9, 10, 11]
Hadronic top side: q2 [-1, 7, 4, 5, 3, 6, 2, 8, 9, 10, 11, 12, 13]
Hadronic top side: b [-1, 2, 1, 3, 6, 4, 5, 7, 8, 9, 12, 10, 11, 14]


In [2]:
### reducing testing and training size to avoid kernal crash
def copy_group(source_group, target_group, n_events_to_copy):
    for key in source_group.keys():
        item = source_group[key]
        if isinstance(item, h5py.Group):
            # Create the group in the target file
            print(f"Creating group {key}")
            new_group = target_group.create_group(key)
            # Recursively copy the subgroup
            copy_group(item, new_group, n_events_to_copy)
        elif isinstance(item, h5py.Dataset):
            print(f"Copying dataset {key}")
            # Copy the dataset with a subset of data
            if item.ndim == 1:
                target_group.create_dataset(key, data=item[:n_events_to_copy])
            else:
                target_group.create_dataset(key, data=item[:n_events_to_copy, :])
        else:
            print(f"Skipping unknown item type for {key}")

In [3]:
newfile_path_test_subset = 'testing_mass_variation_subset_v2.h5'
newfile_path_train_subset = 'training_mass_variation_subset_v2.h5'

ratio = 0.8
test_num_events = int(ratio*(1865837))
train_num_events = int(ratio*(12427644))

In [6]:
# Open the original HDF5 file
with h5py.File(train_filepath_cloud, 'r') as train_file:
    # Create a new HDF5 file
    with h5py.File(newfile_path_train_subset, 'w') as new_train_file:
        # Copy the structure and a subset of data
        copy_group(train_file, new_train_file, train_num_events)

Creating group INPUTS
Creating group Met
Copying dataset cos_phi
Copying dataset met
Copying dataset sin_phi
Copying dataset sumet
Creating group Momenta
Copying dataset MASK
Copying dataset btag
Copying dataset cos_phi
Copying dataset eta
Copying dataset etag
Copying dataset mass
Copying dataset pt
Copying dataset qtag
Copying dataset rapidity
Copying dataset sin_phi
Copying dataset utag
Creating group REGRESSIONS
Creating group EVENT
Copying dataset invariant_mass
Copying dataset log_invariant_mass
Copying dataset log_neutrino_eta
Copying dataset log_neutrino_px
Copying dataset log_neutrino_py
Copying dataset log_neutrino_pz
Copying dataset mt
Copying dataset mx
Copying dataset neutrino_eta
Copying dataset neutrino_px
Copying dataset neutrino_py
Copying dataset neutrino_pz
Creating group ht
Creating group PARTICLE
Copying dataset invariant_mass
Copying dataset log_invariant_mass
Creating group lt
Creating group PARTICLE
Copying dataset invariant_mass
Copying dataset log_invariant_mas

In [7]:
with h5py.File(test_filepath_cloud, 'r') as test_file:
    # Create a new HDF5 file
    with h5py.File(newfile_path_test_subset, 'w') as new_test_file:
        # Copy the structure and a subset of data
        copy_group(test_file, new_test_file, test_num_events)

Creating group INPUTS
Creating group Met
Copying dataset cos_phi
Copying dataset met
Copying dataset sin_phi
Copying dataset sumet
Creating group Momenta
Copying dataset MASK
Copying dataset btag
Copying dataset cos_phi
Copying dataset eta
Copying dataset etag
Copying dataset mass
Copying dataset pt
Copying dataset qtag
Copying dataset rapidity
Copying dataset sin_phi
Copying dataset utag
Creating group REGRESSIONS
Creating group EVENT
Copying dataset invariant_mass
Copying dataset log_invariant_mass
Copying dataset log_neutrino_eta
Copying dataset log_neutrino_px
Copying dataset log_neutrino_py
Copying dataset log_neutrino_pz
Copying dataset neutrino_eta
Copying dataset neutrino_px
Copying dataset neutrino_py
Copying dataset neutrino_pz
Creating group ht
Creating group PARTICLE
Copying dataset invariant_mass
Copying dataset log_invariant_mass
Creating group lt
Creating group PARTICLE
Copying dataset invariant_mass
Copying dataset log_invariant_mass
Creating group TARGETS
Creating grou

In [8]:
%%bash
python utils/examine_hdf5.py training_mass_variation_subset_v2.h5 --shape

Traceback (most recent call last):
  File "/eos/home-i00/n/nmuangko/semi_leptonnic_ttbar/utils/examine_hdf5.py", line 22, in <module>
    main(arguments.filepath, arguments.shape)
  File "/eos/home-i00/n/nmuangko/semi_leptonnic_ttbar/utils/examine_hdf5.py", line 8, in main
    with h5py.File(filepath, 'r') as file:
  File "/cvmfs/sft.cern.ch/lcg/views/LCG_105a_swan/x86_64-el9-gcc13-opt/lib/python3.9/site-packages/h5py/_hl/files.py", line 567, in __init__
    fid = make_fid(name, mode, userblock_size, fapl, fcpl, swmr=swmr)
  File "/cvmfs/sft.cern.ch/lcg/views/LCG_105a_swan/x86_64-el9-gcc13-opt/lib/python3.9/site-packages/h5py/_hl/files.py", line 231, in make_fid
    fid = h5f.open(name, flags, fapl=fapl)
  File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper
  File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper
  File "h5py/h5f.pyx", line 106, in h5py.h5f.open
FileNotFoundError: [Errno 2] Unable to open file (unable to open file: name = 'training_mass

CalledProcessError: Command 'b'python utils/examine_hdf5.py training_mass_variation_subset_v2.h5 --shape\n'' returned non-zero exit status 1.

### Attention

### Loss

### Training

### Testing