In [1]:
from coffea.nanoevents import NanoEventsFactory, NanoAODSchema
import awkward as ak
import numpy as np

In [490]:
def training_filter_multiple_jets(jets, electrons, muons, genparts):
    '''
    Filters events down to training set and calculates jet-level labels
    
    Args:
        jets: selected jets after region filter (and selecting leading four for each event)
        electrons: selected electrons after region filter
        muons: selected muons after region filter
        genparts: selected genpart after region filter
        even: whether the event is even-numbered (used to separate training events)
    
    Returns:
        jets: selected jets after training filter
        electrons: selected electrons after training filter
        muons: selected muons after training filter
        labels: labels of jets within an event (24=W, 6=top_hadron, -6=top_lepton)
        even: whether the event is even-numbered
    '''
    #### filter genPart to valid matching candidates ####

    # get rid of particles without parents
    genpart_parent = genparts.distinctParent
    genpart_filter = np.invert(ak.is_none(genpart_parent, axis=1))
    genparts = genparts[genpart_filter]
    genpart_parent = genparts.distinctParent

    # ensure that parents are top quark or W
    genpart_filter2 = ((np.abs(genpart_parent.pdgId)==6) | (np.abs(genpart_parent.pdgId)==24))
    genparts = genparts[genpart_filter2]

    # ensure particle itself is a quark
    genpart_filter3 = ((np.abs(genparts.pdgId)<7) & (np.abs(genparts.pdgId)>0))
    genparts = genparts[genpart_filter3]

    # get rid of duplicates
    genpart_filter4 = genparts.hasFlags("isLastCopy")
    genparts = genparts[genpart_filter4]
            
        
    #### get jet-level labels and filter events to training set
        
    # match jets to nearest valid genPart candidate
    nearest_genpart = jets.nearest(genparts, threshold=0.4)
    nearest_parent = nearest_genpart.distinctParent # parent of matched particle
    parent_pdgid = nearest_parent.pdgId # pdgId of parent particle
    grandchild_pdgid = nearest_parent.distinctChildren.distinctChildren.pdgId # pdgId of particle's parent's grandchildren

    grandchildren_flat = np.abs(ak.flatten(grandchild_pdgid,axis=-1)) # flatten innermost axis for convenience

    # if particle has a cousin that is a lepton
    has_lepton_cousin = (ak.sum(((grandchildren_flat%2==0) & (grandchildren_flat>10) & (grandchildren_flat<19)),
                                axis=-1)>0)
    # if particle has a cousin that is a neutrino
    has_neutrino_cousin = (ak.sum(((grandchildren_flat%2==1) & (grandchildren_flat>10) & (grandchildren_flat<19)),
                                  axis=-1)>0)

    # if a particle has a lepton cousin and a neutrino cousin
    has_both_cousins = ak.fill_none((has_lepton_cousin & has_neutrino_cousin), False) #not using .to_numpy bc inregular array size (different event level, multiple jets)

    # get labels from parent pdgId (fill none with 100 to filter out events with those jets)
    labels = np.abs(ak.fill_none(parent_pdgid,100)) #not using .to_numpy bc inregular array size (different event level, multiple jets)

    # changing the labels while still preserve awkward array jets. To bypass inplace assignment error of numpy_array vs awkward_array
    new_labels = ak.Array([])
    for idx in range(len(labels)):
        (labels[idx].to_numpy())[has_both_cousins[idx].to_numpy()]=-6
        new_labels = ak.concatenate([new_labels, ak.Array([labels[idx]])], axis=0)
    labels = new_labels
    
    #mask for event validation (atleast 4 jets with -6, 6, 24, 24, 100, 100,...)
    mask = ak.Array([])
    for idx in range(len(labels)):
        event_valid_bool = (ak.sum(labels[idx]==-6)==1) & (ak.sum(labels[idx]==6)==1) & (ak.sum(labels[idx]==24)==2)
        mask = ak.concatenate([mask, ak.Array([event_valid_bool])], axis=0)
            
    # filter events
    jets = jets[mask]
    electrons = electrons[mask]
    muons = muons[mask]
    labels = labels[mask]
    
    return jets, electrons, muons, labels

In [3]:
file_path = 'https://xrootd-local.unl.edu:1094//store/user/AGC/nanoAOD/TT_TuneCUETP8M1_13TeV-powheg-pythia8/cmsopendata2015_ttbar_19980_PU25nsData2015v1_76X_mcRun2_asymptotic_v12_ext3-v1_00000_0000.root'
treename = 'Events'
events = NanoEventsFactory.from_root(file_path, treepath=treename, schemaclass=NanoAODSchema).events()



In [4]:
#Event filtering
#code from jetassignment_training
selected_electrons = events.Electron[(events.Electron.pt > 30) & (np.abs(events.Electron.eta)<2.1) & 
                                        (events.Electron.cutBased==4) & (events.Electron.sip3d < 4)]
selected_muons = events.Muon[(events.Muon.pt > 30) & (np.abs(events.Muon.eta)<2.1) & (events.Muon.tightId) & 
                                (events.Muon.sip3d < 4) & (events.Muon.pfRelIso04_all < 0.15)]
jet_filter = (events.Jet.pt > 30) & (np.abs(events.Jet.eta) < 2.4) & (events.Jet.isTightLeptonVeto)
selected_jets = events.Jet[jet_filter]
selected_genpart = events.GenPart
even = (events.event%2==0)
    
# single lepton requirements
event_filters = ((ak.count(selected_electrons.pt, axis=1) + ak.count(selected_muons.pt, axis=1)) == 1)
# require at least 4 jets
event_filters = event_filters & (ak.count(selected_jets.pt, axis=1) >= 4)
# require at least one jet above B_TAG_THRESHOLD
B_TAG_THRESHOLD = 0.5
event_filters = event_filters & (ak.sum(selected_jets.btagCSVV2 >= B_TAG_THRESHOLD, axis=1) >= 1)
    
# apply event filters
selected_electrons = selected_electrons[event_filters]
selected_muons = selected_muons[event_filters]
selected_jets = selected_jets[event_filters]
selected_genpart = selected_genpart[event_filters]
even = even[event_filters]
    
### only consider 4j2b (signal) region
region_filter = ak.sum(selected_jets.btagCSVV2 > B_TAG_THRESHOLD, axis=1) >= 2 # at least two b-tagged jets

# selected_jets_region = selected_jets[region_filter][:,:10] # only keep top 10 jets
selected_jets_region = selected_jets[region_filter] # all jets

selected_electrons_region = selected_electrons[region_filter]
selected_muons_region = selected_muons[region_filter]
selected_genpart_region = selected_genpart[region_filter]
even = even[region_filter]

In [492]:
selected_jets_region = selected_jets[region_filter][:,:] # all jetsss

In [493]:
jets, electrons, muons, labels = training_filter_multiple_jets(selected_jets_region, selected_electrons_region, selected_muons_region, selected_genpart_region)

### Break down

In [424]:
# get rid of particles without parents
genpart_parent = selected_genpart_region.distinctParent
genpart_filter = np.invert(ak.is_none(genpart_parent, axis=1))
genparts = selected_genpart_region[genpart_filter]
genpart_parent = genparts.distinctParent

# ensure that parents are top quark or W
genpart_filter2 = ((np.abs(genpart_parent.pdgId)==6) | (np.abs(genpart_parent.pdgId)==24))
genparts = genparts[genpart_filter2]

# ensure particle itself is a quark
genpart_filter3 = ((np.abs(genparts.pdgId)<7) & (np.abs(genparts.pdgId)>0))
genparts = genparts[genpart_filter3]

# get rid of duplicates
genpart_filter4 = genparts.hasFlags("isLastCopy")
genparts = genparts[genpart_filter4]

In [425]:
# match jets to nearest valid genPart candidate
nearest_genpart = selected_jets_region.nearest(genparts, threshold=0.4)
nearest_parent = nearest_genpart.distinctParent # parent of matched particle
parent_pdgid = nearest_parent.pdgId # pdgId of parent particle
grandchild_pdgid = nearest_parent.distinctChildren.distinctChildren.pdgId # pdgId of particle's parent's grandchildren

grandchildren_flat = np.abs(ak.flatten(grandchild_pdgid,axis=-1)) # flatten innermost axis for convenience

In [426]:
# if particle has a cousin that is a lepton
has_lepton_cousin = (ak.sum(((grandchildren_flat%2==0) & (grandchildren_flat>10) & (grandchildren_flat<19)),
                            axis=-1)>0)

# if particle has a cousin that is a neutrino
has_neutrino_cousin = (ak.sum(((grandchildren_flat%2==1) & (grandchildren_flat>10) & (grandchildren_flat<19)),
                                axis=-1)>0)

In [427]:
# if a particle has a lepton cousin and a neutrino cousin
has_both_cousins = ak.fill_none((has_lepton_cousin & has_neutrino_cousin), False)#.to_numpy()

In [472]:
# get labels from parent pdgId (fill none with 100 to filter out events with those jets)
labels = np.abs(ak.fill_none(parent_pdgid,100))#.to_numpy()

In [474]:
# changing the labels while still preserve awkward array jets. To bypass inplace assignment error of numpy_array vs awkward_array
def top_lep_label(labels, has_both_cousins, label_num):
    new_labels = ak.Array([])
    for idx in range(len(labels)):
        # print(labels[idx], has_both_cousins[idx])
        (labels[idx].to_numpy())[has_both_cousins[idx].to_numpy()]=label_num
        # print(labels[idx])
        new_labels = ak.concatenate([new_labels, ak.Array([labels[idx]])], axis=0)
        # print(new_labels[idx])
    return new_labels

In [477]:
labels = top_lep_label(labels, has_both_cousins, label_num=-6)

In [463]:
# valid event must have -6, 6, 24, 24, 100, 100, ...

# creating mask
# def event_validation(labels):
#     mask = ak.Array([])
#     for idx in range(len(labels)):
#         event_valid_bool = (ak.sum(labels[idx]==-6)==1) & (ak.sum(labels[idx]==6)==1) & (ak.sum(labels[idx]==24)==2)
#         mask = ak.concatenate([mask, ak.Array([event_valid_bool])], axis=0)
#     return labels[mask]

In [467]:
# labels = event_validation(labels)

In [None]:
# max(ak.num(labels, axis=1)) # 12