In [1]:
import json
import uproot
import pickle as pkl
import os
import sys
sys.path.append("../")

import numpy as np
import pandas as pd
import awkward as ak
from coffea import processor
from coffea import nanoevents
from coffea.nanoevents import NanoEventsFactory, NanoAODSchema, BaseSchema
from coffea.nanoevents.methods import candidate, vector
from coffea.analysis_tools import Weights, PackedSelection

import warnings
warnings.filterwarnings("ignore", message="Found duplicate branch ")
warnings.filterwarnings("ignore", category=DeprecationWarning)
np.seterr(invalid='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [2]:
def pad_val(
    arr: ak.Array,
    value: float,
    target: int = None,
    axis: int = 0,
    to_numpy: bool = False,
    clip: bool = True,
):
    """
    basically: preserves the nested structure of the ak array and replaces None values with -1
    pads awkward array up to ``target`` index along axis ``axis`` with value ``value``,
    optionally converts to numpy array
    """
    if target:
        ret = ak.fill_none(ak.pad_none(arr, target, axis=axis, clip=clip), value, axis=None)
    else:
        ret = ak.fill_none(arr, value, axis=None)
    return ret.to_numpy() if to_numpy else ret

def build_p4(cand):
    return ak.zip(
        {
            "pt": cand.pt,
            "eta": cand.eta,
            "phi": cand.phi,
            "mass": cand.mass,
            "charge": cand.charge,
        },
        with_name="PtEtaPhiMCandidate",
        behavior=candidate.behavior,
    )

In [3]:
d_PDGID = 1
c_PDGID = 4
b_PDGID = 5
g_PDGID = 21
TOP_PDGID = 6

ELE_PDGID = 11
vELE_PDGID = 12
MU_PDGID = 13
vMU_PDGID = 14
TAU_PDGID = 15
vTAU_PDGID = 16

Z_PDGID = 23
W_PDGID = 24
HIGGS_PDGID = 25

def getParticles(
    genparticles, lowid=22, highid=25, flags=["fromHardProcess", "isLastCopy"]
):
    """
    returns the particle objects that satisfy a low id,
    high id condition and have certain flags
    """
    absid = abs(genparticles.pdgId)
    return genparticles[
        ((absid >= lowid) & (absid <= highid)) & genparticles.hasFlags(flags)
    ]

## Load a signal file

In [4]:
# load a root file into coffea-friendly NanoAOD structure
import uproot
f = uproot.open(f"./sig.root")
num = f['Events'].num_entries   ### checks number of events per file 

events = nanoevents.NanoEventsFactory.from_root(f, "Events").events()
nevents = len(events)

print(f'number of events per file is {nevents}')

genparticles = events.GenPart

number of events per file is 98400


In [5]:
### define helper functions that select candidate lepton and candidate jet

def get_candidatelep(events):   
    # muons
    good_muons = (
        (events.Muon.pt > 30)
        & (np.abs(events.Muon.eta) < 2.4)
        & (np.abs(events.Muon.dz) < 0.1)
        & (np.abs(events.Muon.dxy) < 0.05)
        & (events.Muon.sip3d <= 4.0)
        & events.Muon.mediumId
    )

    # electrons
    good_electrons = (
        (events.Electron.pt > 38)
        & (np.abs(events.Electron.eta) < 2.4)
        & ((np.abs(events.Electron.eta) < 1.44) | (np.abs(events.Electron.eta) > 1.57))
        & (np.abs(events.Electron.dz) < 0.1)
        & (np.abs(events.Electron.dxy) < 0.05)
        & (events.Electron.sip3d <= 4.0)
        & (events.Electron.mvaFall17V2noIso_WP90)
    )

    # get candidate lepton
    goodleptons = ak.concatenate(
        [events.Muon[good_muons], events.Electron[good_electrons]], axis=1
    )  # concat muons and electrons
    goodleptons = goodleptons[ak.argsort(goodleptons.pt, ascending=False)]  # sort by pt
    candidatelep = ak.firsts(goodleptons)  # pick highest pt

    return candidatelep

def get_candidatefj(events, candidatelep):  
    candidatelep_p4 = build_p4(candidatelep)  # build p4 for candidate lepton

    # fatjets
    fatjets = events.FatJet

    good_fatjets = (fatjets.pt > 200) & (abs(fatjets.eta) < 2.5) & fatjets.isTight
    good_fatjets = fatjets[good_fatjets]  # select good fatjets
    good_fatjets = good_fatjets[ak.argsort(good_fatjets.pt, ascending=False)]  # sort them by pt

    # for leptonic channel: first clean jets and leptons by removing overlap, then pick candidate_fj closest to the lepton
    lep_in_fj_overlap_bool = good_fatjets.delta_r(candidatelep_p4) > 0.1
    good_fatjets = good_fatjets[lep_in_fj_overlap_bool]
    fj_idx_lep = ak.argmin(good_fatjets.delta_r(candidatelep_p4), axis=1, keepdims=True)
    candidatefj = ak.firsts(good_fatjets[fj_idx_lep])

    return candidatefj

## Construct matching function

In [6]:
# select candidate fj
candidatelep = get_candidatelep(events)
candidatefj = get_candidatefj(events, candidatelep)

In [7]:
# select Higgs particles that decay to W from the full set
higgs = getParticles(genparticles, 25)
is_hWW = ak.all(
    abs(higgs.children.pdgId) == 24, axis=2
)  # W~24 so we get H->WW (limitation: only picking one W and assumes the other will be there)

higgs = higgs[is_hWW]

# choose higgs closest to fj
matched_higgs = candidatefj.nearest(higgs, axis=1, threshold=0.8)
matched_higgs_children = matched_higgs.children
children_mass = matched_higgs_children.mass

# get WW daughters
daughters = ak.flatten(
    ak.flatten(matched_higgs_children.distinctChildren, axis=2), axis=2
)

# make sure the daughters come from hard process
GEN_FLAGS = ["fromHardProcess", "isLastCopy"]
daughters = daughters[daughters.hasFlags(GEN_FLAGS)]
daughters_pdgId = abs(daughters.pdgId)

decay = (
    # 2 quarks * 1
    (ak.sum(daughters_pdgId <= b_PDGID, axis=1) == 2) * 1
    # 1 electron * 3
    + (ak.sum(daughters_pdgId == ELE_PDGID, axis=1) == 1) * 3
    # 1 muon * 5
    + (ak.sum(daughters_pdgId == MU_PDGID, axis=1) == 1) * 5
    # 1 tau * 7
    + (ak.sum(daughters_pdgId == TAU_PDGID, axis=1) == 1) * 7
    # 4 quarks * 11
    + (ak.sum(daughters_pdgId <= b_PDGID, axis=1) == 4) * 11
    # 3 quarks * 13
    + (ak.sum(daughters_pdgId <= b_PDGID, axis=1) == 3) * 13
)

In [8]:
### get number of c-quarks
prompt_c = getParticles(
    genparticles, c_PDGID, c_PDGID, ["fromHardProcess", "isLastCopy"]
)
prompt_c = prompt_c[abs(prompt_c.distinctParent.pdgId) == 24]  # parent W
n_cquarks = ak.sum(prompt_c.pt > 0, axis=1)
print(f"number of c-quarks {n_cquarks}")

number of c-quarks [0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, ... 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1]


In [9]:
def to_label(array: ak.Array) -> ak.Array:
    return ak.values_astype(array, np.int32)

a = to_label(decay == 6)  # 1 electron && 1 quark
print(f"events with one muon and 1 quark \n {a[~ak.is_none(a)]}")

a = to_label((decay == 6) & (n_cquarks == 0)) # 1 electron && 1 quark && 0 c quarks
print(f"events with one muon and 0 quark (c-quark) \n {a[~ak.is_none(a)]}")

a = to_label((decay == 6) & (n_cquarks == 1)) # 1 electron && 1 quark && 1 c quark
print(f"events with one muon and 1 quark (c-quark) \n {a[~ak.is_none(a)]}")

events with one muon and 1 quark 
 [0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, ... 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
events with one muon and 0 quark (c-quark) 
 [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ... 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
events with one muon and 1 quark (c-quark) 
 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [11]:
### with this in mind we can build the array of classes

genLabelVars = {
    "label_H_WqqWqq_0c": to_label((decay == 11) & (n_cquarks == 0)),
    "label_H_WqqWqq_1c": to_label((decay == 11) & (n_cquarks == 1)),
    "label_H_WqqWqq_2c": to_label((decay == 11) & (n_cquarks == 2)),
    "label_H_WqqWq_0c": to_label((decay == 13) & (n_cquarks == 0)),
    "label_H_WqqWq_1c": to_label((decay == 13) & (n_cquarks == 1)),
    "label_H_WqqWq_2c": to_label((decay == 13) & (n_cquarks == 2)),
    "label_H_WqqWev_0c": to_label((decay == 4) & (n_cquarks == 0)),
    "label_H_WqqWev_1c": to_label((decay == 4) & (n_cquarks == 1)),
    "label_H_WqqWmv_0c": to_label((decay == 6) & (n_cquarks == 0)),
    "label_H_WqqWmv_1c": to_label((decay == 6) & (n_cquarks == 1)),

    # "label_H_WqqWtauhv_0c": to_label(decay == 11),  # force c=0
    # "label_H_WqqWtauhv_1c": to_label(decay == 11),  # force c=1
}