In [6]:
import uproot
import awkward as ak
from coffea import nanoevents
from coffea.nanoevents.methods.base import NanoEventsArray
from coffea.analysis_tools import PackedSelection

import pickle
import numpy as np

from typing import Optional, List, Dict
from copy import copy

import matplotlib.pyplot as plt
import mplhep as hep
from matplotlib import colors

from tqdm import tqdm
import fastjet
import jetnet
import vector

import os


In [2]:
plot_dir = "../../../plots/ScaleFactors/Nov14/"
_ = os.system(f"mkdir -p {plot_dir}")


In [3]:
events = nanoevents.NanoEventsFactory.from_root(
    # "/eos/uscms/store/user/lpcpfnano/cmantill/v2_3/2017/HH_gen/GluGluToHHTobbVV_node_cHHH1_TuneCP5_13TeV-powheg-pythia8/GluGluToHHTobbVV_node_cHHH1/221017_221918/0000/nano_mc2017_100.root",
    "/eos/uscms/store/user/lpcpfnano/drankin/v2_2/2017/TTbar/TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8/TTToSemiLeptonic_ext1/211112_132937/0000/nano_mc2017_100.root",
    schemaclass=nanoevents.NanoAODSchema,
).events()




### Boosted Top Pre-selection

Based on selection in https://indico.cern.ch/event/1208247/#10-lund-plane-reweighting-for

In [97]:
def pad_val(
    arr: ak.Array,
    target: int,
    value: float,
    axis: int = 0,
    to_numpy: bool = True,
    clip: bool = True,
):
    """
    pads awkward array up to ``target`` index along axis ``axis`` with value ``value``,
    optionally converts to numpy array
    """
    ret = ak.fill_none(ak.pad_none(arr, target, axis=axis, clip=clip), value, axis=axis)
    return ret.to_numpy() if to_numpy else ret


def add_selection(
    name: str,
    sel: np.ndarray,
    selection: PackedSelection,
    cutflow: dict = None,
    isData: bool = False,
    signGenWeights: ak.Array = None,
):
    """adds selection to PackedSelection object and the cutflow dictionary"""
    selection.add(name, sel)
    if cutflow is not None:
        cutflow[name] = (
            np.sum(selection.all(*selection.names))
            if isData
            # add up sign of genWeights for MC
            else np.sum(signGenWeights[selection.all(*selection.names)])
        )


preselection_cut_vals = {"pt": 250, "msd": 20}
num_jets = 1

preselection_cut = np.prod(
    pad_val(
        (events.FatJet.pt > preselection_cut_vals["pt"])
        * (events.FatJet.msoftdrop > preselection_cut_vals["msd"]),
        num_jets,
        False,
        axis=1,
    ),
    axis=1,
).astype(bool)

presel_events = events[preselection_cut]


In [29]:
isData = False
signGenWeights = None if isData else np.sign(presel_events["genWeight"])
n_events = len(presel_events) if isData else int(np.sum(signGenWeights))
selection = PackedSelection()

cutflow = {}
cutflow["presel"] = len(presel_events)

In [8]:
d_PDGID = 1
b_PDGID = 5
g_PDGID = 21
TOP_PDGID = 6

ELE_PDGID = 11
vELE_PDGID = 12
MU_PDGID = 13
vMU_PDGID = 14
TAU_PDGID = 15
vTAU_PDGID = 16

Z_PDGID = 23
W_PDGID = 24
HIGGS_PDGID = 25

b_PDGIDS = [511, 521, 523]

GRAV_PDGID = 39

GEN_FLAGS = ["fromHardProcess", "isLastCopy"]

FILL_NONE_VALUE = -99999


skim_vars = {
    "eta": "eta",
    "phi": "phi",
    "mass": "mass",
    "pt": "pt",
}

# finding the two gen tops
tops = presel_events.GenPart[
    (abs(presel_events.GenPart.pdgId) == TOP_PDGID) * presel_events.GenPart.hasFlags(GEN_FLAGS)
]

In [59]:
deltaR = 0.8

In [67]:
tops_children = tops.distinctChildren
tops_children = tops_children[tops_children.hasFlags(GEN_FLAGS)]
ws = ak.flatten(tops_children[np.abs(tops_children.pdgId) == W_PDGID], axis=2)
had_top_sel = np.all(np.abs(ws.children.pdgId) <= 5, axis=2)
had_ws = ak.flatten(ws[had_top_sel])
had_tops = ak.flatten(tops[had_top_sel])
had_top_children = ak.flatten(tops_children[had_top_sel], axis=1)
had_bs = had_top_children[np.abs(had_top_children.pdgId) == 5]
add_selection("hadronic bs", np.any(had_bs.pdgId, axis=1), selection, cutflow, isData, signGenWeights)


In [100]:
had_w_jet_match = ak.all(had_ws.children.delta_r(presel_events.FatJet[:, 0]) < deltaR, axis=1)
had_b_jet_match = ak.flatten(pad_val(had_bs.delta_r(presel_events.FatJet[:, 0]) < deltaR, 1, False, axis=1, to_numpy=False))

In [91]:
had_bs.delta_r(presel_events.FatJet[:, 0]) < deltaR

<Array [[True], [False], ... [False], [False]] type='12352 * option[var * ?bool]'>

In [99]:
ak.flatten(pad_val(had_bs.delta_r(presel_events.FatJet[:, 0]) < deltaR, 1, False, axis=1, to_numpy=False))

<Array [True, False, False, ... False, False] type='12352 * bool'>

In [89]:
ak.flatten(pad_val(had_bs.delta_r(presel_events.FatJet[:, 0]) < deltaR, 1, False, axis=1, to_numpy=False), axis=0)

<Array [[True], [False], ... [False], [False]] type='12352 * union[1 * bool, bool]'>

In [65]:
unmatched_selection = PackedSelection()

In [66]:
had_top_jet_match = presel_events.FatJet[:, 0].delta_r(hadronic_tops) < deltaR
add_selection("jet not matched", ~had_top_jet_match, unmatched_selection)

In [63]:
np.sum(presel_events.FatJet[:, 0].delta_r(hadronic_tops) < deltaR)

7850

In [41]:
presel_events.FatJet.pt

<Array [[380], [261, 196, ... 218], [308, 189]] type='12352 * var * float32[para...'>

In [40]:
presel_events.FatJet.delta_r(had_bs)

ValueError: in ListOffsetArray64, cannot broadcast nested list

(https://github.com/scikit-hep/awkward-1.0/blob/1.10.2/src/cpu-kernels/awkward_ListArray_broadcast_tooffsets.cpp#L27)

In [46]:
np.sum(had_bs.delta_r(presel_events.FatJet[:, 0]) < 0.8)

4718

In [53]:
np.sum(ak.pad_none(presel_events.FatJet, 2, axis=1)[:, 1].delta_r(had_bs) < 1)

1578

In [47]:
had_bs.delta_r(ak.pad_none(presel_events.FatJet, 2, axis=1)[:, 1] < 1)

ValueError: no overloads for custom types: less(FatJet, int)

(https://github.com/scikit-hep/awkward-1.0/blob/1.10.2/src/awkward/_connect/_numpy.py#L245)

In [30]:
add_selection("hadronic bs", np.any(had_bs.pdgId, axis=1), selection, cutflow, isData, signGenWeights)

In [100]:
ws[np.all(np.abs(ws.children.pdgId) <= 5, axis=2)]

<GenParticleArray [[GenParticle], ... [GenParticle]] type='12352 * var * ?genPar...'>

In [None]:

# saving 4-vector info
GenTopsVars = {f"GenTops{key}": tops[var].to_numpy() for (var, key) in skim_vars.items()}

higgs_children = higgs.children

# saving whether H->bb or H->VV
GenHiggsVars["GenHiggsChildren"] = abs(higgs_children.pdgId[:, :, 0]).to_numpy()

# finding bb and VV children
is_bb = abs(higgs_children.pdgId) == b_PDGID
is_VV = (abs(higgs_children.pdgId) == W_PDGID) + (abs(higgs_children.pdgId) == Z_PDGID)

# checking that there are 2 b's and 2 V's
has_bb = ak.sum(ak.flatten(is_bb, axis=2), axis=1) == 2
has_VV = ak.sum(ak.flatten(is_VV, axis=2), axis=1) == 2

# only select presel_events with 2 b's and 2 V's
# add_selection("has_bbVV", has_bb * has_VV, selection, cutflow, False, signGenWeights)

# saving bb and VV 4-vector info
bb = ak.flatten(higgs_children[is_bb], axis=2)
VV = ak.flatten(higgs_children[is_VV], axis=2)

# have to pad to 2 because of some 4V presel_events
GenbbVars = {
    f"Genbb{key}": pad_val(bb[var], 2, FILL_NONE_VALUE, axis=1) for (var, key) in skim_vars.items()
}

# selecting only up to the 2nd index because of some 4V presel_events
# (doesn't matter which two are selected since these presel_events will be excluded anyway)
GenVVVars = {f"GenVV{key}": VV[var][:, :2].to_numpy() for (var, key) in skim_vars.items()}

# checking that each V has 2 q children
VV_children = VV.children

quarks = abs(VV_children.pdgId) <= b_PDGID
all_q = ak.all(ak.all(quarks, axis=2), axis=1)
# add_selection("all_q", all_q, selection, cutflow, False, signGenWeights)

V_has_2q = ak.count(VV_children.pdgId, axis=2) == 2
has_4q = ak.values_astype(ak.prod(V_has_2q, axis=1), bool)
# add_selection("has_4q", has_4q, selection, cutflow, False, signGenWeights)

# saving 4q 4-vector info
Gen4qVars = {
    f"Gen4q{key}": ak.to_numpy(
        ak.fill_none(
            ak.pad_none(ak.pad_none(VV_children[var], 2, axis=1, clip=True), 2, axis=2, clip=True),
            FILL_NONE_VALUE,
        )
    )
    for (var, key) in skim_vars.items()
}

gen_cut = has_bb * has_VV * all_q * has_4q
gen_vars = {**GenHiggsVars, **GenbbVars, **GenVVVars, **Gen4qVars}

sel_events = presel_events[gen_cut]
gen_vars = {key: np.squeeze(np.array(value[gen_cut])) for (key, value) in gen_vars.items()}

is_VV = abs(gen_vars["GenHiggsChildren"] == W_PDGID) + abs(gen_vars["GenHiggsChildren"] == Z_PDGID)
