In [6]:
import argparse

import json
import os
import pickle as pkl
import sys
import time
import warnings

import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import uproot
from coffea import nanoevents, processor
from coffea.nanoevents import BaseSchema, NanoAODSchema, NanoEventsFactory

sys.path.append("../")

import json
import os
import pathlib
import pickle as pkl
import shutil
import warnings
from collections import defaultdict
from typing import List, Optional

import awkward as ak
import numpy as np
import pandas as pd
import pyarrow as pa
from coffea import processor
from coffea.analysis_tools import PackedSelection, Weights
from coffea.nanoevents.methods import candidate, vector

warnings.filterwarnings("ignore", message="Found duplicate branch ")
warnings.filterwarnings("ignore", category=DeprecationWarning)
np.seterr(invalid="ignore")


{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'ignore'}

In [7]:
! ls ../rootfiles

hww1.root  hww2.root


In [8]:
# load a root file into coffea-friendly NanoAOD structure
import uproot
f = uproot.open(f"../rootfiles/hww1.root")
num = f['Events'].num_entries   ### checks number of events per file 
print(f'number of events per file is {num}')

events = nanoevents.NanoEventsFactory.from_root(f, "Events").events()

number of events per file is 98400


In [10]:
def build_p4(cand):
    return ak.zip(
        {
            "pt": cand.pt,
            "eta": cand.eta,
            "phi": cand.phi,
            "mass": cand.mass,
            "charge": cand.charge,
        },
        with_name="PtEtaPhiMCandidate",
        behavior=candidate.behavior,
    )

In [49]:
### make selections
nevents = len(events)

# define muon objects
loose_muons = (
    (((events.Muon.pt > 30) & (events.Muon.pfRelIso04_all < 0.25)) |
     (events.Muon.pt > 55))
    & (np.abs(events.Muon.eta) < 2.4)
    & (events.Muon.looseId)
)
n_loose_muons = ak.sum(loose_muons, axis=1)

good_muons = (
    (events.Muon.pt > 28)
    & (np.abs(events.Muon.eta) < 2.4)
    & (np.abs(events.Muon.dz) < 0.1)
    & (np.abs(events.Muon.dxy) < 0.05)
    & (events.Muon.sip3d <= 4.0)
    & events.Muon.mediumId
)
n_good_muons = ak.sum(good_muons, axis=1)

# define electron objects
loose_electrons = (
    (((events.Electron.pt > 38) & (events.Electron.pfRelIso03_all < 0.25)) |
     (events.Electron.pt > 120))
    & ((np.abs(events.Electron.eta) < 1.44) | (np.abs(events.Electron.eta) > 1.57))
    & (events.Electron.cutBased >= events.Electron.LOOSE)
)
n_loose_electrons = ak.sum(loose_electrons, axis=1)

good_electrons = (
    (events.Electron.pt > 38)
    & ((np.abs(events.Electron.eta) < 1.44) | (np.abs(events.Electron.eta) > 1.57))
    & (np.abs(events.Electron.dz) < 0.1)
    & (np.abs(events.Electron.dxy) < 0.05)
    & (events.Electron.sip3d <= 4.0)
    & (events.Electron.mvaFall17V2noIso_WP90)
)
n_good_electrons = ak.sum(good_electrons, axis=1)

# leading lepton
goodleptons = ak.concatenate([events.Muon[good_muons], events.Electron[good_electrons]], axis=1)
goodleptons = goodleptons[ak.argsort(goodleptons.pt, ascending=False)]
candidatelep = ak.firsts(goodleptons)

# candidate leptons
candidatelep_p4 = build_p4(candidatelep)

# MET
met = events.MET
mt_lep_met = np.sqrt(
    2. * candidatelep_p4.pt * met.pt * (ak.ones_like(met.pt) - np.cos(candidatelep_p4.delta_phi(met)))
)

# JETS
goodjets = events.Jet[
    (events.Jet.pt > 30)
    & (abs(events.Jet.eta) < 2.5)
    & events.Jet.isTight
    & (events.Jet.puId > 0)
]
ht = ak.sum(goodjets.pt, axis=1)

# FATJETS
fatjets = events.FatJet

good_fatjets = (
    (fatjets.pt > 200)
    & (abs(fatjets.eta) < 2.5)
    & fatjets.isTight
)
n_fatjets = ak.sum(good_fatjets, axis=1)

good_fatjets = fatjets[good_fatjets]
good_fatjets = good_fatjets[ak.argsort(good_fatjets.pt, ascending=False)]

# for lep channel: first clean jets and leptons by removing overlap, then pick candidate_fj closest to the lepton
lep_in_fj_overlap_bool = good_fatjets.delta_r(candidatelep_p4) > 0.1
good_fatjets = good_fatjets[lep_in_fj_overlap_bool]
fj_idx_lep = ak.argmin(good_fatjets.delta_r(candidatelep_p4), axis=1, keepdims=True)
candidatefj = ak.firsts(good_fatjets[fj_idx_lep])

In [50]:
candidatefj.pt

<Array [None, None, None, ... None, None, None] type='98400 * ?float32[parameter...'>

In [51]:
from typing import Dict
from coffea.nanoevents.methods.base import NanoEventsArray

def get_pfcands_features(
    tagger_vars: dict,
    preselected_events: NanoEventsArray,
    fj_idx_lep,
    fatjet_label: str = "FatJetAK15",
    pfcands_label: str = "FatJetPFCands",
    normalize: bool = True,
) -> Dict[str, np.ndarray]:
    """
    Extracts the pf_candidate features specified in the ``tagger_vars`` dict from the
    ``preselected_events`` and returns them as a dict of numpy arrays
    """

    feature_dict = {}

    jet = ak.firsts(preselected_events[fatjet_label][fj_idx_lep])

    msk = preselected_events[pfcands_label].jetIdx == ak.firsts(fj_idx_lep)
    jet_ak_pfcands = preselected_events[pfcands_label][msk]
    jet_pfcands = preselected_events.PFCands[jet_ak_pfcands.pFCandsIdx]

    # sort them by pt
    pfcand_sort = ak.argsort(jet_pfcands.pt, ascending=False)
    jet_pfcands = jet_pfcands[pfcand_sort]

    # negative eta jets have -1 sign, positive eta jets have +1
    eta_sign = ak.ones_like(jet_pfcands.eta)
    eta_sign = eta_sign * (ak.values_astype(jet.eta > 0, int) * 2 - 1)
    feature_dict["pfcand_etarel"] = eta_sign * (jet_pfcands.eta - jet.eta)
    feature_dict["pfcand_phirel"] = jet.delta_phi(jet_pfcands)
    feature_dict["pfcand_abseta"] = np.abs(jet_pfcands.eta)

    feature_dict["pfcand_pt_log_nopuppi"] = np.log(jet_pfcands.pt)
    feature_dict["pfcand_e_log_nopuppi"] = np.log(jet_pfcands.energy)

    pdgIds = jet_pfcands.pdgId
    feature_dict["pfcand_isEl"] = np.abs(pdgIds) == 11
    feature_dict["pfcand_isMu"] = np.abs(pdgIds) == 13
    feature_dict["pfcand_isChargedHad"] = np.abs(pdgIds) == 211
    feature_dict["pfcand_isGamma"] = np.abs(pdgIds) == 22
    feature_dict["pfcand_isNeutralHad"] = np.abs(pdgIds) == 130

    feature_dict["pfcand_charge"] = jet_pfcands.charge
    feature_dict["pfcand_VTX_ass"] = jet_pfcands.pvAssocQuality
    feature_dict["pfcand_lostInnerHits"] = jet_pfcands.lostInnerHits
    feature_dict["pfcand_quality"] = jet_pfcands.trkQuality

    feature_dict["pfcand_normchi2"] = np.floor(jet_pfcands.trkChi2)

    if "Cdz" in jet_ak_pfcands.fields:
        feature_dict["pfcand_dz"] = jet_ak_pfcands["Cdz"][pfcand_sort]
        feature_dict["pfcand_dxy"] = jet_ak_pfcands["Cdxy"][pfcand_sort]
        feature_dict["pfcand_dzsig"] = jet_ak_pfcands["Cdzsig"][pfcand_sort]
        feature_dict["pfcand_dxysig"] = jet_ak_pfcands["Cdxysig"][pfcand_sort]
    else:
        # this is for old PFNano (<= v2.3)
        feature_dict["pfcand_dz"] = jet_pfcands.dz
        feature_dict["pfcand_dxy"] = jet_pfcands.d0
        feature_dict["pfcand_dzsig"] = jet_pfcands.dz / jet_pfcands.dzErr
        feature_dict["pfcand_dxysig"] = jet_pfcands.d0 / jet_pfcands.d0Err

    feature_dict["pfcand_px"] = jet_pfcands.px
    feature_dict["pfcand_py"] = jet_pfcands.py
    feature_dict["pfcand_pz"] = jet_pfcands.pz
    feature_dict["pfcand_energy"] = jet_pfcands.E
    # feature_dict["pfcand_energy"] = jet_pfcands.energy

    # btag vars
    for var in tagger_vars["pf_features"]["var_names"]:
        if "btag" in var:
            feature_dict[var] = jet_ak_pfcands[var[len("pfcand_") :]][pfcand_sort]

    # pfcand mask
    feature_dict["pfcand_mask"] = (
        ~(
            ma.masked_invalid(
                ak.pad_none(
                    feature_dict["pfcand_abseta"],
                    tagger_vars["pf_features"]["var_length"],
                    axis=1,
                    clip=True,
                ).to_numpy()
            ).mask
        )
    ).astype(np.float32)

    # if no padding is needed, mask will = 1.0
    if isinstance(feature_dict["pfcand_mask"], np.float32):
        feature_dict["pfcand_mask"] = np.ones(
            (
                len(feature_dict["pfcand_abseta"]),
                tagger_vars["pf_features"]["var_length"],
            )
        ).astype(np.float32)

    repl_values_dict = {
        "pfcand_normchi2": [-1, 999],
        "pfcand_dz": [-1, 0],
        "pfcand_dzsig": [1, 0],
        "pfcand_dxy": [-1, 0],
        "pfcand_dxysig": [1, 0],
    }

    # convert to numpy arrays and normalize features
    if "pf_vectors" in tagger_vars.keys():
        variables = set(tagger_vars["pf_features"]["var_names"] + tagger_vars["pf_vectors"]["var_names"])
    else:
        variables = tagger_vars["pf_features"]["var_names"]

    for var in variables:
        a = (
            ak.pad_none(
                feature_dict[var],
                tagger_vars["pf_features"]["var_length"],
                axis=1,
                clip=True,
            )
            .to_numpy()
            .filled(fill_value=0)
        ).astype(np.float32)
        a = np.nan_to_num(a)

        # replace values to match PKU's
        if var in repl_values_dict:
            vals = repl_values_dict[var]
            a[a == vals[0]] = vals[1]

        if normalize:
            if var in tagger_vars["pf_features"]["var_names"]:
                info = tagger_vars["pf_features"]["var_infos"][var]
            else:
                info = tagger_vars["pf_vectors"]["var_infos"][var]

            a = (a - info["median"]) * info["norm_factor"]
            a = np.clip(a, info.get("lower_bound", -5), info.get("upper_bound", 5))

        feature_dict[var] = a

    return feature_dict

In [44]:
! ls ../boostedhiggs/tagger_resources/

05_10_ak8_ttbarwjets.json
ak8_MD_vminclv2ParT_manual_fixwrap.json
particlenet_hww_inclv2_pre2.json
particlenet_hww_inclv2_pre2_noreg.json
README.md
triton_config_05_10_ak8_ttbarwjets.json
triton_config_ak8_MD_vminclv2ParT_manual_fixwrap.json
triton_config_particlenet_hww_inclv2_pre2_noreg.json
triton_config_particlenet_hww_inclv2_pre.json


In [45]:
tagger_resources_path = "../boostedhiggs/tagger_resources/"
model_name = "particlenet_hww_inclv2_pre2_noreg"

with open(f"{tagger_resources_path}/triton_config_{model_name}.json") as f:
    triton_config = json.load(f)
        
with open(f"{tagger_resources_path}/{triton_config['model_name']}.json") as f:
    tagger_vars = json.load(f)

In [46]:
fatjet_label = "FatJet"
pfcands_label = "FatJetPFCands"
svs_label = "FatJetSVs"
    
get_pfcands_features(tagger_vars, events, fj_idx_lep, fatjet_label, pfcands_label)

AttributeError: no field named 'energy'

(https://github.com/scikit-hep/awkward-1.0/blob/1.10.2/src/awkward/highlevel.py#L1124)

In [35]:
preselected_events = events
feature_dict = {}

jet = ak.firsts(preselected_events[fatjet_label][fj_idx_lep])

msk = preselected_events[pfcands_label].jetIdx == ak.firsts(fj_idx_lep)
jet_ak_pfcands = preselected_events[pfcands_label][msk]
jet_pfcands = preselected_events.PFCands[jet_ak_pfcands.pFCandsIdx]

# sort them by pt
pfcand_sort = ak.argsort(jet_pfcands.pt, ascending=False)
jet_pfcands = jet_pfcands[pfcand_sort]

# negative eta jets have -1 sign, positive eta jets have +1
eta_sign = ak.ones_like(jet_pfcands.eta)
eta_sign = eta_sign * (ak.values_astype(jet.eta > 0, int) * 2 - 1)
feature_dict["pfcand_etarel"] = eta_sign * (jet_pfcands.eta - jet.eta)
feature_dict["pfcand_phirel"] = jet.delta_phi(jet_pfcands)
feature_dict["pfcand_abseta"] = np.abs(jet_pfcands.eta)

feature_dict["pfcand_pt_log_nopuppi"] = np.log(jet_pfcands.pt)
feature_dict["pfcand_e_log_nopuppi"] = np.log(jet_pfcands.energy)

pdgIds = jet_pfcands.pdgId
feature_dict["pfcand_isEl"] = np.abs(pdgIds) == 11
feature_dict["pfcand_isMu"] = np.abs(pdgIds) == 13
feature_dict["pfcand_isChargedHad"] = np.abs(pdgIds) == 211
feature_dict["pfcand_isGamma"] = np.abs(pdgIds) == 22
feature_dict["pfcand_isNeutralHad"] = np.abs(pdgIds) == 130

feature_dict["pfcand_charge"] = jet_pfcands.charge
feature_dict["pfcand_VTX_ass"] = jet_pfcands.pvAssocQuality
feature_dict["pfcand_lostInnerHits"] = jet_pfcands.lostInnerHits
feature_dict["pfcand_quality"] = jet_pfcands.trkQuality

feature_dict["pfcand_normchi2"] = np.floor(jet_pfcands.trkChi2)

if "Cdz" in jet_ak_pfcands.fields:
    feature_dict["pfcand_dz"] = jet_ak_pfcands["Cdz"][pfcand_sort]
    feature_dict["pfcand_dxy"] = jet_ak_pfcands["Cdxy"][pfcand_sort]
    feature_dict["pfcand_dzsig"] = jet_ak_pfcands["Cdzsig"][pfcand_sort]
    feature_dict["pfcand_dxysig"] = jet_ak_pfcands["Cdxysig"][pfcand_sort]
else:
    # this is for old PFNano (<= v2.3)
    feature_dict["pfcand_dz"] = jet_pfcands.dz
    feature_dict["pfcand_dxy"] = jet_pfcands.d0
    feature_dict["pfcand_dzsig"] = jet_pfcands.dz / jet_pfcands.dzErr
    feature_dict["pfcand_dxysig"] = jet_pfcands.d0 / jet_pfcands.d0Err

feature_dict["pfcand_px"] = jet_pfcands.px
feature_dict["pfcand_py"] = jet_pfcands.py
feature_dict["pfcand_pz"] = jet_pfcands.pz
feature_dict["pfcand_energy"] = jet_pfcands.E

AttributeError: no field named 'energy'

(https://github.com/scikit-hep/awkward-1.0/blob/1.10.2/src/awkward/highlevel.py#L1124)

In [36]:
jet_pfcands.fields

['d0',
 'd0Err',
 'dz',
 'dzErr',
 'eta',
 'mass',
 'phi',
 'pt',
 'puppiWeight',
 'puppiWeightNoLep',
 'trkChi2',
 'vtxChi2',
 'charge',
 'lostInnerHits',
 'pdgId',
 'pvAssocQuality',
 'trkQuality']