# Debugging inference 

Using latest version of coffea

In [1]:
from collections import defaultdict
import pickle as pkl
import pyarrow as pa
import awkward as ak
import numpy as np
import pandas as pd
import json
import os
import shutil
import pathlib
from typing import List, Optional, Dict
import pyarrow.parquet as pq

import importlib.resources

from coffea import processor
from coffea.nanoevents.methods import candidate, vector
from coffea.analysis_tools import Weights, PackedSelection

from coffea.nanoevents.methods.base import NanoEventsArray
from coffea.nanoevents import NanoEventsFactory, NanoAODSchema, PFNanoAODSchema
from coffea.nanoevents.methods import candidate, vector
import matplotlib.pyplot as plt
import numpy as np
import json

import warnings
warnings.filterwarnings("ignore", message="Found duplicate branch ")
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="Missing cross-reference index ")
np.seterr(invalid='ignore')

  numba.core.entrypoints.init_all()


{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

## NanoAODschema

We want to use PFNanoAODSchema since that load PFCands as a candidate particles, i.e. they have 4-vector properties.

https://coffeateam.github.io/coffea/api/coffea.nanoevents.PFNanoAODSchema.html
https://github.com/CoffeaTeam/coffea/blob/7dd4f863837a6319579f078c9e445c61d9106943/coffea/nanoevents/schemas/nanoaod.py#L282

Additionally, PFNanoAODSchema loads SecondaryVertices as SecondaryVertex:
https://github.com/CoffeaTeam/coffea/blob/7dd4f863837a6319579f078c9e445c61d9106943/coffea/nanoevents/schemas/nanoaod.py#L68
https://github.com/CoffeaTeam/coffea/blob/f2a99631dcf95b46bd0225b242b3ba512a30a89a/coffea/nanoevents/methods/nanoaod.py#L388

We do not neccessarily want this since this means we don't have a candidate 4-vector for the SecondaryVertex and we can't do operations like `delta_phi`. 
We need to modify this to a `mixin` of PFCand.

In [2]:
print(PFNanoAODSchema.mixins['PFCands'])
print(PFNanoAODSchema.mixins['SV'])

# interpret SV with PFCand behavior
PFNanoAODSchema.mixins["SV"] = "PFCand"

print(PFNanoAODSchema.mixins['SV'])

PFCand
SecondaryVertex
PFCand


## Opening file w PFNanoAODSchema

Here we manually open the file (only 200 entries), and we use `PFNanoAODSchema`.
We also open the json file that specifies how many pf candidates/svs are going to be used in the tagger, as well as any normalization that should be applied

In [3]:
file = 'dy_sample.root'
events = NanoEventsFactory.from_root(file, schemaclass=PFNanoAODSchema, entry_stop=200).events()

with open(f"03_31_ak8.json") as f:
    tagger_vars = json.load(f)

We define the selection such that it mimics the boostedHiggs selection. We use the fatjet closest to the lepton and obtain its index.

In [4]:
def build_p4(cand):
    return ak.zip(
        {
            "pt": cand.pt,
            "eta": cand.eta,
            "phi": cand.phi,
            "mass": cand.mass,
            "charge": cand.charge,
        },
        with_name="PtEtaPhiMCandidate",
        behavior=candidate.behavior,
    )

good_muons = (
    (events.Muon.pt > 30)
    & (np.abs(events.Muon.eta) < 2.4)
    & (np.abs(events.Muon.dz) < 0.1)
    & (np.abs(events.Muon.dxy) < 0.05)
    & (events.Muon.sip3d <= 4.0)
    & events.Muon.mediumId
)   
good_electrons = (
    (events.Electron.pt > 38)
    & (np.abs(events.Electron.eta) < 2.4)
    & ((np.abs(events.Electron.eta) < 1.44) | (np.abs(events.Electron.eta) > 1.57))
    & (np.abs(events.Electron.dz) < 0.1)
    & (np.abs(events.Electron.dxy) < 0.05)
    & (events.Electron.sip3d <= 4.0)
    & (events.Electron.mvaFall17V2noIso_WP90)
)

# get candidate lepton
goodleptons = ak.concatenate([events.Muon[good_muons], events.Electron[good_electrons]], axis=1)    # concat muons and electrons
goodleptons = goodleptons[ak.argsort(goodleptons.pt, ascending=False)]      # sort by pt
candidatelep = ak.firsts(goodleptons)   # pick highest pt
candidatelep_p4 = build_p4(candidatelep) 

# get candidate fj
fatjets = events.FatJet
good_fatjets = (
    (fatjets.pt > 200)
    & (abs(fatjets.eta) < 2.5)
    & fatjets.isTight
)
n_fatjets = ak.sum(good_fatjets, axis=1)
good_fatjets = fatjets[good_fatjets]        # select good fatjets
good_fatjets = good_fatjets[ak.argsort(good_fatjets.pt, ascending=False)]    # sort by pt
lep_in_fj_overlap_bool = ~ak.is_none(ak.firsts(good_fatjets.delta_r(candidatelep_p4) > 0.1))
good_fatjets = ak.mask(good_fatjets, lep_in_fj_overlap_bool)

# get idx and fj
fj_idx_lep = ak.argmin(good_fatjets.delta_r(candidatelep_p4), axis=1, keepdims=True)
candidatefj_lep = ak.firsts(good_fatjets[fj_idx_lep])

In [5]:
fatjet_label = "FatJet"
pfcands_label = "FatJetPFCands"
svs_label = "FatJetSVs"

# get the jet given the index (take firsts to avoid singletons)
jet = ak.firsts(events[fatjet_label][fj_idx_lep])

# print indices of events where we actually have a jet
print('Indices where jet pt > 0')
print(np.where((ak.fill_none(jet.pt,-1)>0).to_numpy()))

Indices where jet pt > 0
(array([ 17,  48,  66,  74,  80,  81,  82,  90,  92, 104, 105, 116, 123,
       159, 160, 164]),)


### Debug PFCand features and mask

The mask is just an array with a subarray of the lenght of the number of features e.g. (100). 

For example, if for a given jet we only have 2 PF Candidates and we have 100 PFcandidate points in our network, we would have:
```
mask = [1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
 ```

In [6]:
jet = ak.firsts(events[fatjet_label][fj_idx_lep])
msk = events[pfcands_label].jetIdx == ak.firsts(fj_idx_lep)
jet_ak_pfcands = events[pfcands_label][msk]
jet_pfcands = (events.PFCands[jet_ak_pfcands.pFCandsIdx])

# get any feature of pfcands
pfcand_abseta = np.abs(jet_pfcands.eta)

# index for jet w pfcands 
test_idx = 48

print(f'See how pfcand variable looks for jet {test_idx}')
print(pfcand_abseta[test_idx])
print('Pad until sub-arrays have the same length')
print(ak.pad_none(pfcand_abseta, tagger_vars["pf_points"]["var_length"], axis=1, clip=True).to_numpy())
# print('Get a mask')
# print(ak.pad_none(pfcand_abseta, tagger_vars["pf_points"]["var_length"], axis=1, clip=True).to_numpy().mask)
print('Invert the mask')
print(~(ak.pad_none(pfcand_abseta, tagger_vars["pf_points"]["var_length"], axis=1, clip=True).to_numpy().mask))
print('Convert mask to ones')
print((~(ak.pad_none(pfcand_abseta, tagger_vars["pf_points"]["var_length"], axis=1, clip=True).to_numpy().mask)).astype(np.float32))
pfcand_mask = (~(ak.pad_none(pfcand_abseta, tagger_vars["pf_points"]["var_length"], axis=1, clip=True).to_numpy().mask)).astype(np.float32)
print(f'See how mask looks for jet {test_idx}')
print(pfcand_mask[test_idx])

See how pfcand variable looks for jet 48
[1.26, 1.46, 1.45, 1.41, 1.34, 1.28, 1.44, 1.06]
Pad until sub-arrays have the same length
[[-- -- -- ... -- -- --]
 [-- -- -- ... -- -- --]
 [-- -- -- ... -- -- --]
 ...
 [-- -- -- ... -- -- --]
 [-- -- -- ... -- -- --]
 [-- -- -- ... -- -- --]]
Invert the mask
[[False False False ... False False False]
 [False False False ... False False False]
 [False False False ... False False False]
 ...
 [False False False ... False False False]
 [False False False ... False False False]
 [False False False ... False False False]]
Convert mask to ones
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
See how mask looks for jet 48
[1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0.

In [7]:
### Debug SV features and mask

In [8]:
jet = ak.firsts(events[fatjet_label][fj_idx_lep])
msk = (events[svs_label].jetIdx == ak.firsts(fj_idx_lep))

# find index where msk is true and svIdx is not -1
sv_msk = (events[svs_label].sVIdx != -1) * (msk)
count_sv = ak.fill_none(ak.sum(sv_msk,axis=1),-1,axis=0).to_numpy()
print(count_sv)
print('Index of events where we actually have SVs')
print(np.where(count_sv>0))

# get jet svs
jet_svs = events.SV[
        events[svs_label].sVIdx[sv_msk]
        ]

# index for jet w svs
test_idx = 66

# get sv  feature
eta_sign = ak.values_astype(jet_svs.eta > 0, int) * 2 - 1
sv_etarel = eta_sign * (jet_svs.eta - jet.eta)
print(f'See how sv variable looks for jet {test_idx}')
print(sv_etarel[test_idx])

# get mask
sv_mask = (~(
        ak.pad_none(
            sv_etarel, tagger_vars["sv_points"]["var_length"], axis=1, clip=True
        ).to_numpy().mask)
          ).astype(np.float32)
print('Convert mask to ones and zeros')
print(sv_mask)
print(f'See how mask looks for jet {test_idx}')
print(sv_mask[test_idx])
print(f'Delta phi for jet {test_idx}')
print(jet_svs.delta_phi(jet)[test_idx])
print('Fields for jet svs')
print(jet_svs.fields)

[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  0 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
  0 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1
 -1 -1  0 -1 -1 -1 -1 -1  1  0  0 -1 -1 -1 -1 -1 -1 -1  0 -1  0 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1  2  0 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  0 -1 -1 -1
 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  2  0 -1 -1 -1  1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1]
Index of events where we actually have SVs
(array([ 66,  80, 104, 123, 159, 164]),)
See how sv variable looks for jet 66
[0.0429]
Convert mask to ones and zeros
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
See how mask looks for jet 66
[1. 0. 0. 0. 0. 0. 0.]
Delta phi for jet 66

## Now get feature functions

In [9]:
def get_pfcands_features(
    tagger_vars: dict,
    preselected_events: NanoEventsArray,
    fj_idx_lep,
    fatjet_label: str = "FatJetAK15",
    pfcands_label: str = "FatJetPFCands",
    normalize: bool = True,
) -> Dict[str, np.ndarray]:
    """
    Extracts the pf_candidate features specified in the ``tagger_vars`` dict from the
    ``preselected_events`` and returns them as a dict of numpy arrays
    """

    feature_dict = {}

    jet = ak.firsts(preselected_events[fatjet_label][fj_idx_lep])

    msk = preselected_events[pfcands_label].jetIdx == ak.firsts(fj_idx_lep)
    jet_ak_pfcands = preselected_events[pfcands_label][msk]
    jet_pfcands = (preselected_events.PFCands[jet_ak_pfcands.pFCandsIdx])

    # negative eta jets have -1 sign, positive eta jets have +1
    eta_sign = ak.values_astype(jet_pfcands.eta > 0, int) * 2 - 1
    feature_dict["pfcand_etarel"] = eta_sign * (jet_pfcands.eta - jet.eta)
    feature_dict["pfcand_phirel"] = jet.delta_phi(jet_pfcands)
    feature_dict["pfcand_abseta"] = np.abs(jet_pfcands.eta)

    feature_dict["pfcand_pt_log_nopuppi"] = np.log(jet_pfcands.pt)
    feature_dict["pfcand_e_log_nopuppi"] = np.log(jet_pfcands.energy)

    pdgIds = jet_pfcands.pdgId
    feature_dict["pfcand_isEl"] = np.abs(pdgIds) == 11
    feature_dict["pfcand_isMu"] = np.abs(pdgIds) == 13
    feature_dict["pfcand_isChargedHad"] = np.abs(pdgIds) == 211
    feature_dict["pfcand_isGamma"] = np.abs(pdgIds) == 22
    feature_dict["pfcand_isNeutralHad"] = np.abs(pdgIds) == 130

    feature_dict["pfcand_charge"] = jet_pfcands.charge
    feature_dict["pfcand_VTX_ass"] = jet_pfcands.pvAssocQuality
    feature_dict["pfcand_lostInnerHits"] = jet_pfcands.lostInnerHits
    feature_dict["pfcand_quality"] = jet_pfcands.trkQuality

    feature_dict["pfcand_normchi2"] = np.floor(jet_pfcands.trkChi2)

    feature_dict["pfcand_dz"] = jet_pfcands.dz
    feature_dict["pfcand_dxy"] = jet_pfcands.d0
    feature_dict["pfcand_dzsig"] = jet_pfcands.dz / jet_pfcands.dzErr
    feature_dict["pfcand_dxysig"] = jet_pfcands.d0 / jet_pfcands.d0Err

    # btag vars
    for var in tagger_vars["pf_features"]["var_names"]:
        if "btag" in var:
            feature_dict[var] = jet_ak_pfcands[var[len("pfcand_"):]]

    # pfcand mask
    feature_dict["pfcand_mask"] = (~(ak.pad_none(feature_dict["pfcand_abseta"], tagger_vars["pf_points"]["var_length"], axis=1, clip=True).to_numpy().mask)).astype(np.float32)

    # convert to numpy arrays and normalize features
    for var in tagger_vars["pf_features"]["var_names"]:
        a = (
            ak.pad_none(
                feature_dict[var], tagger_vars["pf_points"]["var_length"], axis=1, clip=True
            )
            .to_numpy()
            .filled(fill_value=0)
        ).astype(np.float32)

        if normalize:
            info = tagger_vars["pf_features"]["var_infos"][var]
            a = (a - info["median"]) * info["norm_factor"]
            a = np.clip(a, info.get("lower_bound", -5), info.get("upper_bound", 5))

        feature_dict[var] = a

    if normalize:
        var = "pfcand_normchi2"
        info = tagger_vars["pf_features"]["var_infos"][var]
        # finding what -1 transforms to
        chi2_min = -1 - info["median"] * info["norm_factor"]
        feature_dict[var][feature_dict[var] == chi2_min] = info["upper_bound"]
    return feature_dict

In [10]:
def get_svs_features(
    tagger_vars: dict,
    preselected_events: NanoEventsArray,
    fj_idx_lep,
    fatjet_label: str = "FatJetAK15",
    svs_label: str = "JetSVsAK15",
    normalize: bool = True,
) -> Dict[str, np.ndarray]:
    """
    Extracts the sv features specified in the ``tagger_vars`` dict from the
    ``preselected_events`` and returns them as a dict of numpy arrays
    """

    feature_dict = {}

    jet = ak.firsts(preselected_events[fatjet_label][fj_idx_lep])
    msk = preselected_events[svs_label].jetIdx == ak.firsts(fj_idx_lep)
    jet_svs = preselected_events.SV[
        preselected_events[svs_label].sVIdx[
            (preselected_events[svs_label].sVIdx != -1)
            * (msk)
        ]
    ]

    # negative eta jets have -1 sign, positive eta jets have +1
    eta_sign = ak.values_astype(jet_svs.eta > 0, int) * 2 - 1
    feature_dict["sv_etarel"] = eta_sign * (jet_svs.eta - jet.eta)
    feature_dict["sv_phirel"] = jet_svs.delta_phi(jet)
    feature_dict["sv_abseta"] = np.abs(jet_svs.eta)
    feature_dict["sv_mass"] = jet_svs.mass
    feature_dict["sv_pt_log"] = np.log(jet_svs.pt)

    feature_dict["sv_ntracks"] = jet_svs.ntracks
    feature_dict["sv_normchi2"] = jet_svs.chi2
    feature_dict["sv_dxy"] = jet_svs.dxy
    feature_dict["sv_dxysig"] = jet_svs.dxySig
    feature_dict["sv_d3d"] = jet_svs.dlen
    feature_dict["sv_d3dsig"] = jet_svs.dlenSig
    svpAngle = jet_svs.pAngle
    feature_dict["sv_costhetasvpv"] = -np.cos(svpAngle)

    feature_dict["sv_mask"] = (~(ak.pad_none(feature_dict["sv_etarel"], tagger_vars["sv_points"]["var_length"], axis=1, clip=True).to_numpy().mask)).astype(np.float32)

    # convert to numpy arrays and normalize features
    for var in tagger_vars["sv_features"]["var_names"]:
        a = (
            ak.pad_none(
                feature_dict[var], tagger_vars["sv_points"]["var_length"], axis=1, clip=True
            )
            .to_numpy()
            .filled(fill_value=0)
        ).astype(np.float32)

        if normalize:
            info = tagger_vars["sv_features"]["var_infos"][var]
            a = (a - info["median"]) * info["norm_factor"]
            a = np.clip(a, info.get("lower_bound", -5), info.get("upper_bound", 5))

        feature_dict[var] = a

    return feature_dict

In [11]:
fatjet_label = "FatJet"
pfcands_label = "FatJetPFCands"
svs_label = "FatJetSVs"

selection = candidatefj_lep.pt > 200

feature_dict = get_pfcands_features(tagger_vars, events[selection], fj_idx_lep[selection], fatjet_label, pfcands_label)
feature_dict = {
    **get_pfcands_features(tagger_vars, events[selection], fj_idx_lep[selection], fatjet_label, pfcands_label),
    **get_svs_features(tagger_vars, events[selection], fj_idx_lep[selection], fatjet_label, svs_label)
}

In [12]:
print(feature_dict)

{'pfcand_etarel': array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), 'pfcand_phirel': array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), 'pfcand_abseta': array([[-0.96000004, -0.96000004, -0.96000004, ..., -0.96000004,
        -0.96000004, -0.96000004],
       [-0.96000004, -0.96000004, -0.96000004, ..., -0.96000004,
        -0.96000004, -0.96000004],
       [-0.96000004, -0.96000004, -0.96000004, ..., -0.96000004,
        -0.96000004, -0.96000004],
       ...,
       [-0.96000004, -0.96000004, -0.96000004, ..., -0.96000004,
        -0.96000004, -0.96000004],
       [-0.96000004, -