In [15]:
import argparse
import json
import os
import pickle as pkl
import sys
import time
import warnings

import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import uproot
from coffea import nanoevents, processor
from coffea.nanoevents import BaseSchema, NanoAODSchema, NanoEventsFactory

sys.path.append("../")
import json
import os
import pathlib
import pickle as pkl
import shutil
import warnings
from collections import defaultdict
from typing import Dict, List, Optional

import awkward as ak
import numpy as np
import pandas as pd
import pyarrow as pa
from coffea import processor
from coffea.analysis_tools import PackedSelection, Weights
from coffea.nanoevents.methods import candidate, vector

warnings.filterwarnings("ignore", message="Found duplicate branch ")
warnings.filterwarnings("ignore", category=DeprecationWarning)
np.seterr(invalid="ignore")

### awkward 1.10.0

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'ignore'}

In [16]:
def build_p4(cand):
    return ak.zip(
        {
            "pt": cand.pt,
            "eta": cand.eta,
            "phi": cand.phi,
            "mass": cand.mass,
            "charge": cand.charge,
        },
        with_name="PtEtaPhiMCandidate",
        behavior=candidate.behavior,
    )

In [17]:
! ls ../rootfiles

[34mHWW[m[m       dy2.root  hww1.root hww3.root qcd2.root
dy1.root  dy3.root  hww2.root qcd1.root qcd3.root


In [None]:
### schema
nanoevents.PFNanoAODSchema.mixins["PFCands"] = "PFCand"
nanoevents.PFNanoAODSchema.mixins["SV"] = "PFCand"

# load a root file into coffea-friendly NanoAOD structure
import uproot
f = uproot.open(f"../rootfiles/hww1.root")
num = f['Events'].num_entries   ### checks number of events per file 
print(f'number of events per file is {num}')

events = nanoevents.NanoEventsFactory.from_root(f, "Events", schemaclass=nanoevents.PFNanoAODSchema).events()

# Get candidate jet

In [19]:
### make selections
nevents = len(events)

# define muon objects
loose_muons = (
    (((events.Muon.pt > 30) & (events.Muon.pfRelIso04_all < 0.25)) |
     (events.Muon.pt > 55))
    & (np.abs(events.Muon.eta) < 2.4)
    & (events.Muon.looseId)
)
n_loose_muons = ak.sum(loose_muons, axis=1)

good_muons = (
    (events.Muon.pt > 28)
    & (np.abs(events.Muon.eta) < 2.4)
    & (np.abs(events.Muon.dz) < 0.1)
    & (np.abs(events.Muon.dxy) < 0.05)
    & (events.Muon.sip3d <= 4.0)
    & events.Muon.mediumId
)
n_good_muons = ak.sum(good_muons, axis=1)

# define electron objects
loose_electrons = (
    (((events.Electron.pt > 38) & (events.Electron.pfRelIso03_all < 0.25)) |
     (events.Electron.pt > 120))
    & ((np.abs(events.Electron.eta) < 1.44) | (np.abs(events.Electron.eta) > 1.57))
    & (events.Electron.cutBased >= events.Electron.LOOSE)
)
n_loose_electrons = ak.sum(loose_electrons, axis=1)

good_electrons = (
    (events.Electron.pt > 38)
    & ((np.abs(events.Electron.eta) < 1.44) | (np.abs(events.Electron.eta) > 1.57))
    & (np.abs(events.Electron.dz) < 0.1)
    & (np.abs(events.Electron.dxy) < 0.05)
    & (events.Electron.sip3d <= 4.0)
    & (events.Electron.mvaFall17V2noIso_WP90)
)
n_good_electrons = ak.sum(good_electrons, axis=1)

# leading lepton
goodleptons = ak.concatenate([events.Muon[good_muons], events.Electron[good_electrons]], axis=1)
goodleptons = goodleptons[ak.argsort(goodleptons.pt, ascending=False)]
candidatelep = ak.firsts(goodleptons)

# candidate leptons
candidatelep_p4 = build_p4(candidatelep)

# MET
met = events.MET
mt_lep_met = np.sqrt(
    2. * candidatelep_p4.pt * met.pt * (ak.ones_like(met.pt) - np.cos(candidatelep_p4.delta_phi(met)))
)

# JETS
goodjets = events.Jet[
    (events.Jet.pt > 30)
    & (abs(events.Jet.eta) < 2.5)
    & events.Jet.isTight
    & (events.Jet.puId > 0)
]
ht = ak.sum(goodjets.pt, axis=1)

# FATJETS
fatjets = events.FatJet

good_fatjets = (
    (fatjets.pt > 200)
    & (abs(fatjets.eta) < 2.5)
    & fatjets.isTight
)
n_fatjets = ak.sum(good_fatjets, axis=1)

good_fatjets = fatjets[good_fatjets]
good_fatjets = good_fatjets[ak.argsort(good_fatjets.pt, ascending=False)]

# for lep channel: first clean jets and leptons by removing overlap, then pick candidate_fj closest to the lepton
lep_in_fj_overlap_bool = good_fatjets.delta_r(candidatelep_p4) > 0.1
good_fatjets = good_fatjets[lep_in_fj_overlap_bool]
fj_idx_lep = ak.argmin(good_fatjets.delta_r(candidatelep_p4), axis=1, keepdims=True)
candidatefj = ak.firsts(good_fatjets[fj_idx_lep])

In [None]:
candidatefj.pt

<Array [None, None, None, ... None, None, None] type='98400 * ?float32[parameter...'>

## Build PFcands and SVs

In [None]:
# import modules to build the tagger inputs
import sys
sys.path.append("../boostedhiggs/")
from get_tagger_inputs import get_pfcands_features, get_svs_features

In [None]:
tagger_resources_path = "../boostedhiggs/tagger_resources/"

# use this model
model_name = "particlenet_hww_inclv2_pre2"

# the different models we can use
pversion, out_name = {
    "05_10_ak8_ttbarwjets": ["PN_UCSD", "softmax__0"],
    "particlenet_hww_inclv2_pre2": ["ParticleNet", "output__0"],
    "particlenet_hww_inclv2_pre2_noreg": ["PN_v2_noreg", "softmax__0"],
    "ak8_MD_vminclv2ParT_manual_fixwrap": ["ParT_noreg", "softmax"],
    "ak8_MD_vminclv2ParT_manual_fixwrap_all_nodes": ["ParT", "softmax"],
}[model_name]   

with open(f"{tagger_resources_path}/triton_config_{model_name}.json") as f:
    triton_config = json.load(f)

with open(f"{tagger_resources_path}/{triton_config['model_name']}.json") as f:
    tagger_vars = json.load(f)

In [None]:
# get pfcands
fatjet_label = "FatJet"
pfcands_label = "FatJetPFCands"
    
pfcands_features = get_pfcands_features(tagger_vars, events, fj_idx_lep, fatjet_label, pfcands_label)

# get svs
svs_label = "FatJetSVs"
    
svs_features = get_svs_features(tagger_vars, events, fj_idx_lep, fatjet_label, svs_label)

In [None]:
# this is our input to the tagger
feature_dict = {
    **pfcands_features,
    **svs_features,
}

## Triton

In [None]:
# adapted from https://github.com/lgray/hgg-coffea/blob/triton-bdts/src/hgg_coffea/tools/chained_quantile.py
class wrapped_triton:
    def __init__(self, model_url: str, batch_size: int, out_name: str = "softmax__0") -> None:
        fullprotocol, location = model_url.split("://")
        _, protocol = fullprotocol.split("+")
        address, model, version = location.split("/")

        self._protocol = protocol
        self._address = address
        self._model = model
        self._version = version

        self._batch_size = batch_size
        self._out_name = out_name

    def __call__(self, input_dict: Dict[str, np.ndarray]) -> np.ndarray:
        if self._protocol == "grpc":
            client = triton_grpc.InferenceServerClient(url=self._address, verbose=False)
            triton_protocol = triton_grpc
        elif self._protocol == "http":
            client = triton_http.InferenceServerClient(
                url=self._address,
                verbose=False,
                concurrency=12,
            )
            triton_protocol = triton_http
        else:
            raise ValueError(f"{self._protocol} does not encode a valid protocol (grpc or http)")

        # manually split into batches for gpu inference
        input_size = input_dict[list(input_dict.keys())[0]].shape[0]
        # print(f"size of input (number of events) = {input_size}")

        outs = [
            self._do_inference(
                {key: input_dict[key][batch : batch + self._batch_size] for key in input_dict},
                triton_protocol,
                client,
            )
            for batch in tqdm(range(0, input_dict[list(input_dict.keys())[0]].shape[0], self._batch_size))
        ]

        return np.concatenate(outs) if input_size > 0 else outs

    def _do_inference(self, input_dict: Dict[str, np.ndarray], triton_protocol, client) -> np.ndarray:
        # Infer
        inputs = []

        for key in input_dict:
            input = triton_protocol.InferInput(key, input_dict[key].shape, "FP32")
            input.set_data_from_numpy(input_dict[key])
            inputs.append(input)

        output = triton_protocol.InferRequestedOutput(self._out_name)

        request = client.infer(
            self._model,
            model_version=self._version,
            inputs=inputs,
            outputs=[output],
        )

        return request.as_numpy(self._out_name)

In [None]:
triton_model = wrapped_triton(triton_config["model_url"], triton_config["batch_size"], out_name=out_name)

In [None]:
tagger_inputs = []

for input_name in tagger_vars["input_names"]:
    for key in tagger_vars[input_name]["var_names"]:
        np.expand_dims(feature_dict[key], 1)

if out_name == "softmax":
    tagger_inputs = {
        f"{input_name}": np.concatenate(
            [np.expand_dims(feature_dict[key], 1) for key in tagger_vars[input_name]["var_names"]],
            axis=1,
        )
        for i, input_name in enumerate(tagger_vars["input_names"])
    }
else:
    tagger_inputs = {
        f"{input_name}__{i}": np.concatenate(
            [np.expand_dims(feature_dict[key], 1) for key in tagger_vars[input_name]["var_names"]],
            axis=1,
        )
        for i, input_name in enumerate(tagger_vars["input_names"])
    }

In [None]:
import tritonclient.grpc as triton_grpc
import tritonclient.http as triton_http
from tqdm import tqdm

# run inference for one fat jet
tagger_outputs = triton_model(tagger_inputs)