In [37]:
import argparse
import json
import os
import pathlib
import pickle as pkl
import shutil
import sys
import time
import warnings
from collections import defaultdict
from typing import Dict, List, Optional

import awkward as ak
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

### schema
import uproot
from coffea import nanoevents, processor
from coffea.analysis_tools import PackedSelection, Weights
from coffea.nanoevents import BaseSchema, NanoAODSchema, NanoEventsFactory
from coffea.nanoevents.methods import candidate, vector

### awkward 1.10.0
sys.path.append("../")

nanoevents.PFNanoAODSchema.mixins["PFCands"] = "PFCand"
nanoevents.PFNanoAODSchema.mixins["SV"] = "PFCand"

warnings.filterwarnings("ignore", message="Found duplicate branch ")
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="Missing cross-reference index ")
warnings.filterwarnings("ignore", message="divide by zero encountered in log")
np.seterr(invalid="ignore")

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'ignore'}

In [36]:
%load_ext autoreload
%autoreload 2

In [5]:
def build_p4(cand):
    return ak.zip(
        {
            "pt": cand.pt,
            "eta": cand.eta,
            "phi": cand.phi,
            "mass": cand.mass,
            "charge": cand.charge,
        },
        with_name="PtEtaPhiMCandidate",
        behavior=candidate.behavior,
    )

In [6]:
! ls ../rootfiles/HWW

file1.root file2.root file3.root ggF.root


# Get candidate jet

In [7]:
def get_candidatefj(events):
    
    good_muons = (
        (events.Muon.pt > 28)
        & (np.abs(events.Muon.eta) < 2.4)
        & (np.abs(events.Muon.dz) < 0.1)
        & (np.abs(events.Muon.dxy) < 0.05)
        & (events.Muon.sip3d <= 4.0)
        & events.Muon.mediumId
    )

    good_electrons = (
        (events.Electron.pt > 38)
        & ((np.abs(events.Electron.eta) < 1.44) | (np.abs(events.Electron.eta) > 1.57))
        & (np.abs(events.Electron.dz) < 0.1)
        & (np.abs(events.Electron.dxy) < 0.05)
        & (events.Electron.sip3d <= 4.0)
        & (events.Electron.mvaFall17V2noIso_WP90)
    )

    # leading lepton
    goodleptons = ak.concatenate([events.Muon[good_muons], events.Electron[good_electrons]], axis=1)
    goodleptons = goodleptons[ak.argsort(goodleptons.pt, ascending=False)]

    candidatelep = ak.firsts(goodleptons[:, 0:1])

    # candidate leptons
    candidatelep_p4 = build_p4(candidatelep)

    # fatjets
    fatjets = events.FatJet

    msk_good_fatjets = (fatjets.pt > 200) & (abs(fatjets.eta) < 2.5) & fatjets.isTight
    n_fatjets = ak.sum(msk_good_fatjets, axis=1)

    good_fatjets = fatjets[msk_good_fatjets]  # select good fatjets

    fj_idx_lep = ak.argmin(good_fatjets.delta_r(candidatelep_p4), axis=1, keepdims=True)

    candidatefj = ak.firsts(good_fatjets[fj_idx_lep])
    
    return candidatefj

# Matching

## Higgs matching

In [229]:
from boostedhiggs.utils import match_H

f = uproot.open(f"../rootfiles/HWW/file1.root")
events = nanoevents.NanoEventsFactory.from_root(f, "Events", entry_stop=10000, schemaclass=nanoevents.PFNanoAODSchema).events()

num = len(events)   ### checks number of events per file 
print(f'number of events per file is {num}')

number of events per file is 10000


In [231]:
candidatefj = get_candidatefj(events)

selection = ~ak.is_none(candidatefj)
candidatefj[selection]

<FatJetArray [FatJet, FatJet, ... FatJet, FatJet] type='2119 * ?fatJet'>

In [232]:
GenVars, matched_mask = match_H(events.GenPart[selection], candidatefj[selection])



In [233]:
GenVars

{'fj_genH_pt': <Array [[313], [256], ... [219], [294]] type='2119 * var * float32[parameters={"...'>,
 'fj_genH_jet': <Array [0.107, 0.202, ... 0.0598, 0.0615] type='2119 * ?float32'>,
 'fj_genV_dR': <Array [0.188, 0.33, 0.0778, ... 0.178, 0.0802] type='2119 * ?float32'>,
 'fj_genVstar': <Array [0.307, 0.385, 0.142, ... 0.397, 0.172] type='2119 * ?float32'>,
 'genV_genVstar_dR': <Array [0.448, 0.595, 0.114, ... 0.575, 0.245] type='2119 * ?float32'>,
 'fj_nquarks': <Array [1, 2, 1, 0, 0, 2, ... 0, 2, 0, 2, 1, 1] type='2119 * ?int64'>,
 'fj_ncquarks': <Array [0, 0, 0, 0, 0, 0, ... 0, 0, 0, 0, 0, 0] type='2119 * ?int64'>,
 'fj_lepinprongs': <Array [1, 1, 1, 0, 0, 1, ... 2, 1, 2, 1, 1, 1] type='2119 * ?int64'>,
 'fj_H_VV_4q': <Array [0, 0, 0, 0, 0, 0, ... 0, 0, 0, 0, 0, 0] type='2119 * int32'>,
 'fj_H_VV_elenuqq': <Array [1, 1, 0, 0, 0, 0, ... 0, 0, 0, 0, 1, 0] type='2119 * int32'>,
 'fj_H_VV_munuqq': <Array [0, 0, 1, 0, 1, 1, ... 0, 1, 0, 1, 0, 1] type='2119 * int32'>,
 'fj_H_VV_taunuqq':

# VJet matching

In [31]:
from boostedhiggs.utils import match_V

f = uproot.open(f"../rootfiles/WJets/file1.root")
events = nanoevents.NanoEventsFactory.from_root(f, "Events", entry_stop=100, schemaclass=nanoevents.PFNanoAODSchema).events()

num = len(events)   ### checks number of events per file 
print(f'number of events per file is {num}')

number of events per file is 100


In [32]:
candidatefj = get_candidatefj(events)

selection = ~ak.is_none(candidatefj)
candidatefj[selection]

<FatJetArray [FatJet, FatJet, ... FatJet, FatJet] type='8 * ?fatJet'>

In [33]:
GenVars, matched_mask = match_V(events.GenPart[selection], candidatefj[selection])

In [34]:
GenVars

{'fj_nprongs': <Array [0, 0, 1, 1, 1, 0, 1, 0] type='8 * ?int64'>,
 'fj_lepinprongs': <Array [0, 0, 1, 1, 1, 0, 1, 0] type='8 * ?int64'>,
 'fj_ncquarks': <Array [0, 0, 0, 0, 0, 0, 0, 0] type='8 * ?int64'>,
 'fj_V_isMatched': <Array [False, False, True, ... True, False] type='8 * ?bool'>,
 'fj_V_2q': <Array [0, 0, 0, 0, 0, 0, 0, 0] type='8 * ?int32'>,
 'fj_V_elenu': <Array [1, 0, 1, 1, 0, 1, 1, 0] type='8 * ?int32'>,
 'fj_V_munu': <Array [0, 1, 0, 0, 1, 0, 0, 1] type='8 * ?int32'>,
 'fj_V_taunu': <Array [0, 0, 0, 0, 0, 0, 0, 0] type='8 * ?int32'>}

In [35]:
matched_mask

<Array [False, False, True, ... True, False] type='8 * ?bool'>

# QCD matching

In [55]:
from boostedhiggs.utils import match_QCD

f = uproot.open(f"../rootfiles/QCD/file1.root")
events = nanoevents.NanoEventsFactory.from_root(f, "Events", entry_stop=3000, schemaclass=nanoevents.PFNanoAODSchema).events()

num = len(events)   ### checks number of events per file 
print(f'number of events per file is {num}')

number of events per file is 3000


In [56]:
candidatefj = get_candidatefj(events)

selection = ~ak.is_none(candidatefj)
candidatefj[selection]

<FatJetArray [FatJet, FatJet, ... FatJet, FatJet] type='17 * ?fatJet'>

In [57]:
GenVars, matched_mask = match_QCD(events.GenPart[selection], candidatefj[selection])

In [58]:
GenVars

{'fj_isQCDb': <Array [0, 0, 0, 0, 1, 0, ... 0, 0, 0, 0, 0, 0] type='17 * ?int32'>,
 'fj_isQCDbb': <Array [0, 0, 0, 0, 0, 0, ... 1, 0, 0, 0, 1, 1] type='17 * ?int32'>,
 'fj_isQCDc': <Array [0, 0, 0, 0, 0, 1, ... 0, 1, 1, 0, 0, 0] type='17 * ?int32'>,
 'fj_isQCDcc': <Array [1, 0, 0, 1, 0, 0, ... 0, 0, 0, 0, 0, 0] type='17 * ?int32'>,
 'fj_isQCDothers': <Array [0, 1, 1, 0, 0, 0, ... 0, 0, 0, 1, 0, 0] type='17 * ?int32'>}

## Top matching

In [222]:
from boostedhiggs.utils import match_Top

f = uproot.open(f"../rootfiles/TTbar/file1.root")
events = nanoevents.NanoEventsFactory.from_root(f, "Events", entry_stop=1000, schemaclass=nanoevents.PFNanoAODSchema).events()

num = len(events)   ### checks number of events per file 
print(f'number of events per file is {num}')

number of events per file is 1000


In [223]:
candidatefj = get_candidatefj(events)

selection = ~ak.is_none(candidatefj)
candidatefj[selection]

<FatJetArray [FatJet, FatJet, ... FatJet, FatJet] type='74 * ?fatJet'>

In [224]:
GenVars, matched_mask = match_Top(events.GenPart[selection], candidatefj[selection])

In [225]:
GenVars

{'fj_Top_isMatched': <Array [True, True, True, ... True, True, True] type='74 * ?bool'>,
 'fj_Top_numMatched': <Array [1, 1, 1, 1, 1, 1, ... 1, 1, 1, 2, 1, 1] type='74 * ?int64'>,
 'fj_Top_nquarksnob': <Array [0, 2, 0, 2, 2, 2, ... 0, 0, 0, 1, 0, 0] type='74 * ?int64'>,
 'fj_Top_nbquarks': <Array [1, 1, 1, 0, 0, 0, ... 1, 1, 1, 0, 1, 0] type='74 * ?int64'>,
 'fj_Top_ncquarks': <Array [0, 0, 0, 0, 0, 0, ... 0, 0, 0, 0, 0, 0] type='74 * ?int64'>,
 'fj_Top_nleptons': <Array [1, 0, 1, 0, 0, 0, ... 0, 0, 1, 1, 1, 1] type='74 * ?int64'>,
 'fj_Top_nele': <Array [0, 0, 1, 0, 0, 0, ... 0, 0, 1, 1, 0, 0] type='74 * ?int64'>,
 'fj_Top_nmu': <Array [1, 0, 0, 0, 0, 0, ... 0, 0, 0, 0, 1, 1] type='74 * ?int64'>,
 'fj_Top_ntau': <Array [0, 0, 0, 0, 0, 0, ... 0, 0, 0, 0, 0, 0] type='74 * ?int64'>,
 'fj_Top_taudecay': <Array [0, 0, 0, 0, 0, 0, ... 0, 0, 0, 0, 0, 0] type='74 * int64'>}

In [226]:
matched_mask

<Array [True, True, True, ... True, True, True] type='74 * ?bool'>

In [228]:
GenVars["fj_Top_nleptons"]

<Array [1, 0, 1, 0, 0, 0, ... 0, 0, 1, 1, 1, 1] type='74 * ?int64'>