# Sensitivity study

Author: Raghav Kansal

In [None]:
from pathlib import Path

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import mplhep as hep
from matplotlib import colors

from boostedhh import utils, hh_vars, plotting
from bbtautau import bbtautau_vars

import logging

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("boostedhh.utils")
logger.setLevel(logging.DEBUG)

In [None]:
# automatically reloads imported files on edits
%load_ext autoreload
%autoreload 2

In [None]:
MAIN_DIR = Path("../../../")

plot_dir = MAIN_DIR / "plots/SensitivityStudy/24Nov21"
plot_dir.mkdir(parents=True, exist_ok=True)

year = "2022"
signal_samples_tag = "24Nov21ParTMass_v12_private_signal"
data_samples_tag = "24Nov21ParTMass_v12_private_signal"

## Define and load samples

In [None]:
base_dir = Path("/ceph/cms/store/user/rkansal/bbtautau/skimmer/")

samples = {
    "jetmet": utils.Sample(
        path=base_dir / data_samples_tag,
        selector="JetHT|JetMET",
        label="JetMET",
        isData=True,
        year=year,
    ),
    "tau": utils.Sample(
        path=base_dir / data_samples_tag,
        selector="Tau_Run",
        label="Tau",
        isData=True,
        year=year,
    ),
    "bbtt": utils.Sample(
        path=base_dir / signal_samples_tag,
        selector=hh_vars.bbtt_sigs["bbtt"],
        label=r"HHbb$\tau\tau$",
        isData=False,
        year=year,
    ),
}

In [None]:
# pt_cut = 250
# msd_cut = 40

filters = [
    [
        ("('ak8FatJetPt', '0')", ">=", 250),
        ("('ak8FatJetPNetmassLegacy', '0')", ">=", 50),
        ("('ak8FatJetPt', '1')", ">=", 200),
        # ("('ak8FatJetMsd', '0')", ">=", msd_cut),
        # ("('ak8FatJetMsd', '1')", ">=", msd_cut),
        # ("('ak8FatJetPNetXbb', '0')", ">=", 0.8),
    ],
]

# save cutflow as pandas table
# cutflow = pd.DataFrame(index=list(samples.keys()))

# dictionary that will contain all information (from all samples)
events_dict = {}
for key, sample in samples.items():
    events_dict[key] = utils.load_sample(sample, filters)

events_dict["bbtthh"] = events_dict["bbtt"][events_dict["bbtt"]["GenTauhh"][0]]
events_dict["bbtthmu"] = events_dict["bbtt"][events_dict["bbtt"]["GenTauhmu"][0]]
events_dict["bbtthe"] = events_dict["bbtt"][events_dict["bbtt"]["GenTauhe"][0]]
del events_dict["bbtt"]

cutflow = pd.DataFrame(index=list(events_dict.keys()))
utils.add_to_cutflow(events_dict, "Preselection", "finalWeight", cutflow)
cutflow

## Triggers

In [None]:
skeys = ["bbtthh", "bbtthmu", "bbtthe"]

for skey in skeys:
    triggered = np.sum([events_dict[skey][hlt][0] for hlt in bbtautau_vars.HLT_hh], axis=0).astype(
        bool
    )
    events_dict[skey] = events_dict[skey][triggered]

### Data

In [None]:
trigdict = {"jetmet": {}, "tau": {}}

for key, d in trigdict.items():
    d["all"] = np.sum([events_dict[key][hlt][0] for hlt in bbtautau_vars.HLT_hh], axis=0).astype(
        bool
    )
    d["jets"] = np.sum([events_dict[key][hlt][0] for hlt in bbtautau_vars.HLT_jets], axis=0).astype(
        bool
    )
    d["taus"] = np.sum([events_dict[key][hlt][0] for hlt in bbtautau_vars.HLT_taus], axis=0).astype(
        bool
    )
    d["jetnotau"] = d["jets"] & ~d["taus"]
    d["nojettau"] = d["taus"] & ~d["jets"]

Checking event loss by flipping triggers

In [None]:
xor = np.setdiff1d(
    events_dict["jetmet"][trigdict["jetmet"]["nojettau"]]["event"][0],
    events_dict["tau"][trigdict["tau"]["nojettau"]]["event"][0],
)

print(len(xor) / len(events_dict["jetmet"]))

xor = np.setdiff1d(
    events_dict["tau"][trigdict["tau"]["jetnotau"]]["event"][0],
    events_dict["jetmet"][trigdict["jetmet"]["jets"]]["event"][0],
)

print(len(xor) / len(events_dict["tau"]))

In [None]:
events_dict["jetmet"] = events_dict["jetmet"][trigdict["jetmet"]["jets"]]
events_dict["tau"] = events_dict["tau"][trigdict["tau"]["nojettau"]]

In [None]:
utils.add_to_cutflow(events_dict, "Triggers", "finalWeight", cutflow)
cutflow

## Taggers

In [None]:
taggers_dict = {}

for key, events in events_dict.items():
    tvars = {}

    qcdouts = ["QCD0HF", "QCD1HF", "QCD2HF"]
    topouts = ["TopW", "TopbW", "TopbWev", "TopbWmv", "TopbWtauhv", "TopbWq", "TopbWqq"]
    tvars["PQCD"] = sum([events[f"ak8FatJetParT{key}"] for key in qcdouts])
    tvars["PTop"] = sum([events[f"ak8FatJetParT{key}"] for key in topouts])

    for disc in ["Xbb", "Xtauhtauh"]:
        tvars[f"{disc}vsQCD"] = events[f"ak8FatJetParT{disc}"] / (
            events[f"ak8FatJetParT{disc}"] + tvars["PQCD"]
        )
        tvars[f"{disc}vsQCDTop"] = events[f"ak8FatJetParT{disc}"] / (
            events[f"ak8FatJetParT{disc}"] + tvars["PQCD"] + tvars["PTop"]
        )

    bb_mask = tvars["XbbvsQCD"][1] > tvars["XbbvsQCD"][0]
    tvars["bb_mask"] = np.vstack([~bb_mask, bb_mask]).T
    tvars["tautau_mask"] = ~tvars["bb_mask"]
    taggers_dict[key] = tvars

In [None]:
def get_jet_vals(vals, mask):
    return vals.values[:, :2][mask]

In [None]:
from sklearn.metrics import roc_curve

sig_key = "bbtthh"

rocs = {}

for jet in ["bb", "tautau"]:
    rocs[jet] = {}
    for i, disc in enumerate(["XbbvsQCD", "XbbvsQCDTop", "XtauhtauhvsQCD", "XtauhtauhvsQCDTop"]):
        print(disc)
        bg_scores = np.concatenate(
            [
                get_jet_vals(taggers_dict[key][disc], taggers_dict[key][f"{jet}_mask"])
                for key in ["jetmet", "tau"]
            ]
        )
        bg_weights = np.concatenate([events_dict[key]["finalWeight"] for key in ["jetmet", "tau"]])

        sig_scores = get_jet_vals(taggers_dict[sig_key][disc], taggers_dict[sig_key][f"{jet}_mask"])
        sig_weights = events_dict[sig_key]["finalWeight"]

        fpr, tpr, thresholds = roc_curve(
            np.concatenate([np.zeros_like(bg_scores), np.ones_like(sig_scores)]),
            np.concatenate([bg_scores, sig_scores]),
            sample_weight=np.concatenate([bg_weights, sig_weights]),
        )

        rocs[jet][disc] = {
            "fpr": fpr,
            "tpr": tpr,
            "thresholds": thresholds,
            "label": disc,
            "color": plt.cm.tab10.colors[i],
        }

In [None]:
print(np.mean(taggers_dict["bbtthh"]["XbbvsQCD"][0] > taggers_dict["bbtthh"]["XbbvsQCD"][1]))
print(
    np.mean(
        taggers_dict["bbtthh"]["XtauhtauhvsQCD"][1] > taggers_dict["bbtthh"]["XtauhtauhvsQCD"][0]
    )
)

In [None]:
tvars = taggers_dict["bbtthh"]
np.mean(
    (tvars["XbbvsQCD"][1] > taggers_dict["bbtthh"]["XbbvsQCD"][0])
    & (tvars["XtauhtauhvsQCD"][0] > tvars["XtauhtauhvsQCD"][1])
)

In [None]:
for jet, title in zip(["bb", "tautau"], ["bb FatJet", r"$\tau\tau$ FatJet"]):
    plotting.multiROCCurveGrey(
        {"": rocs[jet]}, title=title, show=True, plot_dir=plot_dir, name=f"roc_{jet}"
    )

## Cut-and-count

In [None]:
txbbcut = rocs["bb"]["XbbvsQCD"]["auc"][plotting._find_nearest(rocs["bb"]["XbbvsQCD"]["tpr"], 0.45)]
txttcut = rocs["tautau"]["XtauhtauhvsQCD"]["auc"][
    plotting._find_nearest(rocs["tautau"]["XtauhtauhvsQCD"]["tpr"], 0.25)
]
print(txbbcut, txttcut)

sig_key = "bbtthh"
bg_yield = 0
sig_yield = 0

for key in ["bbtthh", "jetmet", "tau"]:
    txbbs = get_jet_vals(taggers_dict[key]["XbbvsQCD"], taggers_dict[key]["bb_mask"])
    txtts = get_jet_vals(taggers_dict[key]["XtauhtauhvsQCD"], taggers_dict[key]["tautau_mask"])
    massbb = get_jet_vals(events_dict[key]["ak8FatJetPNetmassLegacy"], taggers_dict[key]["bb_mask"])
    ptbb = get_jet_vals(events_dict[key]["ak8FatJetPt"], taggers_dict[key]["bb_mask"])
    # plt.hist(massbb, np.linspace(0, 200, 100), histtype="step", label=key, weights=events_dict[key]["finalWeight"])

    if key == sig_key:
        cut = (txbbs > txbbcut) & (txtts > txttcut) & (massbb > 100) & (massbb < 150) & (ptbb > 250)
        sig_yield = np.sum(events_dict[key]["finalWeight"][cut])
        print("Sig yield", sig_yield)
    else:
        cut = (txbbs > txbbcut) & (txtts > txttcut) & (ptbb > 250)
        msb1 = (massbb > 75) & (massbb < 100)
        msb2 = (massbb > 150) & (massbb < 175)
        bg_yield += np.sum(events_dict[key]["finalWeight"][cut & msb1])
        bg_yield += np.sum(events_dict[key]["finalWeight"][cut & msb2])

        # bkg_yield = np.sum(events_dict[key]["finalWeight"][cut])
        # print("Bkg yield", bkg_yield)

# plt.yscale("log")
# plt.show()

print("BG yield", bg_yield)
print("limit", 2 * np.sqrt(bg_yield) / sig_yield)
print("limit scaled to 22-23 all channels", 2 * np.sqrt(bg_yield) / sig_yield / np.sqrt(12))
print("limit scaled to 22-24 all channels", 2 * np.sqrt(bg_yield) / sig_yield / np.sqrt(32))
print("limit scaled to Run 3 all channels", 2 * np.sqrt(bg_yield) / sig_yield / np.sqrt(60))