Takes the skimmed parquet files (output of bbVVSkimmer) and evaluates the HWW Tagger.

Author(s): Raghav Kansal

In [1]:
from __future__ import annotations

import matplotlib.pyplot as plt
import mplhep as hep
import numpy as np
import pandas as pd

plt.style.use(hep.style.CMS)
hep.style.use("CMS")
plt.rcParams.update({"font.size": 24})

import pickle
from pathlib import Path

import plotting
import postprocessing
import utils
from tqdm import tqdm

from HHbbVV.hh_vars import (
    nonres_samples,
    nonres_sig_keys,
    res_samples,
    res_sig_keys,
    samples,
)

MAIN_DIR = Path("../../../")

samples = samples | nonres_samples

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return idx

In [17]:
plot_dir = MAIN_DIR / "plots/TaggerAnalysis/25May23Eras"
# plot_dir = MAIN_DIR / "plots/BDT/24Apr9"
plot_dir.mkdir(parents=True, exist_ok=True)

DATA_DIR = Path("/ceph/cms/store/user/rkansal/bbVV/skimmer/")

# samples_dir = f"{MAIN_DIR}/../data/skimmer/24Mar14UpdateData"
sig_samples_dir = DATA_DIR / "25Jan9UpdateLPFix"  # nonres
# sig_samples_dir = DATA_DIR / "25Feb6XHY"  # res
bg_samples_dir = DATA_DIR / "24Mar6AllYearsBDTVars"

year = "2017"
RES = False

In [6]:
from math import sqrt


def p(mx, my, mh=125):
    return sqrt(
        mh**4
        + my**4
        + mx**4
        - 2 * (mh**2) * (mx**2)
        - 2 * (mh**2) * (my**2)
        - 2 * (mx**2) * (my**2)
    ) / (2 * mx)

In [None]:
p(4000, 80)

In [6]:
nonres_sig_keys = ["HHbbVV"]
nonres_sig_samples = {key: samples[key] for key in nonres_sig_keys}

res_mass_points = [
    (4000, 80),
    (4000, 125),
    # (4000, 150),
    (4000, 190),
    (4000, 250),
    # (4000, 300),
    (4000, 400),
    (4000, 500),
    (4000, 600),
]

# res_mass_points = [
#     (1600, 80),
#     (1600, 125),
#     (1600, 150),
#     (1600, 190),
#     (1800, 250),
# ]
res_sig_keys = [f"X[{mX}]->H(bb)Y[{mY}](VV)" for (mX, mY) in res_mass_points]
res_sig_samples = {key: res_samples[key] for key in res_sig_keys}

sig_keys = nonres_sig_keys if not RES else res_sig_keys
sig_samples = nonres_sig_samples if not RES else res_sig_samples

Go down directly to BDT plots section for BDT plots

In [None]:
# (column name, number of subcolumns)
load_columns = [
    ("weight", 1),
    ("weight_noTrigEffs", 1),
    ("ak8FatJetPt", 2),
    ("ak8FatJetMsd", 2),
    ("ak8FatJetHVV", 2),
    # ("ak8FatJetParticleNetMass", 2),
    ("ak8FatJetParticleNetMD_Txbb", 2),
    ("VVFatJetParTMD_THWWvsT", 1),
    # ("VVFatJetParTMD_probHWW4q", 1),
    # ("VVFatJetParTMD_probHWW3q", 1),
    # ("VVFatJetParTMD_probQCD", 1),
    # ("VVFatJetParTMD_probT", 1),
    # ("GenHiggsChildren", 2),
]

# # Both Jet's Regressed Mass above 50
events_dict = postprocessing.load_samples(
    sig_samples_dir,
    sig_samples,
    year,
    # filters=postprocessing.load_filters,
    columns=utils.format_columns(load_columns),
    variations=False,
)

# (column name, number of subcolumns)
load_columns = [
    ("weight", 1),
    ("weight_noTrigEffs", 1),
    ("ak8FatJetPt", 2),
    ("ak8FatJetMsd", 2),
    # ("ak8FatJetParticleNetMass", 2),
    ("ak8FatJetParticleNetMD_Txbb", 2),
    ("VVFatJetParTMD_THWWvsT", 1),
    # ("VVFatJetParTMD_probHWW4q", 1),
    # ("VVFatJetParTMD_probHWW3q", 1),
    # ("VVFatJetParTMD_probQCD", 1),
    # ("VVFatJetParTMD_probT", 1),
    # ("GenHiggsChildren", 1),
]

events_dict = {
    **events_dict,
    **postprocessing.load_samples(
        bg_samples_dir,
        {key: samples[key] for key in ["QCD", "TT"]},
        year,
        # filters=postprocessing.load_filters,
        columns=utils.format_columns(load_columns),
        variations=False,
    ),
}

In [None]:
cutflow = pd.DataFrame(index=list(events_dict.keys()))
utils.add_to_cutflow(events_dict, "Preselection", "finalWeight", cutflow)
cutflow

In [9]:
bb_masks = postprocessing.bb_VV_assignment(events_dict)

### Cuts

In [10]:
"""
``cuts_dict`` will be of format:
{
    sample1: {
        "cut1var1_min_max_cut1var2...": cut1,
        "cut2var2...": cut2,
        ...
    },
    sample2...
}
"""

pt_key = "Pt"
msd_key = "Msd"
var_prefix = "ak8FatJet"

cutvars_dict = {"Pt": "pt", "Msd": "msoftdrop"}

all_cuts = [
    {pt_key: [300, 3000]},
    # {pt_key: [400, 600], msd_key: [60, 150]},
    # {pt_key: [600, 1000], msd_key: [30, 250]},
    # {pt_key: [300, 1500], msd_key: [110, 140]},
]

var_labels = {pt_key: r"$p_T$", msd_key: r"$m_{SD}$"}

cuts_dict = {}
cut_labels = {}  # labels for plot titles, formatted as "var1label: [min, max] var2label..."

for sample, events in events_dict.items():
    # print(sample)
    cuts_dict[sample] = {}
    for cutvars in all_cuts:
        cutstrs = []
        cutlabel = []
        cuts = []
        for cutvar, (cutmin, cutmax) in cutvars.items():
            cutstrs.append(f"{cutvars_dict[cutvar]}_{cutmin}_{cutmax}")
            cutlabel.append(f"{var_labels[cutvar]}: [{cutmin}, {cutmax}]")
            cuts.append(events[f"{var_prefix}{cutvar}"] >= cutmin)
            cuts.append(events[f"{var_prefix}{cutvar}"] < cutmax)

        cutstr = "_".join(cutstrs)
        cut = np.prod(cuts, axis=0)
        cuts_dict[sample][cutstr] = cut.astype(bool)

        if cutstr not in cut_labels:
            cut_labels[cutstr] = " ".join(cutlabel)

### Histograms

In [11]:
plot_vars = {
    # "th4q": {
    #     "title": "ParticleNet Non-MD Th4q",
    #     "score_label": "ak8FatJetParticleNet_Th4q",
    #     "colour": "orange",
    # },
    # "thvv4q": {
    #     "title": "ParT MD THVV",
    #     "score_label": "ak8FatJetParTMD_THWW4q",
    #     "colour": "green",
    # },
    "thvv4qt": {
        "title": r"ParT $T_{HWW}$",
        "score_label": "VVFatJetParTMD_THWWvsT",
        "colour": "green",
    },
}

In [None]:
samples = {"tt": "TT", "qcd": "QCD", "HHbbVV": "HHbbVV"}

plt.rcParams.update({"font.size": 16})
plt.style.use(hep.style.CMS)
hep.style.use("CMS")

for t, pvars in plot_vars.items():
    for cutstr in cut_labels:
        plt.figure(figsize=(16, 12))
        # plt.suptitle(f"HVV FatJet {pvars['title']} Scores", y=0.95)
        # plt.title(cut_labels[cutstr], fontsize=20)

        for sample, colour, skip in [("HHbbVV", "red", 1), ("QCD", "deepskyblue", 4)]:
            _ = plt.hist(
                events_dict[sample][pvars["score_label"]][cuts_dict[sample][cutstr]][::skip],
                histtype="step",
                bins=np.linspace(0, 1, 101),
                label=f"{sample}",
                linewidth=2,
                color=colour,
                density=True,
                weights=events_dict[sample]["weight"][cuts_dict[sample][cutstr]][::skip],
            )

        plt.ylabel("# Jets (A.U.)")
        plt.xlabel(f"{pvars['title']} Score")
        plt.legend()
        plt.savefig(
            f"{plot_dir}/{t}_hist_{cutstr}.pdf",
            bbox_inches="tight",
        )

### ROCs

In [None]:
from scipy import integrate
from sklearn.metrics import auc, roc_curve

rocs = {}
# sig_key = "HHbbVV"
tot_bg_keys = ["TT", "QCD"]
bg_skip = 1
weight_key = "finalWeight"


for cutstr in cut_labels:
    print(cutstr)
    rocs[cutstr] = {}
    # for sig_key in tqdm(nonres_sig_keys + res_sig_keys):
    for sig_key in tqdm(sig_keys):
        gensel = np.all(
            events_dict[sig_key]["ak8FatJetHVV"].to_numpy().astype(bool)
            == ~bb_masks[sig_key].to_numpy(),
            axis=1,
        )
        rocs[cutstr][sig_key] = {}
        sig_cut = cuts_dict[sig_key][cutstr][~bb_masks[sig_key]] * gensel
        for bg_label, bg_keys in (
            {"Combined": tot_bg_keys} | {bg_key: [bg_key] for bg_key in tot_bg_keys}
        ).items():
            rocs[cutstr][sig_key][bg_label] = {}
            bg_cuts = {bg_key: cuts_dict[bg_key][cutstr][~bb_masks[bg_key]] for bg_key in bg_keys}

            y_true = np.concatenate(
                [
                    np.ones(len(events_dict[sig_key][sig_cut])),
                    np.zeros(
                        int(
                            np.ceil(
                                np.sum(
                                    [
                                        len(events_dict[bg_key][bg_cuts[bg_key]])
                                        for bg_key in bg_keys
                                    ]
                                )
                                / bg_skip
                            )
                        )
                    ),
                ]
            )
            # print(y_true[np.sum(sig_cut):])

            weights = np.concatenate(
                [events_dict[sig_key][weight_key][sig_cut]]
                + [
                    events_dict[bg_key][weight_key][bg_cuts[bg_key]][::bg_skip]
                    for bg_key in bg_keys
                ],
            )

            for t, pvars in plot_vars.items():
                score_label = pvars["score_label"]
                scores = np.concatenate(
                    [events_dict[sig_key][score_label][sig_cut]]
                    + [
                        events_dict[bg_key][score_label][bg_cuts[bg_key]][::bg_skip]
                        for bg_key in bg_keys
                    ],
                )
                # print(scores[np.sum(sig_cut):])
                fpr, tpr, thresholds = roc_curve(y_true, scores, sample_weight=weights)
                rocs[cutstr][sig_key][bg_label][t] = {
                    "fpr": fpr,
                    "tpr": tpr,
                    "thresholds": thresholds,
                    "auc": integrate.trapz(tpr, fpr),
                    "label": bg_label,
                }

#### All signals in same plot

Resonant Y->WW plots for JME paper

In [None]:
cutstr = "pt_600_1000_msoftdrop_30_250"
t = "thvv4qt"

for plabel, prelim in zip(["prelim_", ""], [True, False]):
    for bkey, blabel in zip(["QCD", "TT"], ["QCD", r"t$\rightarrow$bW"]):
        procs = {"all": {}}
        for skey in sig_keys:
            roc = rocs[cutstr][skey][bkey][t].copy()
            # roc["fpr"] = roc["fpr"]  # mass reweighting
            mY = skey.split("Y[")[1].split("]")[0]
            roc["label"] = rf"$m_Y = {mY}$ GeV"
            procs["all"][skey] = roc

        plotting.multiROCCurve(
            procs,
            [],
            title=rf"Y$\rightarrow$WW 4q vs {blabel}",
            xlim=[0, 1],
            ylim=[1e-4, 1],
            year="all",  # this is just to not plot any year at all
            kin_label=r"600 < $p_T$ < 1000 GeV, |$\eta$| < 2.4" "\n" r"$m_{SD}>30$ GeV",
            plot_dir=plot_dir,
            name=f"{plabel}XHY_ROC_{bkey}",
            prelim=prelim,
            show=prelim,
        )

    #     break
    # break

Resonant Y->WW plots for B2G search

In [None]:
cutstr = "pt_300_3000"
t = "thvv4qt"

for plabel, prelim in zip(["prelim_", ""], [True, False]):
    for bkey, blabel in zip(["QCD", "TT", "Combined"], ["QCD", r"t$\rightarrow$bW", r"QCD+t"]):
        procs = {"all": {}}
        for skey in sig_keys:
            roc = rocs[cutstr][skey][bkey][t].copy()
            # roc["fpr"] = roc["fpr"]  # mass reweighting
            mY = skey.split("Y[")[1].split("]")[0]
            roc["label"] = rf"$m_Y = {mY}$ GeV"
            procs["all"][skey] = roc

        plotting.multiROCCurve(
            procs,
            [0.6],
            # title=rf"Y$\rightarrow$WW 4q vs {blabel}",
            xlim=[0, 1],
            ylim=[1e-4, 1],
            year="all",  # this is just to not plot any year at all
            # kin_label=r"600 < $p_T$ < 1000 GeV, |$\eta$| < 2.4" "\n" r"$m_{SD}>30$ GeV",
            plot_dir=plot_dir,
            name=f"{plabel}XHY_ROC_{bkey}",
            prelim=prelim,
            show=prelim,
        )

    #     break
    # break

SM HH samples

In [None]:
cutstr = "pt_300_3000"
t = "thvv4qt"

for plabel, prelim in zip(["prelim_", ""], [True, False]):
    for bkey, blabel in zip(["QCD", "TT", "Combined"], ["QCD", r"t$\rightarrow$bW", r"QCD+t"]):
        procs = {"all": {}}
        for skey in sig_keys:
            roc = rocs[cutstr][skey][bkey][t].copy()
            # roc["fpr"] = roc["fpr"]  # mass reweighting
            roc["label"] = rf"SM HWW"
            procs["all"][skey] = roc

        plotting.multiROCCurve(
            procs,
            [0.6],
            # title=rf"Y$\rightarrow$WW 4q vs {blabel}",
            xlim=[0, 1],
            ylim=[1e-4, 1],
            year="all",  # this is just to not plot any year at all
            # kin_label=r"600 < $p_T$ < 1000 GeV, |$\eta$| < 2.4" "\n" r"$m_{SD}>30$ GeV",
            plot_dir=plot_dir,
            name=f"{plabel}XHY_ROC_{bkey}",
            prelim=prelim,
            show=prelim,
        )

    #     break
    # break

## ROC using Congqiao's samples

In [50]:
import uproot

qcd_files = [
    "pred_qcd170to300.root",
    "pred_qcd300to470.root",
    "pred_qcd470to600.root",
    "pred_qcd600to800.root",
    "pred_qcd800to1000.root",
    "pred_qcd1000to1400.root",
    "pred_qcd1400to1800.root",
    "pred_qcd1800to2400.root",
    "pred_qcd2400to3200.root",
    "pred_qcd3200toinf.root",
]

ttbar_files = ["pred_ttbar.root"]

qcd_files = [
    f"/ceph/cms/store/user/rkansal/bbVV/coli/glopart_inference_files/{file}" for file in qcd_files
]
ttbar_files = [
    f"/ceph/cms/store/user/rkansal/bbVV/coli/glopart_inference_files/{file}" for file in ttbar_files
]

events_dict["QCD"] = uproot.lazy(qcd_files)
events_dict["TT"] = uproot.lazy(ttbar_files)

In [None]:
import awkward as ak

pt_cuts = [600, 1000]
msd_cuts = [50, 250]

WW4q = [f"HWqqWqq{n}c" for n in ["0", "1", "2"]]
WW3q = [f"HWqqWq{n}c" for n in ["0", "1", "2"]]
TTBARbWall = [
    f"Top{n}"
    for n in ["bWqq0c", "bWqq1c", "bWq0c", "bWq1c", "bWev", "bWmv", "bWtauev", "bWtaumv", "bWtauhv"]
]
QCD = [f"QCD{n}" for n in ["b", "bb", "c", "cc", "others"]]

for key in ["QCD", "TT"]:
    print(key)
    events = events_dict[key]
    cut = (
        (events["fj_pt"] > pt_cuts[0])
        & (events["fj_pt"] < pt_cuts[1])
        & (events["fj_sdmass"] > msd_cuts[0])
        & (events["fj_sdmass"] < msd_cuts[1])
    )

    events = events[cut]
    score_hww = np.sum(
        [
            events[f"pfMassDecorrelatedInclParticleTransformerV1JetTags_prob{label}"].to_numpy()
            for label in WW4q + WW3q
        ],
        axis=0,
    )
    score_tt = np.sum(
        [
            events[f"pfMassDecorrelatedInclParticleTransformerV1JetTags_prob{label}"].to_numpy()
            for label in TTBARbWall
        ],
        axis=0,
    )
    score_qcd = np.sum(
        [
            events[f"pfMassDecorrelatedInclParticleTransformerV1JetTags_prob{label}"].to_numpy()
            for label in QCD
        ],
        axis=0,
    )

    events["VVFatJetParTMD_THWWvsT"] = score_hww / (score_hww + score_tt + score_qcd)
    events_dict[key] = events

In [None]:
events_dict[sig_key][weight_key][sig_cut].to_numpy().flatten()

In [None]:
from scipy import integrate
from sklearn.metrics import auc, roc_curve

rocs = {}
# sig_key = "HHbbVV"
tot_bg_keys = ["TT", "QCD"]
bg_skip = 1
weight_key = "weight_noTrigEffs"


for cutstr in cut_labels:
    if cutstr != "pt_600_1000_msoftdrop_30_250":
        continue

    print(cutstr)
    rocs[cutstr] = {}
    # for sig_key in tqdm(nonres_sig_keys + res_sig_keys):
    for sig_key in tqdm(sig_keys):
        rocs[cutstr][sig_key] = {}
        sig_cut = cuts_dict[sig_key][cutstr][~bb_masks[sig_key]]
        for bg_label, bg_keys in (
            {"Combined": tot_bg_keys} | {bg_key: [bg_key] for bg_key in tot_bg_keys}
        ).items():
            rocs[cutstr][sig_key][bg_label] = {}
            # bg_cuts = [cuts_dict[bg_key][cutstr] for bg_key in bg_keys]

            y_true = np.concatenate(
                [
                    np.ones(len(events_dict[sig_key][sig_cut])),
                    np.zeros(
                        int(
                            np.ceil(
                                np.sum([len(events_dict[bg_key]) for bg_key in bg_keys]) / bg_skip
                            )
                        )
                    ),
                ]
            )
            # print(y_true[np.sum(sig_cut):])

            weights = np.concatenate(
                [events_dict[sig_key][weight_key][sig_cut].to_numpy().flatten()]
                + [np.ones(len(events_dict[bg_key]))[::bg_skip] for bg_key in bg_keys],
            )

            for t, pvars in plot_vars.items():
                score_label = pvars["score_label"]
                scores = np.concatenate(
                    [events_dict[sig_key][score_label].to_numpy().flatten()[sig_cut]]
                    + [
                        events_dict[bg_key][score_label].to_numpy()[::bg_skip] for bg_key in bg_keys
                    ],
                )
                # print(scores[np.sum(sig_cut):])
                fpr, tpr, thresholds = roc_curve(y_true, scores, sample_weight=weights)
                rocs[cutstr][sig_key][bg_label][t] = {
                    "fpr": fpr,
                    "tpr": tpr,
                    "thresholds": thresholds,
                    "auc": integrate.trapz(tpr, fpr),
                    "label": bg_label,
                }

#### All signals in same plot

Resonant Y->WW plots

In [None]:
cutstr = "pt_600_1000_msoftdrop_30_250"
t = "thvv4qt"

for plabel, prelim in zip(["prelim_", ""], [True, False]):
    for bkey, blabel in zip(["QCD", "TT"], ["QCD", r"t$\rightarrow$bW"]):
        procs = {"all": {}}
        for skey in sig_keys:
            roc = rocs[cutstr][skey][bkey][t]
            mY = skey.split("Y[")[1].split("]")[0]
            roc["label"] = rf"$m_Y = {mY}$ GeV"
            procs["all"][skey] = roc

        plotting.multiROCCurve(
            procs,
            [],
            title=rf"Y$\rightarrow$WW 4q vs {blabel}",
            xlim=[0, 1],
            ylim=[1e-4, 1],
            year="all",  # this is just to not plot any year at all
            kin_label=r"600 < $p_T$ < 1000 GeV, |$\eta$| < 2.4" "\n" r"$m_{SD}>30$ GeV",
            plot_dir=plot_dir,
            name=f"colisamples_{plabel}XHY_ROC_{bkey}",
            prelim=prelim,
            show=prelim,
        )

    #     break
    # break

In [None]:
# roc = rocs[cutstr][sig_key]["Combined"][t]
# plotting.rocCurve(roc["fpr"], roc["tpr"], show=True, plot_dir=plot_dir, name="THVV", log=False, auc=roc["auc"])

bg_rocs = {key: val[t] for key, val in rocs[cutstr][sig_key].items()}
plotting.multiROCCurveGrey(
    {"all": bg_rocs},
    [],
    ylim=[1e-4, 1],
    xlim=[0, 1],
    show=True,
    plot_dir=plot_dir,
    log=True,
    name="THVV_sep_bgs",
)

In [None]:
xlim = [0, 0.8]
ylim = [1e-6, 1]

plot_thresholds = [0.98, 0.96, 0.94, 0.9, 0.8, 0.6, 0.4]
th_colours = ["#9381FF", "#1f78b4", "#a6cee3", "#ff7f00", "#7CB518", "#EDB458", "#36213E"]

plt.rcParams.update({"font.size": 24})

sig_colours = [
    "#23CE6B",
    "#ffbaba",
    "#ff7b7b",
    "#ff5252",
    # "#EDB458",
    "#a70000",
    "#885053",
    "#3C0919",
]

plot_vars = {
    # "th4q": {
    #     "title": "ParticleNet Non-MD Th4q",
    #     "score_label": "ak8FatJetParticleNet_Th4q",
    #     "colour": "orange",
    # },
    # "thvv4q": {
    #     "title": "ParT MD THWW",
    #     "score_label": "ak8FatJetParTMD_THWW4q",
    #     "colour": "green",
    # },
    "thvv4qt": {
        "title": "ParT MD THVV",
        "score_label": "ak8FatJetParTMD_THWWvsT",
        "colour": "green",
    },
}


for cutstr in cut_labels:
    for t, pvars in plot_vars.items():
        for j, plot_sig_keys in enumerate(sig_splits):
            split_str = "allsigs" if len(sig_splits) == 1 else f"sigs{j}"
            pths = {th: [[], []] for th in plot_thresholds}
            plt.figure(figsize=(12, 12))
            for i, sig_key in enumerate(plot_sig_keys):
                roc = rocs[cutstr][sig_key][t]
                c = sig_colours[i]
                plt.plot(
                    roc["tpr"],
                    roc["fpr"],
                    # label=f"{sig_key} AUC: {roc['auc']:.2f}",
                    label=f"{sig_key}",
                    linewidth=2,
                    color=c,
                )
                for th in plot_thresholds:
                    idx = find_nearest(roc["thresholds"], th)
                    pths[th][0].append(roc["tpr"][idx])
                    pths[th][1].append(roc["fpr"][idx])

                # plt.vlines(
                #     x=roc["tpr"][np.searchsorted(roc["fpr"], 0.01)],
                #     ymin=0,
                #     ymax=0.01,
                #     colors=c,
                #     linestyles="dashed",
                # )

            for k, th in enumerate(plot_thresholds):
                plt.scatter(
                    *pths[th],
                    marker="o",
                    s=40,
                    label=f"THVV > {th:.2f}",
                    color=th_colours[k],
                    zorder=100,
                )
            hep.cms.label(data=False, rlabel="")
            # plt.hlines(y=0.01, xmin=0, xmax=1, colors="lightgrey", linestyles="dashed")
            plt.yscale("log")
            plt.xlabel("Signal efficiency")
            plt.ylabel("Background efficiency")
            # plt.suptitle(f"HVV FatJet {pvars['title']} ROC", y=0.95)
            # plt.title(cut_labels[cutstr], fontsize=20)
            plt.xlim(*xlim)
            plt.ylim(*ylim)
            plt.legend(loc="lower right")
            plt.savefig(f"{plot_dir}/roccurve_{split_str}_{t}_{cutstr}.pdf", bbox_inches="tight")

Old vs New Tagger in same plot

In [None]:
xlim = [0, 0.6]
ylim = [1e-6, 1]

for cutstr in cut_labels:
    plt.figure(figsize=(12, 12))
    for t, pvars in plot_vars.items():
        plt.plot(
            rocs[cutstr][t]["tpr"][::10],
            rocs[cutstr][t]["fpr"][::10],
            label=f"{pvars['title']} AUC: {rocs[cutstr][t]['auc']:.2f}",
            linewidth=2,
            color=pvars["colour"],
        )
        plt.vlines(
            x=rocs[cutstr][t]["tpr"][np.searchsorted(rocs[cutstr][t]["fpr"], 0.01)],
            ymin=0,
            ymax=0.01,
            colors=pvars["colour"],
            linestyles="dashed",
        )
    plt.hlines(y=0.01, xmin=0, xmax=1, colors="lightgrey", linestyles="dashed")
    plt.yscale("log")
    plt.xlabel("Signal Eff.")
    plt.ylabel("BG Eff.")
    plt.suptitle("HVV FatJet ROC Curves", y=0.95)
    plt.title(cut_labels[cutstr], fontsize=20)
    plt.xlim(*xlim)
    plt.ylim(*ylim)
    plt.legend()
    plt.savefig(f"{plot_dir}/roccurve_{cutstr}.pdf", bbox_inches="tight")

### BDT Plots

Checking thresholds

In [None]:
with open(f"{samples_dir}/24_04_05_k2v0_training_eqsig_vbf_vars_rm_deta/roc_dict.pkl", "rb") as f:
    roc = pickle.load(f)

In [None]:
plotting.multiROCCurve(
    {"test": roc},
    thresholds=[[0.98, 0.99, 0.994, 0.995, 0.9965], [0.99, 0.997, 0.998, 0.999, 0.9997]],
    plot_dir=plot_dir,
    name="roc_thresholds",
    show=True,
)

In [None]:
xlim = [0, 0.3]
ylim = [1e-6, 1e-2]

# plot_thresholds = [0.6, 0.9, 0.96, 0.99, 0.997, 0.998, 0.999]
plot_thresholds = [0.997, 0.998, 0.999, 0.9993, 0.9995, 0.9997, 0.9999]
th_colours = [
    "#36213E",
    "#9381FF",
    "#1f78b4",
    # "#a6cee3",
    # "#32965D",
    "#7CB518",
    "#EDB458",
    "#ff7f00",
    "#a70000",
]

roc_colours = [
    "#23CE6B",
    "#a70000",
]

plt.rcParams.update({"font.size": 24})

pths = {th: [[], []] for th in plot_thresholds}
plt.figure(figsize=(12, 12))
c = roc_colours[0]

plt.plot(
    roc["tpr"],
    roc["fpr"],
    # label=roc_labels[inf],
    linewidth=2,
    color=c,
)

for th in plot_thresholds:
    idx = find_nearest(roc["thresholds"], th)
    pths[th][0].append(roc["tpr"][idx])
    pths[th][1].append(roc["fpr"][idx])

for k, th in enumerate(plot_thresholds):
    plt.scatter(
        *pths[th],
        marker="o",
        s=40,
        label=f"BDT Score > {th}",
        color=th_colours[k],
        zorder=100,
    )

hep.cms.label(data=False, rlabel="")
plt.yscale("log")
plt.xlabel("Signal efficiency")
plt.ylabel("Background efficiency")
plt.xlim(*xlim)
plt.ylim(*ylim)
plt.legend(loc="lower right")
# plt.savefig(f"{plot_dir}/roc_bdt.pdf", bbox_inches="tight")
plt.show()

Comparing binary vs multi-class

In [None]:
inferences_dir = f"{samples_dir}/inferences/"
inferences_dir_binary = f"{samples_dir}/inferences_binary/"
rocs = {"inferences": {}, "inferences_binary": {}}

for arr in ["tpr", "fpr", "thresholds"]:
    print(arr)
    rocs["inferences"][arr] = np.loadtxt(f"{inferences_dir}/{arr}.txt")
    # rocs["inferences_binary"][arr] = np.loadtxt(f"{inferences_dir_binary}/{arr}.txt")

roc_labels = {"inferences": "Multiclass", "inferences_binary": "Binary"}

In [None]:
xlim = [0, 1]
ylim = [1e-6, 1]

roc_colours = [
    "#23CE6B",
    "#a70000",
]

plt.rcParams.update({"font.size": 24})

pths = {th: [[], []] for th in plot_thresholds}
plt.figure(figsize=(12, 12))
for i, inf in enumerate(["inferences", "inferences_binary"]):
    roc = rocs[inf]
    c = roc_colours[i]

    plt.plot(
        roc["tpr"],
        roc["fpr"],
        label=roc_labels[inf],
        linewidth=2,
        color=c,
    )

hep.cms.label(data=False, rlabel="")
plt.yscale("log")
plt.xlabel("Signal efficiency")
plt.ylabel("Background efficiency")
plt.xlim(*xlim)
plt.ylim(*ylim)
plt.legend(loc="upper left")
plt.savefig(f"{plot_dir}/roc_multi_binary_bdt.pdf", bbox_inches="tight")

Comparing train vs test

In [None]:
inferences_dir = f"{samples_dir}/inferences/"
rocs = {}

for inf in ["train", "test"]:
    with open(f"{inferences_dir}/23_11_02_{inf}_roc_dict.pkl", "rb") as f:
        rocs[inf] = pickle.load(f)

roc_labels = {"train": "Train", "test": "Test"}

In [None]:
xlim = [0, 1]
ylim = [1e-6, 1]

plot_thresholds = [0.6, 0.9, 0.96, 0.99, 0.997, 0.998, 0.999]
th_colours = [
    "#36213E",
    "#9381FF",
    "#1f78b4",
    # "#a6cee3",
    # "#32965D",
    "#7CB518",
    "#EDB458",
    "#ff7f00",
    "#a70000",
]

roc_colours = [
    "#23CE6B",
    "#a70000",
]

plt.rcParams.update({"font.size": 24})

pths = {th: [[], []] for th in plot_thresholds}
plt.figure(figsize=(12, 12))
for i, inf in enumerate(["inferences"]):
    roc = rocs[inf]
    c = roc_colours[i]

    plt.plot(
        roc["tpr"],
        roc["fpr"],
        # label=roc_labels[inf],
        linewidth=2,
        color=c,
    )

    for th in plot_thresholds:
        idx = find_nearest(roc["thresholds"], th)
        pths[th][0].append(roc["tpr"][idx])
        pths[th][1].append(roc["fpr"][idx])

for k, th in enumerate(plot_thresholds):
    plt.scatter(
        *pths[th],
        marker="o",
        s=40,
        label=f"BDT Score > {th}",
        color=th_colours[k],
        zorder=100,
    )

hep.cms.label(data=False, rlabel="")
plt.yscale("log")
plt.xlabel("Signal efficiency")
plt.ylabel("Background efficiency")
plt.xlim(*xlim)
plt.ylim(*ylim)
plt.legend(loc="lower right")
# plt.savefig(f"{plot_dir}/roc_bdt.pdf", bbox_inches="tight")

Trimming features

In [None]:
fpr_tpr_th = np.array(
    [
        6.841912584548129e-05,
        0.06309407980527611,
        0.997972309589386,
        5.5970691547507736e-05,
        0.06309112513484341,
        0.9981555342674255,
        6.062037416775686e-05,
        0.06309433205678786,
        0.9980131387710571,
        6.282697172598808e-05,
        0.06309181657972177,
        0.9981679916381836,
        7.333505621545634e-05,
        0.06308829354373516,
        0.9979708790779114,
        7.116201530873543e-05,
        0.06309218251750907,
        0.9979798197746277,
        7.2475727068077e-05,
        0.06310060756610338,
        0.9979470372200012,
        7.738615489911652e-05,
        0.06309448855598651,
        0.9979099631309509,
        8.047411596104979e-05,
        0.06309097845150874,
        0.9977815747261047,
        8.503279607366034e-05,
        0.06308554741097783,
        0.9978626370429993,
        8.495399767538146e-05,
        0.06309224241230124,
        0.9976242184638977,
        8.807786735948429e-05,
        0.06308749819238176,
        0.9974913597106934,
        0.00034950987699599445,
        0.06309245484995998,
        0.9928722381591797,
    ]
)

fpr = fpr_tpr_th[::3]

In [None]:
plt.figure(figsize=(12, 8))
plt.plot(fpr)
plt.xlabel("Features Removed")
plt.ylabel("Background efficiency")
plt.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
plt.savefig(f"{plot_dir}/feats_removed.pdf", bbox_inches="tight")

In [None]:
uls = np.array(
    [[0, 53.25], [1, 50.25], [3, 49.5], [4, 49.9375], [6, 50.25], [8, 52.75], [11, 58.8]]
)

plt.figure(figsize=(12, 8))
plt.plot(uls[:, 0], uls[:, 1])
plt.xlabel("Features Removed")
plt.ylabel("Median expected upper limit xSM")
# plt.ticklabel_format(axis='y', style='sci', scilimits=(0, 0))
plt.savefig(f"{plot_dir}/feats_removed_uls.pdf", bbox_inches="tight")

## WW vs ZZ Check

In [None]:
sig_key = "HHbbVV"
zz_events = np.any(events_dict[sig_key]["GenHiggsChildren"] == 23, axis=1)
ww_events = np.any(events_dict[sig_key]["GenHiggsChildren"] == 24, axis=1)

In [None]:
from scipy import integrate
from sklearn.metrics import auc, roc_curve

rocs = {}
sig_key = "HHbbVV"
tot_bg_keys = ["TT", "QCD"]
bg_skip = 1
weight_key = "finalWeight"


for cutstr in cut_labels:
    # print(cutstr)
    rocs[cutstr] = {}
    # for sig_key in tqdm(nonres_sig_keys + res_sig_keys):
    for slabel, sig_sel in zip(["ZZ", "WW"], [zz_events, ww_events]):
        rocs[cutstr][slabel] = {}
        # sig_cut = cuts_dict[sig_key][cutstr]
        for bg_label, bg_keys in (
            {"Combined": tot_bg_keys} | {bg_key: [bg_key] for bg_key in tot_bg_keys}
        ).items():
            rocs[cutstr][slabel][bg_label] = {}
            # bg_cuts = [cuts_dict[bg_key][cutstr] for bg_key in bg_keys]

            y_true = np.concatenate(
                [
                    np.ones(len(events_dict[sig_key][sig_sel])),
                    np.zeros(
                        int(
                            np.ceil(
                                np.sum([len(events_dict[bg_key]) for bg_key in bg_keys]) / bg_skip
                            )
                        )
                    ),
                ]
            )

            weights = np.concatenate(
                [events_dict[sig_key][weight_key][sig_sel]]
                + [events_dict[bg_key][weight_key][::bg_skip] for bg_key in bg_keys],
            )

            for t, pvars in plot_vars.items():
                score_label = pvars["score_label"]
                scores = np.concatenate(
                    [events_dict[sig_key][score_label][sig_sel]]
                    + [events_dict[bg_key][score_label][::bg_skip] for bg_key in bg_keys],
                )
                # print(scores[np.sum(sig_cut):])
                fpr, tpr, thresholds = roc_curve(y_true, scores, sample_weight=weights)
                rocs[cutstr][slabel][bg_label][t] = {
                    "fpr": fpr,
                    "tpr": tpr,
                    "thresholds": thresholds,
                    "auc": integrate.trapz(tpr, fpr),
                    "label": slabel,
                }

In [None]:
bg_rocs = {key: val["Combined"][t] for key, val in rocs[cutstr].items()}
plotting.multiROCCurve(
    {"all": bg_rocs},
    [0.6],
    ylim=[1e-4, 1],
    xlim=[0, 1],
    show=True,
    plot_dir=plot_dir,
    # log=True,
    name="THVV_sep_zzww",
)

In [None]:
548 / 426

## Plotting different years

Using outputs produced by `for year in 2016 2016APV 2017 2018; do python3 InferenceAnalysis.py --year $year --plots-tag 25May23Eras; done`

In [None]:
years = ["2016", "2016APV", "2017", "2018"]

# Load ROCs for each year
year_rocs = {}
for year in years:
    with open(plot_dir / year / f"{year}__ROC_QCD.pkl", "rb") as f:
        year_rocs[year] = pickle.load(f)

# Plot ROCs for each year separately
combined_rocs = {"all": {}}
for year in years:
    for skey, roc in year_rocs[year]["all"].items():
        # Create a unique key for each year+signal combination
        year_skey = f"{skey}_{year}"
        combined_rocs["all"][year_skey] = roc.copy()
        combined_rocs["all"][year_skey]["label"] = f"{skey} ({year})"

# Plot combined ROCs
plotting.multiROCCurve(
    combined_rocs,
    [],
    ylim=[1e-4, 1],
    xlim=[0, 1],
    year="all",
    plot_dir=plot_dir,
    name="ROC_QCD_all_years",
    show=True,
)