In [None]:
from __future__ import annotations

import argparse
import itertools
import json
import os
import pickle
import sys
import warnings
from collections import OrderedDict
from copy import deepcopy
from dataclasses import dataclass, field
from pathlib import Path

import corrections

# from pandas.errors import SettingWithCopyWarning
import hist
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import mplhep as hep
import plotting
import postprocessing
import utils
from corrections import get_lpsf, postprocess_lpsfs
from hist import Hist
from regions import (
    Region,
    get_nonres_selection_regions,
    get_nonres_vbf_selection_regions,
    get_res_selection_regions,
)
from utils import ShapeVar

from HHbbVV import hh_vars
from HHbbVV.hh_vars import (
    bg_keys,
    data_key,
    hbb_bg_keys,
    jec_shifts,
    jmsr_shifts,
    nonres_samples,
    nonres_sig_keys,
    norm_preserving_weights,
    qcd_key,
    res_samples,
    res_sig_keys,
    samples,
    years,
)

plt.style.use(hep.style.CMS)
hep.style.use("CMS")
formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))
plt.rcParams.update({"font.size": 16})

# ignore these because they don't seem to apply
# warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
res_samples = OrderedDict()

# res_mps = [(900, 80), (1200, 190), (2000, 125), (3000, 250), (4000, 150)]
res_mps = [(900, 80)]

for mX, mY in res_mps:
    res_samples[f"X[{mX}]->H(bb)Y[{mY}](VV)"] = f"NMSSM_XToYHTo2W2BTo4Q2B_MX-{mX}_MY-{mY}"

res_sig_keys = list(res_samples.keys())

In [None]:
# del nonres_samples["VBFHHbbVV"]
nonres_sig_keys = [
    "HHbbVV",
    "VBFHHbbVV",
    "qqHH_CV_1_C2V_1_kl_2_HHbbVV",
    "qqHH_CV_1_C2V_2_kl_1_HHbbVV",
]
nonres_samples = {key: nonres_samples[key] for key in nonres_sig_keys}

# bg_keys = ["QCD", "TT", "ST", "V+Jets", "Diboson"]
# samples = {key: samples[key] for key in ["Data"] + bg_keys}

In [None]:
MAIN_DIR = Path("../../../")
# samples_dir = MAIN_DIR / "../data/skimmer/24Mar14UpdateData"
samples_dir = "/ceph/cms/store/user/rkansal/bbVV/skimmer/24Mar14UpdateData"
# samples_dir = f"{MAIN_DIR}/../data/skimmer/Feb24"
# nonres_signal_samples_dir = f"{MAIN_DIR}/../data/skimmer/Jun10"
# res_signal_samples_dir = f"{MAIN_DIR}/../data/skimmer/Apr11"
# samples_dir = "/eos/uscms/store/user/rkansal/bbVV/skimmer/Feb24"
# nonres_signal_samples_dir = "/eos/uscms/store/user/cmantill/bbVV/skimmer/Jun10/"
# res_signal_samples_dir = "/eos/uscms/store/user/rkansal/bbVV/skimmer/Apr11/"
year = "2018"

date = "25Feb24ExcessChecks"
plot_dir = MAIN_DIR / f"plots/PostProcessing/{date}/"
templates_dir = Path(f"templates/{date}/")

_ = os.system(f"mkdir -p {plot_dir}/ControlPlots/{year}")
_ = os.system(f"mkdir -p {plot_dir}/cutflows")
_ = os.system(f"mkdir -p {plot_dir}/templates/wshifts")
_ = os.system(f"mkdir -p {plot_dir}/templates/jshifts")
_ = os.system(f"mkdir -p {plot_dir}/templates/hists2d")
_ = os.system(f"mkdir -p {templates_dir}/cutflows/{year}")

Load samples

In [None]:
systematics = {year: {}}
samples_dir = "/ceph/cms/store/user/rkansal/bbVV/skimmer/24Mar14UpdateData"

# load_samples = list(samples.keys()) + list(nonres_samples.keys()) + list(res_samples.keys())
load_samples = {"Data": "JetHT"}

# save cutflow as pandas table
cutflow = pd.DataFrame(index=list(load_samples.keys()))

events_dict = postprocessing.load_samples(
    samples_dir,
    {**load_samples},  # , **res_samples, **samples},
    year,
    postprocessing.load_filters,
    variations=False,
)

utils.add_to_cutflow(events_dict, "Preselection", "finalWeight", cutflow)
cutflow

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import mplhep as hep

plt.style.use(hep.style.CMS)
hep.style.use("CMS")
formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))

# this is needed for some reason to update the font size for the first plot
fig, ax = plt.subplots(1, 1, figsize=(12, 12))
plt.rcParams.update({"font.size": 24})
plt.close()

fig, ax = plt.subplots(1, 1, figsize=(8, 8))
for sample, events in events_dict.items():
    plt.hist(
        events["ak8FatJetParticleNetMass"].to_numpy().reshape(-1),
        bins=np.arange(50, 250, 10),
        label=sample,
        histtype="step",
        density=True,
    )

plt.legend()
plt.xlabel("AK8 Jet pT (GeV)")
plt.show()

Scale factors and bb VV assignment

In [None]:
# postprocessing.qcd_sf(events_dict, cutflow)
bb_masks = postprocessing.bb_VV_assignment(events_dict)
postprocessing.derive_variables(
    events_dict, bb_masks, resonant=True, nonres_vars=False, do_jshifts=False
)
cutflow

## Data mass sculpting

In [None]:
events = events_dict["Data"]
bb_mask = bb_masks["Data"]
vvtagger = utils.get_feat(events, "VVFatJetParTMD_THWWvsT", bb_mask)
bbtagger = utils.get_feat(events, "bbFatJetParticleNetMD_Txbb", bb_mask)
bbregmass = utils.get_feat(events, f"bbFatJetParticleNetMass", bb_mask)

for mass_var, mlabel in zip(["Msd", "ParticleNetMass"], ["SD", "reg"]):
    vvmass = utils.get_feat(events, f"VVFatJet{mass_var}", bb_mask)
    bbmass = utils.get_feat(events, f"bbFatJet{mass_var}", bb_mask)
    plotting.plotMassSculpting(
        bbmass,
        vvmass,
        events["finalWeight"],
        vvtagger,
        [0.0, 0.4, 0.6, 0.8, 0.9, 0.96],
        mlabel,
        r"$T_{HVV}$",
        year,
        show=True,
    )
    plotting.plotMassSculpting(
        bbmass,
        vvmass,
        events["finalWeight"],
        bbtagger,
        [0.8, "LP", "MP", "HP"],
        mlabel,
        r"$T_{Xbb}$",
        year,
        show=True,
    )

In [None]:
mplot_dir = Path(f"../../../plots/PostProcessing/25Feb24Sculpting/MassSculpting")

for key in ["Data", "QCD"]:
    for mass_var, mlabel in zip(["Msd", "ParticleNetMass"], ["SD", "reg"]):
        for tagger_cuts, tlabel, tagger in zip(
            [[0.0, 0.4, 0.6, 0.8, 0.9, 0.96], [0.8, "LP", "MP", "HP"]],
            [r"$T_{HVV}$", r"$T_{Xbb}$"],
            ["vv", "bb"],
        ):
            hists = {}
            for jet in ["bb", "VV"]:
                hists[jet] = []
                for year in years:
                    with (mplot_dir / f"{year}/pickles/{jet}_{key}_{tagger}cuts_{mlabel}.pkl").open(
                        "rb"
                    ) as f:
                        hists[jet].append(pickle.load(f))

                summed_hists = []
                for i in range(len(hists[jet][0])):
                    summed_hists.append(sum([hist[i] for hist in hists[jet]]))

                hists[jet] = summed_hists

            plotting.plotMassSculptingAllYears(
                hists,
                tagger_cuts,
                mlabel,
                tlabel,
                mplot_dir / f"{key}_{tagger}cuts_{mlabel}.pdf",
                show=True,
            )

# hists = {}
# for year in years:
#     with Path(f"../../../plots/PostProcessing/25Feb24Sculpting/MassSculpting/{year}/pickles/VV_Data_vvcuts_reg.pkl").open("rb") as f:
#         hists.append(pickle.load(f))

# summed_hists = []
# for i in range(len(hists[0])):
#     summed_hists.append(sum([hist[i] for hist in hists]))

In [None]:
bb_hists

## Control Plots

In [None]:
# {var: (bins, label)}
control_plot_vars = [
    # ShapeVar(var="MET_pt", label=r"$p^{miss}_T$ (GeV)", bins=[50, 0, 300]),
    # ShapeVar(var="DijetEta", label=r"$\eta^{jj}$", bins=[30, -8, 8]),
    # ShapeVar(var="DijetPt", label=r"$p_T^{jj}$ (GeV)", bins=[30, 0, 750]),
    # ShapeVar(var="DijetMass", label=r"$m^{jj}$ (GeV)", bins=[30, 600, 4000]),
    # ShapeVar(var="bbFatJetEta", label=r"$\eta^{bb}$", bins=[20, -2.4, 2.4]),
    # ShapeVar(
    #     var="bbFatJetPt", label=r"$p^{bb}_T$ (GeV)", bins=[20, 300, 2300], significance_dir="right"
    # ),
    # ShapeVar(
    #     var="bbFatJetParticleNetMass",
    #     label=r"$m^{bb}_{reg}$ (GeV)",
    #     bins=[20, 50, 250],
    #     significance_dir="bin",
    # ),
    ShapeVar(var="bbFatJetMsd", label=r"$m^{bb}_{msd}$ (GeV)", bins=[20, 0, 300]),
    # ShapeVar(var="bbFatJetParticleNetMD_Txbb", label=r"$T^{bb}_{Xbb}$", bins=[50, 0.8, 1]),
    # ShapeVar(var="VVFatJetEta", label=r"$\eta^{VV}$", bins=[30, -2.4, 2.4]),
    # ShapeVar(var="VVFatJetPt", label=r"$p^{VV}_T$ (GeV)", bins=[20, 300, 2300]),
    # ShapeVar(var="VVFatJetParticleNetMass", label=r"$m^{VV}_{reg}$ (GeV)", bins=[20, 50, 250]),
    # ShapeVar(var="VVFatJetMsd", label=r"$m^{VV}_{msd}$ (GeV)", bins=[40, 50, 250]),
    # ShapeVar(var="VVFatJetParticleNet_Th4q", label=r"Prob($H \to 4q$) vs Prob(QCD) (Non-MD)", bins=[50, 0, 1]),
    # ShapeVar(var="VVFatJetParTMD_THWW4q", label=r"Prob($H \to VV \to 4q$) vs Prob(QCD) (Mass-Decorrelated)", bins=[50, 0, 1]),
    # ShapeVar(var="VVFatJetParTMD_probT", label=r"Prob(Top) (Mass-Decorrelated)", bins=[50, 0, 1]),
    # ShapeVar(var="VVFatJetParTMD_THWWvsT", label=r"$T^{VV}_{HWW}$", bins=[50, 0, 1]),
    # ShapeVar(var="bbFatJetPtOverDijetPt", label=r"$p^{bb}_T / p_T^{jj}$", bins=[50, 0, 40]),
    # ShapeVar(var="VVFatJetPtOverDijetPt", label=r"$p^{VV}_T / p_T^{jj}$", bins=[50, 0, 40]),
    # ShapeVar(var="VVFatJetPtOverbbFatJetPt", label=r"$p^{VV}_T / p^{bb}_T$", bins=[50, 0.4, 2.0]),
    # ShapeVar(var="nGoodMuonsHbb", label=r"# of Muons", bins=[3, 0, 3]),
    # ShapeVar(var="nGoodMuonsHH", label=r"# of Muons", bins=[3, 0, 3]),
    # ShapeVar(var="nGoodElectronsHbb", label=r"# of Electrons", bins=[3, 0, 3]),
    # ShapeVar(var="nGoodElectronsHH", label=r"# of Electrons", bins=[3, 0, 3]),
    # ShapeVar(var="nGoodJets", label=r"# of AK4 B-Jets", bins=[5, 0, 5]),
    # removed if not ggF nonresonant - needs to be the last variable!
    # ShapeVar(var="BDTScore", label=r"BDT Score", bins=[50, 0, 1]),
]

hists = postprocessing.control_plots(
    events_dict,
    bb_masks,
    nonres_sig_keys + res_sig_keys,
    control_plot_vars,
    plot_dir / f"ControlPlots/{year}",
    year,
    bg_keys=bg_keys,
    sig_scale_dict={"HHbbVV": 1e5, "VBFHHbbVV": 2e6} | {key: 2e4 for key in res_sig_keys},
    # bg_keys=["QCD", "TT", "ST", "V+Jets", "Hbb"],
    show=True,
)

In [None]:
with (MAIN_DIR / "plots/PostProcessing/24Mar6Mass/ControlPlots/2016/hists.pkl").open("rb") as f:
    hists2 = pickle.load(f)

Overall LP SF

In [None]:
from postprocessing import Region, nonres_shape_vars

# temp region to check systematics
selection_regions = {
    "pass": Region(
        cuts={
            "bbFatJetParticleNetMD_Txbb": [0.97, CUT_MAX_VAL],
            "VVFatJetParTMD_THWWvsT": [0.8, CUT_MAX_VAL],
        },
        label="Pass",
    ),
    "lpsf": Region(
        cuts={
            "VVFatJetParTMD_THWWvsT": [0.8, CUT_MAX_VAL],
        },
        label="LP SF",
    ),
}

In [None]:
from collections import OrderedDict

from tqdm import tqdm

sel, cf = utils.make_selection(
    selection_regions["lpsf"].cuts, events_dict, bb_masks, prev_cutflow=cutflow
)

sf_table = OrderedDict()

for sig_key in tqdm(nonres_sig_keys + res_sig_keys):
    systematics[sig_key] = {}
    # calculate only for current year
    events_dict[sig_key] = postprocessing.postprocess_lpsfs(events_dict[sig_key])
    lp_sf, unc, uncs = postprocessing.get_lpsf(events_dict[sig_key], sel[sig_key])
    # print(f"BDT LP Scale Factor for {sig_key}: {lp_sf:.2f} ± {unc:.2f}")
    # print(uncs)

    systematics[sig_key]["lp_sf"] = lp_sf
    systematics[sig_key]["lp_sf_unc"] = unc / lp_sf

    sf_table[sig_key] = {"SF": f"{lp_sf:.2f} ± {unc:.2f}", **uncs}

In [None]:
sf_df = pd.DataFrame(index=nonres_sig_keys + res_sig_keys)

for key in sf_table[sig_key]:
    sf_df[key] = [sf_table[skey][key] for skey in nonres_sig_keys + res_sig_keys]

sf_df.to_clipboard()
sf_df

Templates

In [None]:
selection_regions = postprocessing.get_res_selection_regions(year)
# del selection_regions["fail"], selection_regions["failBlinded"]

In [None]:
ht = postprocessing.get_templates(
    events_dict,
    bb_masks,
    year,
    ["HHbbVV"],
    # nonres_sig_keys + res_sig_keys,
    # res_sig_keys,
    selection_regions,
    # res_shape_vars[:1],
    nonres_shape_vars,
    systematics,
    templates_dir,
    # bg_keys=["QCD", "TT", "V+Jets", "Diboson", "Hbb"],
    plot_dir=plot_dir / "templates",
    prev_cutflow=cutflow,
    sig_scale_dict={"HHbbVV": 1e3, "VBFHHbbVV": 1e4} | {key: 1e2 for key in res_sig_keys},
    # sig_splits=sig_splits[:2],
    weight_shifts={},
    jshift="",
    lpsfs=True,
    plot_shifts=False,
    pass_ylim=500,
    fail_ylim=40000,
    # blind_pass=True,
    show=True,
)

In [None]:
templates = {}

for jshift in [""]:  # + jec_shifts + jmsr_shifts:
    print(jshift)
    ttemps = postprocessing.get_templates(
        events_dict,
        bb_masks,
        year,
        nonres_sig_keys,
        selection_regions,
        # res_selection_regions[year],
        nonres_shape_vars,
        # res_shape_vars,
        systematics,
        templates_dir,
        plot_dir=plot_dir / "templates",
        prev_cutflow=cutflow,
        sig_scale_dict={"HHbbVV": 1e3, "VBFHHbbVV": 2e4} | {key: 1e2 for key in res_sig_keys},
        weight_shifts=postprocessing.weight_shifts,
        jshift=jshift,
        lpsfs=True,
        pass_ylim=500,
        fail_ylim=40000,
        # blind_pass=True,
        show=False,
        plot_shifts=True,
    )

    templates = {**templates, **ttemps}

In [None]:
templates

In [None]:
with open(f"{templates_dir}/{year}_templates.pkl", "wb") as f:
    pickle.dump(templates, f)

with open(f"{templates_dir}/systematics.json", "w") as f:
    json.dump(systematics, f)

In [None]:
with open("templates/Apr10//2017_templates.pkl", "rb") as f:
    templates = pickle.load(f)

In [None]:
len(templates["pass"].axes[2])

In [None]:
plotting.hist2ds(
    templates,
    f"{plot_dir}/templates/hists2d/",
    regions=["pass", "fail", "passBlinded", "failBlinded"],
    region_labels=selection_regions_label,
    samples=["Data", "TT", "V+Jets", "X[3000]->H(bb)Y[190](VV)"],
    # fail_zlim=5e3,
    # pass_zlim=1.0,
)

In [None]:
systematics

In [None]:
templates_dict = {}

for year in years:
    with open(f"templates/{date}/{year}_templates.pkl", "rb") as f:
        templates_dict[year] = pickle.load(f)

In [None]:
templates = []
for year in years:
    with open(f"templates/Apr7//{year}_templates.pkl", "rb") as f:
        templates.append(pickle.load(f))

HIG BTV OR Check

In [None]:
hbb2 = events_dict["HHbbVV"][
    np.all(events_dict["HHbbVV"]["ak8FatJetParticleNetMD_Txbb"] > 0.9714, axis=1)
]

In [None]:
higher_txbbjet = np.argmax(hbb2["ak8FatJetParticleNetMD_Txbb"].values, axis=1)
higher_pt = np.argmax(hbb2["ak8FatJetPt"].values, axis=1)
higher_mpnet = np.argmax(hbb2["ak8FatJetParticleNetMass"].values, axis=1)
print(
    "higher txbb sorting",
    np.mean(hbb2["ak8FatJetHbb"].values[np.arange(len(hbb2)), higher_txbbjet]),
)
print("higher pt sorting", np.mean(hbb2["ak8FatJetHbb"].values[np.arange(len(hbb2)), higher_pt]))
print("higher eta sorting", np.mean(hbb2["ak8FatJetHbb"].values[np.arange(len(hbb2)), higher_pt]))
print(
    "higher mpnet sorting", np.mean(hbb2["ak8FatJetHbb"].values[np.arange(len(hbb2)), higher_mpnet])
)

In [None]:
for sample in events_dict:
    print(sample)
    print(
        np.mean(
            np.all(events_dict[sample]["ak8FatJetParticleNetMD_Txbb"] > 0.9714, axis=1)
            * (events_dict[sample]["VVFatJetParTMD_THWWvsT"].values.squeeze() > 0.6)
        )
    )

In [None]:
np.mean(
    np.all(events_dict["HHbbVV"]["ak8FatJetParticleNetMD_Txbb"] > 0.9714, axis=1)
    * (events_dict["HHbbVV"]["VVFatJetParTMD_THWWvsT"].values.squeeze() > 0.6)
)