In [None]:
import utils
import plotting
import postprocessing
import corrections

from utils import CUT_MAX_VAL, ShapeVar
from hh_vars import (
    years,
    data_key,
    qcd_key,
    bg_keys,
    samples,
    nonres_sig_keys,
    # res_samples,
    # res_sig_keys,
    nonres_samples,
    txbb_wps,
    jec_shifts,
    jmsr_shifts,
    LUMI,
)
from postprocessing import res_shape_vars, new_filters, old_filters

from collections import OrderedDict

import numpy as np
import pandas as pd
import pickle, json
import hist
from hist import Hist

import os
from copy import deepcopy
from inspect import cleandoc
import warnings

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
res_samples = OrderedDict()

res_mps = [(900, 80), (1200, 190), (2000, 125), (3000, 250), (4000, 150)]

for mX, mY in res_mps:
    res_samples[f"X[{mX}]->H(bb)Y[{mY}](VV)"] = f"NMSSM_XToYHTo2W2BTo4Q2B_MX-{mX}_MY-{mY}"

res_sig_keys = list(res_samples.keys())

In [None]:
# del nonres_samples["VBFHHbbVV"]
nonres_sig_keys = ["HHbbVV", "VBFHHbbVV"]
nonres_samples = {key: nonres_samples[key] for key in nonres_sig_keys}

bg_keys = ["QCD", "TT", "ST", "V+Jets", "Diboson"]
samples = {key: samples[key] for key in ["Data"] + bg_keys}

In [None]:
MAIN_DIR = "../../../"
samples_dir = f"{MAIN_DIR}/../data/skimmer/Feb24"
nonres_signal_samples_dir = f"{MAIN_DIR}/../data/skimmer/Jun10"
res_signal_samples_dir = f"{MAIN_DIR}/../data/skimmer/Apr11"
# samples_dir = "/eos/uscms/store/user/rkansal/bbVV/skimmer/Feb24"
# nonres_signal_samples_dir = "/eos/uscms/store/user/cmantill/bbVV/skimmer/Jun10/"
# res_signal_samples_dir = "/eos/uscms/store/user/rkansal/bbVV/skimmer/Apr11/"
year = "2018"

date = "24Jan13Res"
plot_dir = f"../../../plots/PostProcessing/{date}/"
templates_dir = f"templates/{date}/"
_ = os.system(f"mkdir -p {plot_dir}/ControlPlots/{year}")
_ = os.system(f"mkdir -p {plot_dir}/cutflows")
_ = os.system(f"mkdir -p {plot_dir}/templates/wshifts")
_ = os.system(f"mkdir -p {plot_dir}/templates/jshifts")
_ = os.system(f"mkdir -p {plot_dir}/templates/hists2d")
_ = os.system(f"mkdir -p {templates_dir}")

selection_regions = postprocessing.get_res_selection_regions(year)

Load samples

In [None]:
import pickle

with open(
    f"{res_signal_samples_dir}/2018/NMSSM_XToYHTo2W2BTo4Q2B_MX-2000_MY-125/pickles/out_0.pkl", "rb"
) as f:
    cf = pickle.load(f)

In [None]:
cf

In [None]:
systematics = {year: {}}

# save cutflow as pandas table
cutflow = pd.DataFrame(index=list(samples.keys()) + list(res_samples.keys()))

# hem cleaning in load_samples not implemented yet for res samples
hem_cleaning = False

# utils.remove_empty_parquets(samples_dir, year)
events_dict = utils.load_samples(
    res_signal_samples_dir, res_samples, year, new_filters, hem_cleaning=hem_cleaning
)
events_dict |= utils.load_samples(
    nonres_signal_samples_dir, nonres_samples, year, new_filters, hem_cleaning=hem_cleaning
)
events_dict |= utils.load_samples(
    samples_dir, samples, year, new_filters, hem_cleaning=hem_cleaning
)

utils.add_to_cutflow(events_dict, "Preselection", "weight", cutflow)

print("")
# print weighted sample yields
for sample in events_dict:
    tot_weight = np.sum(events_dict[sample]["weight"].values)
    print(f"Pre-selection {sample} yield: {tot_weight:.2f}")

In [None]:
higgs_samples = OrderedDict(
    [
        ("Hbb", "*HToBB"),
        # ("HWW", ("*HToWW", "*HToNonbb")),
        # ("HH", ("VBF_HHTobbVV_CV_1_C2V_1_C3_1", "GluGluToHHTo4B_node_cHHH1_preUL")),
    ]
)

events_dict |= utils.load_samples(samples_dir, higgs_samples, year)

cutflow = pd.DataFrame(
    index=list(samples.keys()) + list(res_samples.keys()) + list(higgs_samples.keys())
)
utils.add_to_cutflow(events_dict, "Preselection", "weight", cutflow)

print("")
# print weighted sample yields
for sample in events_dict:
    tot_weight = np.sum(events_dict[sample]["weight"].values)
    print(f"Pre-selection {sample} yield: {tot_weight:.2f}")

Scale factors and bb VV assignment

In [None]:
postprocessing.apply_weights(events_dict, year, cutflow)
bb_masks = postprocessing.bb_VV_assignment(events_dict)
postprocessing.derive_variables(events_dict)
cutflow

Control Plots

In [None]:
samples = list(events_dict.keys())
weight_key = "finalWeight"

control_plot_2d_vars = [
    {
        f"{jet}FatJetPhi": ([40, -3.5, 3.5], rf"$\varphi^{{{jet}}}$"),
        f"{jet}FatJetEta": ([40, -3, 3], rf"$\eta^{{{jet}}}$"),
    }
    for jet in ["bb", "VV"]
]

hists2d = []

for vars2d in control_plot_2d_vars:
    h = Hist(
        hist.axis.StrCategory(samples, name="Sample"),
        *[hist.axis.Regular(*bins, name=var, label=label) for var, (bins, label) in vars2d.items()],
        storage=hist.storage.Weight(),
    )

    for sample in samples:
        events = events_dict[sample]

        fill_data = {var: utils.get_feat(events, var, bb_masks[sample]) for var in vars2d}
        weight = events[weight_key].values.squeeze()

        # if selection is not None:
        #     sel = selection[sample]
        #     fill_data[var] = fill_data[var][sel]
        #     weight = weight[sel]

        if len(weight):
            h.fill(Sample=sample, **fill_data, weight=weight)

    hists2d.append(h)

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import mplhep as hep

plot_keys = ["Data", "QCD", "TT", "HHbbVV", "X[3000]->H(bb)Y[250](VV)"]

fig, axs = plt.subplots(
    len(plot_keys),
    2,
    figsize=(20, 8 * len(plot_keys)),
    gridspec_kw={"wspace": 0.25, "hspace": 0.25},
)

for j, key in enumerate(plot_keys):
    for i in range(2):
        ax = axs[j][i]
        hep.hist2dplot(hists2d[i][key, ...], cmap="turbo", ax=ax)
        hep.cms.label(
            "Work in Progress", data=True, lumi=round(LUMI[year] * 1e-3), year=year, ax=ax
        )
        ax.set_title(key, y=1.07)
        ax._children[0].colorbar.set_label("Events")

plt.savefig(f"{plot_dir}/ControlPlots/{year}/HEM2d.pdf", bbox_inches="tight")
plt.show()

In [None]:
# {var: (bins, label)}
control_plot_vars = [
    # ShapeVar(var="MET_pt", label=r"$p^{miss}_T$ (GeV)", bins=[50, 0, 300]),
    # ShapeVar(var="DijetEta", label=r"$\eta^{jj}$", bins=[30, -8, 8]),
    # ShapeVar(var="DijetPt", label=r"$p_T^{jj}$ (GeV)", bins=[30, 0, 750]),
    # ShapeVar(var="DijetMass", label=r"$m^{jj}$ (GeV)", bins=[30, 600, 4000]),
    # ShapeVar(var="bbFatJetEta", label=r"$\eta^{bb}$", bins=[30, -2.4, 2.4]),
    # ShapeVar(
    #     var="bbFatJetPt", label=r"$p^{bb}_T$ (GeV)", bins=[30, 300, 1500], significance_dir="right"
    # ),
    # ShapeVar(
    #     var="bbFatJetParticleNetMass",
    #     label=r"$m^{bb}_{reg}$ (GeV)",
    #     bins=[20, 50, 250],
    #     significance_dir="bin",
    # ),
    # ShapeVar(var="bbFatJetMsd", label=r"$m^{bb}_{msd}$ (GeV)", bins=[50, 0, 300]),
    ShapeVar(var="bbFatJetParticleNetMD_Txbb", label=r"$T^{bb}_{Xbb}$", bins=[50, 0.8, 1]),
    # ShapeVar(var="VVFatJetEta", label=r"$\eta^{VV}$", bins=[30, -2.4, 2.4]),
    # ShapeVar(var="VVFatJetPt", label=r"$p^{VV}_T$ (GeV)", bins=[30, 300, 1500]),
    # ShapeVar(var="VVParticleNetMass", label=r"$m^{VV}_{reg}$ (GeV)", bins=[20, 50, 250]),
    # ShapeVar(var="VVFatJetMsd", label=r"$m^{VV}_{msd}$ (GeV)", bins=[40, 50, 250]),
    # ShapeVar(var="VVFatJetParticleNet_Th4q", label=r"Prob($H \to 4q$) vs Prob(QCD) (Non-MD)", bins=[50, 0, 1]),
    # ShapeVar(var="VVFatJetParTMD_THWW4q", label=r"Prob($H \to VV \to 4q$) vs Prob(QCD) (Mass-Decorrelated)", bins=[50, 0, 1]),
    # ShapeVar(var="VVFatJetParTMD_probT", label=r"Prob(Top) (Mass-Decorrelated)", bins=[50, 0, 1]),
    ShapeVar(var="VVFatJetParTMD_THWWvsT", label=r"$T^{VV}_{HWW}$", bins=[50, 0, 1]),
    # ShapeVar(var="bbFatJetPtOverDijetPt", label=r"$p^{bb}_T / p_T^{jj}$", bins=[50, 0, 40]),
    # ShapeVar(var="VVFatJetPtOverDijetPt", label=r"$p^{VV}_T / p_T^{jj}$", bins=[50, 0, 40]),
    # ShapeVar(var="VVFatJetPtOverbbFatJetPt", label=r"$p^{VV}_T / p^{bb}_T$", bins=[50, 0.4, 2.0]),
    # ShapeVar(var="nGoodMuons", label=r"# of Muons", bins=[3, 0, 3]),
    # ShapeVar(var="nGoodElectrons", label=r"# of Electrons", bins=[3, 0, 3]),
    # ShapeVar(var="nGoodJets", label=r"# of AK4 B-Jets", bins=[5, 0, 5]),
    # removed if not ggF nonresonant - needs to be the last variable!
    # ShapeVar(var="BDTScore", label=r"BDT Score", bins=[50, 0, 1]),
]

hists = postprocessing.control_plots(
    events_dict,
    bb_masks,
    nonres_sig_keys + res_sig_keys,
    control_plot_vars,
    f"{plot_dir}/ControlPlots/{year}/",
    year,
    bg_keys=bg_keys,
    sig_scale_dict={"HHbbVV": 1e5, "VBFHHbbVV": 2e6} | {key: 2e4 for key in res_sig_keys},
    # bg_keys=["QCD", "TT", "ST", "V+Jets", "Hbb"],
    show=True,
)

In [None]:
selection, _ = utils.make_selection(
    {
        "VVFatJetParTMD_THWWvsT": [0.8, CUT_MAX_VAL],
        "bbFatJetParticleNetMD_Txbb": [0.98, CUT_MAX_VAL],
        "bbFatJetParticleNetMass": [110, 145],
    },
    events_dict,
    bb_masks,
)
cutstr = f"pass_noveto"

postprocessing.control_plots(
    events_dict,
    bb_masks,
    nonres_sig_keys + res_sig_keys,
    control_plot_vars,
    f"{plot_dir}/ControlPlots/{year}/",
    year,
    sig_splits=sig_splits[:1],
    hists={},
    # bg_keys=bg_keys + list(higgs_samples.keys()),
    # bg_keys=["QCD", "TT", "ST", "V+Jets", "Hbb"],
    bg_keys=["QCD", "TT", "ST", "V+Jets"],
    sig_scale_dict={key: 10 for key in nonres_sig_keys + res_sig_keys},
    selection=selection,
    cutstr=cutstr,
    show=True,
)

In [None]:
for sb1, sb2 in [[0, 300], [75, 180], [92.5, 162.5]]:
    selection, _ = utils.make_selection(
        {
            "VVFatJetParTMD_THWWvsT": [0.8, CUT_MAX_VAL],
            "bbFatJetParticleNetMD_Txbb": [0.98, CUT_MAX_VAL],
            "bbFatJetParticleNetMass": [[sb1, 110], [145, sb2]],
        },
        events_dict,
        bb_masks,
    )
    cutstr = f"sidebands_{sb1}_{sb2}"

    postprocessing.control_plots(
        events_dict,
        bb_masks,
        nonres_sig_keys + res_sig_keys,
        control_plot_vars,
        f"{plot_dir}/ControlPlots/{year}/",
        year,
        sig_splits=sig_splits,
        hists={},
        # bg_keys=bg_keys + list(higgs_samples.keys()),
        bg_keys=["QCD", "TT", "ST", "V+Jets", "Hbb"],
        selection=selection,
        cutstr=cutstr,
        show=True,
    )

Overall LP SF

In [None]:
from collections import OrderedDict
from tqdm import tqdm

sel, cf = utils.make_selection(
    selection_regions["lpsf"].cuts, events_dict, bb_masks, prev_cutflow=cutflow
)

sf_table = OrderedDict()

for sig_key in tqdm(res_sig_keys):
    systematics[sig_key] = {}
    # calculate only for current year
    events_dict[sig_key] = postprocessing.postprocess_lpsfs(events_dict[sig_key])
    lp_sf, unc, uncs = postprocessing.get_lpsf(events_dict[sig_key], sel[sig_key])
    # print(f"BDT LP Scale Factor for {sig_key}: {lp_sf:.2f} ± {unc:.2f}")
    # print(uncs)

    systematics[sig_key]["lp_sf"] = lp_sf
    systematics[sig_key]["lp_sf_unc"] = unc / lp_sf

    sf_table[sig_key] = {"SF": f"{lp_sf:.2f} ± {unc:.2f}", **uncs}

In [None]:
sf_df = pd.DataFrame(index=nonres_sig_keys + res_sig_keys)

for key in sf_table[sig_key]:
    sf_df[key] = [sf_table[skey][key] for skey in nonres_sig_keys + res_sig_keys]

sf_df.to_clipboard()
sf_df

Templates

In [None]:
selection_regions = postprocessing.get_res_selection_regions("2017", txbb_wp="HP", thww_wp=0.8)
del selection_regions["fail"], selection_regions["failBlinded"]

In [None]:
h = postprocessing.get_templates(
    events_dict,
    bb_masks,
    year,
    # nonres_sig_keys + res_sig_keys[:10],
    res_sig_keys,
    selection_regions,
    res_shape_vars[:1],
    systematics,
    templates_dir,
    bg_keys=["QCD", "TT", "V+Jets", "Diboson", "Hbb"],
    plot_dir=f"{plot_dir}/templates/",
    prev_cutflow=cutflow,
    # sig_splits=sig_splits[:2],
    weight_shifts={},
    jshift="",
    plot_shifts=False,
    pass_ylim=70,
    fail_ylim=40000,
    blind_pass=True,
    show=True,
)

In [None]:
get_templates(
    events_dict,
    bb_masks,
    args.year,
    sig_keys,
    selection_regions,
    shape_vars,
    systematics,
    template_dir,
    bg_keys=bg_keys,
    plot_dir=plot_dir,
    prev_cutflow=cutflow,
    # sig_splits=sig_splits,
    weight_shifts=weight_shifts,
    jshift=jshift,
    blind_pass=True if args.resonant else False,
    show=False,
    plot_shifts=args.plot_shifts,
)

In [None]:
templates = {}

for jshift in [""] + jec_shifts + jmsr_shifts:
    print(jshift)
    ttemps, tsyst = postprocessing.get_templates(
        events_dict,
        bb_masks,
        year,
        nonres_sig_keys + res_sig_keys,
        res_selection_regions[year],
        res_shape_vars,
        bg_keys=["QCD", "TT", "V+Jets"],
        plot_dir=plot_dir if jshift == "" else "",
        prev_cutflow=cutflow,
        sig_splits=sig_splits,
        weight_shifts=postprocessing.weight_shifts,
        jshift=jshift,
        pass_ylim=7,
        fail_ylim=40000,
        blind_pass=True,
        show=False,
        plot_shifts=False,
    )

    templates = {**templates, **ttemps}
    if jshift == "":
        systematics[year] = tsyst

In [None]:
with open(f"{templates_dir}/{year}_templates.pkl", "wb") as f:
    pickle.dump(templates, f)

with open(f"{templates_dir}/systematics.json", "w") as f:
    json.dump(systematics, f)

In [None]:
with open(f"templates/Apr10//2017_templates.pkl", "rb") as f:
    templates = pickle.load(f)

In [None]:
len(templates["pass"].axes[2])

In [None]:
plotting.hist2ds(
    templates,
    f"{plot_dir}/templates/hists2d/",
    regions=["pass", "fail", "passBlinded", "failBlinded"],
    region_labels=selection_regions_label,
    samples=["Data", "TT", "V+Jets", "X[3000]->H(bb)Y[190](VV)"],
    # fail_zlim=5e3,
    # pass_zlim=1.0,
)

In [None]:
systematics

In [None]:
templates_dict = {}

for year in years:
    with open(f"templates/{date}/{year}_templates.pkl", "rb") as f:
        templates_dict[year] = pickle.load(f)

In [None]:
templates = []
for year in years:
    with open(f"templates/Apr7//{year}_templates.pkl", "rb") as f:
        templates.append(pickle.load(f))