In [116]:
import utils
import plotting
import postprocessing
import corrections

from utils import CUT_MAX_VAL
from hh_vars import (
    years,
    data_key,
    qcd_key,
    bg_keys,
    samples,
    nonres_sig_keys,
    # res_samples,
    # res_sig_keys,
    nonres_samples,
    txbb_wps,
    jec_shifts,
    jmsr_shifts,
    LUMI,
)
from postprocessing import res_shape_vars, new_filters, old_filters

from collections import OrderedDict

import numpy as np
import pandas as pd
import pickle, json
import hist
from hist import Hist

import os
from copy import deepcopy
from inspect import cleandoc
import warnings

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
res_samples = OrderedDict()

res_mps = [(900, 80), (1200, 190), (2000, 125), (3000, 250), (4000, 150)]

for mX, mY in res_mps:
    res_samples[f"X[{mX}]->H(bb)Y[{mY}](VV)"] = f"NMSSM_XToYHTo2W2BTo4Q2B_MX-{mX}_MY-{mY}"

res_sig_keys = list(res_samples.keys())

In [117]:
# del nonres_samples["VBFHHbbVV"]
nonres_sig_keys = ["HHbbVV", "VBFHHbbVV"]
nonres_samples = {key: nonres_samples[key] for key in nonres_sig_keys}

bg_keys = ["QCD", "TT", "ST", "V+Jets", "Diboson"]
samples = {key: samples[key] for key in ["Data"] + bg_keys}

In [None]:
MAIN_DIR = "../../../"
samples_dir = f"{MAIN_DIR}/../data/skimmer/Feb24"
nonres_signal_samples_dir = f"{MAIN_DIR}/../data/skimmer/Jun10"
res_signal_samples_dir = f"{MAIN_DIR}/../data/skimmer/23Aug22_5xhy"
# samples_dir = "/eos/uscms/store/user/rkansal/bbVV/skimmer/Feb24"
# nonres_signal_samples_dir = "/eos/uscms/store/user/cmantill/bbVV/skimmer/Jun10/"
# res_signal_samples_dir = "/eos/uscms/store/user/rkansal/bbVV/skimmer/Apr11/"
year = "2018"

date = "23Aug23"
plot_dir = f"../../../plots/PostProcessing/{date}/"
templates_dir = f"templates/{date}/"
_ = os.system(f"mkdir -p {plot_dir}/ControlPlots/{year}")
_ = os.system(f"mkdir -p {plot_dir}/cutflows")
_ = os.system(f"mkdir -p {plot_dir}/templates/wshifts")
_ = os.system(f"mkdir -p {plot_dir}/templates/jshifts")
_ = os.system(f"mkdir -p {plot_dir}/templates/hists2d")
_ = os.system(f"mkdir -p {templates_dir}")

selection_regions = postprocessing.get_res_selection_regions(year)

Load samples

In [118]:
systematics = {year: {}}

# save cutflow as pandas table
cutflow = pd.DataFrame(index=list(samples.keys()) + list(res_samples.keys()))

# utils.remove_empty_parquets(samples_dir, year)
events_dict = utils.load_samples(res_signal_samples_dir, res_samples, year, new_filters)
events_dict |= utils.load_samples(nonres_signal_samples_dir, nonres_samples, year, new_filters)
events_dict |= utils.load_samples(samples_dir, samples, year, new_filters)

utils.add_to_cutflow(events_dict, "Preselection", "weight", cutflow)

print("")
# print weighted sample yields
for sample in events_dict:
    tot_weight = np.sum(events_dict[sample]["weight"].values)
    print(f"Pre-selection {sample} yield: {tot_weight:.2f}")

Removing 1430 events
Loaded NMSSM_XToYHTo2W2BTo4Q2B_MX-900_MY-80              : 144441 entries
Removing 1873 events
Loaded NMSSM_XToYHTo2W2BTo4Q2B_MX-1200_MY-190            : 210988 entries
Removing 2717 events
Loaded NMSSM_XToYHTo2W2BTo4Q2B_MX-2000_MY-125            : 292440 entries
Removing 2795 events
Loaded NMSSM_XToYHTo2W2BTo4Q2B_MX-3000_MY-250            : 321055 entries
Removing 2673 events
Loaded NMSSM_XToYHTo2W2BTo4Q2B_MX-4000_MY-150            : 327318 entries
Removing 2014 events
Loaded GluGluToHHTobbVV_node_cHHH1                       : 188941 entries
Removing 277 events
Loaded VBF_HHTobbVV_CV_1_C2V_1_C3_1                      : 17539 entries
Loaded JetHT_Run2018A                                    : 376541 entries


  hem_cut = np.any((events["ak8FatJetEta"] > -3.2) * (events["ak8FatJetEta"] <-1.3) * (events["ak8FatJetPhi"] > -1.57) * (events["ak8FatJetPhi"] <-0.87 ), axis=1)


Removing 13922 events
Loaded JetHT_Run2018D                                    : 833289 entries
Removing 2895 events
Loaded JetHT_Run2018C                                    : 171794 entries
Loaded JetHT_Run2018B                                    : 175146 entries
Removing 0 events
Loaded QCD_HT300to500                                    : 23 entries
Removing 0 events
Loaded QCD_HT200to300                                    : 0 entries
Removing 3351 events
Loaded QCD_HT700to1000                                   : 197372 entries
Removing 1762 events
Loaded QCD_HT1000to1500                                  : 122855 entries
Removing 455 events
Loaded QCD_HT2000toInf                                   : 73168 entries
Removing 1390 events
Loaded QCD_HT1500to2000                                  : 134020 entries
Removing 340 events
Loaded QCD_HT500to700                                    : 18460 entries


  


Removing 7609 events
Loaded TTToSemiLeptonic                                  : 557747 entries


  


Removing 10387 events
Loaded TTToHadronic                                      : 749409 entries
Removing 290 events
Loaded ST_tW_top_5f_inclusiveDecays                      : 26038 entries
Removing 197 events
Loaded ST_tW_antitop_5f_inclusiveDecays                  : 19168 entries
Removing 277 events
Loaded ST_s-channel_4f_leptonDecays                      : 19853 entries
Removing 643 events
Loaded ST_t-channel_antitop_4f_InclusiveDecays           : 42693 entries
Removing 0 events
Loaded WJetsToQQ_HT-200to400                             : 0 entries
Removing 0 events
Loaded ZJetsToQQ_HT-200to400                             : 0 entries
Removing 21 events
Loaded ZJetsToQQ_HT-400to600                             : 685 entries
Removing 2276 events
Loaded WJetsToQQ_HT-800toInf                             : 133078 entries
Removing 1559 events
Loaded ZJetsToQQ_HT-600to800                             : 75028 entries
Removing 563 events
Loaded WJetsToQQ_HT-600to800                             : 

In [None]:
higgs_samples = OrderedDict(
    [
        ("Hbb", "*HToBB"),
        # ("HWW", ("*HToWW", "*HToNonbb")),
        # ("HH", ("VBF_HHTobbVV_CV_1_C2V_1_C3_1", "GluGluToHHTo4B_node_cHHH1_preUL")),
    ]
)

events_dict |= utils.load_samples(samples_dir, higgs_samples, year, filters)

cutflow = pd.DataFrame(
    index=list(samples.keys()) + list(res_samples.keys()) + list(higgs_samples.keys())
)
utils.add_to_cutflow(events_dict, "Preselection", "weight", cutflow)

print("")
# print weighted sample yields
for sample in events_dict:
    tot_weight = np.sum(events_dict[sample]["weight"].values)
    print(f"Pre-selection {sample} yield: {tot_weight:.2f}")

Scale factors and bb VV assignment

In [None]:
postprocessing.apply_weights(events_dict, year, cutflow)
bb_masks = postprocessing.bb_VV_assignment(events_dict)
# postprocessing.derive_variables(events_dict)
cutflow

Control Plots

In [None]:
samples = list(events_dict.keys())
weight_key = "finalWeight"

control_plot_2d_vars = [
    {
        f"{jet}FatJetPhi": ([40, -3.5, 3.5], rf"$\varphi^{{{jet}}}$"),
        f"{jet}FatJetEta": ([40, -3, 3], rf"$\eta^{{{jet}}}$"),
    }
    for jet in ["bb", "VV"]
]

hists2d = []

for vars2d in control_plot_2d_vars:
    h = Hist(
        hist.axis.StrCategory(samples, name="Sample"),
        *[hist.axis.Regular(*bins, name=var, label=label) for var, (bins, label) in vars2d.items()],
        storage=hist.storage.Weight(),
    )

    for sample in samples:
        events = events_dict[sample]

        fill_data = {var: utils.get_feat(events, var, bb_masks[sample]) for var in vars2d}
        weight = events[weight_key].values.squeeze()

        # if selection is not None:
        #     sel = selection[sample]
        #     fill_data[var] = fill_data[var][sel]
        #     weight = weight[sel]

        if len(weight):
            h.fill(Sample=sample, **fill_data, weight=weight)

    hists2d.append(h)

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import mplhep as hep

plot_keys = ["Data", "QCD", "TT", "HHbbVV", "X[3000]->H(bb)Y[250](VV)"]

fig, axs = plt.subplots(
    len(plot_keys),
    2,
    figsize=(20, 8 * len(plot_keys)),
    gridspec_kw={"wspace": 0.25, "hspace": 0.25},
)

for j, key in enumerate(plot_keys):
    for i in range(2):
        ax = axs[j][i]
        hep.hist2dplot(hists2d[i][key, ...], cmap="turbo", ax=ax)
        hep.cms.label(
            "Work in Progress", data=True, lumi=round(LUMI[year] * 1e-3), year=year, ax=ax
        )
        ax.set_title(key, y=1.07)
        ax._children[0].colorbar.set_label("Events")

plt.savefig(f"{plot_dir}/ControlPlots/{year}/HEM2d.pdf", bbox_inches="tight")
plt.show()

In [None]:
# {var: (bins, label)}
control_plot_vars = {
    "MET_pt": ([40, 0, 320], r"$p^{miss}_T$ (GeV)"),
    # "DijetEta": ([50, -8, 8], r"$\eta^{jj}$"),
    # "DijetPt": ([50, 0, 750], r"$p_T^{jj}$ (GeV)"),
    # "DijetMass": ([50, 500, 3000], r"$m^{jj}$ (GeV)"),
    "bbFatJetPhi": ([40, -3.5, 3.5], r"$\varphi^{bb}$"),
    "bbFatJetEta": ([40, -3, 3], r"$\eta^{bb}$"),
    "bbFatJetPt": ([40, 300, 2300], r"$p^{bb}_T$ (GeV)"),  # TODO: increase bin widths, x max
    # "bbFatJetParticleNetMass": ([50, 0, 300], r"$m^{bb}_{reg}$ (GeV)"),
    # "bbFatJetMsd": ([50, 0, 300], r"$m^{bb}_{msd}$ (GeV)"),
    # "bbFatJetParticleNetMD_Txbb": ([50, 0.8, 1], r"$p^{bb}_{Txbb}$"),
    "VVFatJetPhi": ([40, -3.5, 3.5], r"$\varphi^{VV}$"),
    "VVFatJetEta": ([40, -3, 3], r"$\eta^{VV}$"),
    "VVFatJetPt": ([40, 300, 2300], r"$p^{VV}_T$ (GeV)"),
    # "VVFatJetParticleNetMass": ([50, 0, 300], r"$m^{VV}_{reg}$ (GeV)"),
    # "VVFatJetMsd": ([50, 0, 300], r"$m^{VV}_{msd}$ (GeV)"),
    # "VVFatJetParticleNet_Th4q": ([50, 0, 1], r"Prob($H \to 4q$) vs Prob(QCD) (Non-MD)"),
    # "VVFatJetParTMD_THWW4q": (
    #     [50, 0, 1],
    #     r"Prob($H \to VV \to 4q$) vs Prob(QCD) (Mass-Decorrelated)",
    # ),
    # "VVFatJetParTMD_probT": ([50, 0, 1], r"Prob(Top) (Mass-Decorrelated)"),
    # "bbFatJetPtOverDijetPt": ([50, 0, 40], r"$p^{bb}_T / p_T^{jj}$"),
    # "VVFatJetPtOverDijetPt": ([50, 0, 40], r"$p^{VV}_T / p_T^{jj}$"),
    # "VVFatJetPtOverbbFatJetPt": ([50, 0.4, 2.0], r"$p^{VV}_T / p^{bb}_T$"),
    # "nGoodMuons": ([3, 0, 3], r"# of Muons"),
    # "nGoodElectrons": ([3, 0, 3], r"# of Electrons"),
    # "nGoodJets": ([5, 0, 5], r"# of AK4 B-Jets"),
    # "BDTScore": ([50, 0, 1], r"BDT Score"),
}

hists = postprocessing.control_plots(
    events_dict,
    bb_masks,
    nonres_sig_keys + res_sig_keys,
    control_plot_vars,
    f"{plot_dir}/ControlPlots/{year}/",
    year,
    bg_keys=bg_keys,
    sig_scale_dict={"HHbbVV": 1e5, "VBFHHbbVV": 2e6} | {key: 2e4 for key in res_sig_keys},
    # bg_keys=["QCD", "TT", "ST", "V+Jets", "Hbb"],
    show=True,
)

In [None]:
selection, _ = utils.make_selection(
    {
        "VVFatJetParTMD_THWWvsT": [0.8, CUT_MAX_VAL],
        "bbFatJetParticleNetMD_Txbb": [0.98, CUT_MAX_VAL],
        "bbFatJetParticleNetMass": [110, 145],
    },
    events_dict,
    bb_masks,
)
cutstr = f"pass_noveto"

postprocessing.control_plots(
    events_dict,
    bb_masks,
    nonres_sig_keys + res_sig_keys,
    control_plot_vars,
    f"{plot_dir}/ControlPlots/{year}/",
    year,
    sig_splits=sig_splits[:1],
    hists={},
    # bg_keys=bg_keys + list(higgs_samples.keys()),
    # bg_keys=["QCD", "TT", "ST", "V+Jets", "Hbb"],
    bg_keys=["QCD", "TT", "ST", "V+Jets"],
    sig_scale_dict={key: 10 for key in nonres_sig_keys + res_sig_keys},
    selection=selection,
    cutstr=cutstr,
    show=True,
)

In [None]:
for sb1, sb2 in [[0, 300], [75, 180], [92.5, 162.5]]:
    selection, _ = utils.make_selection(
        {
            "VVFatJetParTMD_THWWvsT": [0.8, CUT_MAX_VAL],
            "bbFatJetParticleNetMD_Txbb": [0.98, CUT_MAX_VAL],
            "bbFatJetParticleNetMass": [[sb1, 110], [145, sb2]],
        },
        events_dict,
        bb_masks,
    )
    cutstr = f"sidebands_{sb1}_{sb2}"

    postprocessing.control_plots(
        events_dict,
        bb_masks,
        nonres_sig_keys + res_sig_keys,
        control_plot_vars,
        f"{plot_dir}/ControlPlots/{year}/",
        year,
        sig_splits=sig_splits,
        hists={},
        # bg_keys=bg_keys + list(higgs_samples.keys()),
        bg_keys=["QCD", "TT", "ST", "V+Jets", "Hbb"],
        selection=selection,
        cutstr=cutstr,
        show=True,
    )

Overall LP SF

In [None]:
from collections import OrderedDict
from tqdm import tqdm

sel, cf = utils.make_selection(
    selection_regions["lpsf"].cuts, events_dict, bb_masks, prev_cutflow=cutflow
)

sf_table = OrderedDict()

for sig_key in tqdm(res_sig_keys):
    systematics[sig_key] = {}
    # calculate only for current year
    events_dict[sig_key] = postprocessing.postprocess_lpsfs(events_dict[sig_key])
    lp_sf, unc, uncs = postprocessing.get_lpsf(events_dict[sig_key], sel[sig_key])
    # print(f"BDT LP Scale Factor for {sig_key}: {lp_sf:.2f} ± {unc:.2f}")
    # print(uncs)

    systematics[sig_key]["lp_sf"] = lp_sf
    systematics[sig_key]["lp_sf_unc"] = unc / lp_sf

    sf_table[sig_key] = {"SF": f"{lp_sf:.2f} ± {unc:.2f}", **uncs}

In [None]:
sf_df = pd.DataFrame(index=nonres_sig_keys + res_sig_keys)

for key in sf_table[sig_key]:
    sf_df[key] = [sf_table[skey][key] for skey in nonres_sig_keys + res_sig_keys]

sf_df.to_clipboard()
sf_df

Templates

In [None]:
selection_regions = postprocessing.get_res_selection_regions("2017", txbb_wp="HP", thww_wp=0.8)
del selection_regions["fail"], selection_regions["failBlinded"]

In [None]:
h = postprocessing.get_templates(
    events_dict,
    bb_masks,
    year,
    # nonres_sig_keys + res_sig_keys[:10],
    res_sig_keys,
    selection_regions,
    res_shape_vars[:1],
    systematics,
    templates_dir,
    bg_keys=["QCD", "TT", "V+Jets", "Diboson", "Hbb"],
    plot_dir=f"{plot_dir}/templates/",
    prev_cutflow=cutflow,
    # sig_splits=sig_splits[:2],
    weight_shifts={},
    jshift="",
    plot_shifts=False,
    pass_ylim=70,
    fail_ylim=40000,
    blind_pass=True,
    show=True,
)

In [None]:
get_templates(
    events_dict,
    bb_masks,
    args.year,
    sig_keys,
    selection_regions,
    shape_vars,
    systematics,
    template_dir,
    bg_keys=bg_keys,
    plot_dir=plot_dir,
    prev_cutflow=cutflow,
    # sig_splits=sig_splits,
    weight_shifts=weight_shifts,
    jshift=jshift,
    blind_pass=True if args.resonant else False,
    show=False,
    plot_shifts=args.plot_shifts,
)

In [None]:
templates = {}

for jshift in [""] + jec_shifts + jmsr_shifts:
    print(jshift)
    ttemps, tsyst = postprocessing.get_templates(
        events_dict,
        bb_masks,
        year,
        nonres_sig_keys + res_sig_keys,
        res_selection_regions[year],
        res_shape_vars,
        bg_keys=["QCD", "TT", "V+Jets"],
        plot_dir=plot_dir if jshift == "" else "",
        prev_cutflow=cutflow,
        sig_splits=sig_splits,
        weight_shifts=postprocessing.weight_shifts,
        jshift=jshift,
        pass_ylim=7,
        fail_ylim=40000,
        blind_pass=True,
        show=False,
        plot_shifts=False,
    )

    templates = {**templates, **ttemps}
    if jshift == "":
        systematics[year] = tsyst

In [None]:
with open(f"{templates_dir}/{year}_templates.pkl", "wb") as f:
    pickle.dump(templates, f)

with open(f"{templates_dir}/systematics.json", "w") as f:
    json.dump(systematics, f)

In [None]:
with open(f"templates/Apr10//2017_templates.pkl", "rb") as f:
    templates = pickle.load(f)

In [None]:
len(templates["pass"].axes[2])

In [None]:
plotting.hist2ds(
    templates,
    f"{plot_dir}/templates/hists2d/",
    regions=["pass", "fail", "passBlinded", "failBlinded"],
    region_labels=selection_regions_label,
    samples=["Data", "TT", "V+Jets", "X[3000]->H(bb)Y[190](VV)"],
    # fail_zlim=5e3,
    # pass_zlim=1.0,
)

In [None]:
systematics

In [None]:
templates_dict = {}

for year in years:
    with open(f"templates/{date}/{year}_templates.pkl", "rb") as f:
        templates_dict[year] = pickle.load(f)

In [None]:
templates = []
for year in years:
    with open(f"templates/Apr7//{year}_templates.pkl", "rb") as f:
        templates.append(pickle.load(f))