In [12]:
import utils
import plotting
import postprocessing
import corrections

from utils import CUT_MAX_VAL
from hh_vars import (
    years,
    data_key,
    qcd_key,
    bg_keys,
    samples,
    nonres_sig_keys,
    nonres_samples,
    txbb_wps,
    jec_shifts,
    jmsr_shifts,
)
from postprocessing import nonres_shape_vars

import numpy as np
import pandas as pd
import pickle
from pandas.errors import SettingWithCopyWarning
from hist import Hist

import os
from copy import deepcopy
from inspect import cleandoc
import warnings

import matplotlib.pyplot as plt
import mplhep as hep
import matplotlib.ticker as mticker

plt.style.use(hep.style.CMS)
hep.style.use("CMS")
formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))
plt.rcParams.update({"font.size": 16})

# ignore these because they don't seem to apply
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [2]:
%load_ext autoreload
%autoreload 2

In [7]:
# MAIN_DIR = "../../../"
# samples_dir = f"{MAIN_DIR}/../data/skimmer/Feb24"
# signal_samples_dir = f"{MAIN_DIR}/../data/skimmer/Feb24"
samples_dir = "/eos/uscms/store/user/rkansal/bbVV/skimmer/Feb24/"
signal_samples_dir = "/eos/uscms/store/user/cmantill/bbVV/skimmer/Jun10/"
bdt_data_dir = "/eos/uscms/store/user/cmantill/bbVV/skimmer/Jun10/bdt_data/"
year = "2017"

date = "23Jun14"
plot_dir = f"../../../plots/PostProcessing/{date}/"
templates_dir = f"templates/{date}"
_ = os.system(f"mkdir -p {plot_dir}")
_ = os.system(f"mkdir -p {plot_dir}/cutflows/")
_ = os.system(f"mkdir -p {plot_dir}/ControlPlots/{year}/")
_ = os.system(f"mkdir -p {plot_dir}/templates/")
_ = os.system(f"mkdir -p {plot_dir}/templates/wshifts")
_ = os.system(f"mkdir -p {plot_dir}/templates/jshifts")
_ = os.system(f"mkdir -p {templates_dir}")

selection_regions = postprocessing.get_nonres_selection_regions(year)

In [5]:
list(np.load(f"{bdt_data_dir}/{year}_bdt_data_order.npy"))

FileNotFoundError: [Errno 2] No such file or directory: '/eos/uscms/store/user/cmantill/bbVV/skimmer/Jun10/bdt_data//2017_bdt_data_order.npy'

In [13]:
BDT_sample_order = nonres_sig_keys
BDT_sample_order += ["QCD", "TT", "ST", "V+Jets", "Diboson", "Data"]

for key in nonres_sig_keys.copy():
    if key not in BDT_sample_order:
        del nonres_samples[key]
        nonres_sig_keys.remove(key)

for key in bg_keys.copy():
    if key not in BDT_sample_order:
        del samples[key]
        bg_keys.remove(key)

In [14]:
nonres_sig_keys

['HHbbVV',
 'ggHH_kl_2p45_kt_1_HHbbVV',
 'ggHH_kl_5_kt_1_HHbbVV',
 'ggHH_kl_0_kt_1_HHbbVV',
 'qqHH_CV_1_C2V_1_kl_1_HHbbVV',
 'qqHH_CV_1_C2V_0_kl_1_HHbbVV',
 'qqHH_CV_1p5_C2V_1_kl_1_HHbbVV',
 'qqHH_CV_1_C2V_1_kl_2_HHbbVV',
 'qqHH_CV_1_C2V_2_kl_1_HHbbVV',
 'qqHH_CV_1_C2V_1_kl_0_HHbbVV',
 'qqHH_CV_0p5_C2V_1_kl_1_HHbbVV',
 'QCD',
 'TT',
 'ST',
 'V+Jets',
 'Diboson',
 'Data']

Load samples

In [16]:
filters = postprocessing.new_filters
systematics = {year: {}}

# save cutflow as pandas table
cutflow = pd.DataFrame(index=list(samples.keys()) + list(nonres_samples.keys()))

# utils.remove_empty_parquets(samples_dir, year)
events_dict = utils.load_samples(signal_samples_dir, nonres_samples, year, filters)
events_dict |= utils.load_samples(samples_dir, samples, year, filters)

utils.add_to_cutflow(events_dict, "BDTPreselection", "weight", cutflow)

print("")
# print weighted sample yields
for sample in events_dict:
    tot_weight = np.sum(events_dict[sample]["weight"].values)
    print(f"Pre-selection {sample} yield: {tot_weight:.2f}")

Loaded GluGluToHHTobbVV_node_cHHH1                       : 164044 entries
Loaded GluGluToHHTobbVV_node_cHHH2p45                    : 312540 entries
Loaded GluGluToHHTobbVV_node_cHHH5                       : 45377 entries
Loaded GluGluToHHTobbVV_node_cHHH0                       : 120480 entries
Loaded VBF_HHTobbVV_CV_1_C2V_1_C3_1                      : 18095 entries


MemoryError: Unable to allocate 1.35 GiB for an array with shape (341, 530332) and data type float64

Scale factors and bb VV assignment

In [None]:
postprocessing.apply_weights(events_dict, year, cutflow)
bb_masks = postprocessing.bb_VV_assignment(events_dict)
# events_dict[sig_key] = postprocessing.postprocess_lpsfs(events_dict[sig_key])
cutflow

In [None]:
postprocessing.load_bdt_preds(
    events_dict, year, f"{samples_dir}/inferences/", BDT_sample_order, jec_jmsr_shifts=True
)

Control plots

In [None]:
# {var: (bins, label)}
control_plot_vars = {
    # "MET_pt": ([50, 0, 300], r"$p^{miss}_T$ (GeV)"),
    # "DijetEta": ([50, -8, 8], r"$\eta^{jj}$"),
    # "DijetPt": ([50, 0, 750], r"$p_T^{jj}$ (GeV)"),
    # "DijetMass": (
    #     # list(range(800, 1400, 100)) + [1400, 1600, 2000, 3000, 4400],
    #     [40, 600, 4500],
    #     r"$m^{jj}$ (GeV)",
    # ),
    # "bbFatJetEta": ([50, -2.4, 2.4], r"$\eta^{bb}$"),
    # "bbFatJetPt": ([50, 300, 1500], r"$p^{bb}_T$ (GeV)"),
    "bbFatJetParticleNetMass": ([20, 50, 250], r"$m^{bb}_{reg}$ (GeV)"),
    # "bbFatJetMsd": ([50, 0, 300], r"$m^{bb}_{msd}$ (GeV)"),
    # "bbFatJetParticleNetMD_Txbb": ([50, 0.8, 1], r"$T^{bb}_{Xbb}$"),
    # "VVFatJetEta": ([50, -2.4, 2.4], r"$\eta^{VV}$"),
    # "VVFatJetPt": ([50, 300, 1500], r"$p^{VV}_T$ (GeV)"),
    # "VVFatJetParticleNetMass": (
    #     # list(range(50, 110, 10)) + list(range(110, 200, 15)) + [200, 220, 250],
    #     [20, 50, 250],
    #     r"$m^{VV}_{reg}$ (GeV)",
    # ),
    # "VVFatJetMsd": ([40, 50, 250], r"$m^{VV}_{msd}$ (GeV)"),
    # "VVFatJetParticleNet_Th4q": ([50, 0, 1], r"Prob($H \to 4q$) vs Prob(QCD) (Non-MD)"),
    # "VVFatJetParTMD_THWW4q": (
    #     [50, 0, 1],
    #     r"Prob($H \to VV \to 4q$) vs Prob(QCD) (Mass-Decorrelated)",
    # ),
    # "VVFatJetParTMD_probT": ([50, 0, 1], r"Prob(Top) (Mass-Decorrelated)"),
    # "VVFatJetParTMD_THWWvsT": (
    #     [50, 0, 1],
    #     r"$T^{VV}_{HWW}$",
    # ),
    # "bbFatJetPtOverDijetPt": ([50, 0, 40], r"$p^{bb}_T / p_T^{jj}$"),
    # "VVFatJetPtOverDijetPt": ([50, 0, 40], r"$p^{VV}_T / p_T^{jj}$"),
    # "VVFatJetPtOverbbFatJetPt": ([50, 0.4, 2.0], r"$p^{VV}_T / p^{bb}_T$"),
    # "nGoodMuons": ([3, 0, 3], r"# of Muons"),
    # "nGoodElectrons": ([3, 0, 3], r"# of Electrons"),
    # "nGoodJets": ([5, 0, 5], r"# of AK4 B-Jets"),
}

hists = postprocessing.control_plots(
    events_dict,
    bb_masks,
    nonres_sig_keys,
    control_plot_vars,
    f"{plot_dir}/ControlPlots/{year}/",
    year,
    bg_keys=["QCD", "TT", "ST", "V+Jets", "Diboson"],
    # bg_keys=["QCD", "TT", "ST", "V+Jets", "Hbb"],
    show=True,
)

In [None]:
cuts = [0.01, 0.1, 0.5, 0.9, 0.99]
bdtvars = ["", "TT", "VJets"]

# for ttcut in [0.01, 0.1, 0.5, 0.9, 0.99]:
#     ttsel, _ = utils.make_selection({"BDTScoreTT": [ttcut, CUT_MAX_VAL]}, events_dict, bb_masks)
#     cutstr = f"tt{ttcut}"

#     hists = postprocessing.control_plots(
#         events_dict,
#         bb_masks,
#         nonres_sig_keys,
#         control_plot_vars,
#         f"{plot_dir}/ControlPlots/{year}/",
#         year,
#         hists={},
#         bg_keys=["QCD", "TT", "ST", "V+Jets", "Diboson"],
#         selection=ttsel,
#         cutstr=cutstr,
#         show=True,
#     )

for var in bdtvars:
    for cut in cuts:
        sel, _ = utils.make_selection({f"BDTScore{var}": [cut, CUT_MAX_VAL]}, events_dict, bb_masks)
        cutstr = f"bdt{var}{cut}"
        if var == "" and cut > 0.01:
            sig_scale = 1
        else:
            sig_scale = 2e5

        hists = postprocessing.control_plots(
            events_dict,
            bb_masks,
            nonres_sig_keys,
            control_plot_vars,
            f"{plot_dir}/ControlPlots/{year}/",
            year,
            hists={},
            bg_keys=["QCD", "TT", "ST", "V+Jets", "Diboson"],
            selection=sel,
            cutstr=cutstr,
            sig_scale_dict={"HHbbVV": sig_scale},
            combine_pdf=False,
            show=True,
        )

Overall BDT SF

In [None]:
postprocessing.lpsfs(
    events_dict,
    bb_masks,
    nonres_sig_keys,
    nonres_samples,
    cutflow,
    selection_regions["lpsf"],
    systematics,
    all_years=False,
)

Templates

In [None]:
h, tsysts = postprocessing.get_templates(
    events_dict,
    bb_masks,
    year,
    nonres_sig_keys,
    selection_regions,
    nonres_shape_vars,
    systematics,
    plot_dir=f"{plot_dir}/templates",
    prev_cutflow=cutflow,
    weight_shifts={},
    jshift="",
    plot_shifts=False,
    show=True,
)

In [None]:
templates = {}

for jshift in [""] + jec_shifts + jmsr_shifts:
    print(jshift)
    ttemps, tsyst = postprocessing.get_templates(
        events_dict,
        bb_masks,
        year,
        selection_regions[year],
        shape_var,
        shape_bins,
        blind_window,
        plot_dir=plot_dir,
        prev_cutflow=cutflow,
        weight_shifts=postprocessing.weight_shifts,
        jshift=jshift,
        show=False,
    )

    templates = {**templates, **ttemps}
    systematics = {**systematics, **tsyst}

In [None]:
systematics

In [None]:
templates_dict = {}

for year in years:
    with open(f"templates/Feb28/{year}_templates.pkl", "rb") as f:
        templates_dict[year] = pickle.load(f)