In [61]:
import utils
import plotting
import postprocessing
import numpy as np
import warnings
import pandas as pd
from pandas.errors import SettingWithCopyWarning
from hh_vars import samples, sig_key, data_key, jecs, jmsr, jec_vars, jmsr_vars
import os

# ignore these because they don't seem to apply
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

from PyPDF2 import PdfFileMerger

from copy import deepcopy

import matplotlib.pyplot as plt
import mplhep as hep
import matplotlib.ticker as mticker

plt.style.use(hep.style.CMS)
hep.style.use("CMS")
formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))
plt.rcParams.update({"font.size": 16})

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
plot_dir = "../../../plots/BDTPreProcessing/Feb21"
_ = os.system(f"mkdir -p {plot_dir}")

In [67]:
MAIN_DIR = "../../../"
samples_dir = f"{MAIN_DIR}/../data/skimmer/Feb20"
signal_samples_dir = f"{MAIN_DIR}/../data/skimmer/Feb20"
year = "2017"

# Both Jet's Msds > 50 & at least one jet with Txbb > 0.8
# filters = [
#     [
#         ("('ak8FatJetMsd', '0')", ">=", 50),
#         ("('ak8FatJetMsd', '1')", ">=", 50),
#         ("('ak8FatJetParticleNetMD_Txbb', '0')", ">=", 0.8),
#     ],
#     [
#         ("('ak8FatJetMsd', '0')", ">=", 50),
#         ("('ak8FatJetMsd', '1')", ">=", 50),
#         ("('ak8FatJetParticleNetMD_Txbb', '1')", ">=", 0.8),
#     ],
# ]
filters = None

# save cutflow as pandas table
cutflow = pd.DataFrame(index=list(samples.keys()))

# utils.remove_empty_parquets(samples_dir, year)
events_dict = utils.load_samples(signal_samples_dir, {sig_key: samples[sig_key]}, year, filters)
events_dict |= utils.load_samples(
    samples_dir, {k: samples[k] for k in samples.keys() - [sig_key]}, year, filters
)

utils.add_to_cutflow(events_dict, "BDTPreselection", "weight", cutflow)

# print weighted sample yields
for sample in events_dict:
    tot_weight = np.sum(events_dict[sample]["weight"].values)
    print(f"Pre-selection {sample} yield: {tot_weight:.2f}")

Loaded GluGluToHHTobbVV_node_cHHH1                       : 142209 entries
Loaded TTToSemiLeptonic                                  : 518183 entries
Loaded TTToHadronic                                      : 755775 entries
Loaded QCD_HT300to500                                    : 18 entries
Loaded QCD_HT700to1000                                   : 129801 entries
Loaded QCD_HT1000to1500                                  : 78731 entries
Loaded QCD_HT2000toInf                                   : 40152 entries
Loaded QCD_HT1500to2000                                  : 74228 entries
Loaded QCD_HT500to700                                    : 15233 entries
Loaded WW                                                : 827 entries
Loaded ZZ                                                : 1309 entries
Loaded WZ                                                : 2027 entries
Loaded ST_s-channel_4f_hadronicDecays                    : 20243 entries
Loaded ST_tW_top_5f_inclusiveDecays                   

In [70]:
postprocessing.apply_weights(events_dict, year, cutflow)
bb_masks = postprocessing.bb_VV_assignment(events_dict)
cutflow

QCD_SCALE_FACTOR = 0.8718028070925907


In [71]:
cutflow

Unnamed: 0,BDTPreselection,TriggerEffs,QCD SF
HHbbVV,2.973773,1.566205,1.566205
QCD,2004249.0,942036.441454,821270.014043
TT,146703.8,88620.85112,88620.85112
ST,9597.525,6076.282295,6076.282295
V+Jets,54290.34,30732.272295,30732.272295
Diboson,852.9929,499.580247,499.580247
Data,947199.0,947199.0,947199.0


In [None]:
events = events_dict[sig_key]
postprocessing.postprocess_lpsfs(events, save_all=False)

In [None]:
hists = {}

In [None]:
# {var: (bins, label)}
control_plot_vars = {
    "MET_pt": ([50, 0, 250], r"$p^{miss}_T$ (GeV)"),
    "DijetEta": ([50, -8, 8], r"$\eta^{jj}$"),
    "DijetPt": ([50, 0, 750], r"$p_T^{jj}$ (GeV)"),
    "DijetMass": ([50, 500, 3000], r"$m^{jj}$ (GeV)"),
    "bbFatJetEta": ([50, -2.4, 2.4], r"$\eta^{bb}$"),
    "bbFatJetPt": ([50, 300, 1300], r"$p^{bb}_T$ (GeV)"),
    "bbFatJetParticleNetMass": ([50, 0, 300], r"$m^{bb}_{reg}$ (GeV)"),
    "bbFatJetMsd": ([50, 0, 300], r"$m^{bb}_{msd}$ (GeV)"),
    "bbFatJetParticleNetMD_Txbb": ([50, 0.8, 1], r"$p^{bb}_{Txbb}$"),
    "VVFatJetEta": ([50, -2.4, 2.4], r"$\eta^{VV}$"),
    "VVFatJetPt": ([50, 300, 1300], r"$p^{VV}_T$ (GeV)"),
    "VVFatJetParticleNetMass": ([50, 0, 300], r"$m^{VV}_{reg}$ (GeV)"),
    "VVFatJetMsd": ([50, 0, 300], r"$m^{VV}_{msd}$ (GeV)"),
    "VVFatJetParticleNet_Th4q": ([50, 0, 1], r"Prob($H \to 4q$) vs Prob(QCD) (Non-MD)"),
    "VVFatJetParTMD_THWW4q": (
        [50, 0, 1],
        r"Prob($H \to VV \to 4q$) vs Prob(QCD) (Mass-Decorrelated)",
    ),
    "VVFatJetParTMD_probT": ([50, 0, 1], r"Prob(Top) (Mass-Decorrelated)"),
    "bbFatJetPtOverDijetPt": ([50, 0, 40], r"$p^{bb}_T / p_T^{jj}$"),
    "VVFatJetPtOverDijetPt": ([50, 0, 40], r"$p^{VV}_T / p_T^{jj}$"),
    "VVFatJetPtOverbbFatJetPt": ([50, 0.4, 2.0], r"$p^{VV}_T / p^{bb}_T$"),
    "nGoodMuons": ([3, 0, 3], r"# of Muons"),
    "nGoodElectrons": ([3, 0, 3], r"# of Electrons"),
    "nGoodJets": ([5, 0, 5], r"# of AK4 B-Jets"),
    # "BDTScore": ([50, 0, 1], r"BDT Score"),
}

hists = postprocessing.control_plots(
    events_dict, bb_masks, control_plot_vars, plot_dir, hists=hists
)

In [66]:
BDT_samples = list(samples.keys())


# TODO: Change VV msd to regressed mass?
BDT_data_vars = [
    "MET_pt",
    "DijetEta",
    "DijetPt",
    "DijetMass",
    "bbFatJetPt",
    "VVFatJetEta",
    "VVFatJetPt",
    "VVFatJetMsd",
    "VVFatJetParTMD_THWW4q",
    "VVFatJetParTMD_probQCD",
    "VVFatJetParTMD_probHWW3q",
    "VVFatJetParTMD_probHWW4q",
    "VVFatJetParTMD_probT",
    "bbFatJetParticleNetMD_Txbb",
    "bbFatJetPtOverDijetPt",
    "VVFatJetPtOverDijetPt",
    "VVFatJetPtOverbbFatJetPt",
    "finalWeight",
]

jec_jmsr_vars = []

for var in BDT_data_vars:
    if var in jec_vars:
        for key in jecs:
            for shift in ["up", "down"]:
                jec_jmsr_vars.append(f"{var}_{key}_{shift}")

    if var in jmsr_vars:
        for key in jmsr:
            for shift in ["up", "down"]:
                jec_jmsr_vars.append(f"{var}_{key}_{shift}")


bdt_events_dict = []

for sample in BDT_samples:
    save_vars = BDT_data_vars + jec_jmsr_vars if sample != "Data" else BDT_data_vars
    events = pd.DataFrame(
        {var: utils.get_feat(events_dict[sample], var, bb_masks[sample]) for var in save_vars}
    )
    events["Dataset"] = sample
    bdt_events_dict.append(events)

bdt_events = pd.concat(bdt_events_dict, axis=0)

import pyarrow.parquet as pq
import pyarrow as pa

table = pa.Table.from_pandas(bdt_events)
pq.write_table(table, f"{samples_dir}/bdt_data.parquet")