In [82]:
import utils
import plotting
import postprocessing
import numpy as np
import warnings
import pandas as pd
from pandas.errors import SettingWithCopyWarning
from hh_vars import samples, sig_key, data_key, jec_shifts, jmsr_shifts, jec_vars, jmsr_vars
import os

# ignore these because they don't seem to apply
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

from PyPDF2 import PdfFileMerger

from copy import deepcopy

import matplotlib.pyplot as plt
import mplhep as hep
import matplotlib.ticker as mticker

plt.style.use(hep.style.CMS)
hep.style.use("CMS")
formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))
plt.rcParams.update({"font.size": 16})

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
plot_dir = "../../../plots/BDTPreProcessing/Feb21"
_ = os.system(f"mkdir -p {plot_dir}")

In [None]:
MAIN_DIR = "../../../"
samples_dir = f"{MAIN_DIR}/../data/skimmer/Feb20"
signal_samples_dir = f"{MAIN_DIR}/../data/skimmer/Feb20"
year = "2017"

# Both Jet's Msds > 50 & at least one jet with Txbb > 0.8
# filters = [
#     [
#         ("('ak8FatJetMsd', '0')", ">=", 50),
#         ("('ak8FatJetMsd', '1')", ">=", 50),
#         ("('ak8FatJetParticleNetMD_Txbb', '0')", ">=", 0.8),
#     ],
#     [
#         ("('ak8FatJetMsd', '0')", ">=", 50),
#         ("('ak8FatJetMsd', '1')", ">=", 50),
#         ("('ak8FatJetParticleNetMD_Txbb', '1')", ">=", 0.8),
#     ],
# ]
filters = None

# save cutflow as pandas table
cutflow = pd.DataFrame(index=list(samples.keys()))

# utils.remove_empty_parquets(samples_dir, year)
events_dict = utils.load_samples(signal_samples_dir, {sig_key: samples[sig_key]}, year, filters)
events_dict |= utils.load_samples(
    samples_dir, {k: samples[k] for k in samples.keys() - [sig_key]}, year, filters
)

utils.add_to_cutflow(events_dict, "BDTPreselection", "weight", cutflow)

# print weighted sample yields
for sample in events_dict:
    tot_weight = np.sum(events_dict[sample]["weight"].values)
    print(f"Pre-selection {sample} yield: {tot_weight:.2f}")

In [None]:
postprocessing.apply_weights(events_dict, year, cutflow)
bb_masks = postprocessing.bb_VV_assignment(events_dict)
_ = postprocessing.postprocess_lpsfs(events_dict[sig_key], save_all=False)
cutflow

In [None]:
hists = {}

In [None]:
control_plot_vars = postprocessing.control_plot_vars
del control_plot_vars["BDTScore"]

hists = postprocessing.control_plots(
    events_dict, bb_masks, control_plot_vars, plot_dir, hists=hists
)

In [84]:
BDT_samples = list(samples.keys())


# TODO: Change VV msd to regressed mass?
BDT_data_vars = [
    "MET_pt",
    "DijetEta",
    "DijetPt",
    "DijetMass",
    "bbFatJetPt",
    "VVFatJetEta",
    "VVFatJetPt",
    "VVFatJetMsd",
    "VVFatJetParTMD_THWW4q",
    "VVFatJetParTMD_probQCD",
    "VVFatJetParTMD_probHWW3q",
    "VVFatJetParTMD_probHWW4q",
    "VVFatJetParTMD_probT",
    "bbFatJetParticleNetMD_Txbb",
    "bbFatJetPtOverDijetPt",
    "VVFatJetPtOverDijetPt",
    "VVFatJetPtOverbbFatJetPt",
    "finalWeight",
]

jec_jmsr_vars = []

for var in BDT_data_vars:
    if var in jec_vars:
        for jshift in jec_shifts:
            jec_jmsr_vars.append(f"{var}_{jshift}")

    if var in jmsr_vars:
        for jshift in jmsr_shifts:
            jec_jmsr_vars.append(f"{var}_{jshift}")


bdt_events_dict = []

for sample in BDT_samples:
    save_vars = BDT_data_vars + jec_jmsr_vars if sample != "Data" else BDT_data_vars
    events = pd.DataFrame(
        {var: utils.get_feat(events_dict[sample], var, bb_masks[sample]) for var in save_vars}
    )
    events["Dataset"] = sample
    bdt_events_dict.append(events)

bdt_events = pd.concat(bdt_events_dict, axis=0)

import pyarrow.parquet as pq
import pyarrow as pa

table = pa.Table.from_pandas(bdt_events)
pq.write_table(table, f"{samples_dir}/{year}_bdt_data.parquet")