In [None]:
import numpy as np
import warnings
import pandas as pd
from pandas.errors import SettingWithCopyWarning
from pathlib import Path
import pickle

from HHbbVV.hh_vars import data_key, years
import plotting
import postprocessing
import utils
import TTCalibration

from copy import deepcopy

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import mplhep as hep
from PyPDF2 import PdfMerger

# ignore these because they don't seem to apply
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

plt.style.use(hep.style.CMS)
hep.style.use("CMS")
formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))
plt.rcParams.update({"font.size": 16})

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
plot_dir = Path("../../../plots/ttsfs/25Mar26_2017")
plot_dir.mkdir(parents=True, exist_ok=True)

## Load samples

In [None]:
from TTCalibration import (
    bg_samples,
    sig_samples,
    samples,
    top_matched_key,
    top_wmatched_key,
    top_unmatched_key,
)

year = "2017"

# data_dir = "/ceph/cms/store/user/rkansal/bbVV/ttsfs/24Feb28_update_lp/"
data_dir = "/ceph/cms/store/user/rkansal/bbVV/ttsfs/25Mar25DeepAK8/"
# signal_data_dir = "/ceph/cms/store/user/rkansal/bbVV/ttsfs/25Jan30GenSelFix/"
signal_data_dir = "/ceph/cms/store/user/rkansal/bbVV/ttsfs/25Mar25DeepAK8/"

events_dict = postprocessing.load_samples(data_dir, bg_samples, year, hem_cleaning=False)
events_dict |= postprocessing.load_samples(signal_data_dir, sig_samples, year, hem_cleaning=False)

cutflow = pd.DataFrame(index=list(samples.keys()))
utils.add_to_cutflow(events_dict, "Selection", "weight", cutflow)
cutflow

In [None]:
TTCalibration.normalize_events(events_dict)
utils.add_to_cutflow(events_dict, "Scale", "weight", cutflow)

TTCalibration.fatjet_selection(events_dict)
utils.add_to_cutflow(events_dict, "FatJetSelection", "weight", cutflow)

TTCalibration.top_matching(events_dict)
cutflow

In [None]:
print(np.sum(events_dict[top_matched_key]["weight"]))
print(np.sum(events_dict[top_wmatched_key]["weight"]))
print(np.sum(events_dict[top_unmatched_key]["weight"]))

## LP SF Processing

In [None]:
events = events_dict[top_matched_key]
TTCalibration.lp_sf_processing(events)
TTCalibration.lp_sf_normalization(events)

In [None]:
plt.rcParams.update({"font.size": 24})
plt.figure(figsize=(12, 12))
_ = plt.hist(
    events_dict[top_matched_key]["lp_sf_lnN"][10].values,
    np.logspace(-4, 2, 101, base=10),
    histtype="step",
)
plt.xscale("log")
# plt.yscale("log")
plt.xlabel("LP SF")
plt.title("Scale factor distribution")
plt.show()

In [None]:
# events = events_dict[top_matched_key]
# sj_matching_unc = (
#     (np.sum(events["lp_sf_unmatched_quarks"]) / (len(events) * 3))
#     # OR of double matched and boundary quarks
#     # >0.1 to avoid floating point errors
#     + (
#         np.sum((events["lp_sf_double_matched_event"] + events["lp_sf_boundary_quarks"]) > 0.1)
#         / (len(events))
#     )
# ).values[0]
# sj_matching_unc

### Testing distortion uncertainty

In [None]:
import uproot
from tqdm import tqdm
import pickle

package_path = Path("../")

for dist_year in tqdm(years[-1:]):
    f = uproot.open(package_path / f"corrections/lp_ratios/ratio_{dist_year}.root")

    # 3D histogram: [subjet_pt, ln(0.8/Delta), ln(kT/GeV)]
    mc_nom = f["mc_nom"].to_numpy()
    ratio_edges = mc_nom[1:]
    mc_nom = mc_nom[0]

    mc_tot = np.sum(mc_nom, axis=(1, 2), keepdims=True)
    mc_density = mc_nom / mc_tot
    # plotting.plot_lund_plane_six(
    #     mc_density, ratio_edges, name=f"{plot_dir}/{dist_year}_MC.pdf", show=True
    # )

    # ratio_nom = f["ratio_nom"].to_numpy()[0]

    for sig in ["GluGluToHHTobbVV_node_cHHH1", "VBF_HHTobbVV_CV_1_C2V_2_C3_1", "TTToSemiLeptonic"]:
        if sig != "TTToSemiLeptonic" or dist_year != "2018":
            continue

        with (package_path / f"corrections/lp_ratios/signals/{dist_year}_{sig}.hist").open(
            "rb"
        ) as f:
            sig_lp_hist = pickle.load(f)

        sig_tot = np.sum(sig_lp_hist.values(), axis=(1, 2), keepdims=True)

        # 0s -> 1 in the ratio
        mc_sig_ratio = np.nan_to_num((mc_nom / mc_tot) / (sig_lp_hist.values() / sig_tot), nan=1.0)

        # too low stats
        mc_sig_ratio[mc_sig_ratio <= 0.5] = 1.0
        mc_sig_ratio[mc_sig_ratio >= 2] = 1.0

        sig_density = sig_lp_hist.values() / sig_tot

        # mc_sig_ratio = np.clip(mc_sig_ratio, 0.5, 2.0)

        # plotting.plot_lund_plane_six(
        #     sig_density,
        #     ratio_edges,
        #     name=f"{plot_dir}/{dist_year}_{sig}_tt_reweighted.pdf",
        #     show=True,
        # )

        plotting.plot_lund_plane_six(
            mc_sig_ratio,
            ratio_edges,
            name=f"{plot_dir}/{dist_year}_{sig}_ratio_tt_reweighted.pdf",
            log=True,
            show=True,
        )

        # break

In [None]:
plotting.plot_lund_plane_six(
    top_mc_sig_ratio,
    ratio_edges,
    name=f"{plot_dir}/tt_ratio.pdf",
    show=True,
)

In [None]:
f = uproot.open(package_path / f"corrections/lp_ratios/top_RW_2018_june17_ratio.root")

mc_nom = f["mc_nom"].to_numpy()
ratio_edges = mc_nom[1:]
mc_nom = mc_nom[0]

mc_tot_pt = np.sum(mc_nom, axis=(1, 2), keepdims=True)
mc_density = mc_nom / mc_tot_pt
plotting.plot_lund_plane_six(
    mc_density, ratio_edges, name=f"{plot_dir}/2018_top_MC.pdf", show=False
)

with (package_path / f"corrections/lp_ratios/signals/2018_TTToSemiLeptonic.hist").open("rb") as f:
    sig_lp_hist = pickle.load(f)

    sig_tot_pt = np.sum(sig_lp_hist.values(), axis=(1, 2), keepdims=True)
    sig_density = sig_lp_hist.values() / sig_tot_pt

    top_mc_sig_ratio = np.nan_to_num(mc_density / sig_density, nan=1.0)
    top_mc_sig_ratio[top_mc_sig_ratio == 0] = 1.0
    top_mc_sig_ratio = np.clip(mc_sig_ratio, 0.05, 20.0)

In [None]:
sig_lp_hist.values()

In [None]:
import hist

h = hist.Hist(
    hist.axis.Variable(ratio_edges[0], name="subjet_pt", label="Subjet pT [GeV]"),
    hist.axis.Variable(ratio_edges[1], name="logD", label="ln(0.8/Delta)"),
    hist.axis.Variable(ratio_edges[2], name="logkt", label="ln(kT/GeV)"),
    storage=hist.storage.Weight(),
)

In [None]:
mc_tot = np.sum(mc_nom)
sig_tot = sig_lp_hist.sum()
sig_mc_ratio = np.clip(
    np.nan_to_num((sig_lp_hist.values() / sig_tot) / (mc_nom / mc_tot), nan=1), 0.5, 2.0
)

In [None]:
sig_tot_pt = np.sum(sig_lp_hist.values(), axis=(1, 2), keepdims=True)
mc_tot_pt = np.sum(mc_nom, axis=(1, 2), keepdims=True)
mc_sig_ratio_pt = np.nan_to_num((mc_nom / mc_tot_pt) / (sig_lp_hist.values() / sig_tot_pt), nan=1.0)
mc_sig_ratio_pt[mc_sig_ratio_pt == 0] = 1.0
mc_sig_ratio_pt = np.clip(mc_sig_ratio_pt, 0.5, 2.0)
plt.imshow(mc_sig_ratio_pt[0])

In [None]:
sig_tot_pt = np.sum(sig_old_lp_hist.values(), axis=(1, 2), keepdims=True)
mc_tot_pt = np.sum(mc_nom, axis=(1, 2), keepdims=True)
mc_sig_old_ratio_pt = np.nan_to_num(
    (mc_nom / mc_tot_pt) / (sig_old_lp_hist.values() / sig_tot_pt), nan=1.0
)
mc_sig_old_ratio_pt[mc_sig_old_ratio_pt == 0] = 1.0
mc_sig_old_ratio_pt = np.clip(mc_sig_old_ratio_pt, 0.5, 2.0)
plt.imshow(mc_sig_old_ratio_pt[0])

## Plots

In [None]:
from TTCalibration import plot_samples, bg_colours, plot_vars

pre_hists = TTCalibration.plot_pre_hists(events_dict, plot_dir, year, show=True)
post_lnN_hists, uncs_lnN_dict, post_lnN_hists_err = TTCalibration.plot_post_hists(
    events_dict, pre_hists, plot_dir, year, show=True
)

TTCalibration.bin_sf(
    pre_hists, post_lnN_hists, uncs_lnN_dict, post_lnN_hists_err, plot_dir, binn=-1
)
TTCalibration.chisq_diff(pre_hists, post_lnN_hists, plot_dir, lb=20)

### Same plot

In [None]:
for var, var_hist in post_lnN_hists.items():
    name = f"{plot_dir}/PrePostlnN_{var}.pdf"
    plotting.ratioLinePlotPrePost(
        var_hist,
        pre_hists[var],
        plot_samples,
        year,
        bg_colours=bg_colours,
        bg_err=post_lnN_hists_err[var],
        name=name,
    )

## Old stuff

### Post plots with truncated Gaussians (ignore now!)

In [None]:
post_hists = {}
post_hists_err = {}
uncs_dict = {}

events = events_dict[top_matched_key]

for var, (bins, label) in plot_vars.items():
    # if var not in post_hists:
    toy_hists = []
    for i in range(events["lp_sf"].shape[1]):
        toy_hists.append(
            np.histogram(
                events[var][0].values.squeeze(),
                np.linspace(*bins[1:], bins[0] + 1),
                weights=events["weight"][0].values * events["lp_sf"][i].values,
            )[0]
        )

    sys_up_down = []
    for key in ["lp_sf_sys_up", "lp_sf_sys_down"]:
        sys_up_down.append(
            np.histogram(
                events[var][0].values.squeeze(),
                np.linspace(*bins[1:], bins[0] + 1),
                weights=events["weight"][0].values * events[key][0].values,
            )[0]
        )

    np_up_down = []
    for key in ["lp_sf_np_up", "lp_sf_np_down"]:
        np_up_down.append(
            np.histogram(
                events[var][0].values.squeeze(),
                np.linspace(*bins[1:], bins[0] + 1),
                weights=events["weight"][0].values * events[key][0].values,
            )[0]
        )

    um_up_down = []
    for key in ["lp_sf_unmatched_up", "lp_sf_unmatched_down"]:
        um_up_down.append(
            np.histogram(
                events[var][0].values.squeeze(),
                np.linspace(*bins[1:], bins[0] + 1),
                weights=events["weight"][0].values * events[key].values,
            )[0]
        )

    nom_vals = toy_hists[0]  # first column are nominal values

    pt_toy_hists = []
    for i in range(events["lp_sf_pt_extrap_vars"].shape[1]):
        pt_toy_hists.append(
            np.histogram(
                events[var][0].values.squeeze(),
                np.linspace(*bins[1:], bins[0] + 1),
                weights=events["weight"][0].values * events["lp_sf_pt_extrap_vars"][i].values,
            )[0]
        )

    b_ratio_hist = np.histogram(
        events[var][0].values.squeeze(),
        np.linspace(*bins[1:], bins[0] + 1),
        weights=events["weight"][0].values
        * events["lp_sfs_bl_ratio"][0].values
        * events["lp_sf_lnN"][0].values,
    )[0]

    uncs = {
        "stat_unc": np.minimum(nom_vals, np.std(toy_hists[1:], axis=0)),  # cap at 100% unc
        "syst_rat_unc": np.minimum(nom_vals, (np.abs(sys_up_down[0] - sys_up_down[1])) / 2),
        "np_unc": np.minimum(nom_vals, (np.abs(np_up_down[0] - np_up_down[1])) / 2),
        "um_unc": np.minimum(nom_vals, (np.abs(um_up_down[0] - um_up_down[1])) / 2),
        # "syst_sjm_unc": nom_vals * sj_matching_unc,
        "syst_sjpt_unc": np.minimum(nom_vals, np.std(pt_toy_hists, axis=0)),
        "syst_b_unc": np.abs(1 - (b_ratio_hist / nom_vals)) * nom_vals,
    }

    # uncs = {}

    # for i, shift in enumerate(["up", "down"]):
    #     uncs[shift] = {
    #         "syst_rat_unc": np.clip(sys_up_down[i], 0, 2 * nom_vals),
    #         "np_unc": np.clip(np_up_down[i], 0, 2 * nom_vals),
    #         "um_unc": np.clip(um_up_down[i], 0, 2 * nom_vals),
    #     }

    #     uncs[shift]

    #     for key, val in uncs_symm.items():
    #         if shift == "up":
    #             uncs[shift][key] = nom_vals + val
    #         else:
    #             uncs[shift][key] = nom_vals - val

    uncs_dict[var] = uncs

    unc = np.linalg.norm(list(uncs.values()), axis=0)

    thist = deepcopy(pre_hists[var])
    top_matched_key_index = np.where(np.array(list(thist.axes[0])) == top_matched_key)[0][0]
    thist.view(flow=False)[top_matched_key_index, :].value = nom_vals
    post_hists[var] = thist
    post_hists_err[var] = unc


merger_post_plots = PdfMerger()

for var, var_hist in post_hists.items():
    name = f"{plot_dir}/post_{var}.pdf"
    plotting.ratioLinePlot(
        var_hist,
        plot_samples,
        year,
        bg_colours=bg_colours,
        bg_err=post_hists_err[var],
        name=name,
    )
    merger_post_plots.append(name)

merger_post_plots.write(f"{plot_dir}/PostPlots.pdf")
merger_post_plots.close()

### Truncated Gaussians (ignore again)

In [None]:
binn = -1
tvar = "ak8FatJetParTMD_THWW4q"
pre_vals = pre_hists[tvar].view(flow=False)[top_matched_key_index, :].value
nom_vals = post_hists[tvar].view(flow=False)[top_matched_key_index, :].value
unc = post_hists_err[tvar]
print("SF: ", nom_vals[binn] / pre_vals[binn])
print("Uncs: ", {key: val[binn] / nom_vals[binn] * 100 for key, val in uncs_dict[tvar].items()})
print("Combined: ", unc[binn] / nom_vals[binn] * 100)
print("Abs: ", unc[binn] / pre_vals[binn])

### Ratio plots

In [None]:
tvar = "ak8FatJetParTMD_THWW4q"

# plt.figure(figsize=(12, 12))
# hists = pre_hists[tvar]
# bg_tot = np.sum(hists[plot_samples, :].values(), axis=0)
# mcdata_ratio = (bg_tot + 1e-5) / hists[data_key, :].values()
# _ = plt.hist(mcdata_ratio - 1, np.linspace(-0.5, 0.5, 10), histtype='step')

plt.figure(figsize=(12, 12))
hists = post_hists[tvar]
bg_tot = np.sum(hists[plot_samples, :].values(), axis=0)
data_tot = hists[data_key, :].values()
unc = post_hists_err[tvar]
mcdata_ratio = (bg_tot) / data_tot
_ = plt.hist(((bg_tot - data_tot) / (unc))[10:], np.linspace(-6.5, 4.5, 23), histtype="step")
plt.xlabel("(MC - Data) / Unc.")
plt.savefig(f"{plot_dir}/pull_hist.pdf")

In [None]:
plotting.ratioLinePlot(
    post_hists[tvar],
    plot_samples,
    year,
    bg_err=post_hists_err[tvar],
    name=f"{plot_dir}/post_ak8FatJetParTMD_THWW4q_pulls.pdf",
    pulls=True,
)

In [None]:
cut_dict = {}

for key in events_dict:
    cut_dict[key] = events_dict[key][events_dict[key]["tau42"][0] <= 0.3]

In [None]:
# {var: (bins, label)}
plot_vars = {
    "ak8FatJetParTMD_THWW4q": ([20, 0.6, 1], r"ParT $T_{HWW4q}$ MD"),
}

pre_hists_cut = {}

for var, (bins, label) in plot_vars.items():
    if var not in pre_hists_cut:
        pre_hists_cut[var] = utils.singleVarHistNoMask(
            cut_dict, var, bins, label, weight_key="weight"
        )

merger_pre_plots = PdfFileMerger()

for var, var_hist in pre_hists_cut.items():
    name = f"{plot_dir}/pre_{var}_tau42_cut.pdf"
    plotting.ratioLinePlot(
        var_hist,
        plot_samples,
        year,
        bg_err=None,
        name=name,
    )
    merger_pre_plots.append(name)

In [None]:
post_lnN_cut_hists = {}
post_lnN_cut_hists_err = {}
uncs_lnN_cut_dict = {}

events = cut_dict[top_matched_key]

for var, (bins, label) in plot_vars.items():
    if var not in post_lnN_cut_hists:
        toy_hists = []
        for i in range(events["lp_sf_lnN"].shape[1]):
            toy_hists.append(
                np.histogram(
                    events[var][0].values.squeeze(),
                    np.linspace(*bins[1:], bins[0] + 1),
                    weights=events["weight"][0].values * events["lp_sf_lnN"][i].values,
                )[0]
            )

        sys_up_down = []
        for key in ["lp_sf_sys_up", "lp_sf_sys_down"]:
            sys_up_down.append(
                np.histogram(
                    events[var][0].values.squeeze(),
                    np.linspace(*bins[1:], bins[0] + 1),
                    weights=events["weight"][0].values * events[key][0].values,
                )[0]
            )

        nom_vals = toy_hists[0]  # first column are nominal values

        uncs = {
            "stat_unc": np.minimum(nom_vals, np.std(toy_hists[1:], axis=0)),  # cap at 100% unc
            "syst_rat_unc": np.minimum(nom_vals, (np.abs(sys_up_down[0] - sys_up_down[1])) / 2),
            "syst_sjm_unc": nom_vals * sj_matching_unc,
            "syst_sjpt_unc": nom_vals * sj_pt_unc,
        }

        uncs_lnN_cut_dict[var] = uncs

        unc = np.linalg.norm(list(uncs.values()), axis=0)

        thist = deepcopy(pre_hists[var])
        top_matched_key_index = np.where(np.array(list(thist.axes[0])) == top_matched_key)[0][0]
        thist.view(flow=False)[top_matched_key_index, :].value = nom_vals
        post_lnN_cut_hists[var] = thist

        post_lnN_cut_hists_err[var] = unc


merger_post_plots = PdfFileMerger()

for var, var_hist in post_lnN_cut_hists.items():
    name = f"{plot_dir}/postlnN_{var}_cut.pdf"
    plotting.ratioLinePlot(
        var_hist,
        plot_samples,
        year,
        bg_err=post_lnN_cut_hists_err[var],
        name=name,
    )
    merger_post_plots.append(name)

In [None]:
mass_hist = utils.singleVarHistNoMask(
    events_dict, "ak8FatJetMass", [20, 125, 225], r"$m_{SD}$", weight_key="weight"
)

In [None]:
plotting.ratioHistPlot(
    mass_hist,
    [
        "QCD",
        "Diboson",
        "Single Top",
        "W+Jets",
        top_unmatched_key,
        top_wmatched_key,
        top_matched_key,
    ],
    f"{plot_dir}/",
)