# Parse, combine and interpolate limits

In [2]:
from __future__ import annotations

import os
from pathlib import Path

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import mplhep as hep
import numpy as np
import pandas as pd
from scipy import interpolate
from tqdm import tqdm

from HHbbVV.hh_vars import res_sigs, years, LUMI
from HHbbVV.postprocessing import plotting, utils
from HHbbVV.postprocessing.utils import mxmy
from HHbbVV.resonant import ProcessLimits
from HHbbVV.resonant.ProcessLimits import get_lim

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
MAIN_DIR = "../../../"
plot_dir = Path(f"{MAIN_DIR}/plots/XHY/Limits/25Apr22SigEffs")
plot_dir.mkdir(parents=True, exist_ok=True)

cards_dir = Path("/eos/uscms/store/user/rkansal/bbVV/cards/25Mar29QCDTF11nTF21")
templates_dir = Path("/ceph/cms/store/user/rkansal/bbVV/templates/25Feb8XHYFix")
# cards_dir = Path("/eos/uscms/store/user/rkansal/bbVV/cards/25Feb19ResUnblinded")

In [5]:
def label_map(key):
    if key == "50.0":
        label = "Median expected exclusion limits (fb)"
    elif key == "Observed":
        label = "Exclusion limits (fb)"
    elif key == "Significance":
        label = "Local Significance"
    else:
        label = f"{key}% expected exclusion limits (fb)"

    return label

## Load / process limits

In [6]:
limits = ProcessLimits.get_limits(cards_dir)

### Load Amitav's limits

In [7]:
alimits_path = Path(
    "/uscms/home/ammitra/nobackup/2DAlphabet/fitting/CMSSW_14_1_0_pre4/src/XHYbbWW/limits/"
)
alimits = {
    " 2.5": [],
    "16.0": [],
    "50.0": [],
    "84.0": [],
    "97.5": [],
    "Observed": [],
    "Significance": [],
}
key_map = {
    # mine: amitav's
    " 2.5": "limits_Minus2",
    "16.0": "limits_Minus1",
    "50.0": "limits_Expected",
    "84.0": "limits_Plus1",
    "97.5": "limits_Plus2",
    "Observed": "limits_OBSERVED",
    "Significance": "significance",
}

for mkey, akey in key_map.items():
    try:
        alimits[mkey] = pd.read_csv(alimits_path / f"{akey}.csv").values[:, 1:]
    except:
        print(f"{alimits_path}/{akey}.csv not found!")

Min expected limits

In [None]:
print(np.min(limits["50.0"][:, 2]))
print(np.min(alimits["50.0"][:, 2]))

Checking excesses

In [None]:
twosigma = limits["97.5"][:, 2] < limits["Observed"][:, 2]

for i in range(np.sum(twosigma)):
    mx, my = limits["50.0"][twosigma][i][:2]
    print(
        f"({mx}, {my}):\t Expected {limits['50.0'][twosigma][i, 2]}+{limits['97.5'][twosigma][i, 2]}\t Observed {limits['Observed'][twosigma][i, 2]:.2f}\t Sign {limits['Significance'][twosigma][i, 2]:.2f}"
    )

# print(limits["50.0"][twosigma], limits["97.5"][twosigma], limits["Observed"][twosigma])

In [None]:
alimits["Significance"][np.argmax(alimits["Significance"][:, 2])]

## Process and plot

### Boosted alone

In [None]:
mymax = 600
mxs = np.logspace(np.log10(900), np.log10(3999), 100, base=10)
mys = np.logspace(np.log10(60), np.log10(mymax), 100, base=10)

xx, yy = np.meshgrid(mxs, mys)

interpolated = {}
grids = {}

for key, val in limits.items():
    interpolated[key] = interpolate.LinearNDInterpolator(val[:, :2], np.log(val[:, 2]))
    grids[key] = np.exp(interpolated[key](xx, yy))

In [None]:
for key, grid in grids.items():
    if key != "Significance":
        continue

    if key == "Significance":
        vmin, vmax, log = 0, 5, False
    else:
        vmin, vmax, log = 0.05, 1e4, True

    plotting.colormesh(
        xx,
        yy,
        grid,
        label_map(key),
        f"{plot_dir}/upper{mymax}_mesh_{key}_turbo.pdf",
        vmin=vmin,
        vmax=vmax,
        log=log,
    )

In [9]:
for key, val in limits.items():
    if key != "Significance":
        continue

    if key == "Significance":
        vmin, vmax, log = 0, 5, False
    else:
        vmin, vmax, log = 0.05, 1e4, True

    plotting.XHYscatter2d(val, label_map(key), name=f"{plot_dir}/scatter_{key}.pdf", show=False)

### Check whose expected limit is better

In [None]:
sb_better = []
alim_med = alimits["50.0"]

for mx, my, lim in limits["50.0"]:
    match = (alim_med[:, 0] == mx) * (alim_med[:, 1] == my)
    if np.any(match):
        alim = float(alim_med[:, 2][match])

    if alim < lim:
        pbetter = (lim - alim) / lim
        print(f"Semiboosted better for ({mx}, {my}) by {pbetter * 100:.2f}%")
        sb_better.append([mx, my, pbetter])

sb_better = np.array(sb_better)

In [None]:
plotting.scatter2d_overlay(
    limits["50.0"],
    sb_better,
    f"Median expected exclusion limits (fb)",
    f"{plot_dir}/scatter_overlay.pdf",
    show=True,
)

### Combined

In [None]:
combined_limits = {
    " 2.5": [],
    "16.0": [],
    "50.0": [],
    "84.0": [],
    "97.5": [],
    "Observed": [],
    "Significance": [],
}
alim_med = alimits["50.0"]
blim_med = limits["50.0"]

checked_mxmy = []

for mxy in np.vstack((alim_med, blim_med))[:, :2]:
    mx, my = mxy
    mxy = (int(mxy[0]), int(mxy[1]))
    if mx < 900:
        continue

    if mxy in checked_mxmy:
        continue

    amatch, alim = get_lim(alim_med, mxy)
    bmatch, blim = get_lim(blim_med, mxy)

    alim = alim[0, 2] if np.any(amatch) else np.inf
    blim = blim[0, 2] if np.any(bmatch) else np.inf

    if alim < blim and (my < 200):
        # skipping samples for which 2018 PFNano failed !! :(
        print(f"Skipping {mxy} because of missing PFNano!")
        continue

    if blim < alim and (my > (134.5 + mx * 0.1285)):
        print(f"Skipping {mxy} because of missing from Amitav's limits!")
        continue

    use_lims = alimits if alim < blim else limits

    for key, lims in combined_limits.items():
        umatch, lim = get_lim(use_lims[key], mxy)
        if np.any(umatch):
            lims.append([*mxy, use_lims[key][umatch][0, 2]])
        else:
            print(f"Missing {mxy} for {key}!")

    checked_mxmy.append(mxy)

for key, val in combined_limits.items():
    combined_limits[key] = np.array(val)

In [9]:
combined_df = {}

combined_df["MX"] = combined_limits["Observed"][:, 0]
combined_df["MY"] = combined_limits["Observed"][:, 1]

for key, val in combined_limits.items():
    if key != "Observed":
        combined_df[f"Expected {key}"] = val[:, 2]
    else:
        combined_df[key] = val[:, 2]


pd.DataFrame(combined_df).to_csv(cards_dir / "combined_limits.csv")

In [None]:
idx = np.argmax(alimits["Significance"][:, 2])
print(alimits["Significance"][idx])
idx = np.argmax(limits["Significance"][:, 2])
print(limits["Significance"][idx])

Checking excesses

In [None]:
twosigma = combined_limits["97.5"][:, 2] < combined_limits["Observed"][:, 2]

for i in range(np.sum(twosigma)):
    mx, my = combined_limits["50.0"][twosigma][i][:2]
    print(
        f"({mx}, {my}): Expected {combined_limits['50.0'][twosigma][i, 2]}+{combined_limits['97.5'][twosigma][i, 2]}\t Observed {combined_limits['Observed'][twosigma][i, 2]:.2f}\t Sign {combined_limits['Significance'][twosigma][i, 2]:.2f}"
    )

In [None]:
mxs = np.logspace(np.log10(800), np.log10(3999), 300, base=10)
mys = np.logspace(np.log10(60), np.log10(2800), 300, base=10)
cxx, cyy = np.meshgrid(mxs, mys)

for key, val in combined_limits.items():
    if key != "Significance":
        continue

    if key == "Significance":
        vmin, vmax, log = 0, 5, False
    else:
        vmin, vmax, log = 0.05, 1e4, True

    interpolated = interpolate.LinearNDInterpolator(val[:, :2], np.log(val[:, 2]))
    grid = np.exp(interpolated(cxx, cyy))

    plotting.colormesh(
        cxx,
        cyy,
        grid,
        label_map(key),
        f"{plot_dir}/combined_mesh_{key}.pdf",
        vmin=vmin,
        vmax=vmax,
        log=log,
        figsize=(12, 8),
        show=True,
    )

In [None]:
for key, val in combined_limits.items():
    if key != "Significance":
        continue

    if key == "Significance":
        vmin, vmax, log = 0, 5, False
    else:
        vmin, vmax, log = 0.05, 1e4, True

    plotting.XHYscatter2d(
        val, label_map(key), name=f"{plot_dir}/combined_scatter_{key}.pdf", show=True
    )

## Signal Efficiencies

In [6]:
sig = res_sigs[0]
# pd.read_csv(templates_dir / sig / "cutflows" / "2018" / "pass_cutflow.csv").to_numpy()

In [None]:
[
    pd.read_csv(templates_dir / sig / "cutflows" / year / "pass_cutflow.csv").to_numpy()[0, -1]
    for year in years
]

In [None]:
# load cutflows for each signal in templates_dir
tot_lumi = sum([LUMI[year] for year in years]) / 1000.0

sig_effs = []
for sig in tqdm(res_sigs):
    mx, my = utils.mxmy(sig)
    try:
        sig_yield = sum(
            [
                pd.read_csv(
                    templates_dir / sig / "cutflows" / year / "pass_cutflow.csv"
                ).to_numpy()[0, -1]
                for year in years
            ]
        )
        sig_eff = sig_yield / tot_lumi
        sig_effs.append([mx, my, sig_eff])
    except Exception as e:
        print(f"No cutflows found for {sig}!")
        print(e)

In [None]:
plotting.XHYscatter2d(
    np.array(sig_effs),
    "Signal efficiency in SR Pass",
    name=f"{plot_dir}/scatter_sig_eff.pdf",
    show=True,
)