In [None]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
import hist
import uproot
import warnings
from typing import List, Union, Dict
import utils
from pathlib import Path
from os import listdir
from hh_vars import norm_preserving_weights

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from dataclasses import dataclass, field


@dataclass
class Syst:
    samples: List[str] = None
    years: List[str] = field(default_factory=lambda: ["2017"])
    label: str = None


weight_shifts = {
    "pileup": [],
    "pileupID": [],
    # "PDFalphaS": Syst(samples=nonres_sig_keys, label="PDF"),
    # "QCDscale": Syst(samples=nonres_sig_keys, label="QCDscale"),
    "ISRPartonShower": [],
    "FSRPartonShower": [],
    "L1EcalPrefiring": [],
    # "top_pt": ["TT"],
}

In [None]:
# events = pd.read_parquet("0-1.parquet")

In [None]:
year = 2017
signals = ["hhbbvv", "xhy"]
samples = {
    "GluGluToHHTobbVV": "HHbbVV",
    "xhy": "X[3000]->HY[250]",
    "TTToSemiLeptonic": "TT SL",
    "qcd": "QCD",
    "data": "Data",
}

data_key = "Data"
nonres_sig_keys = ["HHbbVV"]
res_sig_keys = ["X[3000]->HY[250]"]

samples = {val: key for key, val in list(samples.items())}

In [None]:
def get_pickles(pickles_path, year, sample_name):
    """Accumulates all pickles in ``pickles_path`` directory"""
    from coffea.processor.accumulator import accumulate

    out_pickles = [f for f in listdir(pickles_path) if f != ".DS_Store"]

    file_name = out_pickles[0]
    with open(f"{pickles_path}/{file_name}", "rb") as file:
        # out = pickle.load(file)[year][sample_name]  # TODO: uncomment and delete below
        out = pickle.load(file)[year]
        sample_name = list(out.keys())[0]
        out = out[sample_name]

    for file_name in out_pickles[1:]:
        with open(f"{pickles_path}/{file_name}", "rb") as file:
            out_dict = pickle.load(file)[year][sample_name]
            out = accumulate([out, out_dict])

    return out


def normalize_weights(events: pd.DataFrame, totals: Dict, sample: str, isData: bool):
    # don't need any reweighting for data
    if isData:
        events["finalWeight"] = events["weight"]
        return

    # check weights are scaled
    if "weight_noxsec" in events:
        if np.all(events["weight"] == events["weight_noxsec"]):
            warnings.warn(f"{sample} has not been scaled by its xsec and lumi!")

    # checking that trigger efficiencies have been applied
    if "weight_noTrigEffs" in events and not np.all(
        np.isclose(events["weight"], events["weight_noTrigEffs"], rtol=1e-5)
    ):
        # normalize weights with and without trigger efficiencies
        events["finalWeight"] = events["weight"] / totals["np_nominal"]
        events["weight_noTrigEffs"] /= totals["np_nominal"]
    else:
        events["weight"] /= totals["np_nominal"]

    # normalize all the variations
    for wvar in weight_shifts:
        if f"weight_{wvar}Up" not in events:
            continue

        for shift in ["Up", "Down"]:
            wlabel = wvar + shift
            if wvar in norm_preserving_weights:
                # normalize by their totals
                events[f"weight_{wlabel}"] /= totals[f"np_{wlabel}"]
            else:
                # normalize by the nominal
                events[f"weight_{wlabel}"] /= totals[f"np_nominal"]

    # normalize scale and PDF weights
    for wkey in ["scale_weights", "pdf_weights"]:
        if wkey in events:
            events[wkey] /= totals[f"np_{wkey}"]


def load_samples(
    data_dir: str,
    samples: Dict[str, str],
    year: str,
    filters: List = None,
    columns: List = None,
    hem_cleaning: bool = True,
) -> Dict[str, pd.DataFrame]:
    """
    Loads events with an optional filter.
    Divides MC samples by the totla before skimming, to take the acceptance into account.

    Args:
        data_dir (str): path to data directory.
        samples (Dict[str, str]): dictionary of samples and selectors to load.
        year (str): year.
        filters (List): Optional filters when loading data.
        columns (List): Optional columns to load.
        hem_cleaning (bool): Whether to apply HEM cleaning to 2018 data.

    Returns:
        Dict[str, pd.DataFrame]: ``events_dict`` dictionary of events dataframe for each sample.

    """
    data_dir = Path(data_dir) / year
    full_samples_list = listdir(data_dir)  # get all directories in data_dir
    events_dict = {}

    # label - key of sample in events_dict
    # selector - string used to select directories to load in for this sample
    for label, selector in samples.items():
        events_dict[label] = []  # list of directories we load in for this sample
        for sample in full_samples_list:
            # check if this directory passes our selector string
            if not utils.check_selector(sample, selector):
                continue

            sample_path = data_dir / sample
            parquet_path, pickles_path = sample_path / "parquet", sample_path / "pickles"

            # no parquet directory?
            if not parquet_path.exists():
                warnings.warn(f"No parquet directory for {sample}!")
                continue

            # print(f"Loading {sample}")
            events = pd.read_parquet(parquet_path, filters=filters, columns=columns)

            # no events?
            if not len(events):
                warnings.warn(f"No events for {sample}!")
                continue

            # normalize by total events
            totals = get_pickles(pickles_path, year, sample)["totals"]
            normalize_weights(events, totals, sample, isData=label == data_key)

            if year == "2018" and hem_cleaning:
                events = utils._hem_cleaning(sample, events)

            events_dict[label].append(events)
            print(f"Loaded {sample: <50}: {len(events)} entries")

        if len(events_dict[label]):
            events_dict[label] = pd.concat(events_dict[label])
        else:
            del events_dict[label]

    return events_dict

In [None]:
events_dict = load_samples("../../../tmp/test_outputs", samples, "2017")

In [None]:
list(events.columns)

In [None]:
events = events_dict["HHbbVV"]

for column in events:
    if "weight" in column[0] or "Weight" in column[0]:
        print(f"{str(column):<50} {np.sum(events[column]):.3f}")

In [None]:
sw_idx = [0, 1, 3, 5, 7, 8, 4]
events = events_dict["TT SL"]

plt.figure(figsize=(12, 12))
for i, idx in enumerate(sw_idx):
    plt.rcParams.update({"font.size": 20})
    plt.hist(
        events["ak8FatJetParticleNetMass"][0].values,
        np.arange(60, 260, 20),
        histtype="step",
        weights=events["scale_weights"][i],
        label=f"Scale Weight {idx}",
    )

plt.legend()
plt.ylabel("Events")
plt.xlabel("FatJet 1 Regressed Mass (GeV)")
plt.title("2017 ggF TT SL")
plt.show()

In [None]:
plt.figure(figsize=(12, 12))
for i in range(10):
    plt.rcParams.update({"font.size": 20})
    plt.hist(
        events["ak8FatJetParticleNetMass"][0].values,
        np.arange(60, 260, 20),
        histtype="step",
        weights=events["pdf_weights"][i],
    )

# plt.legend()
plt.ylabel("Events")
plt.xlabel("FatJet 1 Regressed Mass (GeV)")
plt.title("2017 ggF HHbbVV PDF Variations")
plt.show()

In [None]:
# expect 31fb * 40fb-1 * 0.58 * 0.24 * (0.67 ** 2) * 2 * 1.8% acceptance
31 * 41 * 0.018 * 0.58 * 0.24 * (0.67**2) * 2

In [None]:
get_pickles(f"../../../tmp/test_outputs/2017/hhbbvv/pickles", "2017", "GluGluToHHTobbVV_node_cHHH1")

In [None]:
642 / 35652