In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
features = [
    "sym_Jet0_pt", "sym_Jet1_pt", "sym_Jet2_pt", "sym_Jet3_pt",
    "sym_Jet0_eta", "sym_Jet1_eta", "sym_Jet2_eta", "sym_Jet3_eta",
    "sym_Jet0_phi", "sym_Jet1_phi", "sym_Jet2_phi", "sym_Jet3_phi",  
    "sym_Jet0_m", "sym_Jet1_m", "sym_Jet2_m", "sym_Jet3_m",
]

In [3]:
import numpy as np
from events_data import EventsData
from dataset import SCDatasetInfo

def plot_sr_stats(events, sr_stats, ax, label, **plot_kwargs):
    assert len(events) == len(sr_stats)

    sr_stats_argsort = np.argsort(sr_stats)[::-1]
    weights = events.weights[sr_stats_argsort]
    is_signal = events.is_signal[sr_stats_argsort]
    is_4b = events.is_4b[sr_stats_argsort]

    ax.plot(
        np.cumsum(weights * is_4b) / np.sum(weights * is_4b),
        np.cumsum(weights * is_signal) / np.sum(weights * is_signal),           
        label=label,
        **plot_kwargs,
    )

def get_is_signal(scdinfo: SCDatasetInfo, signal_filename: str):
    # Now show the answer
    is_signals = []
    for file, file_len in zip(scdinfo.files, scdinfo.get_file_lengths()):
        is_signals.append(
            np.full(file_len, True)
            if file.name == signal_filename
            else np.full(file_len, False)
        )
    is_signal = np.concatenate(is_signals)
    return is_signal



def events_from_scdinfo(scdinfo: SCDatasetInfo, features: list, signal_filename: str) -> EventsData:
    df = scdinfo.fetch_data()
    df["signal"] = get_is_signal(scdinfo, signal_filename)
    events = EventsData.from_dataframe(df, features)

    return events

def hist_events_by_labels(events: EventsData, values: np.ndarray, bins, ax, **hist_kwargs):
    assert len(values) == len(events)   
    ax.hist(values[events.is_3b], 
                        bins=bins, histtype="step", label="3b", 
                        weights=events.weights[events.is_3b], 
                        **hist_kwargs)
    ax.hist(values[events.is_bg4b], 
                bins=bins, histtype="step", label="bg4b", 
                weights=events.weights[events.is_bg4b], 
                **hist_kwargs)
    ax.hist(values[events.is_signal], 
                bins=bins, histtype="step", label="signal", 
                weights=events.weights[events.is_signal], 
                **hist_kwargs)

In [6]:
import torch
from fvt_classifier import FvTClassifier
from tst_info import TSTInfo
import yaml
import matplotlib.pyplot as plt

config_filename = "configs/counting_test_v2_base.yml"


config = yaml.safe_load(open(config_filename, "r"))
experiment_name = config["experiment_name"]
n_3b = config["n_3b"]
ratio_4b = config["ratio_4b"]

hashes = TSTInfo.find({
    "experiment_name": experiment_name,
    "n_3b": n_3b,
    "ratio_4b": ratio_4b,
}, return_hparams=True)
hashes, hparams = zip(*hashes)

seeds = np.unique([hp["seed"] for hp in hparams])
signal_ratios = np.unique([hp["signal_ratio"] for hp in hparams])
n_3bs = np.unique([hp["n_3b"] for hp in hparams])

986it [00:00, 1683.87it/s]


In [8]:
import sys
import pandas as pd
import tqdm
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

verbose = False
show_plots = False
hparam_filter = {
    "experiment_name": experiment_name,
    "n_3b": 100_0000,
}

hashes = TSTInfo.find(hparam_filter)
tst_results = []

result_size = 0
for tstinfo_hash in (pbar := tqdm.tqdm(hashes)):
    tstinfo = TSTInfo.load(tstinfo_hash)
    signal_filename = tstinfo.hparams["signal_filename"]
    seed = tstinfo.hparams["seed"]
    signal_ratio = tstinfo.hparams["signal_ratio"]        

    scdinfo_tst = tstinfo.scdinfo_tst
    events_tst = events_from_scdinfo(scdinfo_tst, features, signal_filename)
    base_fvt_hash = tstinfo.base_fvt_tinfo_hash
    fvt_model = FvTClassifier.load_from_checkpoint(f"./checkpoints/{base_fvt_hash}_best.ckpt")
    fvt_model.eval()
    events_tst.set_model_scores(fvt_model)
    CR_fvt_hash = tstinfo.CR_fvt_tinfo_hash
    CR_model = FvTClassifier.load_from_checkpoint(f"./checkpoints/{CR_fvt_hash}_best.ckpt")
    CR_model.eval()


    SR_stats = tstinfo.SR_stats
    SR_cut = tstinfo.SR_cut
    CR_cut = tstinfo.CR_cut
    in_SR = SR_stats >= SR_cut
    in_CR = (SR_stats < SR_cut) & (SR_stats >= CR_cut)


    weights_4b = events_tst.weights * events_tst.is_4b
    weights_signal = events_tst.weights * events_tst.is_signal
    
    ratio_4b = tstinfo.hparams["ratio_4b"]
    probs_4b_est = CR_model.predict(events_tst.X_torch).detach().cpu().numpy()[:, 1]
    reweights = ratio_4b * probs_4b_est / ((1 - ratio_4b) * (1 - probs_4b_est))
    events_tst.reweight(
        np.where(events_tst.is_4b, events_tst.weights, events_tst.weights * reweights))
    
    events_SR = events_tst[in_SR]
    events_CR = events_tst[in_CR]
    SR_stats_SR = SR_stats[in_SR]
    SR_stats_CR = SR_stats[in_CR]
    
    tst_results.append({
        "signal_ratio": signal_ratio,
        "seed": seed,
        "events_SR": events_SR,
        "events_CR": events_CR,
        "SR_stats_SR": SR_stats_SR,
        "SR_stats_CR": SR_stats_CR,
        "SR_cut": SR_cut,
        "CR_cut": CR_cut,
        "CR_model_output_CR": probs_4b_est[in_CR],
        "CR_model_output_SR": probs_4b_est[in_SR],
    })
    result_size += (
        events_SR.get_memory_usage() + events_CR.get_memory_usage() +
        SR_stats_SR.nbytes + SR_stats_CR.nbytes
    ) / 1024**2

    pbar.set_postfix({"Result size (MB)": result_size})

987it [00:00, 1615.05it/s]
  0%|          | 0/87 [00:05<?, ?it/s]


FileNotFoundError: [Errno 2] No such file or directory: '/home/soheuny/HH4bsim/playground/checkpoints/OOa7EOpdDaFN_best.ckpt'

In [132]:
def get_histogram_info(events, values, bins):
    assert len(events) == len(values)

    if isinstance(bins, int):
        bins = np.linspace(np.min(values), np.max(values), bins)
    
    hist_3b, _ = np.histogram(values[events.is_3b], bins=bins, weights=events.weights[events.is_3b])
    hist_bg4b, _ = np.histogram(values[events.is_bg4b], bins=bins, weights=events.weights[events.is_bg4b])
    hist_signal, _ = np.histogram(values[events.is_signal], bins=bins, weights=events.weights[events.is_signal])
    hist_4b, _ = np.histogram(values[events.is_4b], bins=bins, weights=events.weights[events.is_4b])

    std_est = np.sqrt(np.max([hist_4b, hist_3b], axis=0))
    sigma = (hist_4b - hist_3b)  / std_est
    sigma_avg = np.sqrt(np.mean(sigma[~np.isnan(sigma)] ** 2))
    sigma_bg4b = (hist_bg4b - hist_3b)  / std_est
    sigma_avg_bg4b = np.sqrt(np.mean(sigma_bg4b[~np.isnan(sigma_bg4b)] ** 2))

    return {
        "hist_3b": hist_3b,
        "hist_bg4b": hist_bg4b,
        "hist_signal": hist_signal,
        "hist_4b": hist_4b,
        "std_est": std_est,
        "sigma": sigma,
        "sigma_avg": sigma_avg,
        "sigma_bg4b": sigma_bg4b,
        "sigma_avg_bg4b": sigma_avg_bg4b,
    }

In [153]:
tst_results_summary = []

for tst_result in tst_results:
    events_SR = tst_result["events_SR"]
    events_CR = tst_result["events_CR"]
    SR_stats_SR = tst_result["SR_stats_SR"]
    SR_stats_CR = tst_result["SR_stats_CR"]
    SR_cut = tst_result["SR_cut"]
    CR_cut = tst_result["CR_cut"]
    signal_ratio = tst_result["signal_ratio"]
    seed = tst_result["seed"]
    CR_model_output_CR = tst_result["CR_model_output_CR"]
    CR_model_output_SR = tst_result["CR_model_output_SR"]

    nbins = [10, 20, 30, 50, 100, 200]
    for nbin in nbins:
        # hist_info_SR = get_histogram_info(events_SR, SR_stats_SR, nbin)
        # hist_info_CR = get_histogram_info(events_CR, SR_stats_CR, nbin)
        hist_info_SR = get_histogram_info(events_SR, CR_model_output_SR, nbin)
        hist_info_CR = get_histogram_info(events_CR, CR_model_output_CR, nbin)

        tst_results_summary.append({
            "signal_ratio": signal_ratio,
            "seed": seed,
            "nbin": nbin,
            "sigma_avg_SR": hist_info_SR["sigma_avg"],
            "sigma_avg_bg4b_SR": hist_info_SR["sigma_avg_bg4b"],
            "sigma_avg_CR": hist_info_CR["sigma_avg"],
            "sigma_avg_bg4b_CR": hist_info_CR["sigma_avg_bg4b"],
        })

tst_results_summary_df = pd.DataFrame(tst_results_summary)

  sigma = (hist_4b - hist_3b)  / std_est
  sigma_bg4b = (hist_bg4b - hist_3b)  / std_est


In [154]:
from scipy import stats

sig_level = 0.05
z = stats.norm.ppf(1 - sig_level / 2)
tst_results_summary_df["rejected_SR"] = tst_results_summary_df["sigma_avg_SR"] > z
tst_results_summary_df["rejected_bg4b_SR"] = tst_results_summary_df["sigma_avg_bg4b_SR"] > z
tst_results_summary_df["rejected_CR"] = tst_results_summary_df["sigma_avg_CR"] > z
tst_results_summary_df["rejected_bg4b_CR"] = tst_results_summary_df["sigma_avg_bg4b_CR"] > z

tst_results_summary_df.groupby(["signal_ratio", "nbin"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,seed,sigma_avg_SR,sigma_avg_bg4b_SR,sigma_avg_CR,sigma_avg_bg4b_CR,rejected_SR,rejected_bg4b_SR,rejected_CR,rejected_bg4b_CR
signal_ratio,nbin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.0,10,49.5,5.494922,5.494922,2.739472,2.739472,0.99,0.99,0.86,0.86
0.0,20,49.5,4.014668,4.014668,2.269251,2.269251,0.99,0.99,0.73,0.73
0.0,30,49.5,3.398778,3.398778,2.047462,2.047462,0.99,0.99,0.56,0.56
0.0,50,49.5,2.797999,2.797999,1.842147,1.842147,0.94,0.94,0.28,0.28
0.0,100,49.5,2.250341,2.250341,1.676919,1.676919,0.71,0.71,0.06,0.06
0.0,200,49.5,1.916343,1.916343,1.576032,1.576032,0.32,0.32,0.0,0.0
0.01,10,49.5,6.126015,4.99563,2.609983,2.795705,1.0,0.98,0.78,0.87
0.01,20,49.5,4.429435,3.677245,2.181474,2.279668,1.0,0.96,0.64,0.73
0.01,30,49.5,3.720859,3.12853,1.987948,2.056921,0.98,0.91,0.52,0.54
0.01,50,49.5,3.034325,2.601065,1.803994,1.847808,0.95,0.84,0.26,0.31


In [136]:
tst_results_summary_df.to_csv(f"tst_results_summary_{experiment_name}.csv", index=False)