In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
features = [
    "sym_Jet0_pt", "sym_Jet1_pt", "sym_Jet2_pt", "sym_Jet3_pt",
    "sym_Jet0_eta", "sym_Jet1_eta", "sym_Jet2_eta", "sym_Jet3_eta",
    "sym_Jet0_phi", "sym_Jet1_phi", "sym_Jet2_phi", "sym_Jet3_phi",  
    "sym_Jet0_m", "sym_Jet1_m", "sym_Jet2_m", "sym_Jet3_m",
]

In [3]:
import numpy as np
from events_data import EventsData
from dataset import SCDatasetInfo

def plot_sr_stats(events, sr_stats, ax, label, **plot_kwargs):
    assert len(events) == len(sr_stats)

    sr_stats_argsort = np.argsort(sr_stats)[::-1]
    weights = events.weights[sr_stats_argsort]
    is_signal = events.is_signal[sr_stats_argsort]
    is_4b = events.is_4b[sr_stats_argsort]

    ax.plot(
        np.cumsum(weights * is_4b) / np.sum(weights * is_4b),
        np.cumsum(weights * is_signal) / np.sum(weights * is_signal),           
        label=label,
        **plot_kwargs,
    )

def get_is_signal(scdinfo: SCDatasetInfo, signal_filename: str):
    # Now show the answer
    is_signals = []
    for file, file_len in zip(scdinfo.files, scdinfo.get_file_lengths()):
        is_signals.append(
            np.full(file_len, True)
            if file.name == signal_filename
            else np.full(file_len, False)
        )
    is_signal = np.concatenate(is_signals)
    return is_signal



def events_from_scdinfo(scdinfo: SCDatasetInfo, features: list, signal_filename: str) -> EventsData:
    df = scdinfo.fetch_data()
    df["signal"] = get_is_signal(scdinfo, signal_filename)
    events = EventsData.from_dataframe(df, features)

    return events

def hist_events_by_labels(events: EventsData, values: np.ndarray, bins, ax, **hist_kwargs):
    assert len(values) == len(events)   
    ax.hist(values[events.is_3b], 
                        bins=bins, histtype="step", label="3b", 
                        weights=events.weights[events.is_3b], 
                        **hist_kwargs)
    ax.hist(values[events.is_bg4b], 
                bins=bins, histtype="step", label="bg4b", 
                weights=events.weights[events.is_bg4b], 
                **hist_kwargs)
    ax.hist(values[events.is_signal], 
                bins=bins, histtype="step", label="signal", 
                weights=events.weights[events.is_signal], 
                **hist_kwargs)

In [21]:
import torch
from fvt_classifier import FvTClassifier
from tst_info import TSTInfo
import yaml
import matplotlib.pyplot as plt

config_filename = "configs/counting_test_v2_base.yml"


config = yaml.safe_load(open(config_filename, "r"))
experiment_name = config["experiment_name"]
n_3b = config["n_3b"]
ratio_4b = config["ratio_4b"]

hashes, hparams = TSTInfo.find({
    "experiment_name": experiment_name,
    "n_3b": n_3b,
    "ratio_4b": ratio_4b,
}, return_hparams=True)

seeds = np.unique([hp["seed"] for hp in hparams])
signal_ratios = np.unique([hp["signal_ratio"] for hp in hparams])
n_3bs = np.unique([hp["n_3b"] for hp in hparams])

tst_results = []
result_size = 0

909it [00:00, 1122.88it/s]


In [64]:
import sys
import pandas as pd
import tqdm
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

seed_lim = 52
verbose = False
show_plots = False
hparam_filter = {
    "experiment_name": lambda x: x in ["counting_test_v1", "counting_test_v2"],
    "n_3b": 100_0000,
    "seed": lambda x: x <= seed_lim
}

hashes = TSTInfo.find(hparam_filter)

for tstinfo_hash in (pbar := tqdm.tqdm(hashes)):

    if tstinfo_hash in [r["hash"] for r in tst_results]:
        continue

    tstinfo = TSTInfo.load(tstinfo_hash)
    signal_filename = tstinfo.hparams["signal_filename"]
    seed = tstinfo.hparams["seed"]
    signal_ratio = tstinfo.hparams["signal_ratio"]        
    experiment_name = tstinfo.hparams["experiment_name"]
    
    initialize_with_fvt = True if experiment_name == "counting_test_v2" else False

    scdinfo_tst = tstinfo.scdinfo_tst
    events_tst = events_from_scdinfo(scdinfo_tst, features, signal_filename)
    base_fvt_hash = tstinfo.base_fvt_tinfo_hash
    fvt_model = FvTClassifier.load_from_checkpoint(f"./checkpoints/{base_fvt_hash}_best.ckpt")
    fvt_model.eval()
    events_tst.set_model_scores(fvt_model)
    CR_fvt_hash = tstinfo.CR_fvt_tinfo_hash
    CR_model = FvTClassifier.load_from_checkpoint(f"./checkpoints/{CR_fvt_hash}_best.ckpt")
    CR_model.eval()


    SR_stats = tstinfo.SR_stats
    SR_cut = tstinfo.SR_cut
    CR_cut = tstinfo.CR_cut
    in_SR = SR_stats >= SR_cut
    in_CR = (SR_stats < SR_cut) & (SR_stats >= CR_cut)


    weights_4b = events_tst.weights * events_tst.is_4b
    weights_signal = events_tst.weights * events_tst.is_signal
    
    ratio_4b = tstinfo.hparams["ratio_4b"]
    probs_4b_est = CR_model.predict(events_tst.X_torch).detach().cpu().numpy()[:, 1]
    reweights = ratio_4b * probs_4b_est / ((1 - ratio_4b) * (1 - probs_4b_est))
    events_tst.reweight(
        np.where(events_tst.is_4b, events_tst.weights, events_tst.weights * reweights))
    
    events_SR = events_tst[in_SR]
    events_CR = events_tst[in_CR]
    SR_stats_SR = SR_stats[in_SR]
    SR_stats_CR = SR_stats[in_CR]
    
    tst_results.append({
        "signal_ratio": signal_ratio,
        "seed": seed,
        "events_SR": events_SR,
        "events_CR": events_CR,
        "SR_stats_SR": SR_stats_SR,
        "SR_stats_CR": SR_stats_CR,
        "SR_cut": SR_cut,
        "CR_cut": CR_cut,
        "CR_model_output_CR": probs_4b_est[in_CR],
        "CR_model_output_SR": probs_4b_est[in_SR],
        "initialize_with_fvt": initialize_with_fvt,
        "hash": tstinfo_hash,
    })
    result_size += (
        events_SR.get_memory_usage() + events_CR.get_memory_usage() +
        SR_stats_SR.nbytes + SR_stats_CR.nbytes
    ) / 1024**2

    pbar.set_postfix({"Result size (MB)": result_size})

1064it [00:01, 891.77it/s]
100%|██████████| 315/315 [08:23<00:00,  1.60s/it, Result size (MB)=1.92e+4]


In [65]:
def get_histogram_info(events, values, bins):
    assert len(events) == len(values)

    if isinstance(bins, int):
        bins = np.linspace(np.min(values), np.max(values), bins)
    
    hist_3b, _ = np.histogram(values[events.is_3b], bins=bins, weights=events.weights[events.is_3b])
    hist_bg4b, _ = np.histogram(values[events.is_bg4b], bins=bins, weights=events.weights[events.is_bg4b])
    hist_signal, _ = np.histogram(values[events.is_signal], bins=bins, weights=events.weights[events.is_signal])
    hist_4b, _ = np.histogram(values[events.is_4b], bins=bins, weights=events.weights[events.is_4b])

    std_est = np.sqrt(np.max([hist_4b, hist_3b], axis=0))
    sigma = (hist_4b - hist_3b)  / std_est
    sigma_avg = np.sqrt(np.mean(sigma[~np.isnan(sigma)] ** 2))
    sigma_bg4b = (hist_bg4b - hist_3b)  / std_est
    sigma_avg_bg4b = np.sqrt(np.mean(sigma_bg4b[~np.isnan(sigma_bg4b)] ** 2))

    return {
        "hist_3b": hist_3b,
        "hist_bg4b": hist_bg4b,
        "hist_signal": hist_signal,
        "hist_4b": hist_4b,
        "std_est": std_est,
        "sigma": sigma,
        "sigma_avg": sigma_avg,
        "sigma_bg4b": sigma_bg4b,
        "sigma_avg_bg4b": sigma_avg_bg4b,
    }

In [66]:
tst_results_summary = []

for tst_result in tst_results:
    events_SR = tst_result["events_SR"]
    events_CR = tst_result["events_CR"]
    SR_stats_SR = tst_result["SR_stats_SR"]
    SR_stats_CR = tst_result["SR_stats_CR"]
    SR_cut = tst_result["SR_cut"]
    CR_cut = tst_result["CR_cut"]
    signal_ratio = tst_result["signal_ratio"]
    seed = tst_result["seed"]
    CR_model_output_CR = tst_result["CR_model_output_CR"]
    CR_model_output_SR = tst_result["CR_model_output_SR"]
    initialize_with_fvt = tst_result["initialize_with_fvt"]

    nbins = [10, 20, 30, 50, 100, 200]
    for nbin in nbins:
        # hist_info_SR = get_histogram_info(events_SR, SR_stats_SR, nbin)
        # hist_info_CR = get_histogram_info(events_CR, SR_stats_CR, nbin)
        hist_info_SR = get_histogram_info(events_SR, CR_model_output_SR, nbin)
        hist_info_CR = get_histogram_info(events_CR, CR_model_output_CR, nbin)

        tst_results_summary.append({
            "signal_ratio": signal_ratio,
            "seed": seed,
            "nbin": nbin,
            "sigma_avg_SR": hist_info_SR["sigma_avg"],
            "sigma_avg_bg4b_SR": hist_info_SR["sigma_avg_bg4b"],
            "sigma_avg_CR": hist_info_CR["sigma_avg"],
            "sigma_avg_bg4b_CR": hist_info_CR["sigma_avg_bg4b"],
            "initialize_with_fvt": initialize_with_fvt,
        })

tst_results_summary_df = pd.DataFrame(tst_results_summary)

  sigma = (hist_4b - hist_3b)  / std_est
  sigma_bg4b = (hist_bg4b - hist_3b)  / std_est


In [67]:
from scipy import stats

sig_level = 0.05
z = stats.norm.ppf(1 - sig_level / 2)
tst_results_summary_df["rejected_SR"] = tst_results_summary_df["sigma_avg_SR"] > z
tst_results_summary_df["rejected_bg4b_SR"] = tst_results_summary_df["sigma_avg_bg4b_SR"] > z
tst_results_summary_df["rejected_CR"] = tst_results_summary_df["sigma_avg_CR"] > z
tst_results_summary_df["rejected_bg4b_CR"] = tst_results_summary_df["sigma_avg_bg4b_CR"] > z

# tst_results_summary_df.groupby(["signal_ratio", "nbin"]).mean()

In [68]:
tst_results_summary_df[tst_results_summary_df["seed"] <= seed_lim].groupby([
    "nbin", 
    "signal_ratio", 
    "initialize_with_fvt"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,seed,sigma_avg_SR,sigma_avg_bg4b_SR,sigma_avg_CR,sigma_avg_bg4b_CR,rejected_SR,rejected_bg4b_SR,rejected_CR,rejected_bg4b_CR
nbin,signal_ratio,initialize_with_fvt,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10,0.0,False,26.0,5.664643,5.664643,2.667531,2.667531,0.981132,0.981132,0.830189,0.830189
10,0.0,True,26.5,2.528026,2.528026,2.310283,2.310283,0.75,0.75,0.692308,0.692308
10,0.01,False,26.0,5.915642,4.836887,2.538666,2.710492,1.0,0.962264,0.792453,0.849057
10,0.01,True,26.5,2.757207,2.541246,1.948655,2.226118,0.730769,0.634615,0.442308,0.615385
10,0.02,False,26.0,10.613182,4.630083,2.616848,2.735898,1.0,0.962264,0.811321,0.849057
10,0.02,True,26.5,5.88418,2.932129,2.312423,2.426271,0.980769,0.788462,0.673077,0.653846
20,0.0,False,26.0,4.103483,4.103483,2.211467,2.211467,0.981132,0.981132,0.679245,0.679245
20,0.0,True,26.5,2.109835,2.109835,1.916719,1.916719,0.576923,0.576923,0.326923,0.326923
20,0.01,False,26.0,4.291609,3.575574,2.134815,2.218271,1.0,0.943396,0.584906,0.698113
20,0.01,True,26.5,2.184119,2.044622,1.781971,1.930849,0.538462,0.480769,0.384615,0.442308


In [72]:
tst_results_summary_df.to_csv(f"tst_results_summary_{experiment_name}.csv", index=False)