In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
os.chdir('/home/export/soheuny/SRFinder/soheun')

In [2]:
features = [
    "sym_Jet0_pt", "sym_Jet1_pt", "sym_Jet2_pt", "sym_Jet3_pt",
    "sym_Jet0_eta", "sym_Jet1_eta", "sym_Jet2_eta", "sym_Jet3_eta",
    "sym_Jet0_phi", "sym_Jet1_phi", "sym_Jet2_phi", "sym_Jet3_phi",  
    "sym_Jet0_m", "sym_Jet1_m", "sym_Jet2_m", "sym_Jet3_m",
]

In [3]:
import numpy as np
from events_data import EventsData
from dataset import SCDatasetInfo

def plot_sr_stats(events, sr_stats, ax, label, **plot_kwargs):
    assert len(events) == len(sr_stats)

    sr_stats_argsort = np.argsort(sr_stats)[::-1]
    weights = events.weights[sr_stats_argsort]
    is_signal = events.is_signal[sr_stats_argsort]
    is_4b = events.is_4b[sr_stats_argsort]

    ax.plot(
        np.cumsum(weights * is_4b) / np.sum(weights * is_4b),
        np.cumsum(weights * is_signal) / np.sum(weights * is_signal),           
        label=label,
        **plot_kwargs,
    )

def get_is_signal(scdinfo: SCDatasetInfo, signal_filename: str):
    # Now show the answer
    is_signals = []
    for file, file_len in zip(scdinfo.files, scdinfo.get_file_lengths()):
        is_signals.append(
            np.full(file_len, True)
            if file.name == signal_filename
            else np.full(file_len, False)
        )
    is_signal = np.concatenate(is_signals)
    return is_signal



def events_from_scdinfo(scdinfo: SCDatasetInfo, features: list, signal_filename: str) -> EventsData:
    df = scdinfo.fetch_data()
    df["signal"] = get_is_signal(scdinfo, signal_filename)
    events = EventsData.from_dataframe(df, features)

    return events

def hist_events_by_labels(events: EventsData, values: np.ndarray, bins, ax, **hist_kwargs):
    assert len(values) == len(events)   
    ax.hist(values[events.is_3b], 
                        bins=bins, histtype="step", label="3b", 
                        weights=events.weights[events.is_3b], 
                        **hist_kwargs)
    ax.hist(values[events.is_bg4b], 
                bins=bins, histtype="step", label="bg4b", 
                weights=events.weights[events.is_bg4b], 
                **hist_kwargs)
    ax.hist(values[events.is_signal], 
                bins=bins, histtype="step", label="signal", 
                weights=events.weights[events.is_signal], 
                **hist_kwargs)

In [4]:
import torch
from fvt_classifier import FvTClassifier
from tst_info import TSTInfo
import yaml
import matplotlib.pyplot as plt

config_filename = "./configs/counting_test_v2_base.yml"


config = yaml.safe_load(open(config_filename, "r"))
experiment_name = config["experiment_name"]
n_3b = config["n_3b"]
ratio_4b = config["ratio_4b"]

hashes, hparams = TSTInfo.find({
    "experiment_name": experiment_name,
    "n_3b": n_3b,
    "ratio_4b": ratio_4b,
}, return_hparams=True)

seeds = np.unique([hp["seed"] for hp in hparams])
signal_ratios = np.unique([hp["signal_ratio"] for hp in hparams])
n_3bs = np.unique([hp["n_3b"] for hp in hparams])

tst_results = []
result_size = 0

1500it [00:18, 80.69it/s]


In [5]:
def get_histogram_info(events, values, bins):
    assert len(events) == len(values)

    if isinstance(bins, int):
        bins = np.geomspace(np.min(values), np.max(values), bins)
    
    hist_3b, _ = np.histogram(values[events.is_3b], bins=bins, weights=events.weights[events.is_3b])
    hist_bg4b, _ = np.histogram(values[events.is_bg4b], bins=bins, weights=events.weights[events.is_bg4b])
    hist_signal, _ = np.histogram(values[events.is_signal], bins=bins, weights=events.weights[events.is_signal])
    hist_4b, _ = np.histogram(values[events.is_4b], bins=bins, weights=events.weights[events.is_4b])

    std_est = np.sqrt((hist_3b + hist_4b) / 2)
    is_sampled = std_est > 0
    sigma = (hist_4b - hist_3b)[is_sampled]  / std_est[is_sampled]
    sigma_avg = np.sqrt(np.mean(sigma**2))
    sigma_bg4b = (hist_bg4b - hist_3b)[is_sampled] / std_est[is_sampled]
    sigma_avg_bg4b = np.sqrt(np.mean(sigma_bg4b**2))
    df = np.sum(is_sampled)

    return {
        "hist_3b": hist_3b,
        "hist_bg4b": hist_bg4b,
        "hist_signal": hist_signal,
        "hist_4b": hist_4b,
        "std_est": std_est,
        "sigma": sigma,
        "sigma_avg": sigma_avg,
        "sigma_bg4b": sigma_bg4b,
        "sigma_avg_bg4b": sigma_avg_bg4b,
        "df": df,
    }

In [6]:
import sys
import pandas as pd
import tqdm
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

verbose = False
show_plots = False
hparam_filter = {
    "experiment_name": lambda x: x in ["counting_test_v2"],
    "n_3b": 100_0000,
    "seed": lambda x: x < 5,
    # "signal_ratio": lambda x:  x in [0.0, 0.02],
}

hashes = TSTInfo.find(hparam_filter)
tst_results_summary = []

for tstinfo_hash in (pbar := tqdm.tqdm(hashes)):

    if tstinfo_hash in [r["hash"] for r in tst_results]:
        continue

    tstinfo = TSTInfo.load(tstinfo_hash)
    signal_filename = tstinfo.hparams["signal_filename"]
    seed = tstinfo.hparams["seed"]
    signal_ratio = tstinfo.hparams["signal_ratio"]        
    experiment_name = tstinfo.hparams["experiment_name"]
    
    initialize_with_fvt = True if experiment_name == "counting_test_v2" else False

    scdinfo_tst = tstinfo.scdinfo_tst
    events_tst = events_from_scdinfo(scdinfo_tst, features, signal_filename)
    events_tst_2 = events_tst.clone()
    base_fvt_hash = tstinfo.base_fvt_tinfo_hash
    fvt_model = FvTClassifier.load_from_checkpoint(f"./checkpoints/{base_fvt_hash}_best.ckpt")
    fvt_model.eval()
    events_tst.set_model_scores(fvt_model)
    CR_fvt_hash = tstinfo.CR_fvt_tinfo_hash
    CR_model = FvTClassifier.load_from_checkpoint(f"./checkpoints/{CR_fvt_hash}_best.ckpt")
    CR_model.eval()

    SR_stats = tstinfo.SR_stats
    SR_cut = tstinfo.SR_cut
    CR_cut = tstinfo.CR_cut
    in_SR = SR_stats >= SR_cut
    in_CR = (SR_stats < SR_cut) & (SR_stats >= CR_cut)

    weights_4b = events_tst.weights * events_tst.is_4b
    weights_signal = events_tst.weights * events_tst.is_signal
    
    ratio_4b = tstinfo.hparams["ratio_4b"]
    probs_4b_est = CR_model.predict(events_tst.X_torch).detach().cpu().numpy()[:, 1]
    reweights = ratio_4b * probs_4b_est / ((1 - ratio_4b) * (1 - probs_4b_est))
    events_tst.reweight(
        np.where(events_tst.is_4b, events_tst.weights, events_tst.weights * reweights))
    
    probs_4b_est_base = fvt_model.predict(events_tst_2.X_torch).detach().cpu().numpy()[:, 1]
    reweights_base = ratio_4b * probs_4b_est_base / ((1 - ratio_4b) * (1 - probs_4b_est_base))
    events_tst_2.reweight(
        np.where(events_tst_2.is_4b, events_tst_2.weights, events_tst_2.weights * reweights_base))
    
    events_SR = events_tst[in_SR]
    events_CR = events_tst[in_CR]
    SR_stats_SR = SR_stats[in_SR]
    SR_stats_CR = SR_stats[in_CR]

    events_SR_2 = events_tst_2[in_SR]
    events_CR_2 = events_tst_2[in_CR]
    SR_stats_SR_2 = SR_stats[in_SR]
    SR_stats_CR_2 = SR_stats[in_CR]
    
    # tst_results.append({
    #     "signal_ratio": signal_ratio,
    #     "seed": seed,
    #     "events_SR": events_SR,
    #     "events_CR": events_CR,
    #     "SR_stats_SR": SR_stats_SR,
    #     "SR_stats_CR": SR_stats_CR,
    #     "SR_cut": SR_cut,
    #     "CR_cut": CR_cut,
    #     "CR_model_output_CR": probs_4b_est[in_CR],
    #     "CR_model_output_SR": probs_4b_est[in_SR],
    #     "initialize_with_fvt": initialize_with_fvt,
    #     "hash": tstinfo_hash,
    # })
    # result_size += (
    #     events_SR.get_memory_usage() + events_CR.get_memory_usage() +
    #     SR_stats_SR.nbytes + SR_stats_CR.nbytes
    # ) / 1024**2

    # pbar.set_postfix({"Result size (MB)": result_size})

    nbins = [2, 3, 4, 5, 6, 7, 8, 9]
    for nbin in nbins:
        hist_info_SR = get_histogram_info(events_SR, SR_stats_SR, nbin)
        hist_info_CR = get_histogram_info(events_CR, SR_stats_CR, nbin)

        hist_info_SR_base = get_histogram_info(events_SR_2, SR_stats_SR_2, nbin)
        hist_info_CR_base = get_histogram_info(events_CR_2, SR_stats_CR_2, nbin)

        # print("sigma_avg_SR: ", hist_info_SR["sigma_avg"], "sigma_avg_CR: ", hist_info_CR["sigma_avg"])
        
        tst_results_summary.append({
            "signal_ratio": signal_ratio,
            "seed": seed,
            "nbin": nbin,
            "sigma_avg_SR": hist_info_SR["sigma_avg"],
            "sigma_avg_bg4b_SR": hist_info_SR["sigma_avg_bg4b"],
            "sigma_avg_CR": hist_info_CR["sigma_avg"],
            "sigma_avg_bg4b_CR": hist_info_CR["sigma_avg_bg4b"],
            "initialize_with_fvt": initialize_with_fvt,
            "df_SR": hist_info_SR["df"],
            "df_CR": hist_info_CR["df"],
            "reweight": "CR"
        })
        
        tst_results_summary.append({
            "signal_ratio": signal_ratio,
            "seed": seed,
            "nbin": nbin,
            "sigma_avg_SR": hist_info_SR_base["sigma_avg"],
            "sigma_avg_bg4b_SR": hist_info_SR_base["sigma_avg_bg4b"],
            "sigma_avg_CR": hist_info_CR_base["sigma_avg"],
            "sigma_avg_bg4b_CR": hist_info_CR_base["sigma_avg_bg4b"],
            "initialize_with_fvt": initialize_with_fvt,
            "df_SR": hist_info_SR_base["df"],
            "df_CR": hist_info_CR_base["df"],
            "reweight": "base"
        })


tst_results_summary_df = pd.DataFrame(tst_results_summary)

0it [00:00, ?it/s]

1500it [00:03, 429.07it/s]
 13%|█▎        | 2/15 [00:52<05:43, 26.45s/it]


KeyboardInterrupt: 

In [41]:
from scipy import stats

deg_free = 100
np.sqrt(stats.chi2.ppf(0.95, deg_free) / deg_free)

1.1150879490157002

In [38]:
sig_level = 0.05
tst_results_summary_df["rejected_SR"] = tst_results_summary_df["sigma_avg_SR"]**2 > stats.chi2.ppf(1-sig_level, tst_results_summary_df["df_SR"]) / tst_results_summary_df["df_SR"]
tst_results_summary_df["rejected_bg4b_SR"] = tst_results_summary_df["sigma_avg_bg4b_SR"]**2 > stats.chi2.ppf(1-sig_level, tst_results_summary_df["df_SR"]) / tst_results_summary_df["df_SR"]
tst_results_summary_df["rejected_CR"] = tst_results_summary_df["sigma_avg_CR"]**2 > stats.chi2.ppf(1-sig_level, tst_results_summary_df["df_CR"]) / tst_results_summary_df["df_CR"]
tst_results_summary_df["rejected_bg4b_CR"] = tst_results_summary_df["sigma_avg_bg4b_CR"]**2 > stats.chi2.ppf(1-sig_level, tst_results_summary_df["df_CR"]) / tst_results_summary_df["df_CR"]

In [40]:
tst_results_summary_df.groupby(["signal_ratio", "nbin", "reweight"]).get_group((0.0, 5, "base"))

Unnamed: 0,signal_ratio,seed,nbin,sigma_avg_SR,sigma_avg_bg4b_SR,sigma_avg_CR,sigma_avg_bg4b_CR,initialize_with_fvt,df_SR,df_CR,reweight,rejected_SR,rejected_bg4b_SR,rejected_CR,rejected_bg4b_CR
7,0.0,1,5,1.496819,1.496819,1.121609,1.121609,True,4,4,base,False,False,False,False
55,0.0,2,5,1.98714,1.98714,0.877386,0.877386,True,4,4,base,True,True,False,False
103,0.0,3,5,1.672958,1.672958,3.285331,3.285331,True,4,4,base,True,True,True,True
151,0.0,4,5,3.102129,3.102129,2.366776,2.366776,True,4,4,base,True,True,True,True
199,0.0,0,5,2.736025,2.736025,1.680138,1.680138,True,4,4,base,True,True,True,True


In [28]:
tst_results_summary_df.groupby(["signal_ratio", "nbin", "reweight"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,seed,sigma_avg_SR,sigma_avg_bg4b_SR,sigma_avg_CR,sigma_avg_bg4b_CR,initialize_with_fvt,df_SR,df_CR,rejected_SR,rejected_bg4b_SR,rejected_CR,rejected_bg4b_CR
signal_ratio,nbin,reweight,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0.0,2,CR,2.0,3.915376,3.915376,4.165312,4.165312,1.0,1.0,1.0,0.8,0.8,0.6,0.6
0.0,2,base,2.0,1.535162,1.535162,2.480238,2.480238,1.0,1.0,1.0,0.4,0.4,0.4,0.4
0.0,3,CR,2.0,3.477,3.477,3.982292,3.982292,1.0,2.0,2.0,0.8,0.8,0.8,0.8
0.0,3,base,2.0,2.137756,2.137756,2.226119,2.226119,1.0,2.0,2.0,0.6,0.6,0.6,0.6
0.0,4,CR,2.0,3.080222,3.080222,3.603301,3.603301,1.0,3.0,3.0,0.8,0.8,1.0,1.0
0.0,4,base,2.0,1.981003,1.981003,1.901981,1.901981,1.0,3.0,3.0,0.6,0.6,0.4,0.4
0.0,5,CR,2.0,3.088061,3.088061,3.193247,3.193247,1.0,4.0,4.0,0.8,0.8,0.8,0.8
0.0,5,base,2.0,2.278172,2.278172,1.893736,1.893736,1.0,4.0,4.0,0.8,0.8,0.4,0.4
0.0,6,CR,2.0,2.804148,2.804148,2.795829,2.795829,1.0,5.0,5.0,0.8,0.8,0.6,0.6
0.0,6,base,2.0,2.080733,2.080733,1.736979,1.736979,1.0,5.0,5.0,0.8,0.8,0.4,0.4


In [45]:
df2 = pd.read_csv("counting_test_v2.tsv", sep="\t")
df2["rejected_SR"] = df2["sigma_avg_SR"]**2 > stats.chi2.ppf(1-sig_level, df2["nbin"]) / df2["nbin"]
df2["rejected_bg4b_SR"] = df2["sigma_avg_bg4b_SR"]**2 > stats.chi2.ppf(1-sig_level, df2["nbin"]) / df2["nbin"]
df2["rejected_CR"] = df2["sigma_avg_CR"]**2 > stats.chi2.ppf(1-sig_level, df2["nbin"]) / df2["nbin"]
df2["rejected_bg4b_CR"] = df2["sigma_avg_bg4b_CR"]**2 > stats.chi2.ppf(1-sig_level, df2["nbin"]) / df2["nbin"]

In [46]:
df2.groupby(["signal_ratio", "nbin", "initialize_with_fvt"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,seed,sigma_avg_SR,sigma_avg_bg4b_SR,sigma_avg_CR,sigma_avg_bg4b_CR,rejected_SR,rejected_bg4b_SR,rejected_CR,rejected_bg4b_CR
signal_ratio,nbin,initialize_with_fvt,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0.0,10,False,49.5,4.467074,4.467074,4.595683,4.595683,0.97,0.97,1.0,1.0
0.0,10,True,49.5,2.524912,2.524912,2.15914,2.15914,0.94,0.94,0.94,0.94
0.0,20,False,49.5,3.390271,3.390271,3.40587,3.40587,1.0,1.0,1.0,1.0
0.0,20,True,49.5,2.19058,2.19058,1.906788,1.906788,0.98,0.98,0.97,0.97
0.0,30,False,49.5,2.951894,2.951894,2.902679,2.902679,1.0,1.0,1.0,1.0
0.0,30,True,49.5,2.046155,2.046155,1.785553,1.785553,0.99,0.99,1.0,1.0
0.0,50,False,49.5,2.545473,2.545473,2.461721,2.461721,1.0,1.0,1.0,1.0
0.0,50,True,49.5,1.920204,1.920204,1.716421,1.716421,1.0,1.0,1.0,1.0
0.0,100,False,49.5,2.173712,2.173712,2.051494,2.051494,1.0,1.0,1.0,1.0
0.0,100,True,49.5,1.805822,1.805822,1.628994,1.628994,1.0,1.0,1.0,1.0


In [None]:
# tst_results_summary = []

# for tst_result in tst_results:
#     events_SR = tst_result["events_SR"]
#     events_CR = tst_result["events_CR"]
#     SR_stats_SR = tst_result["SR_stats_SR"]
#     SR_stats_CR = tst_result["SR_stats_CR"]
#     SR_cut = tst_result["SR_cut"]
#     CR_cut = tst_result["CR_cut"]
#     signal_ratio = tst_result["signal_ratio"]
#     seed = tst_result["seed"]
#     CR_model_output_CR = tst_result["CR_model_output_CR"]
#     CR_model_output_SR = tst_result["CR_model_output_SR"]
#     initialize_with_fvt = tst_result["initialize_with_fvt"]

#     nbins = [10, 20, 30, 50, 100, 200]
#     for nbin in nbins:
#         # hist_info_SR = get_histogram_info(events_SR, SR_stats_SR, nbin)
#         # hist_info_CR = get_histogram_info(events_CR, SR_stats_CR, nbin)
#         hist_info_SR = get_histogram_info(events_SR, CR_model_output_SR, nbin)
#         hist_info_CR = get_histogram_info(events_CR, CR_model_output_CR, nbin)

#         tst_results_summary.append({
#             "signal_ratio": signal_ratio,
#             "seed": seed,
#             "nbin": nbin,
#             "sigma_avg_SR": hist_info_SR["sigma_avg"],
#             "sigma_avg_bg4b_SR": hist_info_SR["sigma_avg_bg4b"],
#             "sigma_avg_CR": hist_info_CR["sigma_avg"],
#             "sigma_avg_bg4b_CR": hist_info_CR["sigma_avg_bg4b"],
#             "initialize_with_fvt": initialize_with_fvt,
#         })

# tst_results_summary_df = pd.DataFrame(tst_results_summary)

  sigma = (hist_4b - hist_3b)  / std_est
  sigma_bg4b = (hist_bg4b - hist_3b)  / std_est


In [42]:
from scipy import stats

sig_level = 0.05
z = stats.norm.ppf(1 - sig_level / 2)
tst_results_summary_df["rejected_SR"] = tst_results_summary_df["sigma_avg_SR"] > z
tst_results_summary_df["rejected_bg4b_SR"] = tst_results_summary_df["sigma_avg_bg4b_SR"] > z
tst_results_summary_df["rejected_CR"] = tst_results_summary_df["sigma_avg_CR"] > z
tst_results_summary_df["rejected_bg4b_CR"] = tst_results_summary_df["sigma_avg_bg4b_CR"] > z

# tst_results_summary_df.groupby(["signal_ratio", "nbin"]).mean()

In [43]:
tst_results_summary_df.groupby([
    "nbin", 
    "signal_ratio", 
    "initialize_with_fvt"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,seed,sigma_avg_SR,sigma_avg_bg4b_SR,sigma_avg_CR,sigma_avg_bg4b_CR,rejected_SR,rejected_bg4b_SR,rejected_CR,rejected_bg4b_CR
nbin,signal_ratio,initialize_with_fvt,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10,0.0,False,0.0,5.168311,5.168311,4.468959,4.468959,1.0,1.0,1.0,1.0
10,0.01,False,0.0,7.654525,6.034191,5.259844,4.903814,1.0,1.0,1.0,1.0
20,0.0,False,0.0,3.794281,3.794281,3.299375,3.299375,1.0,1.0,1.0,1.0
20,0.01,False,0.0,5.338385,4.223228,3.819751,3.58524,1.0,1.0,1.0,1.0
30,0.0,False,0.0,3.239454,3.239454,2.843111,2.843111,1.0,1.0,1.0,1.0
30,0.01,False,0.0,4.450324,3.563998,3.238995,3.060959,1.0,1.0,1.0,1.0
50,0.0,False,0.0,2.680188,2.680188,2.471611,2.471611,1.0,1.0,1.0,1.0
50,0.01,False,0.0,3.599793,2.957475,2.645609,2.510746,1.0,1.0,1.0,1.0
100,0.0,False,0.0,2.114945,2.114945,2.162473,2.162473,1.0,1.0,1.0,1.0
100,0.01,False,0.0,2.78904,2.362969,2.065068,1.982297,1.0,1.0,1.0,1.0


In [32]:
tst_results_summary_df[["signal_ratio", "nbin", "initialize_with_fvt", "seed"]].values


from itertools import product
all_vs = list(product([0.0, 0.01, 0.02], nbins, [True, False], seeds))

# print([v for v in all_vs if v not in ])


[]


In [40]:
hello = tst_results_summary_df[["signal_ratio", "nbin", "initialize_with_fvt", "seed"]].values.tolist()
hello = [tuple(h) for h in hello]
for v in all_vs:
    if v not in hello:
        print(v)
        

(0.0, 10, False, 0)
(0.0, 20, False, 0)
(0.0, 30, False, 0)
(0.0, 50, False, 0)
(0.0, 100, False, 0)
(0.0, 200, False, 0)
(0.01, 10, False, 0)
(0.01, 20, False, 0)
(0.01, 30, False, 0)
(0.01, 50, False, 0)
(0.01, 100, False, 0)
(0.01, 200, False, 0)
