First, we obtain the MVA score edges for given signal efficiency

In [1]:
import time
import numpy as np
import pickle
import awkward as ak
import dask_awkward as dak
from distributed import Client
import pandas as pd

from typing import Tuple, List, Dict
from sklearn.metrics import roc_curve, auc, roc_auc_score
import matplotlib.pyplot as plt
import mplhep as hep
import matplotlib.pyplot as plt
import matplotlib
plt.style.use(hep.style.CMS)
import numba 

@numba.njit(parallel=True)
def customROC_curve_AN(label, pred, weight):
    """
    generates signal and background efficiency consistent with the AN,
    as described by Fig 4.6 of Dmitry's PhD thesis
    """
    # we assume sigmoid output with labels 0 = background, 1 = signal
    thresholds = np.linspace(start=0,stop=1, num=10000) 
    # thresholds = np.linspace(start=0,stop=1, num=500)
    effBkg_total = -99*np.ones_like(thresholds) 
    effSig_total = -99*np.ones_like(thresholds) 
    # for ix in range(len(thresholds)):
    for ix in numba.prange(len(thresholds)):
        threshold = thresholds[ix]
        # get FP and TP
        positive_filter = (pred >= threshold)
        falsePositive_filter = positive_filter & (label == 0)
        FP = np.sum(weight[falsePositive_filter])#  FP = false positive
        truePositive_filter = positive_filter & (label == 1)
        TP = np.sum(weight[truePositive_filter])#  TP = true positive
        

        # get TN and FN
        negative_filter = (pred < threshold) # just picked negative to be <=
        trueNegative_filter = negative_filter & (label == 0)
        TN = np.sum(weight[trueNegative_filter])#  TN = true negative
        falseNegative_filter = negative_filter & (label == 1)
        FN = np.sum(weight[falseNegative_filter])#  FN = false negative
     


        # effBkg = TN / (TN + FP) # Dmitry PhD thesis definition
        # effSig = FN / (FN + TP) # Dmitry PhD thesis definition
        effBkg = FP / (TN + FP) # AN-19-124 ggH Cat definition
        effSig = TP / (FN + TP) # AN-19-124 ggH Cat definition
        effBkg_total[ix] = effBkg
        effSig_total[ix] = effSig
        

        # print(f"effSig: {effSig}")
        # print(f"FN: {FN}")
        # print(f"TP: {TP}")
        # print(f"threshold: {threshold}")
        
        # sanity check
        # assert ((np.sum(positive_filter) + np.sum(negative_filter)) == len(pred))
        # total_yield = FP + TP + FN + TN
        # assert(np.isclose(total_yield, np.sum(weight)))
        # print(f"total_yield: {total_yield}")
        # print(f"np.sum(weight): {np.sum(weight)}")
    print(f"np.sum(effBkg_total ==-99) : {np.sum(effBkg_total ==-99)}")
    print(f"np.sum(effSig_total ==-99) : {np.sum(effSig_total ==-99)}")
    print(f"np.any(np.isnan(effBkg_total)) : {np.any(np.isnan(effBkg_total))}")
    print(f"np.any(np.isnan(effSig_total)) : {np.any(np.isnan(effSig_total))}")
    # effBkg_total[np.isnan(effBkg_total)] = 1
    # effSig_total[np.isnan(effSig_total)] = 1
    return (effBkg_total, effSig_total, thresholds)

def findBDT_binVals_basedOnSigEff(thres, eff_sigs, sigEffBinEdges):
    """
    we expect len(sigEffBinEdges) == number of categories + 1,
    we expect sshape of thres == shape of eff_sigs
    """
    bin_vals = []
    for target_eff in sigEffBinEdges:
        # print(f"target_eff: {target_eff}")
        for ix in range(1,len(eff_sigs)):
            # print(f"ix: {ix}")
            ith_sigEff = eff_sigs[ix]
            sigEff_oneb4 = eff_sigs[ix-1]
            thres_oneb4 = thres[ix-1]
            thres_current = thres[ix]
            
            if target_eff >= ith_sigEff:
                # print(f"target_eff: {target_eff}")
                # print(f"ith_sigEff: {ith_sigEff}")
                # print(f"sigEff_oneb4: {sigEff_oneb4}")
                # print(f"thres_current: {thres_current}")
                # print(f"thres_oneb4: {thres_oneb4}")
                bin_vals.append(thres_current)
                break
        # print(f"-----------------------------------------")
    return bin_vals


def getMVA_values(initial_load_path: str):
    # year = "2016preVFP"
    # stage2_out_name = "BDT_WgtON_original_AN_BDT_Sept27"
    
    # load_path = f"/work/users/yun79/stage2_output/{stage2_out_name}/ggh/{year}/processed_events_bkgMC*.parquet"
    load_path = f"{initial_load_path}/processed_events_bkgMC*.parquet"
    
    processed_events = ak.from_parquet(load_path)
    nan_filter = ~np.isnan(ak.to_numpy(processed_events.wgt_nominal_total))
    processed_events = processed_events[nan_filter]
    region = processed_events.h_peak | processed_events.h_sidebands
    # DNN_score_raw is a sigmoid output
    bkg = (ak.to_numpy(processed_events.BDT_score[region])) # eval
    bkg_wgt = (ak.to_numpy(processed_events.wgt_nominal_total[region]))
    bkg_label = 0*np.ones_like(bkg)
    
    # we only have ggH and VBF for signal, which I hope is enough
    load_path = f"{initial_load_path}/processed_events_sigMC*.parquet"
    
    
    processed_events = ak.from_parquet(load_path)
    region = processed_events.h_peak | processed_events.h_sidebands
    sig = (ak.to_numpy(processed_events.BDT_score[region])) # eval
    sig_wgt = (ak.to_numpy(processed_events.wgt_nominal_total[region]))
    sig_label = np.ones_like(sig)
    
    pred = np.concatenate((bkg,sig), axis=0)
    wgt_total = np.concatenate((bkg_wgt,sig_wgt), axis=0)
    label = np.concatenate((bkg_label,sig_label), axis=0)

    eff_bkg, eff_sig, thresholds = customROC_curve_AN(label, pred, wgt_total)
    # print(f"eff_sig: {eff_sig}")
    # print to debug start ------------------------------------
    # plt.figure()  
    # plt.plot(eff_sig, eff_bkg, label="Stage2 ROC Curve (Eval)")
    
    # # plt.vlines(eff_sig, 0, eff_bkg, linestyle="dashed")
    # plt.vlines(np.linspace(0,1,11), 0, 1, linestyle="dashed", color="grey")
    # plt.hlines(np.logspace(-4,0,5), 0, 1, linestyle="dashed", color="grey")
    # # plt.hlines(eff_bkg, 0, eff_sig, linestyle="dashed")
    # plt.xlim([0.0, 1.0])
    # # plt.ylim([0.0, 1.0])
    # plt.xlabel('Signal eff')
    # plt.yscale("log")
    # plt.ylim([0.0001, 1.0])
    # plt.ylabel('Background eff')
    # plt.title(f'ROC curve for ggH BDT {year}')
    # # plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    # plt.legend()
    # plt.show()
    
    # plt.clf()
    # print to debug end ------------------------------------

    sigEffBinEdges = np.arange(0.99, 0.0, step=-0.01) # from 0.99 to 0.01 
    bin_vals = findBDT_binVals_basedOnSigEff(thresholds, eff_sig, sigEffBinEdges)
    print(bin_vals) 
    df = pd.DataFrame(columns=["year", "sig_eff","BDT_threshold"])
    df["sig_eff"] = sigEffBinEdges
    df["BDT_threshold"] = bin_vals
    df["year"] = year
    df.to_csv(f"BDT_threshold_{year}.csv")

In [2]:
years = [
    "2016preVFP",
    "2016postVFP",
    "2017",
    "2018",
]
stage2_out_name = "BDT_WgtON_original_AN_BDT_Sept27"
for year in years:
    load_path = f"/work/users/yun79/stage2_output/{stage2_out_name}/ggh/{year}"
    getMVA_values(load_path)

np.sum(effBkg_total ==-99) : 0
np.sum(effSig_total ==-99) : 0
np.any(np.isnan(effBkg_total)) : False
np.any(np.isnan(effSig_total)) : False
[0.0335033503350335, 0.04070407040704071, 0.04590459045904591, 0.0503050305030503, 0.054405440544054405, 0.058005800580058005, 0.06140614061406141, 0.06460646064606461, 0.06780678067806781, 0.0707070707070707, 0.07350735073507352, 0.07630763076307631, 0.07910791079107911, 0.0817081708170817, 0.08410841084108411, 0.08650865086508651, 0.08890889088908892, 0.09130913091309131, 0.09350935093509351, 0.09580958095809582, 0.09800980098009801, 0.10021002100210022, 0.10241024102410241, 0.10461046104610461, 0.10671067106710672, 0.10881088108810881, 0.11101110111011102, 0.11311131113111311, 0.11521152115211522, 0.11731173117311731, 0.11941194119411942, 0.12141214121412142, 0.12351235123512351, 0.1255125512551255, 0.1275127512751275, 0.12951295129512952, 0.13151315131513153, 0.1335133513351335, 0.13561356135613561, 0.13761376137613762, 0.13971397139713973, 0.1

In [12]:
years = [
    "2016preVFP",
    "2016postVFP",
    "2017",
    "2018",
]

sigEffBinEdges = np.arange(0.99, 0.0, step=-0.01) # from 0.99 to 0.01 
for sig_eff in sigEffBinEdges:
    for year in years:
        print(f"year: {year}")
        
        df = pd.read_csv(f"BDT_threshold_{year}.csv")
        bool_filter = np.isclose(df["sig_eff"], sig_eff)
        if np.sum(np.isclose(df["sig_eff"], sig_eff)) < 1:
            print("ERROR!")
        threshold = df["BDT_threshold"][bool_filter].values
        print(f"threshold: {threshold}")
        print("----------------------------------------------")
        


year: 2016preVFP
----------------------------------------------
threshold: [0.03350335]
year: 2016postVFP
----------------------------------------------
threshold: [0.03390339]
year: 2017
----------------------------------------------
threshold: [0.03310331]
year: 2018
----------------------------------------------
threshold: [0.03320332]
year: 2016preVFP
----------------------------------------------
threshold: [0.04070407]
year: 2016postVFP
----------------------------------------------
threshold: [0.04070407]
year: 2017
----------------------------------------------
threshold: [0.04020402]
year: 2018
----------------------------------------------
threshold: [0.04030403]
year: 2016preVFP
----------------------------------------------
threshold: [0.04590459]
year: 2016postVFP
----------------------------------------------
threshold: [0.0460046]
year: 2017
----------------------------------------------
threshold: [0.04540454]
year: 2018
----------------------------------------------
th

In [15]:
# import awkward as ak
# import copy

# A = ak.Array([1,2,3])
# B = ak.ones_like(A )
# A_zip = ak.zip({
#     "A" : A,
#     "B" : B
# })
# B_zip = copy.deepcopy(A_zip)
# C_zip = ak.concatenate([A_zip, B_zip])


In [1]:
from fitting import separateNfit
import pandas as pd
import numpy as np

years = [
    "2016preVFP",
    "2016postVFP",
    "2017",
    "2018",
]
stage2_out_name = "BDT_WgtON_original_AN_BDT_Sept27"
load_paths = {}
BDT_thresholds = {}
sig_eff = 0.45
for year in years:
    load_path = f"/work/users/yun79/stage2_output/{stage2_out_name}/ggh/{year}"
    load_paths[year] = load_path
    BDT_df = pd.read_csv(f"BDT_threshold_{year}.csv")
    bool_filter = np.isclose(BDT_df["sig_eff"], sig_eff)
    threshold = BDT_df["BDT_threshold"][bool_filter].values
    print(f"{year} threshold: {threshold}")
    BDT_thresholds[year] = [0, threshold, 1.0]
    
separateNfit(load_paths, BDT_thresholds)

2016preVFP threshold: [0.16951695]
2016postVFP threshold: [0.16791679]
2017 threshold: [0.16891689]
2018 threshold: [0.16921692]
separateNfit load_path: /work/users/yun79/stage2_output/BDT_WgtON_original_AN_BDT_Sept27/ggh/2016preVFP
BDT_score :[0.104, 0.125, 0.0841, 0.128, 0.0917, ..., 0.21, 0.0698, 0.27, 0.438, 0.212]
ak.max(BDT_score) :0.9089231491088867
ak.min(BDT_score) :0.00261423597112298
ak.sum(subCat_idx==-1): 0
ak.min(subCat_idx): 0.0
ak.max(subCat_idx): 1.0
BDT_score :[0.0888, 0.239, 0.0344, 0.0721, 0.0425, ..., 0.0882, 0.0828, 0.146, 0.189]
ak.max(BDT_score) :0.8770337104797363
ak.min(BDT_score) :0.0007012422429397702
ak.sum(subCat_idx==-1): 0
ak.min(subCat_idx): 0.0
ak.max(subCat_idx): 1.0
separateNfit load_path: /work/users/yun79/stage2_output/BDT_WgtON_original_AN_BDT_Sept27/ggh/2016postVFP
BDT_score :[0.216, 0.0831, 0.2, 0.179, 0.0811, ..., 0.157, 0.093, 0.209, 0.188, 0.194]
ak.max(BDT_score) :0.8894630670547485
ak.min(BDT_score) :0.0025116296019405127
ak.sum(subCat_idx=

Info in <Minuit2>: MnSeedGenerator Computing seed using NumericalGradient calculator
Info in <Minuit2>: MnSeedGenerator Initial state: FCN =       109393127.9 Edm =       39428.76175 NCalls =     17
Info in <Minuit2>: MnSeedGenerator Initial state  
  Minimum value : 109393127.9
  Edm           : 39428.76175
  Internal parameters:	[     0.6727023301  -0.001684318905    -0.1409596604]	
  Internal gradient  :	[      211186.8196      417998332.7     -227650.6606]	
  Internal covariance matrix:
[[  9.0800984e-08              0              0]
 [              0  8.5713212e-13              0]
 [              0              0   7.534956e-08]]]
Info in <Minuit2>: VariableMetricBuilder Start iterating until Edm is < 0.001 with call limit = 1500
Info in <Minuit2>: VariableMetricBuilder    0 - FCN =       109393127.9 Edm =       39428.76175 NCalls =     17
Info in <Minuit2>: VariableMetricBuilder    1 - FCN =       109392121.9 Edm =     0.03225962518 NCalls =     33
Info in <Minuit2>: VariableMet

[#1] INFO:Fitting -- RooAbsPdf::fitTo(subCat0_BWZ_Redux_over_subCat0_BWZ_Redux_Int[mh_ggh]) fixing normalization set for coefficient determination to observables in data
[#1] INFO:Fitting -- using CPU computation library compiled with -mavx2
[#1] INFO:Fitting -- RooAddition::defaultErrorLevel(nll_subCat0_BWZ_Redux_over_subCat0_BWZ_Redux_Int[mh_ggh]_subCat0_rooHist_BWZRedux) Summation contains a RooNLLVar, using its error level
[#1] INFO:Minimization -- RooAbsMinimizerFcn::setOptimizeConst: activating const optimization
Minuit2Minimizer: Minimize with max-calls 1500 convergence for edm < 1 strategy 1
[#1] INFO:NumericIntegration -- RooRealIntegral::init(subCat0_BWZ_Redux_Int[mh_ggh]) using numeric integrator RooIntegrator1D to calculate Int(mh_ggh)
Minuit2Minimizer : Invalid minimum - status = 3
FVAL  = 1.09392e+08
Edm   = 0.33901
Nfcn  = 77
[#1] INFO:Minimization -- RooAbsMinimizerFcn::setOptimizeConst: deactivating const optimization
[#1] INFO:Fitting -- RooAbsPdf::fitTo(subCat0_BWZ_R

Info in <Minuit2>: MnSeedGenerator Computing seed using NumericalGradient calculator
Info in <Minuit2>: MnSeedGenerator Initial state: FCN =       1135.377418 Edm =        4.08084681 NCalls =     23
Info in <Minuit2>: MnSeedGenerator Initial state  
  Minimum value : 1135.377418
  Edm           : 4.08084681
  Internal parameters:	[   -0.03900989327     -1.307670934     -1.286628773     -1.167552078    -0.9284592379    -0.1167780589]	
  Internal gradient  :	[      13.27880773      -64.6269692     -117.0159267    -0.2472649363     -4.295215754      48.07702751]	
  Internal covariance matrix:
[[  0.00083659957              0              0              0              0              0]
 [              0  0.00017944326              0              0              0              0]
 [              0              0  0.00028736631              0              0              0]
 [              0              0              0   0.0049417531              0              0]
 [              0          

[#1] INFO:NumericIntegration -- RooRealIntegral::init(subCat0_BWZ_Redux_Int[mh_ggh]) using numeric integrator RooIntegrator1D to calculate Int(mh_ggh)
[#1] INFO:NumericIntegration -- RooRealIntegral::init(subCat1_BWZ_Redux_Int[mh_ggh]) using numeric integrator RooIntegrator1D to calculate Int(mh_ggh)


Info in <TCanvas::Print>: pdf file ./quick_plots//bkgMC_plot_subCat0.pdf has been created
Info in <TCanvas::Print>: pdf file ./quick_plots//bkgMC_plot_subCat1.pdf has been created
Info in <TCanvas::Print>: pdf file ./quick_plots//sigMC_plot_subCat0.pdf has been created
Info in <TCanvas::Print>: pdf file ./quick_plots//sigMC_plot_subCat1.pdf has been created


[#1] INFO:ObjectHandling -- RooWorkspace::import(w) importing RooRealVar::data_cat0_ggh_norm
[#1] INFO:ObjectHandling -- RooWorkspace::import(w) importing dataset data_cat0_ggh
[#1] INFO:ObjectHandling -- RooWorkspace::import(w) importing RooRealVar::mh_ggh
[#1] INFO:ObjectHandling -- RooWorkspace::import(w) importing RooRealVar::bkg_cat0_ggh_pdf_norm
[#1] INFO:ObjectHandling -- RooWorkspace::import(w) importing RooModZPdf::bkg_cat0_ggh_pdf
[#1] INFO:ObjectHandling -- RooWorkspace::import(w) importing RooRealVar::BWZ_Redux_a_coeff_subCat0
[#1] INFO:ObjectHandling -- RooWorkspace::import(w) importing RooRealVar::BWZ_Redux_b_coeff_subCat0
[#1] INFO:ObjectHandling -- RooWorkspace::import(w) importing RooRealVar::BWZ_Redux_c_coeff_subCat0
[#1] INFO:ObjectHandling -- RooWorkspace::import(w) importing RooRealVar::ggH_cat0_ggh_pdf_norm
[#1] INFO:ObjectHandling -- RooWorkspace::import(w) importing RooCrystalBall::ggH_cat0_ggh_pdf
[#1] INFO:ObjectHandling -- RooWorkspace::import(w) importing Ro

In [2]:
%%bash
combineCards.py datacard_cat0_ggh.txt datacard_cat1_ggh.txt >  datacard_comb_sig_all_ggh_test.txt

In [3]:
%%bash
text2workspace.py -m 125 datacard_comb_sig_all_ggh_test.txt
combine -M Significance -d datacard_comb_sig_all_ggh_test.root -m 125 -n _signif_all_ggh --cminDefaultMinimizerStrategy=1 -t -1 --toysFrequentist --expectSignal 1 --X-rtd FITTER_NEWER_GIVE_UP --X-rtd FITTER_BOUND --setParameters pdf_index_ggh=0 --cminRunAllDiscreteCombinations --setParameterRanges r=-10,10 --X-rtd MINIMIZER_freezeDisassociatedParams --cminDefaultMinimizerTolerance 0.01 --X-rtd MINIMIZER_MaxCalls=9999999 --X-rtd FAST_VERTICAL_MORPH

 <<< Combine >>> 
 <<< v10.0.1 >>>
>>> Random number generator seed is 123456
>>> Method used is Significance
Set Range of Parameter r To : (-10,10)

 -- Significance -- 
Significance: 0.164338
Done in 0.01 min (cpu), 0.01 min (real)


In [16]:
import subprocess
ans = subprocess.run(["combineCards.py", "datacard_cat0_ggh.txt", "datacard_cat1_ggh.txt"], capture_output=True)
with open("datacard_comb_sig_all_ggh_test.txt", "w") as text_file:
     subprocess.call(["combineCards.py", "datacard_cat0_ggh.txt", "datacard_cat1_ggh.txt"], stdout=text_file)

In [14]:
(str(ans.stdout))

"b'Combination of datacard_cat0_ggh.txt  datacard_cat1_ggh.txt\\nimax 2 number of bins\\njmax 1 number of processes minus 1\\nkmax 65 number of nuisance parameters\\n----------------------------------------------------------------------------------------------------------------------------------\\nshapes bkg       ch1       workspace/workspace_bkg_cat0_ggh.root w:bkg_cat0_ggh_pdf\\nshapes data_obs  ch1       workspace/workspace_bkg_cat0_ggh.root w:data_cat0_ggh\\nshapes ggH_hmm   ch1       workspace/workspace_sig_cat0_ggh.root w:ggH_cat0_ggh_pdf\\nshapes bkg       ch2       workspace/workspace_bkg_cat1_ggh.root w:bkg_cat1_ggh_pdf\\nshapes data_obs  ch2       workspace/workspace_bkg_cat1_ggh.root w:data_cat1_ggh\\nshapes ggH_hmm   ch2       workspace/workspace_sig_cat1_ggh.root w:ggH_cat1_ggh_pdf\\n----------------------------------------------------------------------------------------------------------------------------------\\nbin          ch1          ch2        \\nobservation  -1   