In [1]:
import time
import numpy as np
import pickle
import awkward as ak
import dask_awkward as dak
from distributed import Client
from omegaconf import OmegaConf

from typing import Tuple, List, Dict
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
import ROOT as rt

In [2]:
"""
Let's try VBF DNN
"""
load_path = f"/work/users/yun79/stage2_output/test/vbf/2018/processed_events_bkgMC*.parquet"
processed_events = ak.from_parquet(load_path)
region = processed_events.h_peak != 0
bkg = ak.to_numpy(processed_events.DNN_score[region])
bkg_wgt = ak.to_numpy(processed_events.wgt_nominal_total[region])
load_path = f"/work/users/yun79/stage2_output/test/vbf/2018/processed_events_sigMC*.parquet"
processed_events = ak.from_parquet(load_path)
region = processed_events.h_peak != 0
sig = ak.to_numpy(processed_events.DNN_score[region])
sig_wgt = ak.to_numpy(processed_events.wgt_nominal_total[region])
print(bkg)
print(sig)
load_path = f"/work/users/yun79/stage2_output/test/vbf/2018/processed_events_data.parquet"
processed_events = ak.from_parquet(load_path)
region = processed_events.h_peak != 0
data = ak.to_numpy(processed_events.DNN_score[region])
data_wgt = ak.to_numpy(processed_events.wgt_nominal_total[region])

[0.16962133 0.02123687 0.29009721 ... 0.04277927 0.57950991 0.11430977]
[0.18167675 0.51368499 0.45377615 ... 0.41674855 0.31256554 0.59521174]


In [3]:
# -----------------------------------------------------
# set bins with 0.6 signal per bin or import them
# -----------------------------------------------------
bin_edges = [
    0,
    0.07,
    0.432,
    0.71,
    0.926,
    1.114,
    1.28,
    1.428,
    1.564,
    1.686,
    1.798,
    1.9,
    2.0,
    2.8,
]
print(len(bin_edges))
bin_edges = np.array(bin_edges, dtype="double")

14


In [4]:
# -----------------------------------------------------
# define the histograms
# -----------------------------------------------------
hist_VBF = rt.TH1D("VBF", "VBF", (len(bin_edges) - 1), bin_edges)
hist_VBF.FillN(len(sig), sig, sig_wgt)
hist_DY = rt.TH1D("DY", "DY", (len(bin_edges) - 1), bin_edges)
hist_DY.FillN(len(bkg), bkg, bkg_wgt)
hist_data = rt.TH1D("data", "data", (len(bin_edges) - 1), bin_edges)
hist_data.FillN(len(data), data, data_wgt)

In [5]:
# testing if each bin has at least signal value of 0.6, which is not true
nbins = hist_VBF.GetNbinsX()
wgt_sum = 0
for ix in range(nbins):
    bin_wgt = hist_VBF.GetBinContent(ix+1)
    print(bin_wgt)
    wgt_sum += bin_wgt
print(f"wgt_sum: {wgt_sum}")

0.29830400924523065
8.908799236111392
11.540740661473247
0.6980243330720733
0.055237724858792026
0.009677575648620103
0.0006450808345128772
5.28984625973183e-05
0.0
0.0
0.0
0.0
0.0
wgt_sum: 21.511481519706468


In [6]:
print(f"hist_VBF: {hist_VBF.GetSumOfWeights()}")
print(f"hist_DY: {hist_DY.GetSumOfWeights()}")
print(f"hist_data: {hist_data.GetSumOfWeights()}")
# print(f"hist_VBF: {hist_VBF.GetEntries()}")
# print(f"hist_DY: {hist_DY.GetEntries()}")
# print(f"hist_data: {hist_data.GetEntries()}")


hist_VBF: 21.511481519706468
hist_DY: 4449.234056240582
hist_data: 4330.0


In [7]:
# -----------------------------------------------------
# convert to rooHisto grams with variable consistent with datacard
# -----------------------------------------------------
dnnScore_name = "vbf_SR_2018"
dnnScore = rt.RooRealVar(dnnScore_name, dnnScore_name, 0.5, np.min(bin_edges), np.max(bin_edges))
hist_VBF =  rt.RooDataHist(hist_VBF.GetName(), hist_VBF.GetName(), rt.RooArgSet(dnnScore), hist_VBF) 
hist_DY =  rt.RooDataHist(hist_DY.GetName(), hist_DY.GetName(), rt.RooArgSet(dnnScore), hist_DY) 
hist_data =  rt.RooDataHist(hist_data.GetName(), hist_data.GetName(), rt.RooArgSet(dnnScore), hist_data) 


In [8]:
print(f"hist_VBF: {hist_VBF.sumEntries()}")
print(f"hist_DY: {hist_DY.sumEntries()}")
print(f"hist_data: {hist_data.sumEntries()}")

hist_VBF: 21.511481519706464
hist_DY: 4449.234056240582
hist_data: 4330.0


In [9]:
fout = rt.TFile(f"./workspace_vbf.root","RECREATE")
wout = rt.RooWorkspace("workspace","workspace")


wout.Import(hist_VBF);
wout.Import(hist_DY);
wout.Import(hist_data);
wout.Print();
wout.Write();

[#1] INFO:ObjectHandling -- RooWorkspace::import(workspace) importing dataset VBF
[#1] INFO:ObjectHandling -- RooWorkspace::import(workspace) importing RooRealVar::vbf_SR_2018
[#1] INFO:ObjectHandling -- RooWorkspace::import(workspace) importing dataset DY
[#1] INFO:ObjectHandling -- RooWorkspace::import(workspace) importing dataset data

RooWorkspace(workspace) workspace contents

variables
---------
(vbf_SR_2018)

datasets
--------
RooDataHist::VBF(vbf_SR_2018)
RooDataHist::DY(vbf_SR_2018)
RooDataHist::data(vbf_SR_2018)



In [14]:
%%writefile card.txt
imax 1
jmax *
kmax *
---------------
shapes DY vbf_SR_2018 workspace_vbf.root workspace:DY
shapes VBF vbf_SR_2018 workspace_vbf.root workspace:VBF
shapes data_obs vbf_SR_2018 workspace_vbf.root workspace:data

---------------
bin                                      vbf_SR_2018         
observation                              -1              
---------------
bin                                      vbf_SR_2018          vbf_SR_2018                            
process                                  DY                   VBF                           
process                                  1                    -1                                     
rate                                     -1                   -1    
---------------               
lumi_uncor2018       lnN                  1.015                1.015                            
lumi_xyfac           lnN                  1.02                 1.02                          
lumi_len             lnN                  1.002                1.002      

Overwriting card.txt


In [18]:
# original command
! combine -M Significance -d card.txt -m 125 -n _signif_vbf  --cminDefaultMinimizerStrategy=0 --expectSignal 1  --setParameters pdf_index=0  --setParameterRanges r=-10,10 --X-rtd MINIMIZER_freezeDisassociatedParams  --X-rtd MINIMIZER_MaxCalls=9999999 --verbose 3 --toysFrequentist -t -1


>>> Random number generator seed is 123456
>>> Method used is Significance
Turning on runtime-define MINIMIZER_freezeDisassociatedParams
Setting runtime-define MINIMIZER_MaxCalls to 9999999
Info in <TFile::Recover>: workspace_vbf.root, recovered key TProcessID:ProcessID0 at address 236
Info in <TFile::Recover>: workspace_vbf.root, recovered key RooWorkspace:workspace at address 391
Will make a binned dataset
Observables: ['vbf_SR_2018']
Will use category 'CMS_channel' to identify the 1 channels
Creating pdfs for individual modes (1): Creating RooAddPdf pdf_binvbf_SR_2018 with 2 elements
.   done.
Importing combined pdf model_s
Importing combined pdf model_b

RooWorkspace(w) w contents

variables
---------
(CMS_channel,MH,lumi_len,lumi_len_In,lumi_uncor2018,lumi_uncor2018_In,lumi_xyfac,lumi_xyfac_In,r,vbf_SR_2018)

p.d.f.s
-------
RooSimultaneousOpt::model_b[ indexCat=CMS_channel vbf_SR_2018=pdf_binvbf_SR_2018_bonly extraConstraints=() channelMasks=() ] = 0
  RooProdPdf::pdf_binvbf_SR_

In [22]:
# new command
! combine -M Significance -d card.txt -m 125 -n _signif_vbf  --cminDefaultMinimizerStrategy=1 --expectSignal 1  --setParameterRanges r=-10,10 --X-rtd MINIMIZER_MaxCalls=9999999 --verbose 3 --toysFrequentist -t -1


>>> Random number generator seed is 123456
>>> Method used is Significance
Setting runtime-define MINIMIZER_MaxCalls to 9999999
Info in <TFile::Recover>: workspace_vbf.root, recovered key TProcessID:ProcessID0 at address 236
Info in <TFile::Recover>: workspace_vbf.root, recovered key RooWorkspace:workspace at address 391
Will make a binned dataset
Observables: ['vbf_SR_2018']
Will use category 'CMS_channel' to identify the 1 channels
Creating pdfs for individual modes (1): Creating RooAddPdf pdf_binvbf_SR_2018 with 2 elements
.   done.
Importing combined pdf model_s
Importing combined pdf model_b

RooWorkspace(w) w contents

variables
---------
(CMS_channel,MH,lumi_len,lumi_len_In,lumi_uncor2018,lumi_uncor2018_In,lumi_xyfac,lumi_xyfac_In,r,vbf_SR_2018)

p.d.f.s
-------
RooSimultaneousOpt::model_b[ indexCat=CMS_channel vbf_SR_2018=pdf_binvbf_SR_2018_bonly extraConstraints=() channelMasks=() ] = 0
  RooProdPdf::pdf_binvbf_SR_2018_bonly[ lumi_uncor2018_Pdf * lumi_xyfac_Pdf * lumi_len_Pdf