In [1]:
import glob
import os
import json
import pickle
import yaml
import math

import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import pyarrow.parquet as pq
from sklearn.metrics import auc, roc_curve
from scipy.special import softmax

import hist as hist2
import matplotlib.pyplot as plt
import mplhep as hep

plt.style.use(hep.style.CMS)

import sys
sys.path
sys.path.append("../python/")

import utils

plt.rcParams.update({"font.size": 20})

In [2]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [4]:
! ls ../eos/May10_LP_2017/

[34mGluGluHToWW_Pt-200ToInf_M-125[m[m
[34mHWminusJ_HToWW_M-125[m[m
[34mHWplusJ_HToWW_M-125[m[m
[34mHZJ_HToWW_M-125[m[m
[34mTTToSemiLeptonic[m[m
[34mVBFHToWWToAny_M-125_TuneCP5_withDipoleRecoil[m[m
[34mttHToNonbb_M125[m[m


In [57]:
import json
import pickle as pkl
import warnings
from typing import List

import numpy as np
import scipy
from hist import Hist

def get_finetuned_score(data, model_path):
    import onnx
    import onnxruntime as ort

    input_dict = {
        "highlevel": data.loc[:, "fj_ParT_hidNeuron000":"fj_ParT_hidNeuron127"].values.astype("float32"),
    }

    onnx_model = onnx.load(model_path)
    onnx.checker.check_model(onnx_model)

    ort_sess = ort.InferenceSession(
        model_path,
        providers=["AzureExecutionProvider"],
    )
    outputs = ort_sess.run(None, input_dict)

    return scipy.special.softmax(outputs[0], axis=1)[:, 0]

In [78]:
parquets = glob.glob("../eos/May10_LP_2017/GluGluHToWW_Pt-200ToInf_M-125/outfiles/*_mu.parquet")
df = pd.read_parquet(parquets)

# df = pd.read_parquet("/Users/fmokhtar/Downloads/0-2_ele.parquet")

for key in df:
    if "LP" in key:
        print(key)

LP_pfcand0_pt
LP_pfcand0_eta
LP_pfcand0_phi
LP_pfcand0_energy
LP_pfcand1_pt
LP_pfcand1_eta
LP_pfcand1_phi
LP_pfcand1_energy
LP_pfcand2_pt
LP_pfcand2_eta
LP_pfcand2_phi
LP_pfcand2_energy
LP_pfcand3_pt
LP_pfcand3_eta
LP_pfcand3_phi
LP_pfcand3_energy
LP_pfcand4_pt
LP_pfcand4_eta
LP_pfcand4_phi
LP_pfcand4_energy
LP_pfcand5_pt
LP_pfcand5_eta
LP_pfcand5_phi
LP_pfcand5_energy
LP_pfcand6_pt
LP_pfcand6_eta
LP_pfcand6_phi
LP_pfcand6_energy
LP_pfcand7_pt
LP_pfcand7_eta
LP_pfcand7_phi
LP_pfcand7_energy
LP_pfcand8_pt
LP_pfcand8_eta
LP_pfcand8_phi
LP_pfcand8_energy
LP_pfcand9_pt
LP_pfcand9_eta
LP_pfcand9_phi
LP_pfcand9_energy
LP_pfcand10_pt
LP_pfcand10_eta
LP_pfcand10_phi
LP_pfcand10_energy
LP_pfcand11_pt
LP_pfcand11_eta
LP_pfcand11_phi
LP_pfcand11_energy
LP_pfcand12_pt
LP_pfcand12_eta
LP_pfcand12_phi
LP_pfcand12_energy
LP_pfcand13_pt
LP_pfcand13_eta
LP_pfcand13_phi
LP_pfcand13_energy
LP_pfcand14_pt
LP_pfcand14_eta
LP_pfcand14_phi
LP_pfcand14_energy
LP_pfcand15_pt
LP_pfcand15_eta
LP_pfcand15_phi
LP_

In [79]:
df["THWW"] = get_finetuned_score(df, "../../weaver-core-dev/experiments_finetuning/v35_26/model.onnx")

In [80]:
df["THWW"]

0       0.894638
1       0.179884
2       0.031345
3       0.385313
4       0.985560
          ...   
7335    0.618085
7336    0.881122
7337    0.848084
7338    0.212529
7339    0.917673
Name: THWW, Length: 7340, dtype: float32

## Retrieve the pfcands, genquark, and ak8 jet info

In [81]:
pts = df.loc[:,df.columns.str.contains("LP_pfcand") & df.columns.str.contains("pt")].values
etas = df.loc[:,df.columns.str.contains("LP_pfcand") & df.columns.str.contains("eta")].values
phis = df.loc[:,df.columns.str.contains("LP_pfcand") & df.columns.str.contains("phi")].values
energys = df.loc[:,df.columns.str.contains("LP_pfcand") & df.columns.str.contains("energy")].values

pf_cands = np.stack([pts,etas,phis,energys], axis=-1)[:500]
pf_cands.shape

(500, 150, 4)

In [82]:
etas = df.loc[:,df.columns.str.contains("LP_quark") & df.columns.str.contains("eta")].values
phis = df.loc[:,df.columns.str.contains("LP_quark") & df.columns.str.contains("phi")].values

gen_parts_eta_phi = np.stack([etas,phis], axis=-1)[:500]
gen_parts_eta_phi.shape

(500, 2, 2)

In [83]:
ak8_jets = np.stack([df["LP_fj_pt"].values, df["LP_fj_eta"].values, df["LP_fj_phi"].values, df["LP_fj_energy"].values], axis=-1)[:500]
ak8_jets.shape  ## Define the tagger and the cut

(500, 4)

## Initiate a LundReweigher

In [84]:
import sys, os
sys.path.insert(0, '')
sys.path.append("LundReweighting")
sys.path.append('LundReweighting/utils')
from LundReweighting.utils.LundReweighter import *
from LundReweighting.utils.Utils import *
""" An example how to use the Lund Plane reweighting code """

######################## Setup 

def get_tagger_SF(WP):
        
    #Input file 
    f_ratio_name = 'LundReweighting/data/ratio_2018.root'
    f_ratio = ROOT.TFile.Open(f_ratio_name)

    #nominal data/MC Lund plane ratio (3d histogram)
    h_ratio = f_ratio.Get("ratio_nom")
    #systematic variations
    h_ratio_sys_up = f_ratio.Get("ratio_sys_tot_up")
    h_ratio_sys_down = f_ratio.Get("ratio_sys_tot_down")
    #MC ratio of b to light quarks
    b_light_ratio = f_ratio.Get("h_bl_ratio")


    #directory of pt extrapolation fits
    f_ratio.cd('pt_extrap')
    rdir = ROOT.gDirectory #get the present working directory and give it to rdir

    #directory of pt extrapolation fits
    f_ratio.cd('pt_extrap')
    rdir = ROOT.gDirectory #get the present working directory and give it to rdir

    #Main class for reweighting utilities
    LP_rw = LundReweighter(pt_extrap_dir = rdir)

    max_evts = len(HWWJets_tagger_score_lvqq)

    # score = getattr(d, tag_obs)[:max_evts]
    # score_cut = ((HWWJets_tagger_score_lvqq >= 0.8) & (HWWJets_tagger_score_lvqq <= 0.975))



    #Number of toys for statistical and pt extrapolation uncertainties
    nToys = 100
    #Noise vectors used to to generate the toys
    #NOTE the same vector has to be used for the whole sample/signal file for the toys to be consistent 
    rand_noise = np.random.normal(size = (nToys, h_ratio.GetNbinsX(), h_ratio.GetNbinsY(), h_ratio.GetNbinsZ()))
    pt_rand_noise = np.random.normal(size = (nToys, h_ratio.GetNbinsY(), h_ratio.GetNbinsZ(), 3))


    ################### Compute reweighting factors
    B_PDG_ID = 5


    LP_weights = []
    LP_weights_sys_up = []
    LP_weights_sys_down = []
    stat_smeared_weights = []
    pt_smeared_weights = []
    b_weights_up = []
    b_weights_down = []
    bad_matches = []

    weights_nom = []
    
#     msksel=[]
    for i,cands in enumerate(pf_cands):

        #Get the subjets, splittings and checking matching based on PF candidates in the jet and gen-level quarks        
#         try:
        subjets, splittings, bad_match, deltaRs = LP_rw.get_splittings_and_matching(cands, gen_parts_eta_phi[i], ak8_jets[i])
#         msksel.append(True)      
#         except IndexError:
#             print(bad_match, i)
#             msksel.append(False)
#             continue
            
        # print(bad_match)
        # print(deltaRs)
        #Gets the nominal LP reweighting factor for this event and statistical + pt extrapolation toys
        LP_weight, stat_smeared_weight, pt_smeared_weight = LP_rw.reweight_lund_plane(h_rw = h_ratio, subjets = subjets, splittings = splittings,
                rand_noise = rand_noise, pt_rand_noise = pt_rand_noise, )
        #Now get systematic variations
        LP_weight_sys_up,_,_ = LP_rw.reweight_lund_plane(h_rw = h_ratio_sys_up, subjets = subjets, splittings = splittings)
        LP_weight_sys_down,_,_ = LP_rw.reweight_lund_plane(h_rw = h_ratio_sys_down, subjets = subjets, splittings = splittings)

        LP_weights.append(LP_weight)
        stat_smeared_weights.append(stat_smeared_weight)
        pt_smeared_weights.append(pt_smeared_weight)

        LP_weights_sys_up.append(LP_weight_sys_up)
        LP_weights_sys_down.append(LP_weight_sys_down)
        bad_matches.append(bad_match)

    #Nominal event weights of the MC, assume every event is weight '1' for this example
#     msksel = np.array(msksel)

#     max_evts = msksel.sum()
    max_evts = 500
    weights_nom = np.ones(max_evts)
    
    if len(tagger_cut)>1:
        score_cut = (HWWJets_tagger_score_lvqq > tagger_cut[0]) & (HWWJets_tagger_score_lvqq < tagger_cut[1])   
    else:
        score_cut = (HWWJets_tagger_score_lvqq > tagger_cut[0])
    
    score_cut = score_cut[:max_evts]
    
    
    ############### Normalize weights to preserve normalization of the MC sample

    #The nominal Lund Plane correction event weights
    LP_weights = LP_rw.normalize_weights(LP_weights) * weights_nom 

    #Toy variations for stat and pt uncertainties
    stat_smeared_weights = LP_rw.normalize_weights(stat_smeared_weights) * weights_nom.reshape(max_evts, 1)
    pt_smeared_weights = LP_rw.normalize_weights(pt_smeared_weights) * weights_nom.reshape(max_evts,1)

    #Systematic up/down variations
    LP_weights_sys_up = LP_rw.normalize_weights(LP_weights_sys_up) * weights_nom
    LP_weights_sys_down = LP_rw.normalize_weights(LP_weights_sys_down) * weights_nom

    ############### Compute efficiences and uncertainties


    #Efficiency of the cut in nominal MC
    eff_nom = np.average(score_cut, weights = weights_nom) #TODO

    #Efficiency of the cut after the Lund Plane reweighting
    eff_rw = np.average(score_cut, weights = LP_weights)

    #Nominal 'scale factor'
    SF = eff_rw / eff_nom

    print("Nominal efficiency %.3f, Corrected efficiency %.3f, SF (corrected / nom) %.3f" % (eff_nom, eff_rw, SF))

    #NOTE, better to use corrected efficiency computed separately for each sample rather than a single 'SF'


    #Compute efficiency for each of the stat/pt toys
    eff_toys = []
    pt_eff_toys = []
    for i in range(nToys):
        eff = np.average(score_cut, weights = stat_smeared_weights[:,i])
        eff_toys.append(eff)

        eff1 = np.average(score_cut, weights = pt_smeared_weights[:,i])
        pt_eff_toys.append(eff1)

    #Compute stat and pt uncertainty based on variation in the toys
    toys_mean = np.mean(eff_toys)
    toys_std = np.std(eff_toys)
    pt_toys_mean = np.mean(pt_eff_toys)
    pt_toys_std = np.std(pt_eff_toys)

    eff_stat_unc = (abs(toys_mean - eff_rw)  + toys_std) 
    eff_pt_unc = (abs(pt_toys_mean - eff_rw) + pt_toys_std)

    print("Stat variation toys eff. avg %.3f, std dev %.3f" % (toys_mean, toys_std))
    print("Pt variation toys eff. avg %.3f, std dev %.3f" % (pt_toys_mean, pt_toys_std))


    #Compute difference in efficiency due to weight variations as uncertainty
    def get_uncs(score_cut, weights_up, weights_down, eff_baseline):
        eff_up =  np.average(score_cut, weights = weights_up)
        eff_down =  np.average(score_cut, weights = weights_down)

        unc_up = eff_up - eff_baseline
        unc_down = eff_down - eff_baseline 
        return unc_up, unc_down


    #Compute efficiency of systematic variations
    sys_unc_up, sys_unc_down = get_uncs(score_cut, LP_weights_sys_up, LP_weights_sys_down, eff_rw)
    # b_unc_up, b_unc_down = get_uncs(score_cut, b_weights_up, b_weights_down, eff_rw)


    #matching uncertainty, taken as a fractional uncertainty on efficiency
    bad_match_frac = np.mean(bad_matches)
    bad_match_unc = bad_match_frac * eff_rw


    ############ Results
    print("\n\nCalibrated efficiency  is %.3f +/- %.3f  (stat) +/- %.3f (pt) +/- %.3f/%.3f (sys)+/- %.3f (matching)  \n\n"  % 
            (eff_rw, eff_stat_unc, eff_pt_unc, sys_unc_up, sys_unc_down, bad_match_unc))

    #next compute the uncertainty about SFs

    #Efficiency of the cut in nominal MC
    eff_nom = np.average(score_cut, weights = weights_nom) #TODO

    #Efficiency of the cut after the Lund Plane reweighting
    eff_rw = np.average(score_cut, weights = LP_weights)

    #Nominal 'scale factor'
    print("Now perform SFs information")
    SF = eff_rw / eff_nom

    print("SF (corrected / nom) %.3f" % (SF))

    #propagate statistical and pt extrapolation uncertainties to SF
    SF_stat_unc = (abs(toys_mean - eff_rw)  + toys_std) /eff_nom
    SF_pt_unc = (abs(pt_toys_mean - eff_rw) + pt_toys_std) /eff_nom

    #propagate systemetic uncertainty to SF
    eff_sys_up =  np.average(score_cut, weights = LP_weights_sys_up)
    eff_sys_down =  np.average(score_cut, weights = LP_weights_sys_down)

    sys_unc_up = abs(eff_rw - eff_sys_up)
    sys_unc_down = abs(eff_rw - eff_sys_down)

    SF_sys_unc_up = sys_unc_up/eff_nom
    SF_sys_unc_down = sys_unc_down/eff_nom

    #calculate bad matching uncertainty directly
    SF_match_unc = bad_match_frac * SF

    print("\n\nSF is %.3f +/-%.3f(stat) +/-%.5f(pt) +%.3f/-%.3f(sys) +/-%.3f(match) \n\n"  % (SF, SF_stat_unc, SF_pt_unc, sys_unc_up, sys_unc_down, SF_match_unc))
    f_ratio.Close()
    
    return LP_weights, LP_weights_sys_up, LP_weights_sys_down

## Define the tagger and the cut

In [85]:
HWWJets_tagger_score_lvqq = df["THWW"]
tagger_cut = [0.925]

LP_weights, LP_weights_sys_up, LP_weights_sys_down = get_tagger_SF(tagger_cut)

Nominal efficiency 0.168, Corrected efficiency 0.151, SF (corrected / nom) 0.897
Stat variation toys eff. avg 0.155, std dev 0.017
Pt variation toys eff. avg 0.151, std dev 0.001


Calibrated efficiency  is 0.151 +/- 0.021  (stat) +/- 0.001 (pt) +/- -0.002/0.001 (sys)+/- 0.150 (matching)  


Now perform SFs information
SF (corrected / nom) 0.897


SF is 0.897 +/-0.125(stat) +/-0.00413(pt) +0.002/-0.001(sys) +/-0.895(match) 


