# Compute the LateX cutflow tables
- Will load the pkl files that contain the cutflows and the sumgenweight
- Will scale the events by the cross section
- Will save the yields in a dictionnary called ```cutflows -> Dict()```
- Will make the LateX table using the function ```make_composition_table()```

In [1]:
import glob
import json
import os
import pickle as pkl
import sys

import hist as hist2
import matplotlib.pyplot as plt
import mplhep as hep
import numpy as np
import pandas as pd
import pyarrow
import pyarrow.parquet as pq
import yaml
from scipy.special import softmax
from sklearn.metrics import auc, roc_curve

sys.path
sys.path.append("../python/")

import utils

plt.style.use(hep.style.CMS)
plt.rcParams.update({"font.size": 20})

pd.options.mode.chained_assignment = None

import glob
import os
import json
import pickle
import yaml
import math

import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import pyarrow.parquet as pq
from sklearn.metrics import auc, roc_curve
from scipy.special import softmax

import hist as hist2
import matplotlib.pyplot as plt
import mplhep as hep

plt.style.use(hep.style.CMS)

import sys
sys.path
sys.path.append("../python/")

import utils

plt.rcParams.update({"font.size": 20})

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# get lumi
with open("../fileset/luminosity.json") as f:
    luminosity = json.load(f)
    
luminosity

{'ele': {'Run2': 137640.0,
  '2016APV': 19492.72,
  '2016': 16809.96,
  '2017': 41476.02,
  '2018': 59816.23},
 'mu': {'Run2': 137640.0,
  '2016APV': 19436.16,
  '2016': 16810.81,
  '2017': 41475.26,
  '2018': 59781.96},
 'lep': {'Run2': 137640.0,
  '2016APV': 19436.16,
  '2016': 16810.81,
  '2017': 41475.26,
  '2018': 59781.96},
 'had': {'Run2': 137640.0,
  '2016APV': 19436.16,
  '2016': 16810.81,
  '2017': 41475.26,
  '2018': 59781.96}}

In [4]:
def get_lumi(years, channels):
    lum_ = 0
    for year in years:
        lum = 0
        for ch in channels:
            lum += luminosity[ch][year] / 1000.0

        lum_ += lum / len(channels)    
    return lum_

# Read cutflows from pkl

In [5]:
def get_sum_sumgenweight(pkl_files, year, sample):
    """Load and sum the sumgenweight of each pkl file."""
    
    sum_sumgenweight = 0
    for ifile in pkl_files:
        with open(ifile, "rb") as f:
            metadata = pkl.load(f)            
        sum_sumgenweight = sum_sumgenweight + metadata[sample][year]["sumgenweight"]

    return sum_sumgenweight


def get_xsecweight(pkl_files, year, ch, sample, is_data):
    
    if not is_data:
        # find xsection
        f = open("../fileset/xsec_pfnano.json")
        xsec = json.load(f)
        f.close()
        try:
            xsec = eval(str((xsec[sample])))
        except ValueError:
            print(f"sample {sample} doesn't have xsecs defined in xsec_pfnano.json so will skip it")
            return None

        # get overall weighting of events.. each event has a genweight...
        # sumgenweight sums over events in a chunk... sum_sumgenweight sums over chunks
        xsec_weight = (xsec * luminosity[ch][year]) / get_sum_sumgenweight(pkl_files, year, sample)
    else:
        xsec_weight = 1
    return xsec_weight

def get_cutflow(pkl_files, year, ch, sample, is_data):
    """
    Get cutflow from metadata but multiply by xsec-weight
    """
    xsec_weight = get_xsecweight(pkl_files, year, ch, sample, is_data)
        
    cuts = [
        "sumgenweight",
        "Trigger",
        "METFilters",
        "OneLep",
        "NoTaus",
        "AtLeastOneFatJet",
        "CandidateJetpT",
        "LepInJet",
        "JetLepOverlap",
        "dPhiJetMET",
        "MET",
    ]
        
    if year == "2018":
        cuts += ["HEMCleaning"]
        
    evyield = dict.fromkeys(cuts, 0)
    for ik, pkl_file in enumerate(pkl_files):
        with open(pkl_file, "rb") as f:
            metadata = pkl.load(f)
            
        cutflows = metadata[sample][year]["cutflows"][ch]

        for key in evyield.keys():

            if key == "sumgenweight":
                evyield[key] += metadata[sample][year][key] * xsec_weight
            else:
                evyield[key] += cutflows[key] * xsec_weight        
    return evyield

In [6]:
! ls ../eos/July18_hww_2016APV/WJetsToLNu_HT-100To200/outfiles/0-10.pkl

../eos/July18_hww_2016APV/WJetsToLNu_HT-100To200/outfiles/0-10.pkl


In [6]:
with open("../eos/Oct10_hww_2016APV/WJetsToLNu_HT-100To200/outfiles/0-10.pkl", "rb") as f:
    metadata = pkl.load(f)
metadata

{'WJetsToLNu_HT-100To200': {'mc': 149,
  '2016APV': {'sumgenweight': 1492951.0,
   'sumlheweight': {0: 1551137.2,
    1: 1551799.4,
    2: 1541534.8,
    3: 1491997.6,
    4: 1492951.0,
    5: 1483308.9,
    6: 1443469.5,
    7: 1444666.2,
    8: 1435536.2},
   'sumpdfweight': {0: 1492951.0,
    1: 1494297.1,
    2: 1492237.6,
    3: 1494463.2,
    4: 1493804.0,
    5: 1494341.9,
    6: 1493538.8,
    7: 1494321.6,
    8: 1493813.9,
    9: 1493034.2,
    10: 1492469.5,
    11: 1491638.0,
    12: 1493345.8,
    13: 1493301.8,
    14: 1492837.2,
    15: 1493383.9,
    16: 1492520.0,
    17: 1492682.9,
    18: 1492891.5,
    19: 1492741.8,
    20: 1493447.0,
    21: 1493083.9,
    22: 1493360.1,
    23: 1492864.5,
    24: 1493688.5,
    25: 1493356.0,
    26: 1493390.6,
    27: 1493141.5,
    28: 1493313.1,
    29: 1494101.6,
    30: 1493643.0,
    31: 1494384.6,
    32: 1493172.6,
    33: 1494015.9,
    34: 1493571.5,
    35: 1492720.9,
    36: 1492159.4,
    37: 1493126.9,
    38: 14935

# Adding a cut from the parquets

In [7]:
### This is your configuration. specefy which channels, years, samples, and directory of pkl files to use.
channels = [
    "ele", 
    "mu",
]
years = [
    "2018", 
    "2017",
    "2016", 
    "2016APV",
]

samples = [
    "ggF", 
    "VBF",  
    "WH",
    "ZH",    
    "ttH",
    "WJetsLNu",
    "TTbar",
    "SingleTop",
    "Diboson",
    "EWKvjets",
    "DYJets",
    "WZQQ",
    "Data",
]

samples_dir = {
    "2016":    "../eos/Oct10_hww_2016",
    "2016APV": "../eos/Oct10_hww_2016APV",    
    "2017":    "../eos/Oct10_hww_2017",    
    "2018":    "../eos/Oct10_hww_2018",    
}

In [34]:
df["xsecweight"]

1       0.000583
2       0.000583
10      0.000583
12      0.000583
19      0.000583
          ...   
5623    0.000583
5624    0.000583
5625    0.000583
5633    0.000583
5635    0.000583
Name: xsecweight, Length: 1870, dtype: float64

In [35]:
df["nominal"]

1       0.000618
2       0.000394
10      0.000625
12      0.000560
19      0.000607
          ...   
5623    0.000608
5624    0.000327
5625    0.000582
5633    0.000600
5635    0.000625
Name: nominal, Length: 1870, dtype: float64

In [8]:
cutflows = {}
for year in years:
#     if year != "2016":
#         continue
    print(f"Processing year {year}")
    
    cutflows[year] = {}
    
    for ch in channels:
#         if ch != "mu": 
#             continue
        
        print(f"  {ch} channel")
        cutflows[year][ch] = {}

        condor_dir = os.listdir(samples_dir[year])

        for sample in condor_dir:

            # first: check if the sample is in one of combine_samples_by_name
            sample_to_use = None
            for key in utils.combine_samples_by_name:
                if key in sample:
                    sample_to_use = utils.combine_samples_by_name[key]
                    break

            # second: if not, combine under common label
            if sample_to_use is None:
                for key in utils.combine_samples:
                    if key in sample:
                        sample_to_use = utils.combine_samples[key]
                        break
                    else:
                        sample_to_use = sample

            if sample_to_use not in samples:
                continue

            is_data = False
            if sample_to_use == "Data":
                is_data = True

            out_files = f"{samples_dir[year]}/{sample}/outfiles/"
            pkl_files = glob.glob(f"{out_files}/*.pkl")

            if len(pkl_files) == 0:
                continue

            parquet_files = glob.glob(f"{out_files}/*_{ch}.parquet")
            
            try:
                data = pd.read_parquet(parquet_files)
            except pyarrow.lib.ArrowInvalid:
                # empty parquet because no event passed selection
#                 print(f"No parquet file for {sample}")
                continue

            if len(data) == 0:
#                 print(f"Hi, No parquet file for {sample}")
                continue
    
            if sample_to_use not in cutflows[year][ch].keys():
                cutflows[year][ch][sample_to_use] = get_cutflow(pkl_files, year, ch, sample, is_data)
            else:
                temp = get_cutflow(pkl_files, year, ch, sample, is_data)
                for key in cutflows[year][ch][sample_to_use]:
                    cutflows[year][ch][sample_to_use][key] += temp[key]
            
    print(f"------------------------------------------")

Processing year 2018
  ele channel
  mu channel
------------------------------------------
Processing year 2017
  ele channel
  mu channel
------------------------------------------
Processing year 2016
  ele channel
  mu channel
------------------------------------------
Processing year 2016APV
  ele channel
  mu channel
------------------------------------------


In [9]:
samples = cutflows["2017"]["ele"].keys()  # samples
samples

dict_keys(['VBF', 'WJetsLNu', 'EWKvjets', 'WH', 'TTbar', 'SingleTop', 'ggF', 'DYJets', 'Data', 'Diboson', 'WZQQ', 'ttH', 'ZH'])

In [10]:
from make_stacked_hists import make_events_dict

presel = {
        "mu": {
#             "fj_mass": "fj_mass>40",
#             "THWW>0.75": "THWW>0.750",
        },
        "ele": {
#             "fj_mass": "fj_mass>40",
#             "THWW>0.75": "THWW>0.750",
        },
}

THWW_path = "../../weaver-core-dev/experiments_finetuning/v35_30/model.onnx"

events_dict = make_events_dict(years, channels, samples_dir, samples, presel, THWW_path)

INFO:root:Finding VBFHToWWToAny_M-125_TuneCP5_withDipoleRecoil samples and should combine them under VBF
INFO:root:Will fill the VBF dataframe with the remaining 1601 events
INFO:root:tot event weight 25.613311877416795 

INFO:root:Finding WJetsToLNu_HT-100To200 samples and should combine them under WJetsLNu
INFO:root:Will fill the WJetsLNu dataframe with the remaining 57 events
INFO:root:tot event weight 126.30860134877776 

INFO:root:Finding EWKWminus_WToLNu samples and should combine them under EWKvjets
INFO:root:Will fill the EWKvjets dataframe with the remaining 2069 events
INFO:root:tot event weight 881.62056583208 

INFO:root:Finding EWKZ_ZToNuNu samples and should combine them under EWKvjets
INFO:root:Finding fake_2018_mu_EWK_SF_Up.parquet samples and should combine them under EWKvjets
INFO:root:No parquet file for fake_2018_mu_EWK_SF_Up.parquet
INFO:root:Finding HWminusJ_HToWW_M-125 samples and should combine them under WH
INFO:root:Will fill the WH dataframe with the remainin

INFO:root:tot event weight 175.1663824111345 

INFO:root:Finding WJetsToLNu_HT-2500ToInf samples and should combine them under WJetsLNu
INFO:root:Will fill the WJetsLNu dataframe with the remaining 62435 events
INFO:root:tot event weight 22.96116749615988 

INFO:root:Finding ttHToNonbb_M125 samples and should combine them under ttH
INFO:root:Will fill the ttH dataframe with the remaining 5879 events
INFO:root:tot event weight 35.14049157748235 

INFO:root:Finding ZJetsToQQ_HT-800toInf samples and should combine them under WZQQ
INFO:root:Will fill the WZQQ dataframe with the remaining 512 events
INFO:root:tot event weight 61.299457382969194 

INFO:root:Finding EGamma_Run2018C samples and should combine them under Data
INFO:root:Will fill the Data dataframe with the remaining 15261 events
INFO:root:tot event weight 15261.0 

INFO:root:Finding EGamma_Run2018D samples and should combine them under Data
INFO:root:Will fill the Data dataframe with the remaining 69925 events
INFO:root:tot eve

INFO:root:Finding WW samples and should combine them under Diboson
INFO:root:Will fill the Diboson dataframe with the remaining 624 events
INFO:root:tot event weight 661.3145558679895 

INFO:root:Finding DYJetsToLL_LHEFilterPtZ-250To400 samples and should combine them under DYJets
INFO:root:Will fill the DYJets dataframe with the remaining 165755 events
INFO:root:tot event weight 1317.4916547823514 

INFO:root:Finding ST_t-channel_antitop_4f_InclusiveDecays samples and should combine them under SingleTop
INFO:root:Will fill the SingleTop dataframe with the remaining 19318 events
INFO:root:tot event weight 667.5982563083757 

INFO:root:Finding TTTo2L2Nu samples and should combine them under TTbar
INFO:root:Will fill the TTbar dataframe with the remaining 109127 events
INFO:root:tot event weight 5088.6265446193775 

INFO:root:Finding EWKZ_ZToQQ samples and should combine them under EWKvjets
INFO:root:Will fill the EWKvjets dataframe with the remaining 5 events
INFO:root:tot event weight 

INFO:root:Will fill the SingleTop dataframe with the remaining 4036 events
INFO:root:tot event weight 914.3763829556749 

INFO:root:Finding GluGluHToWWToLNuQQ_M-125_TuneCP5_13TeV_powheg_jhugen751_pythia8 samples and should combine them under ggF
INFO:root:Will fill the ggF dataframe with the remaining 1 events
INFO:root:tot event weight 0.6687053205537946 

INFO:root:Finding GluGluHToWW_Pt-200ToInf_M-125 samples and should combine them under ggF
INFO:root:Will fill the ggF dataframe with the remaining 4809 events
INFO:root:tot event weight 37.83235012135305 

INFO:root:Finding DYJetsToLL_LHEFilterPtZ-650ToInf samples and should combine them under DYJets
INFO:root:Will fill the DYJets dataframe with the remaining 233145 events
INFO:root:tot event weight 92.58058359541592 

INFO:root:Finding WJetsToQQ_HT-200to400 samples and should combine them under WZQQ
INFO:root:Finding ST_tW_antitop_5f_inclusiveDecays samples and should combine them under SingleTop
INFO:root:Will fill the SingleTop d

INFO:root:tot event weight 5.4639525667359825 

INFO:root:Finding WZ samples and should combine them under Diboson
INFO:root:Will fill the Diboson dataframe with the remaining 596 events
INFO:root:tot event weight 160.23113492875405 

INFO:root:Finding SingleMuon_Run2017F samples and should combine them under Data
INFO:root:Finding VBFHToWWToAny_M-125_TuneCP5_withDipoleRecoil samples and should combine them under VBF
INFO:root:Will fill the VBF dataframe with the remaining 2576 events
INFO:root:tot event weight 24.855114197840102 

INFO:root:Finding WJetsToLNu_HT-100To200 samples and should combine them under WJetsLNu
INFO:root:Will fill the WJetsLNu dataframe with the remaining 105 events
INFO:root:tot event weight 166.74539815030852 

INFO:root:Finding fake_2017_mu_EWK_SF_Down.parquet samples and should combine them under EWKvjets
INFO:root:No parquet file for fake_2017_mu_EWK_SF_Down.parquet
INFO:root:Finding fake_2017_mu_EWK_SF_Up.parquet samples and should combine them under EWKvj

INFO:root:Will fill the WZQQ dataframe with the remaining 1367 events
INFO:root:tot event weight 136.3794313711509 

INFO:root:Finding fake_2017_ele_EWK_SF_Down.parquet samples and should combine them under EWKvjets
INFO:root:No parquet file for fake_2017_ele_EWK_SF_Down.parquet
INFO:root:Finding ZJetsToQQ_HT-600to800 samples and should combine them under WZQQ
INFO:root:Will fill the WZQQ dataframe with the remaining 1041 events
INFO:root:tot event weight 132.78213443900563 

INFO:root:Finding DYJetsToLL_LHEFilterPtZ-400To650 samples and should combine them under DYJets
INFO:root:Will fill the DYJets dataframe with the remaining 43875 events
INFO:root:tot event weight 226.85021133916075 

INFO:root:Finding EWKWminus_WToQQ samples and should combine them under EWKvjets
INFO:root:Will fill the EWKvjets dataframe with the remaining 112 events
INFO:root:tot event weight 8.71855588160276 

INFO:root:Finding WJetsToLNu_HT-600To800 samples and should combine them under WJetsLNu
INFO:root:Will

INFO:root:Will fill the WH dataframe with the remaining 1347 events
INFO:root:tot event weight 1.9896903853553265 

INFO:root:Finding DYJetsToLL_LHEFilterPtZ-100To250 samples and should combine them under DYJets
INFO:root:Will fill the DYJets dataframe with the remaining 27781 events
INFO:root:tot event weight 1148.671322447633 

INFO:root:Finding fake_2016_mu_EWK_SF_Up.parquet samples and should combine them under EWKvjets
INFO:root:No parquet file for fake_2016_mu_EWK_SF_Up.parquet
INFO:root:Finding EWKWplus_WToQQ samples and should combine them under EWKvjets
INFO:root:Will fill the EWKvjets dataframe with the remaining 50 events
INFO:root:tot event weight 4.161994572838769 

INFO:root:Finding ST_s-channel_4f_leptonDecays samples and should combine them under SingleTop
INFO:root:Will fill the SingleTop dataframe with the remaining 4763 events
INFO:root:tot event weight 14.907830141865544 

INFO:root:Finding DYJetsToLL_LHEFilterPtZ-50To100 samples and should combine them under DYJets

INFO:root:tot event weight 0.5588687162742549 

INFO:root:Finding WJetsToLNu_HT-1200To2500 samples and should combine them under WJetsLNu
INFO:root:Will fill the WJetsLNu dataframe with the remaining 89401 events
INFO:root:tot event weight 1076.6771149769274 

INFO:root:Finding EWKZ_ZToLL samples and should combine them under EWKvjets
INFO:root:Will fill the EWKvjets dataframe with the remaining 159 events
INFO:root:tot event weight 32.31952476474904 

INFO:root:Finding WJetsToLNu_HT-200To400 samples and should combine them under WJetsLNu
INFO:root:Will fill the WJetsLNu dataframe with the remaining 5457 events
INFO:root:tot event weight 2846.3848881619906 

INFO:root:Finding ST_tW_top_5f_inclusiveDecays samples and should combine them under SingleTop
INFO:root:Will fill the SingleTop dataframe with the remaining 1822 events
INFO:root:tot event weight 364.6139761173978 

INFO:root:Finding GluGluHToWWToLNuQQ_M-125_TuneCP5_13TeV_powheg_jhugen751_pythia8 samples and should combine them un

INFO:root:tot event weight 40.84631532402763 

INFO:root:Finding VBFHToWWToAny_M-125_TuneCP5_withDipoleRecoil samples and should combine them under VBF
INFO:root:Will fill the VBF dataframe with the remaining 488 events
INFO:root:tot event weight 6.874058267640764 

INFO:root:Finding WJetsToLNu_HT-100To200 samples and should combine them under WJetsLNu
INFO:root:Will fill the WJetsLNu dataframe with the remaining 8 events
INFO:root:tot event weight 13.2223336270851 

INFO:root:Finding EWKWminus_WToLNu samples and should combine them under EWKvjets
INFO:root:Will fill the EWKvjets dataframe with the remaining 1002 events
INFO:root:tot event weight 264.37031496015913 

INFO:root:Finding EWKZ_ZToNuNu samples and should combine them under EWKvjets
INFO:root:Finding HWminusJ_HToWW_M-125 samples and should combine them under WH
INFO:root:Will fill the WH dataframe with the remaining 1463 events
INFO:root:tot event weight 1.1336575193702283 

INFO:root:Finding WJetsToLNu_HT-800To1200 samples 

INFO:root:Finding EWKWminus_WToQQ samples and should combine them under EWKvjets
INFO:root:Will fill the EWKvjets dataframe with the remaining 32 events
INFO:root:tot event weight 2.314554269009178 

INFO:root:Finding WJetsToLNu_HT-600To800 samples and should combine them under WJetsLNu
INFO:root:Will fill the WJetsLNu dataframe with the remaining 32368 events
INFO:root:tot event weight 4167.464159502781 

INFO:root:Finding SingleElectron_Run2016C_HIPM samples and should combine them under Data
INFO:root:Will fill the Data dataframe with the remaining 5696 events
INFO:root:tot event weight 5696.0 

INFO:root:Finding WJetsToQQ_HT-600to800 samples and should combine them under WZQQ
INFO:root:Will fill the WZQQ dataframe with the remaining 364 events
INFO:root:tot event weight 51.24802932637182 

INFO:root:Finding WJetsToLNu_HT-2500ToInf samples and should combine them under WJetsLNu
INFO:root:Will fill the WJetsLNu dataframe with the remaining 27093 events
INFO:root:tot event weight 6.60

INFO:root:Will fill the DYJets dataframe with the remaining 2058 events
INFO:root:tot event weight 254.93569869711422 

INFO:root:Finding WW samples and should combine them under Diboson
INFO:root:Will fill the Diboson dataframe with the remaining 1332 events
INFO:root:tot event weight 205.77201547716868 

INFO:root:Finding SingleMuon_Run2016D_HIPM samples and should combine them under Data
INFO:root:Will fill the Data dataframe with the remaining 11331 events
INFO:root:tot event weight 11331.0 

INFO:root:Finding DYJetsToLL_LHEFilterPtZ-250To400 samples and should combine them under DYJets
INFO:root:Will fill the DYJets dataframe with the remaining 86587 events
INFO:root:tot event weight 415.5494831059824 

INFO:root:Finding ST_t-channel_antitop_4f_InclusiveDecays samples and should combine them under SingleTop
INFO:root:Will fill the SingleTop dataframe with the remaining 6161 events
INFO:root:tot event weight 194.08207416077585 

INFO:root:Finding TTTo2L2Nu samples and should combin

# Add the cut to the cutflow dict

In [11]:
presel = {
        "mu": {
            "fj_mass": "fj_mass>40",
            "THWW>0.75": "fj_mass>40 & THWW>0.75",
        },
        "ele": {
            "fj_mass": "fj_mass>40",            
            "THWW>0.75": "fj_mass>40 & THWW>0.75",
        },
}

In [12]:
for ch in channels:
    for cut, sel in list(presel[ch].items()):
        for sample in samples:
            for year in years:

                df = events_dict[year][ch][sample]
                df = df.query(sel)
                
                w = df["nominal"]

                cutflows[year][ch][sample][cut] = w.sum()

In [40]:
cutflows["2016APV"]["mu"]["WJetsLNu"]     # take a quick look

{'sumgenweight': 32148205.541996155,
 'Trigger': 5052102.989037976,
 'METFilters': 5049012.531204127,
 'OneLep': 5031057.850622146,
 'NoTaus': 4271447.470579825,
 'AtLeastOneFatJet': 461978.1002925532,
 'CandidateJetpT': 196124.03115179407,
 'LepInJet': 68289.06508691012,
 'JetLepOverlap': 26003.119346261694,
 'dPhiJetMET': 18333.833322064966,
 'MET': 16980.930607238028,
 'fj_mass': 19896.861487391365,
 'THWW>0.75': 560.3813816501639}

In [41]:
cutflows["2016APV"]["ele"]["WJetsLNu"]     # take a quick look

{'sumgenweight': 32241758.10101272,
 'Trigger': 4454643.38131529,
 'METFilters': 4451955.991847101,
 'OneLep': 3062734.5054372367,
 'NoTaus': 3062734.5054372367,
 'AtLeastOneFatJet': 364780.748501086,
 'CandidateJetpT': 153895.86376889877,
 'LepInJet': 58938.59730123174,
 'JetLepOverlap': 19206.882023214886,
 'dPhiJetMET': 13362.535289163337,
 'MET': 12293.663142455729,
 'fj_mass': 14729.953784965503,
 'THWW>0.75': 323.25912091618096}

# Combine different channels

In [13]:
common_cuts = cutflows["2018"]["mu"]["WJetsLNu"]
common_cuts

{'sumgenweight': 98881812.96014196,
 'Trigger': 15774368.475905173,
 'METFilters': 15767447.50578202,
 'OneLep': 15713782.312337536,
 'NoTaus': 13365615.02871292,
 'AtLeastOneFatJet': 1419078.1158154933,
 'CandidateJetpT': 638926.6511643255,
 'LepInJet': 226595.34688035544,
 'JetLepOverlap': 85525.34383794866,
 'dPhiJetMET': 59904.21890436672,
 'MET': 55956.918262700114,
 'HEMCleaning': 53829.03321016653,
 'fj_mass': 65069.24460716746,
 'THWW>0.75': 1822.2158269649399}

In [14]:
def combine_channels(cutflows):

    # combine both channels
    cutflows_new = {}
    for year in cutflows.keys():
        cutflows_new[year] = {}
        cutflows_new[year]["lep"] = {}
        
        for ch in ["mu", "ele"]:
            for sample in cutflows[year][ch]:
                                
                if sample not in cutflows_new[year]["lep"]:
                    cutflows_new[year]["lep"][sample] = {}
                
                for cut in cutflows[year][ch][sample]:
                    
                    if (year != "2018") and (cut == "HEMCleaning"):
                        continue
                    
                    if cut not in cutflows_new[year]["lep"][sample]:
                        cutflows_new[year]["lep"][sample][cut] = cutflows[year][ch][sample][cut]
                    else:
                        cutflows_new[year]["lep"][sample][cut] += cutflows[year][ch][sample][cut]
        cutflows[year] = {**cutflows[year], **cutflows_new[year]}
        
    return cutflows

In [15]:
cutflows = combine_channels(cutflows)

In [16]:
cutflows["2018"].keys()

dict_keys(['ele', 'mu', 'lep'])

In [17]:
cutflows["2018"]["ele"]["WJetsLNu"]

{'sumgenweight': 98938496.94524625,
 'Trigger': 13451747.990687694,
 'METFilters': 13445769.865167841,
 'OneLep': 10365734.270532267,
 'NoTaus': 10365734.270532267,
 'AtLeastOneFatJet': 1175706.8480802262,
 'CandidateJetpT': 527598.3350759504,
 'LepInJet': 203266.60352170584,
 'JetLepOverlap': 67482.88251702799,
 'dPhiJetMET': 46441.958592916984,
 'MET': 43162.660704319554,
 'HEMCleaning': 41485.858620482926,
 'fj_mass': 51498.8021733948,
 'THWW>0.75': 1300.756305407815}

In [18]:
cutflows["2018"]["mu"]["WJetsLNu"]

{'sumgenweight': 98881812.96014196,
 'Trigger': 15774368.475905173,
 'METFilters': 15767447.50578202,
 'OneLep': 15713782.312337536,
 'NoTaus': 13365615.02871292,
 'AtLeastOneFatJet': 1419078.1158154933,
 'CandidateJetpT': 638926.6511643255,
 'LepInJet': 226595.34688035544,
 'JetLepOverlap': 85525.34383794866,
 'dPhiJetMET': 59904.21890436672,
 'MET': 55956.918262700114,
 'HEMCleaning': 53829.03321016653,
 'fj_mass': 65069.24460716746,
 'THWW>0.75': 1822.2158269649399}

In [19]:
cutflows["2018"]["lep"]["WJetsLNu"]

{'sumgenweight': 197820309.9053882,
 'Trigger': 29226116.466592867,
 'METFilters': 29213217.37094986,
 'OneLep': 26079516.582869805,
 'NoTaus': 23731349.299245186,
 'AtLeastOneFatJet': 2594784.9638957195,
 'CandidateJetpT': 1166524.986240276,
 'LepInJet': 429861.9504020613,
 'JetLepOverlap': 153008.22635497665,
 'dPhiJetMET': 106346.1774972837,
 'MET': 99119.57896701967,
 'HEMCleaning': 95314.89183064946,
 'fj_mass': 116568.04678056226,
 'THWW>0.75': 3122.972132372755}

# Combine different years

In [20]:
def combine_years(cutflows):
    """Will remove the HEM cleaning cutflow from 2018 first."""
    
    whatever_year = list(cutflows.keys())[0]
    channels = cutflows[whatever_year].keys()
    
    # combine all years
    cutflows_new = {}
    cutflows_new["Run2"] = {}
    
    for ch in channels:
        cutflows_new["Run2"][ch] = {}
        
        for year in cutflows:
            for sample in cutflows[year][ch]:
                
                if sample not in cutflows_new["Run2"][ch]:
                    cutflows_new["Run2"][ch][sample] = {}

                for cut in cutflows[year][ch][sample]:
                    if "HEM" in cut:
                        continue
                    if cut not in cutflows_new["Run2"][ch][sample]:
                        cutflows_new["Run2"][ch][sample][cut] = cutflows[year][ch][sample][cut]
                    else:
                        cutflows_new["Run2"][ch][sample][cut] += cutflows[year][ch][sample][cut]

    cutflows = {**cutflows, **cutflows_new}

    return cutflows

In [21]:
cutflows = combine_years(cutflows)

In [22]:
cutflows["2016"]["ele"].keys()

dict_keys(['VBF', 'WJetsLNu', 'EWKvjets', 'WH', 'TTbar', 'Data', 'SingleTop', 'ggF', 'DYJets', 'Diboson', 'WZQQ', 'ttH', 'ZH'])

In [23]:
cutflows.keys()

dict_keys(['2018', '2017', '2016', '2016APV', 'Run2'])

In [24]:
cutflows["Run2"].keys()

dict_keys(['ele', 'mu', 'lep'])

# Combine non-dominant backgrounds

In [25]:
# combine non-dominant backgrounds under others
dominant_bkgs = ["WJetsLNu", "TTbar"]
signals = ["ggF", "VBF", "WH", "ZH", "ttH"]

for year in cutflows:
    for ch in cutflows[year]:
        cutflows[year][ch]["Others"] = dict.fromkeys(cutflows[year][ch]["WJetsLNu"], 0)
        for sample in cutflows[year][ch]:
            if sample == "Data":
                continue
            if sample not in signals+dominant_bkgs:
                for cut in cutflows[year][ch][sample]:
                    cutflows[year][ch]["Others"][cut] += cutflows[year][ch][sample][cut]

In [26]:
cutflows["2018"]["ele"].keys()

dict_keys(['VBF', 'WJetsLNu', 'EWKvjets', 'WH', 'TTbar', 'SingleTop', 'ggF', 'DYJets', 'Data', 'Diboson', 'WZQQ', 'ttH', 'ZH', 'Others'])

In [27]:
cutflows["2018"]["lep"]["Others"]

{'sumgenweight': 701905777.1893944,
 'Trigger': 106776300.4785971,
 'METFilters': 106744779.68028218,
 'OneLep': 69889276.65752879,
 'NoTaus': 65050915.302088186,
 'AtLeastOneFatJet': 1747597.8010711675,
 'CandidateJetpT': 813340.0718076346,
 'LepInJet': 395296.1751882167,
 'JetLepOverlap': 151354.71308578423,
 'dPhiJetMET': 90838.14744899893,
 'MET': 78830.04211136982,
 'HEMCleaning': 75405.24645365497,
 'fj_mass': 62865.87271783278,
 'THWW>0.75': 1103.411013839698}

# LateX cutflow table

In [28]:
cuts = [
    "sumgenweight",
    "Trigger",
    "METFilters",
    "OneLep",        
    "NoTaus",
    "AtLeastOneFatJet",
    "CandidateJetpT",
    "LepInJet",
    "JetLepOverlap",
    "dPhiJetMET",
    "MET",
    "HEMCleaning",
]

for cut in presel["mu"]:
    cuts += [cut]

In [29]:
cut_to_label = {
    "sumgenweight": "sumgenweight",        
    "HEMCleaning": "HEMCleaning",    
    "Trigger": "Trigger",
    "METFilters": "METFilters",
    "OneLep": "n Leptons = 1",
    "NoTaus": "n Taus = 0",
    "AtLeastOneFatJet": r"n FatJets $>=$ 1",
    "CandidateJetpT": r"j $p_T > 250$GeV",
    "LepInJet": r"$\Delta R(j, \ell) < 0.8$",
    "JetLepOverlap": r"$\Delta R(j, \ell) > 0.03$",
    "dPhiJetMET": r"$\Delta \phi(\mathrm{MET}, j)<1.57$",
    "MET": r"$\mathrm{MET}>20$",
    
    "None": "None",

    "fj_mass": r"j $\mathrm{softdrop} > 40$GeV",
    
    "THWW>0.75": r"$\ensuremath{T_{\text{HWW}}^{\ell\nu qq}} > 0.75$",
} 


In [30]:
parquet_to_latex = {
    "WJetsLNu": "$\PW(\Pell\PGn)$+",
    "TTbar": "\\ttbar",
    "Others": "Other MC",

    "ggF": "ggF",
    "VBF": "VBF",
    "WH": "WH",
    "ZH": "ZH",    
    "ttH": "$t\\bar{t}H$",    
    
    "Data": "Data",
}

def make_latex_cutflow_table(cutflows_dict, year, ch, add_data=False, add_sumgenweight=False):
    """Will use the cutflows dictionary to make the LateX table we have in the AN."""
    
    samples_bkg = ["WJetsLNu", "TTbar", "Others"]
    samples_sig = ["ggF","VBF", "WH", "ZH", "ttH"]

    ### backgrounds
    headers = [parquet_to_latex[s] for s in samples_bkg]
    
    textabular = f"l{'r'*len(headers)}"
    textabular += "|r"
    
    texheader = "\\textbf{Inclusive Selection}" + " & " + " & ".join(headers) + " & Total MC "
    if add_data:
        textabular += "|r"
        texheader += "& Data "
    texheader += "\\\\"
    texdata = "\\hline\n"
    
    data = dict()
    
    for cut in cuts: 
        if (year != "2018") and (cut == "HEMCleaning"):
            continue
            
        if not add_sumgenweight and cut == "sumgenweight":
            continue
    
        data[cut] = []

        for sample in samples_bkg:            
            data[cut].append(round(cutflows_dict[year][ch][sample][cut]))
            
        totalmc = 0
        for sample in (samples_bkg + samples_sig):
            totalmc += round(cutflows_dict[year][ch][sample][cut])
            
        data[cut].append(totalmc)
        
        if add_data:
            data[cut].append(round(cutflows_dict[year][ch]["Data"][cut]))

    for label in data:
        if label == "z":
            texdata += "\\hline\n"
        texdata += f"{cut_to_label[label]} & {' & '.join(map(str,data[label]))} \\\\\n"
        
    texdata += "\\hline\n"    

    ### signal
    headers2 = [parquet_to_latex[s] for s in samples_sig]
    texheader2 = " & " + " & ".join(headers2) + "\\\\"
    texdata2 = "\\hline\n"

    textabular2 = f"l{'r'*len(headers2)}"
    
    data = dict()
    for cut in cuts:
        if (year != "2018") and (cut == "HEMCleaning"):
            continue
            
        data[cut] = []

        for sample in samples_sig:
            data[cut].append(round(cutflows_dict[year][ch][sample][cut]))
        
    for label in data:
        if label == "z":
            texdata += "\\hline\n"
        texdata2 += f"{cut_to_label[label]} & {' & '.join(map(str,data[label]))} \\\\\n"    

    # make table
    print("\\begin{table}[!htp]")
    print("\\begin{center}")
    
    print("\\begin{tabular}{"+textabular+"}")
    print(texheader)
    print(texdata,end="")
    print("\\end{tabular}")

    print("\\begin{tabular}{"+textabular2+"}")
    print(texheader2)
    print(texdata2,end="")
    print("\\end{tabular}")
    
    
    if ch == "lep":
        print("\\caption{Event yield of " + year + " Monte Carlo samples normalized to " + str(round(get_lumi([year], [ch]))) + "\\fbinv.}")        
    else:
        print("\\caption{Event yield of " + ch + " channel " + year + " Monte Carlo samples normalized to " + str(round(get_lumi([year], [ch]))) + "\\fbinv.}")
        
    print("\\label{sel-tab-cutflow" + year + "}")
    print("\\end{center}")
    print("\\end{table}")    

In [31]:
make_latex_cutflow_table(cutflows, "2018", "lep", add_data=True, add_sumgenweight=True)

\begin{table}[!htp]
\begin{center}
\begin{tabular}{lrrr|r|r}
\textbf{Inclusive Selection} & $\PW(\Pell\PGn)$+ & \ttbar & Other MC & Total MC & Data \\
\hline
sumgenweight & 197820310 & 99476871 & 701905777 & 999935499 & 2276244440 \\
Trigger & 29226116 & 10530590 & 106776300 & 146591899 & 1174738710 \\
METFilters & 29213217 & 10524986 & 106744780 & 146541854 & 1173476723 \\
n Leptons = 1 & 26079517 & 8801799 & 69889277 & 104819686 & 656865738 \\
n Taus = 0 & 23731349 & 7031178 & 65050915 & 95857257 & 632124064 \\
n FatJets $>=$ 1 & 2594785 & 1264275 & 1747598 & 5610184 & 6978889 \\
j $p_T > 250$GeV & 1166525 & 642813 & 813340 & 2624530 & 2825338 \\
$\Delta R(j, \ell) < 0.8$ & 429862 & 283922 & 395296 & 1110154 & 1392226 \\
$\Delta R(j, \ell) > 0.03$ & 153008 & 245912 & 151355 & 551261 & 640994 \\
$\Delta \phi(\mathrm{MET}, j)<1.57$ & 106346 & 155609 & 90838 & 353469 & 359014 \\
$\mathrm{MET}>20$ & 99120 & 146617 & 78830 & 325195 & 326733 \\
HEMCleaning & 95315 & 140575 & 75405 & 311842