# Compute the LateX cutflow tables
- Will load the pkl files that contain the cutflows and the sumgenweight
- Will scale the events by the cross section
- Will save the yields in a dictionnary called ```cutflows -> Dict()```
- Will make the LateX table using the function ```make_composition_table()```

In [1]:
import glob
import json
import os
import pickle as pkl
import sys

import hist as hist2
import matplotlib.pyplot as plt
import mplhep as hep
import numpy as np
import pandas as pd
import pyarrow
import pyarrow.parquet as pq
import yaml
from scipy.special import softmax
from sklearn.metrics import auc, roc_curve

sys.path
sys.path.append("../python/")

import utils

plt.style.use(hep.style.CMS)
plt.rcParams.update({"font.size": 20})

pd.options.mode.chained_assignment = None

import glob
import os
import json
import pickle
import yaml
import math

import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import pyarrow.parquet as pq
from sklearn.metrics import auc, roc_curve
from scipy.special import softmax

import hist as hist2
import matplotlib.pyplot as plt
import mplhep as hep

plt.style.use(hep.style.CMS)

import sys
sys.path
sys.path.append("../python/")

import utils

plt.rcParams.update({"font.size": 20})

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# get lumi
with open("../fileset/luminosity.json") as f:
    luminosity = json.load(f)
    
luminosity

{'ele': {'Run2': 137640.0,
  '2016APV': 19492.72,
  '2016': 16809.96,
  '2017': 41476.02,
  '2018': 59816.23},
 'mu': {'Run2': 137640.0,
  '2016APV': 19436.16,
  '2016': 16810.81,
  '2017': 41475.26,
  '2018': 59781.96},
 'lep': {'Run2': 137640.0,
  '2016APV': 19436.16,
  '2016': 16810.81,
  '2017': 41475.26,
  '2018': 59781.96},
 'had': {'Run2': 137640.0,
  '2016APV': 19436.16,
  '2016': 16810.81,
  '2017': 41475.26,
  '2018': 59781.96}}

In [4]:
def get_lumi(years, channels):
    lum_ = 0
    for year in years:
        lum = 0
        for ch in channels:
            lum += luminosity[ch][year] / 1000.0

        lum_ += lum / len(channels)    
    return lum_

# Read cutflows from pkl

In [5]:
def get_sum_sumgenweight(pkl_files, year, sample):
    """Load and sum the sumgenweight of each pkl file."""
    
    sum_sumgenweight = 0
    for ifile in pkl_files:
        with open(ifile, "rb") as f:
            metadata = pkl.load(f)            
        sum_sumgenweight = sum_sumgenweight + metadata[sample][year]["sumgenweight"]

    return sum_sumgenweight


def get_xsecweight(pkl_files, year, ch, sample, is_data):
    
    if not is_data:
        # find xsection
        f = open("../fileset/xsec_pfnano.json")
        xsec = json.load(f)
        f.close()
        try:
            xsec = eval(str((xsec[sample])))
        except ValueError:
            print(f"sample {sample} doesn't have xsecs defined in xsec_pfnano.json so will skip it")
            return None

        # get overall weighting of events.. each event has a genweight...
        # sumgenweight sums over events in a chunk... sum_sumgenweight sums over chunks
        xsec_weight = (xsec * luminosity[ch][year]) / get_sum_sumgenweight(pkl_files, year, sample)
    else:
        xsec_weight = 1
    return xsec_weight

def get_cutflow(pkl_files, year, ch, sample, is_data):
    """
    Get cutflow from metadata but multiply by xsec-weight
    """
    xsec_weight = get_xsecweight(pkl_files, year, ch, sample, is_data)
        
    cuts = [
        "sumgenweight",
        "Trigger",
        "METFilters",
        "OneLep",
        "NoTaus",
        "AtLeastOneFatJet",
        "CandidateJetpT",
        "LepInJet",
        "JetLepOverlap",
        "dPhiJetMET",
        "MET",
    ]
        
    if year == "2018":
        cuts += ["HEMCleaning"]
        
    evyield = dict.fromkeys(cuts, 0)
    for ik, pkl_file in enumerate(pkl_files):
        with open(pkl_file, "rb") as f:
            metadata = pkl.load(f)
            
        cutflows = metadata[sample][year]["cutflows"][ch]

        for key in evyield.keys():

            if key == "sumgenweight":
                evyield[key] += metadata[sample][year][key] * xsec_weight
            else:
                evyield[key] += cutflows[key] * xsec_weight        
    return evyield

In [7]:
! ls ../eos/July18_hww_2016APV/WJetsToLNu_HT-100To200/outfiles/0-10.pkl

../eos/July18_hww_2016APV/WJetsToLNu_HT-100To200/outfiles/0-10.pkl


In [6]:
with open("../eos/July18_hww_2016APV/WJetsToLNu_HT-100To200/outfiles/0-10.pkl", "rb") as f:
    metadata = pkl.load(f)
metadata

{'WJetsToLNu_HT-100To200': {'mc': 149,
  '2016APV': {'sumgenweight': 1492951.0,
   'sumlheweight': {0: 1551137.2,
    1: 1551799.4,
    2: 1541534.8,
    3: 1491997.6,
    4: 1492951.0,
    5: 1483308.9,
    6: 1443469.5,
    7: 1444666.2,
    8: 1435536.2},
   'sumpdfweight': {},
   'cutflows': {'ele': {'Trigger': 198848.0,
     'METFilters': 198767.0,
     'OneLep': 136324.0,
     'NoTaus': 136324.0,
     'AtLeastOneFatJet': 1854.0,
     'CandidateJetpT': 53.0,
     'LepInJet': 38.0,
     'JetLepOverlap': 5.0,
     'dPhiJetMET': 1.0,
     'MET': 0.0},
    'mu': {'Trigger': 228311.0,
     'METFilters': 228203.0,
     'OneLep': 227602.0,
     'NoTaus': 192965.0,
     'AtLeastOneFatJet': 2406.0,
     'CandidateJetpT': 86.0,
     'LepInJet': 67.0,
     'JetLepOverlap': 9.0,
     'dPhiJetMET': 3.0,
     'MET': 3.0}}}}}

# Adding a cut from the parquets

In [10]:
### This is your configuration. specefy which channels, years, samples, and directory of pkl files to use.
channels = [
    "ele", 
    "mu",
]
years = [
    "2018", 
    "2017", 
    "2016", 
    "2016APV",
]

samples = [
    "ggF", 
    "VBF",  
    "WH",
    "ZH",    
    "ttH",
    "WJetsLNu",
    "TTbar",
    "SingleTop",
    "Diboson",
    "EWKvjets",
    "DYJets",
    "WZQQ",
    "Data",
]

samples_dir = {
    "2016":    "../eos/July18_hww_2016",
    "2016APV": "../eos/July18_hww_2016APV",    
    "2017":    "../eos/July18_hww_2017",    
    "2018":    "../eos/July18_hww_2018",    
}

In [11]:
cutflows = {}
for year in years:
#     if year != "2016":
#         continue
    print(f"Processing year {year}")
    
    cutflows[year] = {}
    
    for ch in channels:
#         if ch != "mu": 
#             continue
        
        print(f"  {ch} channel")
        cutflows[year][ch] = {}

        condor_dir = os.listdir(samples_dir[year])

        for sample in condor_dir:

            # first: check if the sample is in one of combine_samples_by_name
            sample_to_use = None
            for key in utils.combine_samples_by_name:
                if key in sample:
                    sample_to_use = utils.combine_samples_by_name[key]
                    break

            # second: if not, combine under common label
            if sample_to_use is None:
                for key in utils.combine_samples:
                    if key in sample:
                        sample_to_use = utils.combine_samples[key]
                        break
                    else:
                        sample_to_use = sample

            if sample_to_use not in samples:
                continue

            is_data = False
            if sample_to_use == "Data":
                is_data = True

            out_files = f"{samples_dir[year]}/{sample}/outfiles/"
            pkl_files = glob.glob(f"{out_files}/*.pkl")

            if len(pkl_files) == 0:
                continue

            parquet_files = glob.glob(f"{out_files}/*_{ch}.parquet")
            
            try:
                data = pd.read_parquet(parquet_files)
            except pyarrow.lib.ArrowInvalid:
                # empty parquet because no event passed selection
#                 print(f"No parquet file for {sample}")
                continue

            if len(data) == 0:
#                 print(f"Hi, No parquet file for {sample}")
                continue
    
            if sample_to_use not in cutflows[year][ch].keys():
                cutflows[year][ch][sample_to_use] = get_cutflow(pkl_files, year, ch, sample, is_data)
            else:
                temp = get_cutflow(pkl_files, year, ch, sample, is_data)
                for key in cutflows[year][ch][sample_to_use]:
                    cutflows[year][ch][sample_to_use][key] += temp[key]
            
    print(f"------------------------------------------")

Processing year 2018
  ele channel
  mu channel
------------------------------------------
Processing year 2017
  ele channel
  mu channel
------------------------------------------
Processing year 2016
  ele channel
  mu channel
------------------------------------------
Processing year 2016APV
  ele channel
  mu channel
------------------------------------------


In [12]:
samples = cutflows["2017"]["ele"].keys()  # samples
samples

dict_keys(['VBF', 'WJetsLNu', 'EWKvjets', 'WH', 'TTbar', 'SingleTop', 'ggF', 'DYJets', 'Data', 'Diboson', 'WZQQ', 'ttH', 'ZH'])

In [13]:
from make_stacked_hists import make_events_dict

presel = {
        "mu": {
#             "fj_mass": "fj_mass>40",
#             "THWW>0.50": "THWW>0.50",
        },
        "ele": {
#             "fj_mass": "fj_mass>40",            
#             "THWW>0.50": "THWW>0.50",
        },
}

events_dict = {}
for year in years:
    
    out = make_events_dict([year], channels, samples_dir[year], samples, presel)
    events_dict = {**events_dict, **out}

INFO:root:Finding VBFHToWWToAny_M-125_TuneCP5_withDipoleRecoil samples and should combine them under VBF
INFO:root:Will fill the VBF dataframe with the remaining 1601 events
INFO:root:tot event weight 21.480661415230607 

INFO:root:Finding WJetsToLNu_HT-100To200 samples and should combine them under WJetsLNu
INFO:root:Will fill the WJetsLNu dataframe with the remaining 57 events
INFO:root:tot event weight 126.30860134877776 

INFO:root:Finding EWKWminus_WToLNu samples and should combine them under EWKvjets
INFO:root:Will fill the EWKvjets dataframe with the remaining 2069 events
INFO:root:tot event weight 881.62056583208 

INFO:root:Finding EWKZ_ZToNuNu samples and should combine them under EWKvjets
INFO:root:Finding HWminusJ_HToWW_M-125 samples and should combine them under WH
INFO:root:Will fill the WH dataframe with the remaining 5590 events
INFO:root:tot event weight 3.9852053537803513 

INFO:root:Finding WJetsToLNu_HT-800To1200 samples and should combine them under WJetsLNu
INFO:r

INFO:root:Will fill the WZQQ dataframe with the remaining 512 events
INFO:root:tot event weight 61.299457382969194 

INFO:root:Finding EGamma_Run2018C samples and should combine them under Data
INFO:root:Will fill the Data dataframe with the remaining 15261 events
INFO:root:tot event weight 15261.0 

INFO:root:Finding EGamma_Run2018D samples and should combine them under Data
INFO:root:Will fill the Data dataframe with the remaining 69925 events
INFO:root:tot event weight 69925.0 

INFO:root:Finding EWKWplus_WToLNu samples and should combine them under EWKvjets
INFO:root:Will fill the EWKvjets dataframe with the remaining 1792 events
INFO:root:tot event weight 924.9605700855368 

INFO:root:Finding EGamma_Run2018B samples and should combine them under Data
INFO:root:Will fill the Data dataframe with the remaining 16456 events
INFO:root:tot event weight 16456.0 

INFO:root:Finding GluGluZH_HToWW_M-125_TuneCP5_13TeV-powheg-pythia8 samples and should combine them under ZH
INFO:root:Will fi

INFO:root:Will fill the EWKvjets dataframe with the remaining 5 events
INFO:root:tot event weight 15.992647360645298 

INFO:root:Finding ZJetsToQQ_HT-400to600 samples and should combine them under WZQQ
INFO:root:Will fill the WZQQ dataframe with the remaining 99 events
INFO:root:tot event weight 72.87026412553513 

INFO:root:Finding ZZ samples and should combine them under Diboson
INFO:root:Will fill the Diboson dataframe with the remaining 160 events
INFO:root:tot event weight 41.78763915568997 

INFO:root:Finding TTToHadronic samples and should combine them under TTbar
INFO:root:Will fill the TTbar dataframe with the remaining 3327 events
INFO:root:tot event weight 731.6849923754332 

INFO:root:Finding WJetsToQQ_HT-800toInf samples and should combine them under WZQQ
INFO:root:Will fill the WZQQ dataframe with the remaining 1172 events
INFO:root:tot event weight 201.79581481330467 

INFO:root:Finding fake_2018_ele_EWK_SF_Down.parquet samples and should combine them under EWKvjets
INFO

INFO:root:Finding SingleElectron_Run2017E samples and should combine them under Data
INFO:root:Will fill the Data dataframe with the remaining 22746 events
INFO:root:tot event weight 22746.0 

INFO:root:Finding SingleElectron_Run2017B samples and should combine them under Data
INFO:root:Will fill the Data dataframe with the remaining 9060 events
INFO:root:tot event weight 9060.0 

INFO:root:Finding HWplusJ_HToWW_M-125 samples and should combine them under WH
INFO:root:Will fill the WH dataframe with the remaining 4843 events
INFO:root:tot event weight 5.688995408383446 

INFO:root:Finding SingleElectron_Run2017C samples and should combine them under Data
INFO:root:Will fill the Data dataframe with the remaining 24152 events
INFO:root:tot event weight 24152.0 

INFO:root:Finding DYJetsToLL_LHEFilterPtZ-100To250 samples and should combine them under DYJets
INFO:root:Will fill the DYJets dataframe with the remaining 62810 events
INFO:root:tot event weight 3399.2619351270473 

INFO:root:Fi

INFO:root:Finding EWKZ_ZToNuNu samples and should combine them under EWKvjets
INFO:root:Finding HWminusJ_HToWW_M-125 samples and should combine them under WH
INFO:root:Will fill the WH dataframe with the remaining 5565 events
INFO:root:tot event weight 3.7958832790061012 

INFO:root:Finding WJetsToLNu_HT-800To1200 samples and should combine them under WJetsLNu
INFO:root:Will fill the WJetsLNu dataframe with the remaining 158739 events
INFO:root:tot event weight 8953.5317042282 

INFO:root:Finding TTToSemiLeptonic samples and should combine them under TTbar
INFO:root:Will fill the TTbar dataframe with the remaining 412828 events
INFO:root:tot event weight 48576.268338104644 

INFO:root:Finding ST_t-channel_top_4f_InclusiveDecays samples and should combine them under SingleTop
INFO:root:Will fill the SingleTop dataframe with the remaining 38837 events
INFO:root:tot event weight 1096.1648296450883 

INFO:root:Finding ST_s-channel_4f_hadronicDecays samples and should combine them under Sin

INFO:root:tot event weight 23.03249598421183 

INFO:root:Finding ttHToNonbb_M125 samples and should combine them under ttH
INFO:root:Will fill the ttH dataframe with the remaining 14875 events
INFO:root:tot event weight 20.172903133687527 

INFO:root:Finding ZJetsToQQ_HT-800toInf samples and should combine them under WZQQ
INFO:root:Will fill the WZQQ dataframe with the remaining 2027 events
INFO:root:tot event weight 165.25894263454106 

INFO:root:Finding SingleElectron_Run2017F samples and should combine them under Data
INFO:root:Finding EWKWplus_WToLNu samples and should combine them under EWKvjets
INFO:root:Will fill the EWKvjets dataframe with the remaining 1845 events
INFO:root:tot event weight 774.2621943863786 

INFO:root:Finding GluGluZH_HToWW_M-125_TuneCP5_13TeV-powheg-pythia8 samples and should combine them under ZH
INFO:root:Will fill the ZH dataframe with the remaining 13423 events
INFO:root:tot event weight 0.8472199737523365 

INFO:root:Finding DYJetsToLL_LHEFilterPtZ-0To

INFO:root:Will fill the SingleTop dataframe with the remaining 3809 events
INFO:root:tot event weight 114.26193779625198 

INFO:root:Finding TTTo2L2Nu samples and should combine them under TTbar
INFO:root:Will fill the TTbar dataframe with the remaining 65972 events
INFO:root:tot event weight 1828.3703992840324 

INFO:root:Finding EWKZ_ZToQQ samples and should combine them under EWKvjets
INFO:root:Finding ZJetsToQQ_HT-400to600 samples and should combine them under WZQQ
INFO:root:Will fill the WZQQ dataframe with the remaining 39 events
INFO:root:tot event weight 15.069807115564199 

INFO:root:Finding ZZ samples and should combine them under Diboson
INFO:root:Will fill the Diboson dataframe with the remaining 85 events
INFO:root:tot event weight 21.929873740996786 

INFO:root:Finding TTToHadronic samples and should combine them under TTbar
INFO:root:Will fill the TTbar dataframe with the remaining 725 events
INFO:root:tot event weight 47.5887870420303 

INFO:root:Finding WJetsToQQ_HT-80

INFO:root:Finding WJetsToQQ_HT-200to400 samples and should combine them under WZQQ
INFO:root:Finding ST_tW_antitop_5f_inclusiveDecays samples and should combine them under SingleTop
INFO:root:Will fill the SingleTop dataframe with the remaining 2019 events
INFO:root:tot event weight 404.42972080656295 

INFO:root:Finding ZJetsToQQ_HT-200to400 samples and should combine them under WZQQ
INFO:root:Finding HWplusJ_HToWW_M-125 samples and should combine them under WH
INFO:root:Will fill the WH dataframe with the remaining 2016 events
INFO:root:tot event weight 2.9262792840480456 

INFO:root:Finding DYJetsToLL_LHEFilterPtZ-100To250 samples and should combine them under DYJets
INFO:root:Will fill the DYJets dataframe with the remaining 18496 events
INFO:root:tot event weight 816.5094844977515 

INFO:root:Finding EWKWplus_WToQQ samples and should combine them under EWKvjets
INFO:root:Will fill the EWKvjets dataframe with the remaining 109 events
INFO:root:tot event weight 8.088623282737373 

I

INFO:root:tot event weight 822.2145174147009 

INFO:root:Finding EWKZ_ZToLL samples and should combine them under EWKvjets
INFO:root:Will fill the EWKvjets dataframe with the remaining 497 events
INFO:root:tot event weight 109.10698672727986 

INFO:root:Finding WJetsToLNu_HT-200To400 samples and should combine them under WJetsLNu
INFO:root:Will fill the WJetsLNu dataframe with the remaining 5558 events
INFO:root:tot event weight 2737.574617160356 

INFO:root:Finding ST_tW_top_5f_inclusiveDecays samples and should combine them under SingleTop
INFO:root:Will fill the SingleTop dataframe with the remaining 1520 events
INFO:root:tot event weight 397.39890045914007 

INFO:root:Finding GluGluHToWW_Pt-200ToInf_M-125 samples and should combine them under ggF
INFO:root:Will fill the ggF dataframe with the remaining 2389 events
INFO:root:tot event weight 16.47649554872515 

INFO:root:Finding DYJetsToLL_LHEFilterPtZ-650ToInf samples and should combine them under DYJets
INFO:root:Will fill the DYJ

INFO:root:tot event weight 6597.546151187164 

INFO:root:Finding SingleMuon_Run2016C_HIPM samples and should combine them under Data
INFO:root:Finding HZJ_HToWW_M-125 samples and should combine them under ZH
INFO:root:Will fill the ZH dataframe with the remaining 4220 events
INFO:root:tot event weight 2.2890368768238423 

INFO:root:Finding WZ samples and should combine them under Diboson
INFO:root:Will fill the Diboson dataframe with the remaining 687 events
INFO:root:tot event weight 74.07226614054493 

INFO:root:Finding SingleElectron_Run2016B_ver2_HIPM samples and should combine them under Data
INFO:root:Will fill the Data dataframe with the remaining 13459 events
INFO:root:tot event weight 13459.0 

INFO:root:Finding VBFHToWWToAny_M-125_TuneCP5_withDipoleRecoil samples and should combine them under VBF
INFO:root:Will fill the VBF dataframe with the remaining 860 events
INFO:root:tot event weight 9.831317405229466 

INFO:root:Finding WJetsToLNu_HT-100To200 samples and should combine

INFO:root:Will fill the DYJets dataframe with the remaining 27428 events
INFO:root:tot event weight 109.32226570780152 

INFO:root:Finding SingleMuon_Run2016B_ver2_HIPM samples and should combine them under Data
INFO:root:Will fill the Data dataframe with the remaining 15726 events
INFO:root:tot event weight 15726.0 

INFO:root:Finding SingleMuon_Run2016E_HIPM samples and should combine them under Data
INFO:root:Will fill the Data dataframe with the remaining 10634 events
INFO:root:tot event weight 10634.0 

INFO:root:Finding EWKWminus_WToQQ samples and should combine them under EWKvjets
INFO:root:Will fill the EWKvjets dataframe with the remaining 61 events
INFO:root:tot event weight 4.368036418348015 

INFO:root:Finding WJetsToLNu_HT-600To800 samples and should combine them under WJetsLNu
INFO:root:Will fill the WJetsLNu dataframe with the remaining 47533 events
INFO:root:tot event weight 6055.73584856738 

INFO:root:Finding SingleElectron_Run2016C_HIPM samples and should combine the

In [14]:
# for top pt reweighting
for year in years:
    for ch in channels:
        events_dict[year][ch]["TTbar"]["event_weight"] *= events_dict[year][ch]["TTbar"]["top_reweighting"]

# Add the cut to the curflow dict

In [15]:
presel = {
        "mu": {
            "fj_mass": "fj_mass>40",
            "THWW>0.75": "fj_mass>40 & THWW>0.75",
        },
        "ele": {
            "fj_mass": "fj_mass>40",            
            "THWW>0.75": "fj_mass>40 & THWW>0.75",
        },
}

In [16]:
for ch in channels:
    for cut, sel in list(presel[ch].items()):
        for sample in samples:
            for year in years:

                df = events_dict[year][ch][sample]
                df = df.query(sel)
                
                w = df["event_weight"]

                cutflows[year][ch][sample][cut] = w.sum()

In [17]:
cutflows["2016APV"]["mu"]["WJetsLNu"]     # take a quick look

{'sumgenweight': 32148205.541996155,
 'Trigger': 5052102.989037976,
 'METFilters': 5049012.531204127,
 'OneLep': 5031057.850622146,
 'NoTaus': 4271447.470579825,
 'AtLeastOneFatJet': 461978.1002925532,
 'CandidateJetpT': 196124.03115179407,
 'LepInJet': 68289.06508691012,
 'JetLepOverlap': 26003.119346261694,
 'dPhiJetMET': 18333.833322064966,
 'MET': 16980.930607238028,
 'fj_mass': 19896.861487391365,
 'THWW>0.75': 560.3813816501639}

In [18]:
cutflows["2016APV"]["ele"]["WJetsLNu"]     # take a quick look

{'sumgenweight': 32241758.10101272,
 'Trigger': 4454643.38131529,
 'METFilters': 4451955.991847101,
 'OneLep': 3062734.5054372367,
 'NoTaus': 3062734.5054372367,
 'AtLeastOneFatJet': 364780.748501086,
 'CandidateJetpT': 153895.86376889877,
 'LepInJet': 58938.59730123174,
 'JetLepOverlap': 19206.882023214886,
 'dPhiJetMET': 13362.535289163337,
 'MET': 12293.663142455729,
 'fj_mass': 14729.953784965503,
 'THWW>0.75': 323.25912091618096}

# Combine different channels

In [19]:
common_cuts = cutflows["2018"]["mu"]["WJetsLNu"]
common_cuts

{'sumgenweight': 98881812.96014196,
 'Trigger': 15774784.39789099,
 'METFilters': 15767863.42776784,
 'OneLep': 15714197.698371835,
 'NoTaus': 13365974.316493511,
 'AtLeastOneFatJet': 1419110.9803375734,
 'CandidateJetpT': 638940.285300165,
 'LepInJet': 226599.5724394703,
 'JetLepOverlap': 85527.28589072316,
 'dPhiJetMET': 59905.32513231152,
 'MET': 55958.024490644915,
 'HEMCleaning': 53830.13943811133,
 'fj_mass': 65514.012967767514,
 'THWW>0.75': 1832.973790582876}

In [20]:
def combine_channels(cutflows):

    # combine both channels
    cutflows_new = {}
    for year in cutflows.keys():
        cutflows_new[year] = {}
        cutflows_new[year]["lep"] = {}
        
        for ch in ["mu", "ele"]:
            for sample in cutflows[year][ch]:
                                
                if sample not in cutflows_new[year]["lep"]:
                    cutflows_new[year]["lep"][sample] = {}
                
                for cut in cutflows[year][ch][sample]:
                    
                    if (year != "2018") and (cut == "HEMCleaning"):
                        continue
                    
                    if cut not in cutflows_new[year]["lep"][sample]:
                        cutflows_new[year]["lep"][sample][cut] = cutflows[year][ch][sample][cut]
                    else:
                        cutflows_new[year]["lep"][sample][cut] += cutflows[year][ch][sample][cut]
        cutflows[year] = {**cutflows[year], **cutflows_new[year]}
        
    return cutflows

In [21]:
cutflows = combine_channels(cutflows)

In [22]:
cutflows["2018"].keys()

dict_keys(['ele', 'mu', 'lep'])

In [23]:
cutflows["2018"]["ele"]["WJetsLNu"]

{'sumgenweight': 98938496.94524625,
 'Trigger': 13452100.702478617,
 'METFilters': 13446122.536571965,
 'OneLep': 10365996.09355191,
 'NoTaus': 10365996.09355191,
 'AtLeastOneFatJet': 1175735.568559565,
 'CandidateJetpT': 527612.6926960349,
 'LepInJet': 203272.06535077497,
 'JetLepOverlap': 67484.52509185502,
 'dPhiJetMET': 46443.27286250808,
 'MET': 43163.8827737948,
 'HEMCleaning': 41487.080689958166,
 'fj_mass': 51503.24272169137,
 'THWW>0.75': 1300.904911979545}

In [24]:
cutflows["2018"]["mu"]["WJetsLNu"]

{'sumgenweight': 98881812.96014196,
 'Trigger': 15774784.39789099,
 'METFilters': 15767863.42776784,
 'OneLep': 15714197.698371835,
 'NoTaus': 13365974.316493511,
 'AtLeastOneFatJet': 1419110.9803375734,
 'CandidateJetpT': 638940.285300165,
 'LepInJet': 226599.5724394703,
 'JetLepOverlap': 85527.28589072316,
 'dPhiJetMET': 59905.32513231152,
 'MET': 55958.024490644915,
 'HEMCleaning': 53830.13943811133,
 'fj_mass': 65514.012967767514,
 'THWW>0.75': 1832.973790582876}

In [25]:
cutflows["2018"]["lep"]["WJetsLNu"]

{'sumgenweight': 197820309.9053882,
 'Trigger': 29226885.10036961,
 'METFilters': 29213985.964339804,
 'OneLep': 26080193.791923746,
 'NoTaus': 23731970.410045423,
 'AtLeastOneFatJet': 2594846.548897139,
 'CandidateJetpT': 1166552.9779961999,
 'LepInJet': 429871.63779024524,
 'JetLepOverlap': 153011.81098257817,
 'dPhiJetMET': 106348.5979948196,
 'MET': 99121.90726443971,
 'HEMCleaning': 95317.22012806949,
 'fj_mass': 117017.25568945889,
 'THWW>0.75': 3133.878702562421}

# Combine different years

In [26]:
def combine_years(cutflows):
    """Will remove the HEM cleaning cutflow from 2018 first."""
    
    whatever_year = list(cutflows.keys())[0]
    channels = cutflows[whatever_year].keys()
    
    # combine all years
    cutflows_new = {}
    cutflows_new["Run2"] = {}
    
    for ch in channels:
        cutflows_new["Run2"][ch] = {}
        
        for year in cutflows:
            for sample in cutflows[year][ch]:
                
                if sample not in cutflows_new["Run2"][ch]:
                    cutflows_new["Run2"][ch][sample] = {}

                for cut in cutflows[year][ch][sample]:
                    if "HEM" in cut:
                        continue
                    if cut not in cutflows_new["Run2"][ch][sample]:
                        cutflows_new["Run2"][ch][sample][cut] = cutflows[year][ch][sample][cut]
                    else:
                        cutflows_new["Run2"][ch][sample][cut] += cutflows[year][ch][sample][cut]

    cutflows = {**cutflows, **cutflows_new}

    return cutflows

In [27]:
cutflows = combine_years(cutflows)

In [28]:
cutflows["2016"]["ele"].keys()

dict_keys(['VBF', 'WJetsLNu', 'EWKvjets', 'WH', 'TTbar', 'Data', 'SingleTop', 'ggF', 'DYJets', 'Diboson', 'WZQQ', 'ttH', 'ZH'])

In [29]:
cutflows.keys()

dict_keys(['2018', '2017', '2016', '2016APV', 'Run2'])

In [30]:
cutflows["Run2"].keys()

dict_keys(['ele', 'mu', 'lep'])

# Combine non-dominant backgrounds

In [31]:
# combine non-dominant backgrounds under others
dominant_bkgs = ["WJetsLNu", "TTbar"]
signals = ["ggF", "VH", "WH", "ZH", "ttH"]

for year in cutflows:
    for ch in cutflows[year]:
        cutflows[year][ch]["Others"] = dict.fromkeys(cutflows[year][ch]["WJetsLNu"], 0)
        for sample in cutflows[year][ch]:
            if sample == "Data":
                continue
            if sample not in signals+dominant_bkgs:
                for cut in cutflows[year][ch][sample]:
                    cutflows[year][ch]["Others"][cut] += cutflows[year][ch][sample][cut]

In [32]:
cutflows["2018"]["ele"].keys()

dict_keys(['VBF', 'WJetsLNu', 'EWKvjets', 'WH', 'TTbar', 'SingleTop', 'ggF', 'DYJets', 'Data', 'Diboson', 'WZQQ', 'ttH', 'ZH', 'Others'])

In [33]:
cutflows["2018"]["lep"]["Others"]

{'sumgenweight': 702099098.9089419,
 'Trigger': 106791477.11714289,
 'METFilters': 106759941.87585033,
 'OneLep': 69901443.1543528,
 'NoTaus': 65061347.06311686,
 'AtLeastOneFatJet': 1748491.671447163,
 'CandidateJetpT': 813748.1257520039,
 'LepInJet': 395530.47309093684,
 'JetLepOverlap': 151568.63428125813,
 'dPhiJetMET': 90992.31029109999,
 'MET': 78970.44810435054,
 'HEMCleaning': 75539.22162458888,
 'fj_mass': 63127.5839288049,
 'THWW>0.75': 1144.1071443704818}

# LateX cutflow table

In [34]:
cuts = [
    "sumgenweight",
    "Trigger",
    "METFilters",
    "OneLep",        
    "NoTaus",
    "AtLeastOneFatJet",
    "CandidateJetpT",
    "LepInJet",
    "JetLepOverlap",
    "dPhiJetMET",
    "MET",
    "HEMCleaning",
]

for cut in presel["mu"]:
    cuts += [cut]

In [35]:
cut_to_label = {
    "sumgenweight": "sumgenweight",        
    "HEMCleaning": "HEMCleaning",    
    "Trigger": "Trigger",
    "METFilters": "METFilters",
    "OneLep": "n Leptons = 1",
    "NoTaus": "n Taus = 0",
    "AtLeastOneFatJet": r"n FatJets $>=$ 1",
    "CandidateJetpT": r"j $p_T > 250$GeV",
    "LepInJet": r"$\Delta R(j, \ell) < 0.8$",
    "JetLepOverlap": r"$\Delta R(j, \ell) > 0.03$",
    "dPhiJetMET": r"$\Delta \phi(\mathrm{MET}, j)<1.57$",
    "MET": r"$\mathrm{MET}>20$",
    
    "None": "None",

    "fj_mass": r"j $\mathrm{softdrop} > 40$GeV",
    
    "THWW>0.75": r"$\ensuremath{T_{\text{HWW}}^{\ell\nu qq}} > 0.75$",
} 


In [36]:
parquet_to_latex = {
    "WJetsLNu": "$\PW(\Pell\PGn)$+",
    "TTbar": "\\ttbar",
    "Others": "Other MC",

    "ggF": "ggF",
    "VBF": "VBF",
    "WH": "WH",
    "ZH": "ZH",    
    "ttH": "$t\\bar{t}H$",    
    
    "Data": "Data",
}

def make_latex_cutflow_table(cutflows_dict, year, ch, add_data=False, add_sumgenweight=False):
    """Will use the cutflows dictionary to make the LateX table we have in the AN."""
    
    samples_bkg = ["WJetsLNu", "TTbar", "Others"]
    samples_sig = ["ggF","VBF", "WH", "ZH", "ttH"]

    ### backgrounds
    headers = [parquet_to_latex[s] for s in samples_bkg]
    
    textabular = f"l{'r'*len(headers)}"
    textabular += "|r"
    
    texheader = "\\textbf{Inclusive Selection}" + " & " + " & ".join(headers) + " & Total MC "
    if add_data:
        textabular += "|r"
        texheader += "& Data "
    texheader += "\\\\"
    texdata = "\\hline\n"
    
    data = dict()
    
    for cut in cuts: 
        if (year != "2018") and (cut == "HEMCleaning"):
            continue
            
        if not add_sumgenweight and cut == "sumgenweight":
            continue
    
        data[cut] = []

        for sample in samples_bkg:            
            data[cut].append(round(cutflows_dict[year][ch][sample][cut]))
            
        totalmc = 0
        for sample in (samples_bkg + samples_sig):
            totalmc += round(cutflows_dict[year][ch][sample][cut])
            
        data[cut].append(totalmc)
        
        if add_data:
            data[cut].append(round(cutflows_dict[year][ch]["Data"][cut]))

    for label in data:
        if label == "z":
            texdata += "\\hline\n"
        texdata += f"{cut_to_label[label]} & {' & '.join(map(str,data[label]))} \\\\\n"
        
    texdata += "\\hline\n"    

    ### signal
    headers2 = [parquet_to_latex[s] for s in samples_sig]
    texheader2 = " & " + " & ".join(headers2) + "\\\\"
    texdata2 = "\\hline\n"

    textabular2 = f"l{'r'*len(headers2)}"
    
    data = dict()
    for cut in cuts:
        if (year != "2018") and (cut == "HEMCleaning"):
            continue
            
        data[cut] = []

        for sample in samples_sig:
            data[cut].append(round(cutflows_dict[year][ch][sample][cut]))
        
    for label in data:
        if label == "z":
            texdata += "\\hline\n"
        texdata2 += f"{cut_to_label[label]} & {' & '.join(map(str,data[label]))} \\\\\n"    

    # make table
    print("\\begin{table}[!htp]")
    print("\\begin{center}")
    
    print("\\begin{tabular}{"+textabular+"}")
    print(texheader)
    print(texdata,end="")
    print("\\end{tabular}")

    print("\\begin{tabular}{"+textabular2+"}")
    print(texheader2)
    print(texdata2,end="")
    print("\\end{tabular}")
    
    
    if ch == "lep":
        print("\\caption{Event yield of " + year + " Monte Carlo samples normalized to " + str(round(get_lumi([year], [ch]))) + "\\fbinv.}")        
    else:
        print("\\caption{Event yield of " + ch + " channel " + year + " Monte Carlo samples normalized to " + str(round(get_lumi([year], [ch]))) + "\\fbinv.}")
        
    print("\\label{sel-tab-cutflow" + year + "}")
    print("\\end{center}")
    print("\\end{table}")    

In [40]:
make_latex_cutflow_table(cutflows, "2016APV", "lep", add_data=True, add_sumgenweight=True)

\begin{table}[!htp]
\begin{center}
\begin{tabular}{lrrr|r|r}
\textbf{Inclusive Selection} & $\PW(\Pell\PGn)$+ & \ttbar & Other MC & Total MC & Data \\
\hline
sumgenweight & 64389964 & 32379446 & 238896943 & 335730197 & 1115429449 \\
Trigger & 9506746 & 3438165 & 35081911 & 48032426 & 477596975 \\
METFilters & 9500969 & 3435561 & 35064258 & 48006386 & 477443016 \\
n Leptons = 1 & 8093792 & 2749063 & 21898912 & 32745995 & 199445779 \\
n Taus = 0 & 7334182 & 2175550 & 20353121 & 29866216 & 192225714 \\
n FatJets $>=$ 1 & 826759 & 398619 & 566831 & 1792970 & 2154589 \\
j $p_T > 250$GeV & 350020 & 194438 & 249445 & 794309 & 852790 \\
$\Delta R(j, \ell) < 0.8$ & 127228 & 85929 & 121113 & 334482 & 419181 \\
$\Delta R(j, \ell) > 0.03$ & 45210 & 74701 & 48439 & 168543 & 193512 \\
$\Delta \phi(\mathrm{MET}, j)<1.57$ & 31696 & 47437 & 29132 & 108395 & 106868 \\
$\mathrm{MET}>20$ & 29275 & 44443 & 24439 & 98276 & 95135 \\
j $\mathrm{softdrop} > 40$GeV & 34627 & 35924 & 19434 & 90061 & 81158 \\
$\e