# Compute the LateX cutflow tables
- Will load the pkl files that contain the cutflows and the sumgenweight
- Will scale the events by the cross section
- Will save the yields in a dictionnary called ```cutflows -> Dict()```
- Will make the LateX table using the function ```make_composition_table()```

In [1]:
import glob
import json
import os
import pickle as pkl
import sys

import hist as hist2
import matplotlib.pyplot as plt
import mplhep as hep
import numpy as np
import pandas as pd
import pyarrow
import pyarrow.parquet as pq
import yaml
from scipy.special import softmax
from sklearn.metrics import auc, roc_curve

sys.path
sys.path.append("../python/")

import utils

plt.style.use(hep.style.CMS)
plt.rcParams.update({"font.size": 20})

pd.options.mode.chained_assignment = None

import glob
import os
import json
import pickle
import yaml
import math

import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import pyarrow.parquet as pq
from sklearn.metrics import auc, roc_curve
from scipy.special import softmax

import hist as hist2
import matplotlib.pyplot as plt
import mplhep as hep

plt.style.use(hep.style.CMS)

import sys
sys.path
sys.path.append("../python/")

import utils

plt.rcParams.update({"font.size": 20})

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# get lumi
with open("../fileset/luminosity.json") as f:
    luminosity = json.load(f)
    
luminosity

{'ele': {'Run2': 137640.0,
  '2016APV': 19492.72,
  '2016': 16809.96,
  '2017': 41476.02,
  '2018': 59816.23},
 'mu': {'Run2': 137640.0,
  '2016APV': 19436.16,
  '2016': 16810.81,
  '2017': 41475.26,
  '2018': 59781.96},
 'lep': {'Run2': 137640.0,
  '2016APV': 19436.16,
  '2016': 16810.81,
  '2017': 41475.26,
  '2018': 59781.96},
 'had': {'Run2': 137640.0,
  '2016APV': 19436.16,
  '2016': 16810.81,
  '2017': 41475.26,
  '2018': 59781.96}}

In [4]:
def get_lumi(years, channels):
    lum_ = 0
    for year in years:
        lum = 0
        for ch in channels:
            lum += luminosity[ch][year] / 1000.0

        lum_ += lum / len(channels)    
    return lum_

# Read cutflows from pkl

In [5]:
def get_sum_sumgenweight(pkl_files, year, sample):
    """Load and sum the sumgenweight of each pkl file."""
    
    sum_sumgenweight = 0
    for ifile in pkl_files:
        with open(ifile, "rb") as f:
            metadata = pkl.load(f)            
        sum_sumgenweight = sum_sumgenweight + metadata[sample][year]["sumgenweight"]

    return sum_sumgenweight


def get_xsecweight(pkl_files, year, ch, sample, is_data):
    
    if not is_data:
        # find xsection
        f = open("../fileset/xsec_pfnano.json")
        xsec = json.load(f)
        f.close()
        try:
            xsec = eval(str((xsec[sample])))
        except ValueError:
            print(f"sample {sample} doesn't have xsecs defined in xsec_pfnano.json so will skip it")
            return None

        # get overall weighting of events.. each event has a genweight...
        # sumgenweight sums over events in a chunk... sum_sumgenweight sums over chunks
        xsec_weight = (xsec * luminosity[ch][year]) / get_sum_sumgenweight(pkl_files, year, sample)
    else:
        xsec_weight = 1
    return xsec_weight

def get_cutflow(pkl_files, year, ch, sample, is_data):
    """
    Get cutflow from metadata but multiply by xsec-weight
    """
    xsec_weight = get_xsecweight(pkl_files, year, ch, sample, is_data)
        
    cuts = [
        "sumgenweight",
        "Trigger",
        "METFilters",
        "OneLep",
        "NoTaus",
        "AtLeastOneFatJet",
        "CandidateJetpT",
        "LepInJet",
        "JetLepOverlap",
        "dPhiJetMET",
        "MET",
    ]
        
    if year == "2018":
        cuts += ["HEMCleaning"]
        
    evyield = dict.fromkeys(cuts, 0)
    for ik, pkl_file in enumerate(pkl_files):
        with open(pkl_file, "rb") as f:
            metadata = pkl.load(f)
            
        cutflows = metadata[sample][year]["cutflows"][ch]

        for key in evyield.keys():

            if key == "sumgenweight":
                evyield[key] += metadata[sample][year][key] * xsec_weight
            else:
                evyield[key] += cutflows[key] * xsec_weight        
    return evyield

In [6]:
! ls ../eos/May31_hww_2018/WJetsToLNu_HT-100To200/outfiles/0-10.pkl

../eos/May31_hww_2018/WJetsToLNu_HT-100To200/outfiles/0-10.pkl


In [7]:
with open("../eos/May31_hww_2018/WJetsToLNu_HT-100To200/outfiles/0-10.pkl", "rb") as f:
    metadata = pkl.load(f)
metadata

{'WJetsToLNu_HT-100To200': {'mc': 89,
  '2018': {'sumgenweight': 860209.0,
   'sumlheweight': {0: 893759.06,
    1: 894174.7,
    2: 888283.2,
    3: 859625.5,
    4: 860209.0,
    5: 854675.94,
    6: 831615.4,
    7: 832338.2,
    8: 827100.1},
   'sumpdfweight': {},
   'cutflows': {'ele': {'Trigger': 112333.0,
     'METFilters': 112304.0,
     'OneLep': 86555.0,
     'NoTaus': 86555.0,
     'AtLeastOneFatJet': 1048.0,
     'CandidateJetpT': 43.0,
     'LepInJet': 27.0,
     'JetLepOverlap': 8.0,
     'dPhiJetMET': 1.0,
     'MET': 1.0,
     'HEMCleaning': 1.0},
    'mu': {'Trigger': 133416.0,
     'METFilters': 133390.0,
     'OneLep': 133079.0,
     'NoTaus': 112979.0,
     'AtLeastOneFatJet': 1338.0,
     'CandidateJetpT': 75.0,
     'LepInJet': 51.0,
     'JetLepOverlap': 11.0,
     'dPhiJetMET': 1.0,
     'MET': 1.0,
     'HEMCleaning': 1.0}}}}}

# Adding a cut from the parquets

In [8]:
### This is your configuration. specefy which channels, years, samples, and directory of pkl files to use.
channels = ["ele", "mu"]
years = ["2018", "2017", "2016", "2016APV"]

samples = [
    "ggF", 
    "WH",
    "ZH",
    "VBF",
    "ttH",
    "DYJets",
    "WJetsLNu",
    "WZQQ",
    "TTbar",
    "SingleTop",
    "Diboson",
    "Data"
]

samples_dir = {
    "2016":    "../eos/May31_hww_2016",
    "2016APV": "../eos/May31_hww_2016APV",    
    "2017":    "../eos/May31_hww_2017",    
    "2018":    "../eos/May31_hww_2018",    
}

In [9]:
cutflows = {}
for year in years:
#     if year != "2016":
#         continue
    print(f"Processing year {year}")
    
    cutflows[year] = {}
    
    for ch in channels:
#         if ch != "mu": 
#             continue
        
        print(f"  {ch} channel")
        cutflows[year][ch] = {}

        condor_dir = os.listdir(samples_dir[year])

        for sample in condor_dir:

            # first: check if the sample is in one of combine_samples_by_name
            sample_to_use = None
            for key in utils.combine_samples_by_name:
                if key in sample:
                    sample_to_use = utils.combine_samples_by_name[key]
                    break

            # second: if not, combine under common label
            if sample_to_use is None:
                for key in utils.combine_samples:
                    if key in sample:
                        sample_to_use = utils.combine_samples[key]
                        break
                    else:
                        sample_to_use = sample

            if sample_to_use not in samples:
                continue

            is_data = False
            if sample_to_use == "Data":
                is_data = True

            out_files = f"{samples_dir[year]}/{sample}/outfiles/"
            pkl_files = glob.glob(f"{out_files}/*.pkl")

            if len(pkl_files) == 0:
                continue

            parquet_files = glob.glob(f"{out_files}/*_{ch}.parquet")
            
            try:
                data = pd.read_parquet(parquet_files)
            except pyarrow.lib.ArrowInvalid:
                # empty parquet because no event passed selection
#                 print(f"No parquet file for {sample}")
                continue

            if len(data) == 0:
#                 print(f"Hi, No parquet file for {sample}")
                continue
    
            if sample_to_use not in cutflows[year][ch].keys():
                cutflows[year][ch][sample_to_use] = get_cutflow(pkl_files, year, ch, sample, is_data)
            else:
                temp = get_cutflow(pkl_files, year, ch, sample, is_data)
                for key in cutflows[year][ch][sample_to_use]:
                    cutflows[year][ch][sample_to_use][key] += temp[key]
            
    print(f"------------------------------------------")

Processing year 2018
  ele channel
  mu channel
------------------------------------------
Processing year 2017
  ele channel
  mu channel
------------------------------------------
Processing year 2016
  ele channel
  mu channel
------------------------------------------
Processing year 2016APV
  ele channel
  mu channel
------------------------------------------


In [10]:
samples = cutflows["2017"]["ele"].keys()  # samples
samples

dict_keys(['VBF', 'WJetsLNu', 'WH', 'TTbar', 'SingleTop', 'ggF', 'Data', 'Diboson', 'ttH', 'ZH'])

In [11]:
from make_stacked_hists import make_events_dict

presel = {
        "mu": {
            "THWW>0.50": "THWW>0.50",
        },
        "ele": {
            "THWW>0.50": "THWW>0.50",
        },
}

events_dict = {}
for year in years:
    
    out = make_events_dict([year], channels, samples_dir[year], samples, presel)
    events_dict = {**events_dict, **out}

INFO:root:Finding VBFHToWWToAny_M-125_TuneCP5_withDipoleRecoil samples and should combine them under VBF
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.50 selection on 1601 events
INFO:root:Will fill the VBF dataframe with the remaining 774 events
INFO:root:tot event weight 10.139928859599209 

INFO:root:Finding WJetsToLNu_HT-100To200 samples and should combine them under WJetsLNu
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.50 selection on 57 events
INFO:root:Will fill the WJetsLNu dataframe with the remaining 3 events
INFO:root:tot event weight 6.352390849732743 

INFO:root:Finding HWminusJ_HToWW_M-125 samples and should combine them under WH
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.50 selection on 5590 events
INFO:root:Will fill the WH dataframe with the remaining 2444 events
INFO:root:tot event weight 1.700025567778488 

INFO:root:Finding WJetsToLNu_HT-800To1200 samples and should combine th

INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.50 selection on 46191 events
INFO:root:Will fill the WJetsLNu dataframe with the remaining 3720 events
INFO:root:tot event weight 1858.5412726705074 

INFO:root:Finding HZJ_HToWW_M-125 samples and should combine them under ZH
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.50 selection on 9318 events
INFO:root:Will fill the ZH dataframe with the remaining 3847 events
INFO:root:tot event weight 2.5978015630275664 

INFO:root:Finding WZ samples and should combine them under Diboson
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.50 selection on 704 events
INFO:root:Will fill the Diboson dataframe with the remaining 37 events
INFO:root:tot event weight 12.690625015672772 

INFO:root:Finding SingleMuon_Run2018B samples and should combine them under Data
INFO:root:Finding VBFHToWWToAny_M-125_TuneCP5_withDipoleRecoil samples and should combine them under VBF
INFO:r

INFO:root:Finding GluGluZH_HToWW_M-125_TuneCP5_13TeV-powheg-pythia8 samples and should combine them under ZH
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.50 selection on 17325 events
INFO:root:Will fill the ZH dataframe with the remaining 8737 events
INFO:root:tot event weight 0.11378015752722213 

INFO:root:Finding SingleMuon_Run2018C samples and should combine them under Data
INFO:root:Applying THWW>0.50 selection on 18506 events
INFO:root:Will fill the Data dataframe with the remaining 1020 events
INFO:root:tot event weight 1020.0 

INFO:root:Finding SingleMuon_Run2018D samples and should combine them under Data
INFO:root:Applying THWW>0.50 selection on 85292 events
INFO:root:Will fill the Data dataframe with the remaining 4689 events
INFO:root:tot event weight 4689.0 

INFO:root:Finding WJetsToLNu_HT-400To600 samples and should combine them under WJetsLNu
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.50 selection on 59086 e

INFO:root:tot event weight 4.596566146139161 

INFO:root:Finding TTTo2L2Nu samples and should combine them under TTbar
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.50 selection on 137227 events
INFO:root:Will fill the TTbar dataframe with the remaining 2822 events
INFO:root:tot event weight 97.38897248323246 

INFO:root:Finding ZZ samples and should combine them under Diboson
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.50 selection on 272 events
INFO:root:Will fill the Diboson dataframe with the remaining 9 events
INFO:root:tot event weight 2.2206353133864756 

INFO:root:Finding TTToHadronic samples and should combine them under TTbar
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.50 selection on 662 events
INFO:root:Will fill the TTbar dataframe with the remaining 41 events
INFO:root:tot event weight 6.2015633430221415 

INFO:root:Finding WJetsToLNu_HT-600To800 samples and should combine them unde

INFO:root:Applying THWW>0.50 selection on 19040 events
INFO:root:Will fill the SingleTop dataframe with the remaining 307 events
INFO:root:tot event weight 0.9981815621110091 

INFO:root:Finding SingleMuon_Run2017D samples and should combine them under Data
INFO:root:Applying THWW>0.50 selection on 13398 events
INFO:root:Will fill the Data dataframe with the remaining 693 events
INFO:root:tot event weight 693.0 

INFO:root:Finding SingleMuon_Run2017E samples and should combine them under Data
INFO:root:Applying THWW>0.50 selection on 29299 events
INFO:root:Will fill the Data dataframe with the remaining 1594 events
INFO:root:tot event weight 1594.0 

INFO:root:Finding SingleMuon_Run2017B samples and should combine them under Data
INFO:root:Applying THWW>0.50 selection on 14712 events
INFO:root:Will fill the Data dataframe with the remaining 771 events
INFO:root:tot event weight 771.0 

INFO:root:Finding WW samples and should combine them under Diboson
INFO:root:---> Using already store

INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.50 selection on 1726 events
INFO:root:Will fill the SingleTop dataframe with the remaining 84 events
INFO:root:tot event weight 16.845322570594167 

INFO:root:Finding SingleElectron_Run2016H samples and should combine them under Data
INFO:root:Applying THWW>0.50 selection on 19607 events
INFO:root:Will fill the Data dataframe with the remaining 900 events
INFO:root:tot event weight 900.0 

INFO:root:Finding GluGluHToWW_Pt-200ToInf_M-125 samples and should combine them under ggF
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.50 selection on 2111 events
INFO:root:Will fill the ggF dataframe with the remaining 991 events
INFO:root:tot event weight 7.120795865441716 

INFO:root:Finding SingleElectron_Run2016F samples and should combine them under Data
INFO:root:Applying THWW>0.50 selection on 936 events
INFO:root:Will fill the Data dataframe with the remaining 49 events
INFO:root:tot even

INFO:root:Applying THWW>0.50 selection on 92 events
INFO:root:Will fill the SingleTop dataframe with the remaining 1 events
INFO:root:tot event weight 0.022461527357381817 

INFO:root:Finding WJetsToLNu_HT-1200To2500 samples and should combine them under WJetsLNu
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.50 selection on 91268 events
INFO:root:Will fill the WJetsLNu dataframe with the remaining 5966 events
INFO:root:tot event weight 73.29637065870591 

INFO:root:Finding WJetsToLNu_HT-200To400 samples and should combine them under WJetsLNu
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.50 selection on 5582 events
INFO:root:Will fill the WJetsLNu dataframe with the remaining 284 events
INFO:root:tot event weight 156.39884352372158 

INFO:root:Finding ST_tW_top_5f_inclusiveDecays samples and should combine them under SingleTop
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.50 selection on 1861 events
I

INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.50 selection on 57860 events
INFO:root:Will fill the WJetsLNu dataframe with the remaining 2361 events
INFO:root:tot event weight 32.22916528499102 

INFO:root:Finding WJetsToLNu_HT-200To400 samples and should combine them under WJetsLNu
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.50 selection on 5558 events
INFO:root:Will fill the WJetsLNu dataframe with the remaining 193 events
INFO:root:tot event weight 96.39810946922861 

INFO:root:Finding ST_tW_top_5f_inclusiveDecays samples and should combine them under SingleTop
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.50 selection on 1520 events
INFO:root:Will fill the SingleTop dataframe with the remaining 75 events
INFO:root:tot event weight 19.371259805935996 

INFO:root:Finding GluGluHToWW_Pt-200ToInf_M-125 samples and should combine them under ggF
INFO:root:---> Using already stored event weight
INFO:r

INFO:root:Will fill the WH dataframe with the remaining 1044 events
INFO:root:tot event weight 0.7760342405824139 

INFO:root:Finding WJetsToLNu_HT-800To1200 samples and should combine them under WJetsLNu
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.50 selection on 77194 events
INFO:root:Will fill the WJetsLNu dataframe with the remaining 6548 events
INFO:root:tot event weight 337.91161202618457 

INFO:root:Finding TTToSemiLeptonic samples and should combine them under TTbar
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.50 selection on 400063 events
INFO:root:Will fill the TTbar dataframe with the remaining 13476 events
INFO:root:tot event weight 702.2993890355266 

INFO:root:Finding ST_t-channel_top_4f_InclusiveDecays samples and should combine them under SingleTop
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.50 selection on 14713 events
INFO:root:Will fill the SingleTop dataframe with the remaini

INFO:root:Finding HZJ_HToWW_M-125 samples and should combine them under ZH
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.50 selection on 5636 events
INFO:root:Will fill the ZH dataframe with the remaining 2848 events
INFO:root:tot event weight 1.2484352460366552 

INFO:root:Finding WZ samples and should combine them under Diboson
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.50 selection on 476 events
INFO:root:Will fill the Diboson dataframe with the remaining 69 events
INFO:root:tot event weight 7.369328647100005 

INFO:root:Finding SingleElectron_Run2016B_ver2_HIPM samples and should combine them under Data


In [12]:
for ch in channels:
    for cut, sel in list(presel[ch].items()):
        for sample in samples:
            for year in years:
                try:
                    df = events_dict[year][ch][sample]
                    df = df.query(sel)
                    w = df["event_weight"]
                except:
                    print(year, ch, sample, "is not present so will fill with 0")
                    w = df["event_weight"]

                cutflows[year][ch][sample][cut] = w.sum()

In [13]:
cutflows["2016APV"]["mu"]["WJetsLNu"]     # take a quick look

{'sumgenweight': 32148205.541996155,
 'Trigger': 5052102.989037976,
 'METFilters': 5049012.531204127,
 'OneLep': 5031057.850622146,
 'NoTaus': 4271447.470579825,
 'AtLeastOneFatJet': 461978.1002925532,
 'CandidateJetpT': 196124.03115179407,
 'LepInJet': 68289.06508691012,
 'JetLepOverlap': 26003.119346261694,
 'dPhiJetMET': 18333.833322064966,
 'MET': 16980.930607238028,
 'THWW>0.50': 1956.787905235017}

In [14]:
cutflows["2016APV"]["ele"]["WJetsLNu"]     # take a quick look

{'sumgenweight': 32241758.10101272,
 'Trigger': 4454643.38131529,
 'METFilters': 4451955.991847101,
 'OneLep': 3062734.5054372367,
 'NoTaus': 3062734.5054372367,
 'AtLeastOneFatJet': 364780.748501086,
 'CandidateJetpT': 153895.86376889877,
 'LepInJet': 58938.59730123174,
 'JetLepOverlap': 19206.882023214886,
 'dPhiJetMET': 13362.535289163337,
 'MET': 12293.663142455729,
 'THWW>0.50': 1087.679157780676}

# Combine different channels

In [15]:
common_cuts = cutflows["2018"]["mu"]["WJetsLNu"]
common_cuts

{'sumgenweight': 98881812.96014196,
 'Trigger': 15774784.39789099,
 'METFilters': 15767863.42776784,
 'OneLep': 15714197.698371835,
 'NoTaus': 13365974.316493511,
 'AtLeastOneFatJet': 1419110.9803375734,
 'CandidateJetpT': 638940.285300165,
 'LepInJet': 226599.5724394703,
 'JetLepOverlap': 85527.28589072316,
 'dPhiJetMET': 59905.32513231152,
 'MET': 55958.024490644915,
 'HEMCleaning': 53830.13943811133,
 'THWW>0.50': 6439.42214502947}

In [16]:
def combine_channels(cutflows):

    # combine both channels
    cutflows_new = {}
    for year in cutflows.keys():
        cutflows_new[year] = {}
        cutflows_new[year]["lep"] = {}
        
        for ch in ["mu", "ele"]:
            for sample in cutflows[year][ch]:
                                
                if sample not in cutflows_new[year]["lep"]:
                    cutflows_new[year]["lep"][sample] = {}
                
                for cut in cutflows[year][ch][sample]:
                    
                    if (year != "2018") and (cut == "HEMCleaning"):
                        continue
                    
                    if cut not in cutflows_new[year]["lep"][sample]:
                        cutflows_new[year]["lep"][sample][cut] = cutflows[year][ch][sample][cut]
                    else:
                        cutflows_new[year]["lep"][sample][cut] += cutflows[year][ch][sample][cut]
        cutflows[year] = {**cutflows[year], **cutflows_new[year]}
        
    return cutflows

In [17]:
cutflows = combine_channels(cutflows)

In [18]:
cutflows["2018"].keys()

dict_keys(['ele', 'mu', 'lep'])

In [19]:
cutflows["2018"]["ele"]["WJetsLNu"]

{'sumgenweight': 98938496.94524625,
 'Trigger': 13452100.702478617,
 'METFilters': 13446122.536571965,
 'OneLep': 10365996.09355191,
 'NoTaus': 10365996.09355191,
 'AtLeastOneFatJet': 1175735.568559565,
 'CandidateJetpT': 527612.6926960349,
 'LepInJet': 203272.06535077497,
 'JetLepOverlap': 67484.52509185502,
 'dPhiJetMET': 46443.27286250808,
 'MET': 43163.8827737948,
 'HEMCleaning': 41487.080689958166,
 'THWW>0.50': 4220.261398463306}

In [20]:
cutflows["2018"]["mu"]["WJetsLNu"]

{'sumgenweight': 98881812.96014196,
 'Trigger': 15774784.39789099,
 'METFilters': 15767863.42776784,
 'OneLep': 15714197.698371835,
 'NoTaus': 13365974.316493511,
 'AtLeastOneFatJet': 1419110.9803375734,
 'CandidateJetpT': 638940.285300165,
 'LepInJet': 226599.5724394703,
 'JetLepOverlap': 85527.28589072316,
 'dPhiJetMET': 59905.32513231152,
 'MET': 55958.024490644915,
 'HEMCleaning': 53830.13943811133,
 'THWW>0.50': 6439.42214502947}

In [21]:
cutflows["2018"]["lep"]["WJetsLNu"]

{'sumgenweight': 197820309.9053882,
 'Trigger': 29226885.10036961,
 'METFilters': 29213985.964339804,
 'OneLep': 26080193.791923746,
 'NoTaus': 23731970.410045423,
 'AtLeastOneFatJet': 2594846.548897139,
 'CandidateJetpT': 1166552.9779961999,
 'LepInJet': 429871.63779024524,
 'JetLepOverlap': 153011.81098257817,
 'dPhiJetMET': 106348.5979948196,
 'MET': 99121.90726443971,
 'HEMCleaning': 95317.22012806949,
 'THWW>0.50': 10659.683543492776}

# Combine different years

In [22]:
def combine_years(cutflows):
    """Will remove the HEM cleaning cutflow from 2018 first."""
    
    whatever_year = list(cutflows.keys())[0]
    channels = cutflows[whatever_year].keys()
    
    # combine all years
    cutflows_new = {}
    cutflows_new["Run2"] = {}
    
    for ch in channels:
        cutflows_new["Run2"][ch] = {}
        
        for year in cutflows:
            for sample in cutflows[year][ch]:
                
                if sample not in cutflows_new["Run2"][ch]:
                    cutflows_new["Run2"][ch][sample] = {}

                for cut in cutflows[year][ch][sample]:
                    if "HEM" in cut:
                        continue
                    if cut not in cutflows_new["Run2"][ch][sample]:
                        cutflows_new["Run2"][ch][sample][cut] = cutflows[year][ch][sample][cut]
                    else:
                        cutflows_new["Run2"][ch][sample][cut] += cutflows[year][ch][sample][cut]

    cutflows = {**cutflows, **cutflows_new}

    return cutflows

In [23]:
cutflows = combine_years(cutflows)

In [24]:
cutflows["2016"]["ele"].keys()

dict_keys(['VBF', 'WJetsLNu', 'WH', 'TTbar', 'Data', 'SingleTop', 'ggF', 'Diboson', 'ttH', 'ZH'])

In [25]:
cutflows.keys()

dict_keys(['2018', '2017', '2016', '2016APV', 'Run2'])

In [26]:
cutflows["Run2"].keys()

dict_keys(['ele', 'mu', 'lep'])

# Combine non-dominant backgrounds

In [27]:
# combine non-dominant backgrounds under others
dominant_bkgs = ["WJetsLNu", "TTbar"]
signals = ["ggF", "VH", "WH", "ZH", "ttH"]

for year in cutflows:
    for ch in cutflows[year]:
        cutflows[year][ch]["Others"] = dict.fromkeys(cutflows[year][ch]["WJetsLNu"], 0)
        for sample in cutflows[year][ch]:
            if sample == "Data":
                continue
            if sample not in signals+dominant_bkgs:
                for cut in cutflows[year][ch][sample]:
                    cutflows[year][ch]["Others"][cut] += cutflows[year][ch][sample][cut]

In [28]:
cutflows["2018"]["ele"].keys()

dict_keys(['VBF', 'WJetsLNu', 'WH', 'TTbar', 'SingleTop', 'ggF', 'Data', 'Diboson', 'ttH', 'ZH', 'Others'])

In [29]:
cutflows["2018"]["lep"]["Others"]

{'sumgenweight': 103288174.06980732,
 'Trigger': 7360679.539628845,
 'METFilters': 7357757.191103395,
 'OneLep': 6136757.41099368,
 'NoTaus': 5316573.641202256,
 'AtLeastOneFatJet': 394505.280369409,
 'CandidateJetpT': 196489.79830350704,
 'LepInJet': 78448.07078503745,
 'JetLepOverlap': 38007.13455693741,
 'dPhiJetMET': 25207.602441683914,
 'MET': 23642.381395779146,
 'HEMCleaning': 22754.01652855556,
 'THWW>0.50': 1179.1867931694264}

# LateX cutflow table

In [30]:
cuts = [
    "sumgenweight",
    "Trigger",
    "METFilters",
    "OneLep",        
    "NoTaus",
    "AtLeastOneFatJet",
    "CandidateJetpT",
    "LepInJet",
    "JetLepOverlap",
    "dPhiJetMET",
    "MET",
    "HEMCleaning",   
]

for cut in presel["mu"]:
    cuts += [cut]

In [31]:
cut_to_label = {
    "sumgenweight": "sumgenweight",        
    "HEMCleaning": "HEMCleaning",    
    "Trigger": "Trigger",
    "METFilters": "METFilters",
    "OneLep": "n Leptons = 1",
    "NoTaus": "n Taus = 0",
    "AtLeastOneFatJet": r"n FatJets $>=$ 1",
    "CandidateJetpT": r"j $p_T > 250$GeV",
    "LepInJet": r"$\Delta R(j, \ell) < 0.8$",
    "JetLepOverlap": r"$\Delta R(j, \ell) > 0.03$",
    "dPhiJetMET": r"$\Delta \phi(\mathrm{MET}, j)<1.57$",
    "MET": r"$\mathrm{MET}>20$",
    
    "None": "None",

    "THWW>0.50": r"$\ensuremath{T_{\text{HWW}}^{\ell\nu qq}} > 0.50$",
} 


In [34]:
parquet_to_latex = {
    "WJetsLNu": "$\PW(\Pell\PGn)$+",
    "TTbar": "\\ttbar",
    "Others": "Other MC",

    "ggF": "ggF",
    "VBF": "VBF",
    "WH": "WH",
    "ZH": "ZH",    
    "ttH": "$t\\bar{t}H$",    
    
    "Data": "Data",
}

def make_latex_cutflow_table(cutflows_dict, year, ch, add_data=False, add_sumgenweight=False):
    """Will use the cutflows dictionary to make the LateX table we have in the AN."""
    
    samples_bkg = ["WJetsLNu", "TTbar", "Others"]
    samples_sig = ["ggF","VBF", "WH", "ZH", "ttH"]

    ### backgrounds
    headers = [parquet_to_latex[s] for s in samples_bkg]
    
    textabular = f"l{'r'*len(headers)}"
    textabular += "|r"
    
    texheader = "\\textbf{Inclusive Selection}" + " & " + " & ".join(headers) + " & Total MC "
    if add_data:
        textabular += "|r"
        texheader += "& Data "
    texheader += "\\\\"
    texdata = "\\hline\n"
    
    data = dict()
    
    for cut in cuts: 
        if (year != "2018") and (cut == "HEMCleaning"):
            continue
            
        if not add_sumgenweight and cut == "sumgenweight":
            continue
    
        data[cut] = []

        for sample in samples_bkg:            
            data[cut].append(round(cutflows_dict[year][ch][sample][cut]))
            
        totalmc = 0
        for sample in (samples_bkg + samples_sig):
            totalmc += round(cutflows_dict[year][ch][sample][cut])
            
        data[cut].append(totalmc)
        
        if add_data:
            data[cut].append(round(cutflows_dict[year][ch]["Data"][cut]))

    for label in data:
        if label == "z":
            texdata += "\\hline\n"
        texdata += f"{cut_to_label[label]} & {' & '.join(map(str,data[label]))} \\\\\n"
        
    texdata += "\\hline\n"    

    ### signal
    headers2 = [parquet_to_latex[s] for s in samples_sig]
    texheader2 = " & " + " & ".join(headers2) + "\\\\"
    texdata2 = "\\hline\n"

    textabular2 = f"l{'r'*len(headers2)}"
    
    data = dict()
    for cut in cuts:
        if (year != "2018") and (cut == "HEMCleaning"):
            continue
            
        data[cut] = []

        for sample in samples_sig:
            data[cut].append(round(cutflows_dict[year][ch][sample][cut]))
        
    for label in data:
        if label == "z":
            texdata += "\\hline\n"
        texdata2 += f"{cut_to_label[label]} & {' & '.join(map(str,data[label]))} \\\\\n"    

    # make table
    print("\\begin{table}[!htp]")
    print("\\begin{center}")
    
    print("\\begin{tabular}{"+textabular+"}")
    print(texheader)
    print(texdata,end="")
    print("\\end{tabular}")

    print("\\begin{tabular}{"+textabular2+"}")
    print(texheader2)
    print(texdata2,end="")
    print("\\end{tabular}")
    
    
    if ch == "lep":
        print("\\caption{Event yield of " + year + " Monte Carlo samples normalized to " + str(round(get_lumi([year], [ch]))) + "\\fbinv.}")        
    else:
        print("\\caption{Event yield of " + ch + " channel " + year + " Monte Carlo samples normalized to " + str(round(get_lumi([year], [ch]))) + "\\fbinv.}")
        
    print("\\label{sel-tab-cutflow" + year + "}")
    print("\\end{center}")
    print("\\end{table}")    

In [39]:
make_latex_cutflow_table(cutflows, "2018", "lep", add_data=True, add_sumgenweight=True)

\begin{table}[!htp]
\begin{center}
\begin{tabular}{lrrr|r|r}
\textbf{Inclusive Selection} & $\PW(\Pell\PGn)$+ & \ttbar & Other MC & Total MC & Data \\
\hline
sumgenweight & 197820310 & 99476871 & 103288174 & 400775883 & 2276244440 \\
Trigger & 29226885 & 10530862 & 7360680 & 47135028 & 1174738710 \\
METFilters & 29213986 & 10525258 & 7357757 & 47113588 & 1173476723 \\
n Leptons = 1 & 26080194 & 8802028 & 6136757 & 41032249 & 656865738 \\
n Taus = 0 & 23731970 & 7031360 & 5316574 & 36090550 & 632124064 \\
n FatJets $>=$ 1 & 2594847 & 1264303 & 394505 & 4256058 & 6978889 \\
j $p_T > 250$GeV & 1166553 & 642826 & 196490 & 2007188 & 2825338 \\
$\Delta R(j, \ell) < 0.8$ & 429872 & 283927 & 78448 & 792939 & 1392226 \\
$\Delta R(j, \ell) > 0.03$ & 153012 & 245916 & 38007 & 437557 & 640994 \\
$\Delta \phi(\mathrm{MET}, j)<1.57$ & 106349 & 155612 & 25208 & 287591 & 359014 \\
$\mathrm{MET}>20$ & 99122 & 146620 & 23642 & 269777 & 326733 \\
HEMCleaning & 95317 & 140577 & 22754 & 258971 & 298219 \\
