# Compute the LateX cutflow tables
- Will load the pkl files that contain the cutflows and the sumgenweight
- Will scale the events by the cross section
- Will save the yields in a dictionnary called ```cutflows -> Dict()```
- Will make the LateX table using the function ```make_composition_table()```

In [1]:
import glob
import json
import os
import pickle as pkl
import sys

import hist as hist2
import matplotlib.pyplot as plt
import mplhep as hep
import numpy as np
import pandas as pd
import pyarrow
import pyarrow.parquet as pq
import yaml
from scipy.special import softmax
from sklearn.metrics import auc, roc_curve

sys.path
sys.path.append("../python/")

import utils

plt.style.use(hep.style.CMS)
plt.rcParams.update({"font.size": 20})

pd.options.mode.chained_assignment = None

import glob
import os
import json
import pickle
import yaml
import math

import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import pyarrow.parquet as pq
from sklearn.metrics import auc, roc_curve
from scipy.special import softmax

import hist as hist2
import matplotlib.pyplot as plt
import mplhep as hep

plt.style.use(hep.style.CMS)

import sys
sys.path
sys.path.append("../python/")

import utils

plt.rcParams.update({"font.size": 20})

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# get lumi
with open("../fileset/luminosity.json") as f:
    luminosity = json.load(f)
    
luminosity

{'ele': {'Run2': 137640.0,
  '2016APV': 19492.72,
  '2016': 16809.96,
  '2017': 41476.02,
  '2018': 59816.23},
 'mu': {'Run2': 137640.0,
  '2016APV': 19436.16,
  '2016': 16810.81,
  '2017': 41475.26,
  '2018': 59781.96},
 'lep': {'Run2': 137640.0,
  '2016APV': 19436.16,
  '2016': 16810.81,
  '2017': 41475.26,
  '2018': 59781.96},
 'had': {'Run2': 137640.0,
  '2016APV': 19436.16,
  '2016': 16810.81,
  '2017': 41475.26,
  '2018': 59781.96}}

In [4]:
def get_lumi(years, channels):
    lum_ = 0
    for year in years:
        lum = 0
        for ch in channels:
            lum += luminosity[ch][year] / 1000.0

        lum_ += lum / len(channels)    
    return lum_

# Read cutflows from pkl

In [5]:
"""
For ../eos/Dec7_2016

cuts["mu"] += [
        "Trigger",
        "METFilters",
        "OneLep",
        "NoTaus",
        "LepIso",
        "LepMiniIso",
        "OneCandidateJet",
        "CandidateJetpT",
        "LepInJet",
        "JetLepOverlap",
        "dPhiJetMETCut",
]
cuts["ele"] += [
        "Trigger",
        "METFilters",
        "OneLep",
        "NoTaus",
        "LepIso",
        "OneCandidateJet",
        "CandidateJetpT",
        "LepInJet",
        "JetLepOverlap",
        "dPhiJetMETCut",
]
"""

'\nFor ../eos/Dec7_2016\n\ncuts["mu"] += [\n        "Trigger",\n        "METFilters",\n        "OneLep",\n        "NoTaus",\n        "LepIso",\n        "LepMiniIso",\n        "OneCandidateJet",\n        "CandidateJetpT",\n        "LepInJet",\n        "JetLepOverlap",\n        "dPhiJetMETCut",\n]\ncuts["ele"] += [\n        "Trigger",\n        "METFilters",\n        "OneLep",\n        "NoTaus",\n        "LepIso",\n        "OneCandidateJet",\n        "CandidateJetpT",\n        "LepInJet",\n        "JetLepOverlap",\n        "dPhiJetMETCut",\n]\n'

In [6]:
def get_sum_sumgenweight(pkl_files, year, sample):
    """Load and sum the sumgenweight of each pkl file."""
    
    sum_sumgenweight = 0
    for ifile in pkl_files:
        with open(ifile, "rb") as f:
            metadata = pkl.load(f)            
        sum_sumgenweight = sum_sumgenweight + metadata[sample][year]["sumgenweight"]

    return sum_sumgenweight


def get_xsecweight(pkl_files, year, ch, sample, is_data):
    
    if not is_data:
        # find xsection
        f = open("../fileset/xsec_pfnano.json")
        xsec = json.load(f)
        f.close()
        try:
            xsec = eval(str((xsec[sample])))
        except ValueError:
            print(f"sample {sample} doesn't have xsecs defined in xsec_pfnano.json so will skip it")
            return None

        # get overall weighting of events.. each event has a genweight...
        # sumgenweight sums over events in a chunk... sum_sumgenweight sums over chunks
        xsec_weight = (xsec * luminosity[ch][year]) / get_sum_sumgenweight(pkl_files, year, sample)
    else:
        xsec_weight = 1
    return xsec_weight

def get_cutflow(pkl_files, year, ch, sample, is_data):
    """
    Get cutflow from metadata but multiply by xsec-weight
    """
    xsec_weight = get_xsecweight(pkl_files, year, ch, sample, is_data)

    cuts = {
        "mu": ["sumgenweight"],
        "ele": ["sumgenweight"],
    }
        
    cuts["mu"] += [
        "Trigger",
        "METFilters",
        "OneLep",
        "LepMiniIso",        
        "NoTaus",
        "AtLeastOneFatJet",
        "CandidateJetpT",
        "LepInJet",
        "JetLepOverlap",
        "dPhiJetMET",
        "MET",        
    ]
    cuts["ele"] += [
        "Trigger",
        "METFilters",
        "OneLep",
        "NoTaus",
        "AtLeastOneFatJet",
        "CandidateJetpT",
        "LepInJet",
        "JetLepOverlap",
        "dPhiJetMET",
        "MET",
    ]
        
    if year == "2018":
        cuts["mu"] += ["HEMCleaning"]
        cuts["ele"] += ["HEMCleaning"]
        
    evyield = dict.fromkeys(cuts[ch], 0)
    for ik, pkl_file in enumerate(pkl_files):
        with open(pkl_file, "rb") as f:
            metadata = pkl.load(f)
            
        cutflows = metadata[sample][year]["cutflows"][ch]

        for key in evyield.keys():

            if key == "sumgenweight":
                evyield[key] += metadata[sample][year][key] * xsec_weight
            else:
                
#                 try:
                evyield[key] += cutflows[key] * xsec_weight
#                 except KeyError:
#                     evyield[key] += cutflows["AtLeastOneFatJet"] * xsec_weight            
    return evyield

In [7]:
! ls ../eos/Feb9_2018/WJetsToLNu_HT-100To200/outfiles/0-10.pkl

../eos/Feb9_2018/WJetsToLNu_HT-100To200/outfiles/0-10.pkl


In [8]:
with open("../eos/Feb9_2018/QCD_Pt_170to300/outfiles/0-10.pkl", "rb") as f:
    metadata = pkl.load(f)
metadata

{'QCD_Pt_170to300': {'mc': 48,
  '2018': {'sumgenweight': 485000.0,
   'sumlheweight': {},
   'sumpdfweight': {},
   'cutflows': {'ele': {'Trigger': 2652.0,
     'METFilters': 2644.0,
     'OneLep': 55.0,
     'NoTaus': 55.0,
     'AtLeastOneFatJet': 45.0,
     'CandidateJetpT': 22.0,
     'LepInJet': 15.0,
     'JetLepOverlap': 4.0,
     'dPhiJetMET': 2.0,
     'MET': 2.0,
     'HEMCleaning': 2.0,
     'single_weight_pileup': 2.0},
    'mu': {'Trigger': 1452.0,
     'METFilters': 1450.0,
     'OneLep': 458.0,
     'LepMiniIso': 46.0,
     'NoTaus': 43.0,
     'AtLeastOneFatJet': 29.0,
     'CandidateJetpT': 11.0,
     'LepInJet': 5.0,
     'JetLepOverlap': 3.0,
     'dPhiJetMET': 3.0,
     'MET': 3.0,
     'HEMCleaning': 3.0,
     'single_weight_pileup': 3.0}}}}}

In [9]:
with open("../eos/Feb9_2018/WJetsToLNu_HT-100To200/outfiles/0-10.pkl", "rb") as f:
    metadata = pkl.load(f)
metadata

{'WJetsToLNu_HT-100To200': {'mc': 89,
  '2018': {'sumgenweight': 860209.0,
   'sumlheweight': {0: 893759.06,
    1: 894174.7,
    2: 888283.2,
    3: 859625.5,
    4: 860209.0,
    5: 854675.94,
    6: 831615.4,
    7: 832338.2,
    8: 827100.1},
   'sumpdfweight': {},
   'cutflows': {'ele': {'Trigger': 112451.0,
     'METFilters': 112422.0,
     'OneLep': 84815.0,
     'NoTaus': 84815.0,
     'AtLeastOneFatJet': 1026.0,
     'CandidateJetpT': 30.0,
     'LepInJet': 19.0,
     'JetLepOverlap': 7.0,
     'dPhiJetMET': 1.0,
     'MET': 1.0,
     'HEMCleaning': 1.0,
     'single_weight_pileup': 1.0},
    'mu': {'Trigger': 155966.0,
     'METFilters': 155920.0,
     'OneLep': 129309.0,
     'LepMiniIso': 128996.0,
     'NoTaus': 109515.0,
     'AtLeastOneFatJet': 1301.0,
     'CandidateJetpT': 58.0,
     'LepInJet': 39.0,
     'JetLepOverlap': 10.0,
     'dPhiJetMET': 1.0,
     'MET': 1.0,
     'HEMCleaning': 1.0,
     'single_weight_pileup': 1.0}}}}}

# Adding a cut from the parquets

In [10]:
### This is your configuration. specefy which channels, years, samples, and directory of pkl files to use.
channels = ["ele", "mu"]
years = ["2018", "2017", "2016", "2016APV"]

samples = [
    "ggF", 
    "VH",
    "VBF",
    "ttH",
    "QCD",
    "DYJets",
    "WJetsLNu",
    "WZQQ",
    "TTbar",
    "SingleTop",
    "Diboson",
    "Data"
]

samples_dir = {
    "2016":    "../eos/Feb9_2016",
    "2016APV": "../eos/Feb9_2016APV",    
    "2017":    "../eos/Feb9_2017",    
    "2018":    "../eos/Feb9_2018",    
}

In [11]:
cutflows = {}
for year in years:
#     if year != "2016":
#         continue
    print(f"Processing year {year}")
    
    cutflows[year] = {}
    
    for ch in channels:
#         if ch != "mu": 
#             continue
        
        print(f"  {ch} channel")
        cutflows[year][ch] = {}

        condor_dir = os.listdir(samples_dir[year])

        for sample in condor_dir:

            # first: check if the sample is in one of combine_samples_by_name
            sample_to_use = None
            for key in utils.combine_samples_by_name:
                if key in sample:
                    sample_to_use = utils.combine_samples_by_name[key]
                    break

            # second: if not, combine under common label
            if sample_to_use is None:
                for key in utils.combine_samples:
                    if key in sample:
                        sample_to_use = utils.combine_samples[key]
                        break
                    else:
                        sample_to_use = sample

            if sample_to_use not in samples:
                continue

            is_data = False
            if sample_to_use == "Data":
                is_data = True

            out_files = f"{samples_dir[year]}/{sample}/outfiles/"
            pkl_files = glob.glob(f"{out_files}/*.pkl")

            if len(pkl_files) == 0:
                continue

            parquet_files = glob.glob(f"{out_files}/*_{ch}.parquet")
            
            try:
                data = pd.read_parquet(parquet_files)
            except pyarrow.lib.ArrowInvalid:
                # empty parquet because no event passed selection
#                 print(f"No parquet file for {sample}")
                continue

            if len(data) == 0:
#                 print(f"Hi, No parquet file for {sample}")
                continue
    
            if sample_to_use not in cutflows[year][ch].keys():
                cutflows[year][ch][sample_to_use] = get_cutflow(pkl_files, year, ch, sample, is_data)
            else:
                temp = get_cutflow(pkl_files, year, ch, sample, is_data)
                for key in cutflows[year][ch][sample_to_use]:
                    cutflows[year][ch][sample_to_use][key] += temp[key]
            
    print(f"------------------------------------------")

Processing year 2018
  ele channel
  mu channel
------------------------------------------
Processing year 2017
  ele channel
  mu channel
------------------------------------------
Processing year 2016
  ele channel
  mu channel
------------------------------------------
Processing year 2016APV
  ele channel
  mu channel
------------------------------------------


In [12]:
samples = cutflows["2017"]["ele"].keys()  # samples
samples

dict_keys(['VBF', 'WJetsLNu', 'TTbar', 'SingleTop', 'ggF', 'Data', 'QCD', 'Diboson', 'ttH'])

In [13]:
from make_stacked_hists import make_events_dict

presel = {
        "mu": {
            "THWW>0.80": "THWW>0.80",
#             "msoftdrop": "fj_mass>40",
#             "met": "met_pt>35", 0.2
        },
        "ele": {
            "THWW>0.80": "THWW>0.80",
#             "msoftdrop": "fj_mass>40",
#             "met": "met_pt>55",
#             "lepmiso": "(lep_pt<120) | ( (lep_pt>120) & (lep_misolation<0.2))",
        },
}

events_dict = {}
for year in years:
    
    out = make_events_dict([year], channels, samples_dir[year], samples, presel)
    events_dict = {**events_dict, **out}

INFO:root:Finding VBFHToWWToAny_M-125_TuneCP5_withDipoleRecoil samples and should combine them under VBF
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 1514 events
INFO:root:Will fill the VBF dataframe with the remaining 472 events
INFO:root:tot event weight 6.175103436284349 

INFO:root:Finding WJetsToLNu_HT-100To200 samples and should combine them under WJetsLNu
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 34 events
INFO:root:Will fill the WJetsLNu dataframe with the remaining 1 events
INFO:root:tot event weight 2.1619583307401005 

INFO:root:Finding WJetsToLNu_HT-800To1200 samples and should combine them under WJetsLNu
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 159607 events
INFO:root:Will fill the WJetsLNu dataframe with the remaining 2498 events
INFO:root:tot event weight 137.70089602705804 

INFO:root:Finding TTToSemiLeptonic samples and should

INFO:root:Will fill the ttH dataframe with the remaining 488 events
INFO:root:tot event weight 2.8781466082373344 

INFO:root:Finding EGamma_Run2018C samples and should combine them under Data
INFO:root:Applying THWW>0.80 selection on 14770 events
INFO:root:Will fill the Data dataframe with the remaining 156 events
INFO:root:tot event weight 156.0 

INFO:root:Finding EGamma_Run2018D samples and should combine them under Data
INFO:root:Applying THWW>0.80 selection on 67956 events
INFO:root:Will fill the Data dataframe with the remaining 791 events
INFO:root:tot event weight 791.0 

INFO:root:Finding QCD_Pt_800to1000 samples and should combine them under QCD
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 963 events
INFO:root:Will fill the QCD dataframe with the remaining 3 events
INFO:root:tot event weight 0.232886603917689 

INFO:root:Finding EGamma_Run2018B samples and should combine them under Data
INFO:root:Applying THWW>0.80 selection on 1

INFO:root:Applying THWW>0.80 selection on 192 events
INFO:root:Will fill the QCD dataframe with the remaining 1 events
INFO:root:tot event weight 0.00010456539048648485 

INFO:root:Finding ZZ samples and should combine them under Diboson
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 129 events
INFO:root:Will fill the Diboson dataframe with the remaining 5 events
INFO:root:tot event weight 1.3102336386393223 

INFO:root:Finding TTToHadronic samples and should combine them under TTbar
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 1386 events
INFO:root:Will fill the TTbar dataframe with the remaining 16 events
INFO:root:tot event weight 3.593248243292888 

INFO:root:Finding QCD_Pt_1000to1400 samples and should combine them under QCD
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 1111 events
INFO:root:Will fill the QCD dataframe with the remaining 4 events
I

INFO:root:Applying THWW>0.80 selection on 4741 events
INFO:root:Will fill the ggF dataframe with the remaining 1483 events
INFO:root:tot event weight 11.827494166377425 

INFO:root:Finding ST_tW_antitop_5f_inclusiveDecays samples and should combine them under SingleTop
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 4009 events
INFO:root:Will fill the SingleTop dataframe with the remaining 56 events
INFO:root:tot event weight 12.288184758124583 

INFO:root:Finding SingleElectron_Run2017E samples and should combine them under Data
INFO:root:Applying THWW>0.80 selection on 22127 events
INFO:root:Will fill the Data dataframe with the remaining 263 events
INFO:root:tot event weight 263.0 

INFO:root:Finding SingleElectron_Run2017B samples and should combine them under Data
INFO:root:Applying THWW>0.80 selection on 8739 events
INFO:root:Will fill the Data dataframe with the remaining 67 events
INFO:root:tot event weight 67.0 

INFO:root:Finding QCD

INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 79 events
INFO:root:Will fill the WJetsLNu dataframe with the remaining 0 events
INFO:root:tot event weight 0.0 

INFO:root:Finding WJetsToLNu_HT-800To1200 samples and should combine them under WJetsLNu
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 148852 events
INFO:root:Will fill the WJetsLNu dataframe with the remaining 3064 events
INFO:root:tot event weight 167.51474469176915 

INFO:root:Finding TTToSemiLeptonic samples and should combine them under TTbar
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 375217 events
INFO:root:Will fill the TTbar dataframe with the remaining 2917 events
INFO:root:tot event weight 335.03825668522484 

INFO:root:Finding ST_t-channel_top_4f_InclusiveDecays samples and should combine them under SingleTop
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0

INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 107655 events
INFO:root:Will fill the WJetsLNu dataframe with the remaining 2230 events
INFO:root:tot event weight 261.29979531791184 

INFO:root:Finding WJetsToLNu_HT-2500ToInf samples and should combine them under WJetsLNu
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 56498 events
INFO:root:Will fill the WJetsLNu dataframe with the remaining 472 events
INFO:root:tot event weight 0.1619133996101132 

INFO:root:Finding ttHToNonbb_M125 samples and should combine them under ttH
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 13697 events
INFO:root:Will fill the ttH dataframe with the remaining 1298 events
INFO:root:tot event weight 2.0599319914736407 

INFO:root:Finding SingleElectron_Run2017F samples and should combine them under Data
INFO:root:Finding QCD_Pt_800to1000 samples and should combine them under QCD


INFO:root:Will fill the SingleTop dataframe with the remaining 9 events
INFO:root:tot event weight 0.23765211133433672 

INFO:root:Finding TTTo2L2Nu samples and should combine them under TTbar
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 64280 events
INFO:root:Will fill the TTbar dataframe with the remaining 274 events
INFO:root:tot event weight 7.512167038693731 

INFO:root:Finding QCD_Pt_2400to3200 samples and should combine them under QCD
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 229 events
INFO:root:Will fill the QCD dataframe with the remaining 0 events
INFO:root:tot event weight 0.0 

INFO:root:Finding ZZ samples and should combine them under Diboson
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 60 events
INFO:root:Will fill the Diboson dataframe with the remaining 1 events
INFO:root:tot event weight 0.2957430045866518 

INFO:root:Finding TTT

INFO:root:tot event weight 5.744043692268485 

INFO:root:Finding SingleElectron_Run2016H samples and should combine them under Data
INFO:root:Finding GluGluHToWW_Pt-200ToInf_M-125 samples and should combine them under ggF
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 3295 events
INFO:root:Will fill the ggF dataframe with the remaining 1053 events
INFO:root:tot event weight 7.417915915042469 

INFO:root:Finding SingleElectron_Run2016F samples and should combine them under Data
INFO:root:Finding ST_tW_antitop_5f_inclusiveDecays samples and should combine them under SingleTop
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 1861 events
INFO:root:Will fill the SingleTop dataframe with the remaining 29 events
INFO:root:tot event weight 5.384468570158478 

INFO:root:Finding QCD_Pt_120to170 samples and should combine them under QCD
INFO:root:Finding QCD_Pt_3200toInf samples and should combine them under QCD

INFO:root:Finding ST_t-channel_top_4f_InclusiveDecays samples and should combine them under SingleTop
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 9112 events
INFO:root:Will fill the SingleTop dataframe with the remaining 22 events
INFO:root:tot event weight 0.6646921522407591 

INFO:root:Finding ST_s-channel_4f_hadronicDecays samples and should combine them under SingleTop
INFO:root:Finding WJetsToLNu_HT-1200To2500 samples and should combine them under WJetsLNu
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 57858 events
INFO:root:Will fill the WJetsLNu dataframe with the remaining 470 events
INFO:root:tot event weight 6.3750345344318795 

INFO:root:Finding WJetsToLNu_HT-200To400 samples and should combine them under WJetsLNu
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 5086 events
INFO:root:Will fill the WJetsLNu dataframe with the remaining 31 events

INFO:root:Will fill the QCD dataframe with the remaining 1 events
INFO:root:tot event weight 0.01561145482948878 

INFO:root:Finding WJetsToLNu_HT-400To600 samples and should combine them under WJetsLNu
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 12611 events
INFO:root:Will fill the WJetsLNu dataframe with the remaining 178 events
INFO:root:tot event weight 89.95992412964699 

INFO:root:Finding QCD_Pt_470to600 samples and should combine them under QCD
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 1703 events
INFO:root:Will fill the QCD dataframe with the remaining 6 events
INFO:root:tot event weight 1.0970122905496826 

INFO:root:Finding SingleMuon_Run2016C_HIPM samples and should combine them under Data
INFO:root:Finding WZ samples and should combine them under Diboson
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 555 events
INFO:root:Will fill the D

INFO:root:tot event weight 0.8404716685275341 

INFO:root:Finding QCD_Pt_1000to1400 samples and should combine them under QCD
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 1127 events
INFO:root:Will fill the QCD dataframe with the remaining 4 events
INFO:root:tot event weight 0.026831479647143438 

INFO:root:Finding SingleElectron_Run2016D_HIPM samples and should combine them under Data
INFO:root:Finding QCD_Pt_600to800 samples and should combine them under QCD
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 2859 events
INFO:root:Will fill the QCD dataframe with the remaining 20 events
INFO:root:tot event weight 1.005848000853743 

INFO:root:Finding QCD_Pt_300to470 samples and should combine them under QCD
INFO:root:---> Using already stored event weight
INFO:root:Applying THWW>0.80 selection on 1405 events
INFO:root:Will fill the QCD dataframe with the remaining 6 events
INFO:root:tot event weight 

In [14]:
# presel = {
#         "THWW>0.80": "THWW>0.80",
# #             "msoftdrop": "fj_mass>40",
# #             "met": "met_pt>55",
# #             "lepmiso": "(lep_pt<120) | ( (lep_pt>120) & (lep_misolation<0.2))",
# }

In [15]:
# for cut, sel in list(presel.items()):    
#     for sample in samples:

#         for year in years:
#             for ch in channels:
      
#                 try:
#                     df = events_dict[year][ch][sample]
#                     df = df.query(sel)
#                     w = df["event_weight"]
#                 except:
#                     print(year, ch, sample, "is not present so will fill with 0")
#                     w = df["event_weight"]
                    
#                 cutflows[year][ch][sample][cut] = w.sum()

In [16]:
for ch in channels:
    for cut, sel in list(presel[ch].items()):
        for sample in samples:
            for year in years:
                try:
                    df = events_dict[year][ch][sample]
                    df = df.query(sel)
                    w = df["event_weight"]
                except:
                    print(year, ch, sample, "is not present so will fill with 0")
                    w = df["event_weight"]

                cutflows[year][ch][sample][cut] = w.sum()

In [17]:
cutflows["2016APV"]["mu"]["QCD"]     # take a quick look

{'sumgenweight': 2158841073.6961017,
 'Trigger': 7733027.8571640495,
 'METFilters': 7717476.40305799,
 'OneLep': 2528471.2102144607,
 'LepMiniIso': 223137.04624347284,
 'NoTaus': 200312.06735418417,
 'AtLeastOneFatJet': 136192.2108293256,
 'CandidateJetpT': 52934.95056206322,
 'LepInJet': 30920.008817828053,
 'JetLepOverlap': 17344.944586178673,
 'dPhiJetMET': 11616.677124383454,
 'MET': 9717.0370157677,
 'THWW>0.80': 111.14058583976653}

In [18]:
cutflows["2016APV"]["ele"]["QCD"]     # take a quick look

{'sumgenweight': 2165123387.235826,
 'Trigger': 28180136.286444254,
 'METFilters': 28140394.99885865,
 'OneLep': 328760.50845948554,
 'NoTaus': 328760.50845948554,
 'AtLeastOneFatJet': 289704.74895183905,
 'CandidateJetpT': 145730.687351548,
 'LepInJet': 114936.89782142005,
 'JetLepOverlap': 44327.886420873285,
 'dPhiJetMET': 19018.86084559247,
 'MET': 14124.286054968414,
 'THWW>0.80': 93.77285944786469}

In [19]:
cutflows["2018"]["ele"]["ggF"]

{'sumgenweight': 6028.3346903316005,
 'Trigger': 603.6899350730905,
 'METFilters': 602.881734104983,
 'OneLep': 388.9135434738342,
 'NoTaus': 388.9135434738342,
 'AtLeastOneFatJet': 318.12961390885613,
 'CandidateJetpT': 170.6872193838514,
 'LepInJet': 97.61137662573327,
 'JetLepOverlap': 90.85626405647837,
 'dPhiJetMET': 61.90578161681452,
 'MET': 56.56200506649324,
 'HEMCleaning': 54.4630950896176,
 'THWW>0.80': 15.83365808263488}

In [20]:
cutflows["2018"]["ele"]["TTbar"]

{'sumgenweight': 49752687.64857001,
 'Trigger': 4999747.120986915,
 'METFilters': 4997120.90736503,
 'OneLep': 3427834.7618229645,
 'NoTaus': 3427834.7618229645,
 'AtLeastOneFatJet': 621213.3788457776,
 'CandidateJetpT': 302332.23598755227,
 'LepInJet': 136434.24986663507,
 'JetLepOverlap': 116772.86275007836,
 'dPhiJetMET': 72395.2649972906,
 'MET': 68160.81426400038,
 'HEMCleaning': 65382.089800065885,
 'THWW>0.80': 464.6288476418568}

# Combine different channels

In [21]:
common_cuts = cutflows["2018"]["mu"]["VBF"]
common_cuts

{'sumgenweight': 48316.581150264006,
 'Trigger': 4634.032851408792,
 'METFilters': 4630.388062311652,
 'OneLep': 3302.3299191332926,
 'LepMiniIso': 3267.9804752004325,
 'NoTaus': 2470.6669092784737,
 'AtLeastOneFatJet': 230.57294769595737,
 'CandidateJetpT': 102.0553100822256,
 'LepInJet': 61.2270060421994,
 'JetLepOverlap': 57.679855037776875,
 'dPhiJetMET': 42.988729949437804,
 'MET': 39.53664274744095,
 'HEMCleaning': 37.63394639219362,
 'THWW>0.80': 10.27211516133846}

In [22]:
def combine_channels(cutflows):
    """
    Must add lepminiso cutflow to electron channel.
    Will add to extra keys to the channels,
        1. `ele_new`: which contains the mini-isolation label to match the mu channel (the yield doesn't change)
        2. `lep`: which is the sum of `ele_new` and `mu`
    """
    
    for year in cutflows.keys():
        cutflows[year]["ele_new"] = {}   

        for sample in cutflows[year]["ele"].keys():
            cutflows[year]["ele_new"][sample] = {}

            for cut in common_cuts:
                if (year != "2018") and (cut == "HEMCleaning"):
                    continue
                    
                if cut != "LepMiniIso":
                    cutflows[year]["ele_new"][sample][cut] = cutflows[year]["ele"][sample][cut]
                else:
                    cutflows[year]["ele_new"][sample][cut] = cutflows[year]["ele"][sample]["OneLep"]

    # combine both channels
    cutflows_new = {}
    for year in cutflows.keys():
        cutflows_new[year] = {}
        cutflows_new[year]["lep"] = {}
        
        for ch in ["mu", "ele_new"]:
            for sample in cutflows[year][ch]:
                                
                if sample not in cutflows_new[year]["lep"]:
                    cutflows_new[year]["lep"][sample] = {}
                
                for cut in cutflows[year][ch][sample]:
                    
                    if (year != "2018") and (cut == "HEMCleaning"):
                        continue
                    
                    if cut not in cutflows_new[year]["lep"][sample]:
                        cutflows_new[year]["lep"][sample][cut] = cutflows[year][ch][sample][cut]
                    else:
                        cutflows_new[year]["lep"][sample][cut] += cutflows[year][ch][sample][cut]
        cutflows[year] = {**cutflows[year], **cutflows_new[year]}
        
    return cutflows

In [23]:
cutflows = combine_channels(cutflows)

In [24]:
cutflows["2018"].keys()

dict_keys(['ele', 'mu', 'ele_new', 'lep'])

In [25]:
cutflows["2018"]["ele_new"]["WJetsLNu"]

{'sumgenweight': 98938496.94524625,
 'Trigger': 13494656.17486616,
 'METFilters': 13488618.890384968,
 'OneLep': 10159409.905364677,
 'LepMiniIso': 10159409.905364677,
 'NoTaus': 10159409.905364677,
 'AtLeastOneFatJet': 1156730.6194095358,
 'CandidateJetpT': 495338.20674019103,
 'LepInJet': 192104.8620934339,
 'JetLepOverlap': 64323.362198248855,
 'dPhiJetMET': 44303.27213056914,
 'MET': 41197.55827468004,
 'HEMCleaning': 39627.042510522035,
 'THWW>0.80': 862.958461907012}

In [26]:
cutflows["2018"]["mu"]["WJetsLNu"]

{'sumgenweight': 98881812.96014196,
 'Trigger': 18344533.58998877,
 'METFilters': 18335526.912493113,
 'OneLep': 15259681.07624155,
 'LepMiniIso': 15201162.017077586,
 'NoTaus': 12926093.249765638,
 'AtLeastOneFatJet': 1364620.0240112403,
 'CandidateJetpT': 583489.2706716096,
 'LepInJet': 211377.56713725123,
 'JetLepOverlap': 79562.04863680585,
 'dPhiJetMET': 55806.28999208973,
 'MET': 52121.20126645196,
 'HEMCleaning': 50165.97622332939,
 'THWW>0.80': 1232.8578706813782}

In [27]:
cutflows["2018"]["lep"]["WJetsLNu"]

{'sumgenweight': 197820309.9053882,
 'Trigger': 31839189.76485493,
 'METFilters': 31824145.80287808,
 'OneLep': 25419090.981606226,
 'LepMiniIso': 25360571.922442265,
 'NoTaus': 23085503.155130316,
 'AtLeastOneFatJet': 2521350.6434207764,
 'CandidateJetpT': 1078827.4774118005,
 'LepInJet': 403482.4292306851,
 'JetLepOverlap': 143885.4108350547,
 'dPhiJetMET': 100109.56212265887,
 'MET': 93318.75954113199,
 'HEMCleaning': 89793.01873385142,
 'THWW>0.80': 2095.81633258839}

# Combine different years

In [28]:
def combine_years(cutflows):
    """Will remove the HEM cleaning cutflow from 2018 first."""
    
    whatever_year = list(cutflows.keys())[0]
    channels = cutflows[whatever_year].keys()
    
    # combine all years
    cutflows_new = {}
    cutflows_new["Run2"] = {}
    
    for ch in channels:
        cutflows_new["Run2"][ch] = {}
        
        for year in cutflows:
            for sample in cutflows[year][ch]:
                
                if sample not in cutflows_new["Run2"][ch]:
                    cutflows_new["Run2"][ch][sample] = {}

                for cut in cutflows[year][ch][sample]:
                    if "HEM" in cut:
                        continue
                    if cut not in cutflows_new["Run2"][ch][sample]:
                        cutflows_new["Run2"][ch][sample][cut] = cutflows[year][ch][sample][cut]
                    else:
                        cutflows_new["Run2"][ch][sample][cut] += cutflows[year][ch][sample][cut]

    cutflows = {**cutflows, **cutflows_new}

    return cutflows

In [29]:
cutflows = combine_years(cutflows)

In [30]:
cutflows["2016"]["ele"].keys()

dict_keys(['VBF', 'WJetsLNu', 'TTbar', 'Data', 'SingleTop', 'ggF', 'QCD', 'Diboson', 'ttH'])

In [31]:
cutflows.keys()

dict_keys(['2018', '2017', '2016', '2016APV', 'Run2'])

In [32]:
cutflows["Run2"].keys()

dict_keys(['ele', 'mu', 'ele_new', 'lep'])

# Combine non-dominant backgrounds

In [33]:
# combine non-dominant backgrounds under others
dominant_bkgs = ["WJetsLNu", "QCD", "TTbar"]
signals = ["ggF", "VH"]

for year in cutflows:
    for ch in cutflows[year]:
        cutflows[year][ch]["Others"] = dict.fromkeys(cutflows[year][ch]["WJetsLNu"], 0)
        for sample in cutflows[year][ch]:
            if sample == "Data":
                continue
            if sample not in signals+dominant_bkgs:
                for cut in cutflows[year][ch][sample]:
                    cutflows[year][ch]["Others"][cut] += cutflows[year][ch][sample][cut]

In [34]:
cutflows["2018"]["ele"].keys()

dict_keys(['VBF', 'WJetsLNu', 'TTbar', 'SingleTop', 'ggF', 'QCD', 'Data', 'Diboson', 'ttH', 'Others'])

In [35]:
cutflows["2018"]["lep"]["Others"]

{'sumgenweight': 103338427.99260004,
 'Trigger': 8465163.68069785,
 'METFilters': 8461578.098172793,
 'OneLep': 6025103.119154213,
 'LepMiniIso': 5994959.048241838,
 'NoTaus': 5196687.329703328,
 'AtLeastOneFatJet': 383662.5043456247,
 'CandidateJetpT': 182760.08095323652,
 'LepInJet': 73955.20521748498,
 'JetLepOverlap': 35739.36283193218,
 'dPhiJetMET': 23761.14152413553,
 'MET': 22296.456684631317,
 'HEMCleaning': 21362.67512062594,
 'THWW>0.80': 392.48907897466495}

# LateX cutflow table

In [36]:
cuts = {
    "mu": [
        "sumgenweight",
        "Trigger",
        "METFilters",
        "OneLep",
        "LepMiniIso",        
        "NoTaus",
        "AtLeastOneFatJet",
        "CandidateJetpT",
        "LepInJet",
        "JetLepOverlap",
        "dPhiJetMET",
        "MET",
        "HEMCleaning",      
    ],
    "ele": [
        "sumgenweight",
        "Trigger",
        "METFilters",
        "OneLep",        
        "NoTaus",
        "AtLeastOneFatJet",
        "CandidateJetpT",
        "LepInJet",
        "JetLepOverlap",
        "dPhiJetMET",
        "MET",
        "HEMCleaning",   
    ],  
    "lep": [
        "sumgenweight",
        "Trigger",
        "METFilters",
        "OneLep",
        "LepMiniIso",        
        "NoTaus",
        "AtLeastOneFatJet",
        "CandidateJetpT",
        "LepInJet",
        "JetLepOverlap",
        "dPhiJetMET",
        "MET",
        "HEMCleaning",           
    ],
}

for ch in cuts:
    if ch == "lep":
        for cut in presel["mu"]:        
            cuts[ch] += [cut]
    else:
        for cut in presel[ch]:
            cuts[ch] += [cut]

In [37]:
cut_to_label = {
    "sumgenweight": "sumgenweight",        
    "HEMCleaning": "HEMCleaning",    
    "Trigger": "Trigger",
    "METFilters": "METFilters",
    "OneLep": "n Leptons = 1",
    "NoTaus": "n Taus = 0",
    "LepIso": r"$\ell$ relative isolation",
    "LepMiniIso": r"$\ell$ mini-isolation",    
    "AtLeastOneFatJet": r"n FatJets $>=$ 1",
    "CandidateJetpT": r"j $p_T > 250$GeV",
    "LepInJet": r"$\Delta R(j, \ell) < 0.8$",
    "JetLepOverlap": r"$\Delta R(j, \ell) > 0.03$",
    "dPhiJetMET": r"$\Delta \phi(\mathrm{MET}, j)<1.57$",
    "MET": r"$\mathrm{MET}>20$",
    
    "None": "None",

    "THWW>0.80": r"$\ensuremath{T_{\text{HWW}}^{\ell\nu qq}} > 0.80$",
} 


In [38]:
parquet_to_latex = {
    "WJetsLNu": "$\PW(\Pell\PGn)$+",
    "QCD": "QCD",
#     "DYJets": "$\PZ(\Pell\Pell)$+jets",
    "TTbar": "\\ttbar",
    "Others": "Other MC",

    "ggF": "ggF",
    "VBF": "VBF",
#     "VH": "VH",
#     "ttH": "$t\\bar{t}H$",    
    
    "Data": "Data",
}

def make_latex_cutflow_table(cutflows_dict, year, ch, add_data=False, add_sumgenweight=False):
    """Will use the cutflows dictionary to make the LateX table we have in the AN."""
    
    samples_bkg = ["WJetsLNu","QCD","TTbar","Others"]
    samples_sig = ["ggF","VBF"]

    ### backgrounds
    headers = [parquet_to_latex[s] for s in samples_bkg]
    
    textabular = f"l{'r'*len(headers)}"
    textabular += "|r"
    
    texheader = "\\textbf{Inclusive Selection}" + " & " + " & ".join(headers) + " & Total MC "
    if add_data:
        textabular += "|r"
        texheader += "& Data "
    texheader += "\\\\"
    texdata = "\\hline\n"
    
    data = dict()
    
    for cut in cuts[ch]: 
        if (year != "2018") and (cut == "HEMCleaning"):
            continue
            
        if not add_sumgenweight and cut == "sumgenweight":
            continue
    
        data[cut] = []

        for sample in samples_bkg:            
            data[cut].append(round(cutflows_dict[year][ch][sample][cut]))
            
        totalmc = 0
        for sample in (samples_bkg + samples_sig):
            totalmc += round(cutflows_dict[year][ch][sample][cut])
            
        data[cut].append(totalmc)
        
        if add_data:
            data[cut].append(round(cutflows_dict[year][ch]["Data"][cut]))

    for label in data:
        if label == "z":
            texdata += "\\hline\n"
        texdata += f"{cut_to_label[label]} & {' & '.join(map(str,data[label]))} \\\\\n"
        
    texdata += "\\hline\n"    

    ### signal
    headers2 = [parquet_to_latex[s] for s in samples_sig]    
    texheader2 = " & " + " & ".join(headers2) + "\\\\"
    texdata2 = "\\hline\n"

    data = dict()
    for cut in cuts[ch]:
        if (year != "2018") and (cut == "HEMCleaning"):
            continue
            
        data[cut] = []

        for sample in samples_sig:
            data[cut].append(round(cutflows_dict[year][ch][sample][cut]))
        
    for label in data:
        if label == "z":
            texdata += "\\hline\n"
        texdata2 += f"{cut_to_label[label]} & {' & '.join(map(str,data[label]))} \\\\\n"    

    # make table
    print("\\begin{table}[!htp]")
    print("\\begin{center}")
    
    print("\\begin{tabular}{"+textabular+"}")
    print(texheader)
    print(texdata,end="")
    print(texheader2)
    print(texdata2,end="")
    print("\\end{tabular}")
    
    
    if ch == "lep":
        print("\\caption{Event yield of " + year + " Monte Carlo samples normalized to " + str(round(get_lumi([year], [ch]))) + "\\fbinv.}")        
    else:
        print("\\caption{Event yield of " + ch + " channel " + year + " Monte Carlo samples normalized to " + str(round(get_lumi([year], [ch]))) + "\\fbinv.}")

    print("\\label{sel-tab-cutflow" + year + "}")
    print("\\end{center}")
    print("\\end{table}")    

In [39]:
make_latex_cutflow_table(cutflows, "2016", "lep", add_data=True, add_sumgenweight=True)

\begin{table}[!htp]
\begin{center}
\begin{tabular}{lrrrr|r|r}
\textbf{Inclusive Selection} & $\PW(\Pell\PGn)$+ & QCD & \ttbar & Other MC & Total MC & Data \\
\hline
sumgenweight & 34495755 & 3734374445 & 27964378 & 29129527 & 3825994666 & 615115660 \\
Trigger & 5263985 & 31071425 & 3527493 & 2445700 & 42311189 & 444762865 \\
METFilters & 5260487 & 31026586 & 3524911 & 2444314 & 42258881 & 444581778 \\
n Leptons = 1 & 3831194 & 2499006 & 2412703 & 1673192 & 10417793 & 173524490 \\
$\ell$ mini-isolation & 3823434 & 486256 & 2383167 & 1663548 & 8358085 & 167806496 \\
n Taus = 0 & 3666706 & 467539 & 1905815 & 1442753 & 7484236 & 161818889 \\
n FatJets $>=$ 1 & 684900 & 372635 & 346909 & 108946 & 1513707 & 1808735 \\
j $p_T > 250$GeV & 299584 & 174361 & 167639 & 51106 & 692848 & 719861 \\
$\Delta R(j, \ell) < 0.8$ & 108796 & 127704 & 73800 & 20231 & 330625 & 358519 \\
$\Delta R(j, \ell) > 0.03$ & 39183 & 53931 & 64386 & 9819 & 167406 & 160727 \\
$\Delta \phi(\mathrm{MET}, j)<1.57$ & 27589 &