This notebook outputs the cutflow data for each sample as a YAML file in somewhat the same format as on HEPData. There will obviously need to be some iterations on this, but it is a good starting point.

In [64]:
import uproot
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, Image
import yaml


In [65]:
hist_files_all = glob.glob('/data/hnl/histograms/v9p3_histograms/histograms_fullrun2_*')
hist_files_10_10 = glob.glob('/data/hnl/histograms/v9p3_histograms/histograms_fullrun2_10G_10mm_*')
cutflow_types = [
    'CutFlow',
    'CutFlow_LNC_raw_counts',
    'CutFlow_LNV_raw_counts',
    'CutFlow_weighted_one_hnl_majorana',
    'CutFlow_weighted_one_hnl_majorana_LNV_only',
    'CutFlow_weighted_one_hnl_majorana_LNC_only',
    'CutFlow_weighted_one_hnl_dirac',
    'CutFlow_weighted_majorana_limit_ih',
    'CutFlow_weighted_majorana_limit_nh',
    'CutFlow_weighted_dirac_limit_ih',
    'CutFlow_weighted_dirac_limit_nh'
    ]
# cutflow_bins_mu_mumu = ['all', 'trigger', '4-filter', 'medium prompt muon', 'plep overlap', 'DV', 'fiducial', '2-track DV', 'OS DV', 'mumu DV', 'cosmic veto', 'lepton pt', '2-medium-lepton DV', 'trig match', 'm_lll', 'B-hadron veto', 'Z mass veto', 'm_HNL', ]
# cutflow_bins_e_emu = ['all', 'trigger', '4-filter', 'medium prompt electron', 'plep overlap', 'DV', 'fiducial', '2-track DV', 'OS DV', 'emu DV', 'cosmic veto', 'lepton pt', 'medium-veryveryloose-lepton DV', 'trig match', 'm_lll', 'B-hadron veto', 'Z mass veto', 'm_HNL', ]
# cutflow_bins_mu_mue = ['all', 'trigger', '4-filter', 'medium prompt muon', 'plep overlap', 'DV', 'fiducial', '2-track DV', 'OS DV', 'emu DV', 'cosmic veto', 'lepton pt', 'medium-veryveryloose-lepton DV', 'trig match', 'm_lll', 'B-hadron veto', 'Z mass veto', 'm_HNL', ]
# cutflow_bins_e_ee = ['all', 'trigger', '4-filter', 'medium prompt electron', 'plep overlap', 'DV', 'fiducial', '2-track DV', 'OS DV', 'emu DV', 'cosmic veto', 'lepton pt', 'mat. veto', '2-veryveryloose-lepton DV', 'trig match', 'm_lll', 'B-hadron veto', 'Z mass veto', 'm_HNL', ]
# cutflow_bins_mu_ee = ['all', 'trigger', '4-filter', 'medium prompt muon', 'plep overlap', 'DV', 'fiducial', '2-track DV', 'OS DV', 'emu DV', 'cosmic veto', 'lepton pt', 'mat. veto', '2-veryveryloose-lepton DV', 'trig match', 'm_lll', 'B-hadron veto', 'Z mass veto', 'm_HNL', ]
# cutflow_bins_e_mumu = ['all', 'trigger', '4-filter', 'medium prompt electron', 'plep overlap', 'DV', 'fiducial', '2-track DV', 'OS DV', 'mumu DV', 'cosmic veto', 'lepton pt', '2-medium-lepton DV', 'trig match', 'm_lll', 'B-hadron veto', 'Z mass veto', 'm_HNL', ]

In [66]:
samples = {}
# pick all or the benchmark 10G 10mm
# for filename in hist_files_all:
for filename in hist_files_10_10:
    sample = filename.split('histograms_fullrun2_')[1].replace('.root','') # extract sample name from filename
    cutflow_dict = {}
    cutflow_dir = uproot.open(filename)['nominal']['VSI_LeptonsMod']['CutFlow']
    for cutflow_type in cutflow_types:
        cutflow_dict[cutflow_type] = cutflow_dir[cutflow_type].numpy()[0].tolist()
        if cutflow_type == "CutFlow":
            cutflow_dict['stat_err'] = np.sqrt(cutflow_dir[cutflow_type].numpy()[0]).tolist()
            cutflow_dict['labels'] = cutflow_dir[cutflow_type].xlabels
            cutflow_dict['labels'].insert(-1, 'empty') # annoying feature of cutflows, second to last bin is empty            
    samples[sample] = cutflow_dict


In [67]:
# print(yaml.dump(samples['10G_10mm_eeu'], sort_keys=False))

In [68]:
dataframes = {}
for k in samples.keys():
    dataframes[k] = pd.DataFrame()
    for x in samples[k].keys():
        dataframes[k][x] = samples[k][x]


for k, df in dataframes.items():
    # df['labels'] = 
    if k not in ['10G_10mm_uee', '10G_10mm_eee',]:
        # print(df[:13].labels)
        # print(df[13:].labels)
        dummy_matveto = pd.DataFrame(columns=df.columns)
        dummy_matveto.loc[0] = [None]*len(df.columns)
        df = pd.concat([df[:13], dummy_matveto ,df[13:]]).reset_index(drop=True)
        # print()
    df['labels'] = ['All', 'Trigger', 'PV', '4-filter', 'Medium prompt lep.', 'P.lep. overlap', 'DV', 'Fiducial', '2-track DV', 'OS DV', 'Lep. type DV', 'Cosmic veto', 'Lepton pt', 'Mat. veto', 'Lep. qual. DV', 'Trig. match', 'm_{lll}', 'B-hadron veto', 'Z mass veto', 'm_{HNL}', 'empty', 'truth matched']
    dataframes[k] = df


In [69]:
model = 'CutFlow_weighted_majorana_limit_ih'
model = 'CutFlow_weighted_one_hnl_dirac'
model = 'CutFlow_weighted_one_hnl_majorana'
    # 'CutFlow',
    # 'CutFlow_LNC_raw_counts',
    # 'CutFlow_LNV_raw_counts',
    # 'CutFlow_weighted_one_hnl_majorana',
    # 'CutFlow_weighted_one_hnl_majorana_LNV_only',
    # 'CutFlow_weighted_one_hnl_majorana_LNC_only',
    # '
    # ',
    # 'CutFlow_weighted_majorana_limit_ih',
    # 'CutFlow_weighted_majorana_limit_nh',
    # 'CutFlow_weighted_dirac_limit_ih',
    # 'CutFlow_weighted_dirac_limit_nh'

replacements = {
    'm\_\{HNL\}': '$m_{\mathrm{HNL}}$', 
    'm\_\{lll\}': '$m_{\mathrm{lll}}$', 
    'veryveryloose': 'VVL', 
    'truth matched': '\hline truth matched',
    'nan': '--',
    'uuu': '$\mu\mu\mu$',
    }
# for sample, df in dataframes.items():

prompt_el_df = pd.DataFrame()
prompt_el_df['Selection'] = dataframes['10G_10mm_eee']['labels']
prompt_el_df['eee'] = dataframes['10G_10mm_eee'][model]  
prompt_el_df['eeu'] = dataframes['10G_10mm_eeu'][model]
prompt_el_df['euu'] = dataframes['10G_10mm_euu'][model]

prompt_mu_df = pd.DataFrame()
prompt_mu_df['Selection'] = dataframes['10G_10mm_uuu']['labels']
prompt_mu_df['uuu'] = dataframes['10G_10mm_uuu'][model]  
prompt_mu_df['uue'] = dataframes['10G_10mm_uue'][model]
prompt_mu_df['uee'] = dataframes['10G_10mm_uee'][model]

df = pd.DataFrame()
df['Selection'] = dataframes['10G_10mm_uuu']['labels']
df['uuu'] = dataframes['10G_10mm_uuu'][model]  
df['uue'] = dataframes['10G_10mm_uue'][model]
df['uee'] = dataframes['10G_10mm_uee'][model]
df['eee'] = dataframes['10G_10mm_eee'][model]  
df['eeu'] = dataframes['10G_10mm_eeu'][model]
df['euu'] = dataframes['10G_10mm_euu'][model]

latex_table = df[:20].to_latex(index=False, float_format="{:0.3f}".format, na_rep='--')
for i, j in replacements.items():
    latex_table = latex_table.replace(i, j)

print(f'''
\\begin{{table}}[!htbp]
\centering % center the table
\label{{tab:cutflow_prompt_muon}}

{latex_table}

\caption{{Cutflow for prompt muon samples. 
Weighted number of expected events based on Majorana limit inverted-hierarchy model.}}
\end{{table}}
'''
)

# prompt_el_latex_table = prompt_el_df[:20].to_latex(index=False, float_format="{:0.2f}".format, na_rep='--')
# for i, j in replacements.items():
#     prompt_el_latex_table = prompt_el_latex_table.replace(i, j)

# print(f'''
# \\begin{{table}}[!htbp]
# \centering % center the table
# {prompt_el_latex_table}

# \caption{{Cutflow for prompt electron samples. 
# Weighted number of expected events based on Majorana limit inverted-hierarchy model.}}
# \label{{tab:cutflow_prompt_electron}}

# \end{{table}}
# '''
# )


\begin{table}[!htbp]
\centering % center the table
\label{tab:cutflow_prompt_muon}

\begin{tabular}{lrrrrrr}
\toprule
          Selection &    $\mu\mu\mu$ &     uue &   uee &    eee &     eeu &   euu \\
\midrule
                All & 60.055 & 174.441 & 2.788 & 60.847 & 177.425 & 2.800 \\
            Trigger & 23.315 &  64.974 & 0.994 & 17.626 &  54.470 & 0.884 \\
                 PV & 23.315 &  64.974 & 0.994 & 17.626 &  54.470 & 0.884 \\
           4-filter & 14.939 &  39.843 & 0.547 &  9.940 &  32.845 & 0.573 \\
 Medium prompt lep. & 14.063 &  37.306 & 0.511 &  8.112 &  26.816 & 0.462 \\
     P.lep. overlap & 14.030 &  37.261 & 0.511 &  8.085 &  26.691 & 0.459 \\
                 DV &  5.113 &  12.053 & 0.148 &  2.350 &   9.149 & 0.179 \\
           Fiducial &  4.814 &  11.356 & 0.143 &  2.186 &   8.377 & 0.165 \\
         2-track DV &  4.659 &  10.874 & 0.135 &  2.043 &   8.070 & 0.160 \\
              OS DV &  4.634 &  10.767 & 0.133 &  2.016 &   8.009 & 0.159 \\
       Lep. type 

In [70]:
samples = {}
hist_files_data = glob.glob('/data/hnl/v8_histograms/jul29_unblind_SR_v8p0_histograms/fullrun2_histograms_mc_*.root')
# pick all or the benchmark 10G 10mm
# for filename in hist_files_all:
for filename in hist_files_data:
    cutflow_dict = {}
    sample = filename.split('fullrun2_histograms_mc_')[1].replace('.root','') # extract sample name from filename
    cutflow_dir = uproot.open(filename)['VSI_LeptonsMod']['CutFlow']
    for cutflow_type in ['CutFlow']:
        cutflow_dict[cutflow_type] = cutflow_dir[cutflow_type].numpy()[0].tolist()
        if cutflow_type == "CutFlow":
            # cutflow_dict['stat_err'] = np.sqrt(cutflow_dir[cutflow_type].numpy()[0]).tolist()
            cutflow_dict['labels'] = cutflow_dir[cutflow_type].xlabels
            cutflow_dict['labels'].append('empty') # annoying feature of cutflows, second to last bin is empty 
            cutflow_dict['labels'].append('truth matched') # annoying feature of cutflows, second to last bin is empty 
    samples[sample] = cutflow_dict

dataframes_data = {}
for k in samples.keys():
    dataframes_data[k] = pd.DataFrame()
    for x in samples[k].keys():
        dataframes_data[k][x] = samples[k][x]
    # break

for k, df in dataframes_data.items():
    # df['labels'] = 
    if k not in ['uee', 'eee',]:
        # print(df[:13].labels)
        # print(df[13:].labels)
        dummy_matveto = pd.DataFrame(columns=df.columns)
        dummy_matveto.loc[0] = [None]*len(df.columns)
        df = pd.concat([df[:13], dummy_matveto ,df[13:]]).reset_index(drop=True)
        # print()
    df['labels'] = ['All', 'Trigger', 'PV', '4-filter', 'Medium prompt lep.', 'P.lep. overlap', 'DV', 'Fiducial', '2-track DV', 'OS DV', 'Lep. type DV', 'Cosmic veto', 'Lepton pt', 'Mat. veto', 'Lep. qual. DV', 'Trig. match', 'm_{lll}', 'B-hadron veto', 'Z mass veto', 'm_{HNL}', 'empty', 'truth matched']
    dataframes_data[k] = df

dataframes_data
df_data = pd.DataFrame()
df_data['Selection'] = dataframes_data['uuu']['labels']
df_data['uuu'] = dataframes_data['uuu']['CutFlow']
df_data['uue'] = dataframes_data['uue']['CutFlow']
df_data['uee'] = dataframes_data['uee']['CutFlow']
df_data['eee'] = dataframes_data['eee']['CutFlow']
df_data['eeu'] = dataframes_data['eeu']['CutFlow']
df_data['euu'] = dataframes_data['euu']['CutFlow']

df_data

Unnamed: 0,Selection,uuu,uue,uee,eee,eeu,euu
0,All,101921425.0,101921425.0,101921425.0,101629496.0,101921425.0,101921425.0
1,Trigger,22002921.0,22002921.0,22002921.0,21924697.0,22002921.0,22002921.0
2,PV,22002921.0,22002921.0,22002921.0,21924697.0,22002921.0,22002921.0
3,4-filter,16294834.0,16294834.0,16294834.0,16236924.0,16294834.0,16294834.0
4,Medium prompt lep.,8023702.0,8023702.0,8023702.0,6427469.0,6449805.0,6449805.0
5,P.lep. overlap,6259871.0,6259871.0,6259871.0,4664256.0,4680330.0,4680330.0
6,DV,6259871.0,6259871.0,6259871.0,4664256.0,4680330.0,4680330.0
7,Fiducial,3205961.0,3205961.0,3205961.0,2437219.0,2445529.0,2445529.0
8,2-track DV,1975904.0,1975904.0,1975904.0,1508527.0,1513732.0,1513732.0
9,OS DV,1166717.0,1166717.0,1166717.0,921697.0,924865.0,924865.0


In [71]:

replacements = {
    'm\_\{HNL\}': '$m_{\mathrm{HNL}}$', 
    'm\_\{lll\}': '$m_{\mathrm{lll}}$', 
    'veryveryloose': 'VVL', 
    'truth matched': '\hline truth matched',
    'nan': '--',
    'NAN': '--',
    'uuu': '$\mu\mu\mu$',
    'E+0': 'e+' 
    }


latex_table = df_data[:20].to_latex(index=False, float_format="{:.3E}".format, na_rep='--')
# latex_table = df[:20].to_latex(index=False, float_format="{:.4n}".format, na_rep='--')
for i, j in replacements.items():
    latex_table = latex_table.replace(i, j)

print(f'''
\\begin{{table}}[!htbp]
\centering % center the table
\label{{tab:cutflow_prompt_muon}}

{latex_table}

\caption{{Cutflow for prompt muon samples. 
Weighted number of expected events based on Majorana limit inverted-hierarchy model.}}
\end{{table}}
'''
)


\begin{table}[!htbp]
\centering % center the table
\label{tab:cutflow_prompt_muon}

\begin{tabular}{lrrrrrr}
\toprule
          Selection &       $\mu\mu\mu$ &       uue &       uee &       eee &       eeu &       euu \\
\midrule
                All & 1.019e+8 & 1.019e+8 & 1.019e+8 & 1.016e+8 & 1.019e+8 & 1.019e+8 \\
            Trigger & 2.200e+7 & 2.200e+7 & 2.200e+7 & 2.192e+7 & 2.200e+7 & 2.200e+7 \\
                 PV & 2.200e+7 & 2.200e+7 & 2.200e+7 & 2.192e+7 & 2.200e+7 & 2.200e+7 \\
           4-filter & 1.629e+7 & 1.629e+7 & 1.629e+7 & 1.624e+7 & 1.629e+7 & 1.629e+7 \\
 Medium prompt lep. & 8.024e+6 & 8.024e+6 & 8.024e+6 & 6.427e+6 & 6.450e+6 & 6.450e+6 \\
     P.lep. overlap & 6.260e+6 & 6.260e+6 & 6.260e+6 & 4.664e+6 & 4.680e+6 & 4.680e+6 \\
                 DV & 6.260e+6 & 6.260e+6 & 6.260e+6 & 4.664e+6 & 4.680e+6 & 4.680e+6 \\
           Fiducial & 3.206e+6 & 3.206e+6 & 3.206e+6 & 2.437e+6 & 2.446e+6 & 2.446e+6 \\
         2-track DV & 1.976e+6 & 1.976e+6 & 1.976e+6 & 1.

In [87]:
from hepdata_lib import Submission, Variable, Table

sub = Submission()
outdir="./cutflows/"


model = 'CutFlow_weighted_one_hnl_majorana'
models = {
    'CutFlow': 'Raw Event Counts',
    'CutFlow_weighted_one_hnl_majorana': 'One HNL Majorana',
    'CutFlow_weighted_one_hnl_dirac': 'One HNL Dirac',
    'CutFlow_weighted_majorana_limit_ih': 'Majorana limit IH',
    'CutFlow_weighted_majorana_limit_nh': 'Majorana limit NH',
    'CutFlow_weighted_dirac_limit_ih': 'Dirac limit IH',
    'CutFlow_weighted_dirac_limit_nh': 'Dirac limit NH',
    }
for model, title in models.items():
    # add independent variable 
    selection = Variable("Selection",
                    is_independent=True,
                    is_binned=False,
                    )
    selection.values = dataframes['10G_10mm_eee']['labels'][:20]

    table = Table(title)
    table.add_variable(selection)

    table.description = f'This cutflow for six simulated signal channels shows the weighted number of expected events. Each column uses the 10 mm, 10 GeV sample. The weights applied correspond to the model: {title}' 
    table.location = 'Data corresponds to Table XXXXXX' 
    table.keywords['phrases'] = ['cutflow', 'efficiency', 'selection']

    # add dependent variables
    channels = ['uuu', 'uue', 'uee', 'eee', 'eeu', 'euu']
    for channel in channels:
        efficiency = Variable(channel, is_independent=False, is_binned=False, units='Events')
        efficiency.values = dataframes[f'10G_10mm_{channel}'][model][:20]
        table.add_variable(efficiency)

    table.write_output('./cutflows/')

    sub.add_table(table)

do_data = True
if do_data:
    selection = Variable("Selection",
                is_independent=True,
                is_binned=False,
                )
    selection.values = df_data['Selection'][:20]

    table = Table("Data selection")
    table.add_variable(selection)

    table.description = f'This cutflow shows the number of passing data events data with six selections applied corresponding to the six different signal channels for Majorana HNLs.' 
    table.location = 'Data corresponds to Table XXXXXX' 
    table.keywords['phrases'] = ['cutflow', 'efficiency', 'selection']

    # add dependent variables
    channels = ['uuu', 'uue', 'uee', 'eee', 'eeu', 'euu']
    for channel in channels:
        efficiency = Variable(channel, is_independent=False, is_binned=False, units='Events')
        efficiency.values = df_data[channel][:20]
        table.add_variable(efficiency)

    table.write_output('./cutflows/')

    sub.add_table(table)


sub.create_files(outdir, remove_old=True)
