In [1]:
import uproot
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os #for looping over files in a directory
import math
import json
import glob

class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, (np.integer,)):
            return int(obj)
        elif isinstance(obj, (np.floating,)):
            return float(obj)
        elif isinstance(obj, (np.ndarray,)):
            return obj.tolist()
        return super().default(obj)

def load_json_file(file_path):
    """
    Loads JSON data from a file.

    Args:
        file_path (str): The path to the JSON file.

    Returns:
        dict or list: A Python dictionary or list representing the JSON data, or None if an error occurs.
    """
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
            return data
    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in '{file_path}'")
        return None
    except Exception as e:
         print(f"An unexpected error occurred: {e}")
         return None

In [53]:
# some cuts, like mindphijetmet, nbjet=0, nlepton=0, already applied at preselection level for now
SRcuts={'met_met': 200,
        'mTGammaMet': 50,
        'j1_pt': 150,
        'ph_pt': 10,
        'met_signif': 25}

def getemptyresults():
    results={}
    for b in ['TT', 'TL', 'LT', 'LL']:
        results[b] = {'real': {'nevents': 0,
                               'sumweights': 0},
                      'fake': {'nevents': 0,
                               'sumweights': 0},
                      'other': {'nevents': 0,
                                'sumweights': 0},
                      'data': 0
                     }
    return results

def ABCDresults(data,mask,isMC):
    masks={}
    masks['TT'] = (data['ph_select_tightID']==1) & (data['ph_select_tightIso']==1) & ((data['ph_isEM'] & 0x45fc01)==0)
    masks['TL'] = (data['ph_select_tightID']==1) & (data['ph_select_tightIso']==0) & ((data['ph_isEM'] & 0x45fc01)==0)
    masks['LT'] = (data['ph_select_tightID']==0) & (data['ph_select_tightIso']==1) & ((data['ph_isEM'] & 0x45fc01)==0)
    masks['LL'] = (data['ph_select_tightID']==0) & (data['ph_select_tightIso']==0) & ((data['ph_isEM'] & 0x45fc01)==0)

    if isMC:
        real_mask = (data['ph_truthprompt'] == 1)
        fake_mask = (data['ph_truthJFP']    == 1)

    results=getemptyresults()
    for b in ['TT', 'TL', 'LT', 'LL']:
        if isMC:
            results[b]['real']['nevents']     = np.sum(mask & masks[b] & real_mask)
            results[b]['real']['sumweights']  = np.sum(data['weight_total'][mask & masks[b] & real_mask])
            results[b]['fake']['nevents']     = np.sum(mask & masks[b] & fake_mask)
            results[b]['fake']['sumweights']  = np.sum(data['weight_total'][mask & masks[b] & fake_mask])
            results[b]['other']['nevents']    = np.sum(mask & masks[b] & ~real_mask & ~fake_mask)
            results[b]['other']['sumweights'] = np.sum(data['weight_total'][mask & masks[b] & ~real_mask & ~fake_mask])
        else:
            results[b]['data'] = np.sum(mask & masks[b])
    return results

def dumpjson(data,isMC):

    SR_mask = \
    (data['met_met']         >  SRcuts['met_met']*1000.   ) & \
    (data['j1_pt']           >  SRcuts['j1_pt']*1000.     ) & \
    (data['ph_pt']           >  SRcuts['ph_pt']*1000.     ) & \
    (data['met_signif']      >  SRcuts['met_signif']      ) & \
    (data['nBTagJets']       == 0                         ) & \
    (data['mindPhiJetMet']   >  0.4                       ) & \
    (data['nElectrons']      == 0                         ) & \
    (data['nMuons']          == 0                         ) & \
    (data['mTGammaMet']      <  SRcuts['mTGammaMet']*1000.) & \
    (data['mindPhiGammaJet'] > 1.5                        ) & \
    (data['nTau20_baseline'] == 0                         )
    

    return {'SR': {'bin_1': ABCDresults(data,SR_mask,isMC)}}

In [63]:
base_path = "/data/mhance/SUSY/ntuples/v3"

# Iterate over subdirectories and files
for root, _, files in os.walk(base_path):
    for file in files:
        if not file.endswith('.root'): continue
        print(file)
        filepath = os.path.join(root, file)
        #if filepath != "/data/mhance/SUSY/ntuples/v3/output_Wtaunugamma.root": continue
        #if filepath != "/data/mhance/SUSY/ntuples/v3/output_data_2018.root": continue
        #if filepath != "/data/mhance/SUSY/ntuples/v3/output_Znunu_CVetoBVeto.root": continue
        #if filepath != "/data/mhance/SUSY/ntuples/v3/output_N2_220_N1_200_HH.root": continue
        #print(filepath)
        with uproot.open(filepath) as f:
            if 'picontuple' in f:
                tree = f['picontuple']
                # Extract the data
                data = tree.arrays(library="np")
                #data['met_signif'] = data['met_met']/data['ph_pt']

                results=dumpjson(data,"data_" not in filepath)
                print(json.dumps(results, indent=4, cls=NumpyEncoder))
                with open("ABCD_results/"+file.replace(".root","_ABCD.json"),'w') as jf:
                    json.dump(results, jf, indent=4, cls=NumpyEncoder)

output_Sh_2211_Ztautau_HH_maxHTpTV2_CVetoBVeto.root
{
    "SR": {
        "bin_1": {
            "TT": {
                "real": {
                    "nevents": 0,
                    "sumweights": 0.0
                },
                "fake": {
                    "nevents": 0,
                    "sumweights": 0.0
                },
                "other": {
                    "nevents": 0,
                    "sumweights": 0.0
                },
                "data": 0
            },
            "TL": {
                "real": {
                    "nevents": 0,
                    "sumweights": 0.0
                },
                "fake": {
                    "nevents": 0,
                    "sumweights": 0.0
                },
                "other": {
                    "nevents": 0,
                    "sumweights": 0.0
                },
                "data": 0
            },
            "LT": {
                "real": {
                    "nevents": 0,
         

In [64]:
totalresults=getemptyresults()
sample_max={}
sample_max['TL']=[0,'']
sample_max['LT']=[0,'']
sample_max['TT']=[0,'']
sample_max['LL']=[0,'']

samples=[]

for fp in glob.glob("ABCD_results/*.json"):
    if "gammajet" in fp: continue
    if "jetjet" in fp: continue
    if "N2" in fp: continue
        
    data = load_json_file(fp)

    sample_tag = fp.replace("_results/output_","").replace("ABCD","").replace(".json","")[:-1]
    samples.append(sample_tag)
    
    for b in ['TT', 'TL', 'LT', 'LL']:
        totalresults[b]['data'] += data["SR"]["bin_1"][b]["data"]
        totalresults[b]['real']['sumweights'] += data["SR"]["bin_1"][b]["real"]["sumweights"]
        totalresults[b]['fake']['sumweights'] += data["SR"]["bin_1"][b]["fake"]["sumweights"]
        totalresults[b]['other']['sumweights'] += data["SR"]["bin_1"][b]["other"]["sumweights"]

        if sample_max[b][0] < data["SR"]["bin_1"][b]["real"]["sumweights"]:
            sample_max[b][0] = data["SR"]["bin_1"][b]["real"]["sumweights"]
            sample_max[b][1] = sample_tag
print(json.dumps(totalresults,indent=4,cls=NumpyEncoder))

print("Most contributing samples:")
for b in ['TT', 'TL', 'LT', 'LL']:
    print(f"{b}: {sample_max[b][1][:-1]}")

mcs_data=load_json_file(f"ABCD_results/output_{sample_max['TT'][1]}_ABCD.json")
print(json.dumps(mcs_data,indent=4,cls=NumpyEncoder))

{
    "TT": {
        "real": {
            "nevents": 0,
            "sumweights": 3.208922228543088
        },
        "fake": {
            "nevents": 0,
            "sumweights": 0.23391352500766516
        },
        "other": {
            "nevents": 0,
            "sumweights": 3.06911040641603
        },
        "data": 6
    },
    "TL": {
        "real": {
            "nevents": 0,
            "sumweights": 1.554581651231274
        },
        "fake": {
            "nevents": 0,
            "sumweights": 1.6892036180797732
        },
        "other": {
            "nevents": 0,
            "sumweights": 1.5312866900640074
        },
        "data": 7
    },
    "LT": {
        "real": {
            "nevents": 0,
            "sumweights": 0.6613824581727386
        },
        "fake": {
            "nevents": 0,
            "sumweights": 0.16816351178567857
        },
        "other": {
            "nevents": 0,
            "sumweights": 0.8772523483785335
        },
        "da

In [74]:
#N_LL = totalresults['LL']['data']-totalresults['LL']['real']['sumweights']
#N_TL = totalresults['TL']['data']-totalresults['TL']['real']['sumweights']
#N_LT = totalresults['LT']['data']-totalresults['LT']['real']['sumweights']

N={}
for b in ['TT', 'TL', 'LT', 'LL']:
    N[b] = totalresults[b]['data']-totalresults[b]['real']['sumweights']
    print(f"{b}: {totalresults[b]['data']:.1f}   {totalresults[b]['real']['sumweights']:.1f}")

if N['LL']>0:
    N_TT_bkg = N['TL']*N['LT']/N['LL']
else:
    N_TT_bkg = 0
print(f"N_TT_bkg = ({N['TL']:.1f}*{N['LT']:.1f})/({N['LL']:.1f}) = {N_TT_bkg:.1f}, N_TT_fake={totalresults['TT']['fake']['sumweights']:.1f}, N_TT_other={totalresults['TT']['other']['sumweights']:.1f}, and N_TT_real={totalresults['TT']['real']['sumweights']:.1f}")

TT: 6.0   3.2
TL: 7.0   1.6
LT: 6.0   0.7
LL: 5.0   0.1
N_TT_bkg = (5.4*5.3)/(4.9) = 6.0, N_TT_fake=0.2, N_TT_other=3.1, and N_TT_real=3.2


Quick function that will test closure for any single sample.

In [71]:
def sampleABCD(sample,debug=False):
    sresults=None
    if isinstance(sample,str):
        sresults=load_json_file(f"ABCD_results/output_{sample}_ABCD.json")["SR"]["bin_1"]
        #print(json.dumps(results,indent=4,cls=NumpyEncoder))
    elif isinstance(sample,dict):
        sresults=sample
    else:
        print("Must provide either valid sample string or dictionary of results.")
        return None

    N_all={}
    for b in ['TT', 'TL', 'LT', 'LL']:
        N_all[b] = sresults[b]['real']['sumweights']+sresults[b]['fake']['sumweights']

    num_TL = (N_all['TL']-sresults['TL']['real']['sumweights'])
    num_LT = (N_all['LT']-sresults['LT']['real']['sumweights'])
    den_LL = (N_all['LL']-sresults['LL']['real']['sumweights'])
    N_TT_fake_est = 0.
    if den_LL > 0:
        N_TT_fake_est = num_TL*num_LT/den_LL

    if debug and den_LL > 0 and sresults['TT']['fake']['sumweights']>0:
        print(f"{sample:52s} {sresults['TT']['real']['sumweights']:6.1f}  {N_TT_fake_est:6.1f}  {sresults['TT']['fake']['sumweights']:6.1f}  {sresults['TT']['other']['sumweights']:6.1f}   {(N_TT_fake_est-sresults['TT']['fake']['sumweights'])/sresults['TT']['fake']['sumweights']:6.1f}")
    elif debug:
        print(f"{sample:52s} {sresults['TT']['real']['sumweights']:6.1f}  {N_TT_fake_est:6.1f}  {sresults['TT']['fake']['sumweights']:6.1f}  {sresults['TT']['other']['sumweights']:6.1f}")
        
    return N_TT_fake_est

In [72]:
for s in samples:
    est=sampleABCD(s,False)
    if est>0.:
        sampleABCD(s,True)

Wtaunu_H_CFilterBVeto                                   0.0     0.0    -0.2     0.1
Znunu_BFilter                                           0.0     0.0     0.0     0.0     71.8
Wtaunu_H_CVetoBVeto                                     0.0     0.0     0.2     0.0     -0.9
Sh_2211_WqqZvv                                          0.0     0.1     0.0     0.0     10.8
Sh_2211_WlvWqq                                          0.0     0.0     0.0     0.0     -0.1
Znunugamma                                              0.7     0.0     0.0     0.0     -0.7
Wtaunugamma                                             1.7     0.0     0.0     0.0      0.1
Wtaunu_H_BFilter                                        0.0     0.1     0.0     0.0     14.4
Znunu_CVetoBVeto                                        0.0     0.9     0.1     0.3      8.8


This seems to be working.  To do:
* Update cutflow comparisons to target the 0L-low SR and reproduce Ren's numbers.  Then we can actually provide numbers for SR's in data.