In [24]:
import uproot
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os #for looping over files in a directory
import math
import json
import glob

class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, (np.integer,)):
            return int(obj)
        elif isinstance(obj, (np.floating,)):
            return float(obj)
        elif isinstance(obj, (np.ndarray,)):
            return obj.tolist()
        return super().default(obj)

def load_json_file(file_path):
    """
    Loads JSON data from a file.

    Args:
        file_path (str): The path to the JSON file.

    Returns:
        dict or list: A Python dictionary or list representing the JSON data, or None if an error occurs.
    """
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
            return data
    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in '{file_path}'")
        return None
    except Exception as e:
         print(f"An unexpected error occurred: {e}")
         return None

In [25]:
# some cuts, like mindphijetmet, nbjet=0, nlepton=0, already applied at preselection level for now
SRcuts={'met_met': 250,
        'mTGammaMet': 50,
        'j1_pt': 150,
        'ph_pt': 10,
        'met_signif': 40}

def getemptyresults():
    results={}
    for b in ['TT', 'TL', 'LT', 'LL']:
        results[b] = {'real': {'nevents': 0,
                               'sumweights': 0},
                      'fake': {'nevents': 0,
                               'sumweights': 0},
                      'data': 0
                     }
    return results

def ABCDresults(data,mask,isMC):
    masks={}
    masks['TT'] = (data['ph_select_tightID']==1) & (data['ph_select_tightIso']==1) & ((data['ph_isEM'] & 0x45fc01)==0)
    masks['TL'] = (data['ph_select_tightID']==1) & (data['ph_select_tightIso']==0) & ((data['ph_isEM'] & 0x45fc01)==0)
    masks['LT'] = (data['ph_select_tightID']==0) & (data['ph_select_tightIso']==1) & ((data['ph_isEM'] & 0x45fc01)==0)
    masks['LL'] = (data['ph_select_tightID']==0) & (data['ph_select_tightIso']==0) & ((data['ph_isEM'] & 0x45fc01)==0)

    if isMC:
        real_mask = (data['ph_truthprompt'] == 1)
        fake_mask = (data['ph_truthJFP'] == 1)

    results=getemptyresults()
    for b in ['TT', 'TL', 'LT', 'LL']:
        if isMC:
            results[b]['real']['nevents']    = np.sum(mask & masks[b] & real_mask)
            results[b]['real']['sumweights'] = np.sum(data['weight_total'][mask & masks[b] & real_mask])
            results[b]['fake']['nevents']    = np.sum(mask & masks[b] & fake_mask)
            results[b]['fake']['sumweights'] = np.sum(data['weight_total'][mask & masks[b] & fake_mask])
        else:
            results[b]['data'] = np.sum(mask & masks[b])
    return results

def dumpjson(data,isMC):

    SR_mask = \
    (data['met_met']>SRcuts['met_met']*1000.) & \
    (data['mTGammaMet']<SRcuts['mTGammaMet']*1000.) & \
    (data['j1_pt']>SRcuts['j1_pt']*1000.) & \
    (data['ph_pt']>SRcuts['ph_pt']*1000.) & \
    (data['met_signif']>SRcuts['met_signif'])

    return {'SR': {'bin_1': ABCDresults(data,SR_mask,isMC)}}

In [26]:
base_path = "/data/mhance/SUSY/ntuples/v3"

# Iterate over subdirectories and files
for root, _, files in os.walk(base_path):
    for file in files:
        if not file.endswith('.root'): continue
        print(file)
        filepath = os.path.join(root, file)
        #if filepath != "/data/mhance/SUSY/ntuples/v3/output_data_2018.root": continue
        #if filepath != "/data/mhance/SUSY/ntuples/v3/output_Znunu_CVetoBVeto.root": continue
        #print(filepath)
        with uproot.open(filepath) as f:
            if 'picontuple' in f:
                tree = f['picontuple']
                # Extract the data
                data = tree.arrays(library="np")
                data['met_signif'] = data['met_met']/data['ph_pt']

                results=dumpjson(data,"data_" not in filepath)
                print(json.dumps(results, indent=4, cls=NumpyEncoder))
                with open("ABCD_results/"+file.replace(".root","_ABCD.json"),'w') as jf:
                    json.dump(results, jf, indent=4, cls=NumpyEncoder)

output_Sh_2211_Ztautau_HH_maxHTpTV2_CVetoBVeto.root
{
    "SR": {
        "bin_1": {
            "TT": {
                "real": {
                    "nevents": 0,
                    "sumweights": 0.0
                },
                "fake": {
                    "nevents": 0,
                    "sumweights": 0.0
                },
                "data": 0
            },
            "TL": {
                "real": {
                    "nevents": 0,
                    "sumweights": 0.0
                },
                "fake": {
                    "nevents": 0,
                    "sumweights": 0.0
                },
                "data": 0
            },
            "LT": {
                "real": {
                    "nevents": 0,
                    "sumweights": 0.0
                },
                "fake": {
                    "nevents": 0,
                    "sumweights": 0.0
                },
                "data": 0
            },
            "LL": {
          

In [55]:
totalresults=getemptyresults()
sample_max={}
sample_max['TL']=[0,'']
sample_max['LT']=[0,'']
sample_max['TT']=[0,'']
sample_max['LL']=[0,'']

samples=[]

for fp in glob.glob("ABCD_results/*.json"):
    if "gammajet" in fp: continue
    if "jetjet" in fp: continue
    if "N2" in fp: continue
        
    data = load_json_file(fp)

    sample_tag = fp.replace("_results/output_","").replace("ABCD","").replace(".json","")[:-1]
    samples.append(sample_tag)
    
    for b in ['TT', 'TL', 'LT', 'LL']:
        totalresults[b]['data'] += data["SR"]["bin_1"][b]["data"]
        totalresults[b]['real']['sumweights'] += data["SR"]["bin_1"][b]["real"]["sumweights"]
        totalresults[b]['fake']['sumweights'] += data["SR"]["bin_1"][b]["fake"]["sumweights"]

        if sample_max[b][0] < data["SR"]["bin_1"][b]["real"]["sumweights"]:
            sample_max[b][0] = data["SR"]["bin_1"][b]["real"]["sumweights"]
            sample_max[b][1] = sample_tag
print(json.dumps(totalresults,indent=4,cls=NumpyEncoder))

print("Most contributing samples:")
for b in ['TT', 'TL', 'LT', 'LL']:
    print(f"{b}: {sample_max[b][1][:-1]}")

mcs_data=load_json_file(f"ABCD_results/output_{sample_max['TT'][1]}_ABCD.json")
print(json.dumps(mcs_data,indent=4,cls=NumpyEncoder))

mcs_data=load_json_file(f"ABCD_results/output_Wmunu_ABCD.json")
print(json.dumps(mcs_data,indent=4,cls=NumpyEncoder))

{
    "TT": {
        "real": {
            "nevents": 0,
            "sumweights": 25.041041341377422
        },
        "fake": {
            "nevents": 0,
            "sumweights": 5.870354173704982
        },
        "data": 46
    },
    "TL": {
        "real": {
            "nevents": 0,
            "sumweights": 9.282687488943338
        },
        "fake": {
            "nevents": 0,
            "sumweights": 30.533763958024792
        },
        "data": 42
    },
    "LT": {
        "real": {
            "nevents": 0,
            "sumweights": 5.166242744657211
        },
        "fake": {
            "nevents": 0,
            "sumweights": 15.257648225524463
        },
        "data": 38
    },
    "LL": {
        "real": {
            "nevents": 0,
            "sumweights": 2.147067476529628
        },
        "fake": {
            "nevents": 0,
            "sumweights": 36.91562483552843
        },
        "data": 29
    }
}
Most contributing samples:
TT: Wtaunugamm
TL: Wtau

In [56]:
N_LL = totalresults['LL']['data']-totalresults['LL']['real']['sumweights']
N_TL = totalresults['TL']['data']-totalresults['TL']['real']['sumweights']
N_LT = totalresults['LT']['data']-totalresults['LT']['real']['sumweights']

N_TT_bkg = N_TL*N_LT/N_LL

print(f"N_TT_bkg = ({N_TL:.1f}*{N_LT:.1f})/({N_LL:.1f}) = {N_TT_bkg:.1f}, with N_TT={totalresults['TT']['data']} and N_TT_fake={totalresults['TT']['fake']['sumweights']:.1f} and N_TT_real={totalresults['TT']['real']['sumweights']:.1f}")

N_TT_bkg = (32.7*32.8)/(26.9) = 40.0, with N_TT=46 and N_TT_fake=5.9 and N_TT_real=25.0


Quick function that will test closure for any single sample.

In [75]:
def sampleABCD(sample,debug=False):
    results=None
    if isinstance(sample,str):
        results=load_json_file(f"ABCD_results/output_{sample}_ABCD.json")["SR"]["bin_1"]
        #print(json.dumps(results,indent=4,cls=NumpyEncoder))
    elif isinstance(sample,dict):
        results=sample
    else:
        print("Must provide either valid sample string or dictionary of results.")
        return None

    N_all={}
    for b in ['TT', 'TL', 'LT', 'LL']:
        N_all[b] = results[b]['real']['sumweights']+results[b]['fake']['sumweights']

    num_TL = (N_all['TL']-results['TL']['real']['sumweights'])
    num_LT = (N_all['LT']-results['LT']['real']['sumweights'])
    den_LL = (N_all['LL']-results['LL']['real']['sumweights'])
    N_TT_fake_est = 0.
    if den_LL > 0:
        N_TT_fake_est = num_TL*num_LT/den_LL

    if debug and den_LL > 0 and results['TT']['fake']['sumweights']>0:
        print(f"{sample:42s} {N_TT_fake_est:.3f}  {results['TT']['fake']['sumweights']:.3f}  {(N_TT_fake_est-results['TT']['fake']['sumweights'])/results['TT']['fake']['sumweights']:.3f}")
    return N_TT_fake_est

In [76]:
for s in samples:
    sampleABCD(s,True)

Wenu_BFilter                               0.000  0.040  -1.000
Wenugamma                                  0.000  0.004  -1.000
Znunu_BFilter                              0.038  0.063  -0.405
Sh_2212_lvvv                               0.000  0.083  -1.000
Wtaunu_H_CVetoBVeto                        10.022  1.088  8.212
Sh_2211_WqqZvv                             0.027  0.167  -0.841
Znunu_CFilterBVeto                         1.080  0.969  0.115
Wenu_CVetoBVeto                            0.124  0.162  -0.234
Sh_2211_WlvWqq                             -0.024  0.310  -1.076
Znunugamma                                 0.056  0.041  0.378
Wtaunugamma                                0.078  0.090  -0.136
Sh_2211_WlvZqq                             0.130  0.152  -0.146
Sh_2211_Ztautau_LH_maxHTpTV2_CVetoBVeto    0.000  0.032  -1.000
Wmunugamma                                 0.000  0.013  -1.000
Sh_2212_llvvjj_ss                          0.000  0.009  -1.000
Wtaunu_L_CFilterBVeto                    

This seems to be working.  To do:
* Update cutflow comparisons to target the 0L-low SR and reproduce Ren's numbers.  Then we can actually provide numbers for SR's in data.