In [1]:
import uproot
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os #for looping over files in a directory
import math
import json
import glob

class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, (np.integer,)):
            return int(obj)
        elif isinstance(obj, (np.floating,)):
            return float(obj)
        elif isinstance(obj, (np.ndarray,)):
            return obj.tolist()
        return super().default(obj)

def load_json_file(file_path):
    """
    Loads JSON data from a file.

    Args:
        file_path (str): The path to the JSON file.

    Returns:
        dict or list: A Python dictionary or list representing the JSON data, or None if an error occurs.
    """
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
            return data
    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in '{file_path}'")
        return None
    except Exception as e:
         print(f"An unexpected error occurred: {e}")
         return None

In [55]:
# some cuts, like mindphijetmet, nbjet=0, nlepton=0, already applied at preselection level for now
SRcuts={'met_met': 200,
        'mTGammaMet': 50,
        'j1_pt': 150,
        'ph_pt': 10,
        'met_signif': 25}

def getemptyresults():
    results={}
    for b in ['TT', 'TL', 'LT', 'LL']:
        results[b] = {'real': {'nevents': 0,
                               'sumweights': 0},
                      'fake': {'nevents': 0,
                               'sumweights': 0},
                      'other': {'nevents': 0,
                                'sumweights': 0},
                      'data': 0
                     }
    return results

def ABCDresults(data,mask,isMC):
    masks={}
    masks['TT'] = (data['ph_select_tightID']==1) & (data['ph_select_tightIso']==1) & ((data['ph_isEM'] & 0x45fc01)==0)
    masks['TL'] = (data['ph_select_tightID']==1) & (data['ph_select_tightIso']==0) & ((data['ph_isEM'] & 0x45fc01)==0)
    masks['LT'] = (data['ph_select_tightID']==0) & (data['ph_select_tightIso']==1) & ((data['ph_isEM'] & 0x45fc01)==0)
    masks['LL'] = (data['ph_select_tightID']==0) & (data['ph_select_tightIso']==0) & ((data['ph_isEM'] & 0x45fc01)==0)

    if isMC:
        real_mask = (data['ph_truthprompt'] == 1)
        fake_mask = (data['ph_truthJFP']    == 1)

    results=getemptyresults()
    for b in ['TT', 'TL', 'LT', 'LL']:
        if isMC:
            results[b]['real']['nevents']     = np.sum(mask & masks[b] & real_mask)
            results[b]['real']['sumweights']  = np.sum(data['weight_total'][mask & masks[b] & real_mask])
            results[b]['fake']['nevents']     = np.sum(mask & masks[b] & fake_mask)
            results[b]['fake']['sumweights']  = np.sum(data['weight_total'][mask & masks[b] & fake_mask])
            results[b]['other']['nevents']    = np.sum(mask & masks[b] & ~real_mask & ~fake_mask)
            results[b]['other']['sumweights'] = np.sum(data['weight_total'][mask & masks[b] & ~real_mask & ~fake_mask])
        else:
            results[b]['data'] = np.sum(mask & masks[b])
    return results

def dumpjson(data,isMC):

    PS={}
    PS['0L'] = \
    (data['met_met']         >  SRcuts['met_met']*1000.   ) & \
    (data['j1_pt']           >  SRcuts['j1_pt']*1000.     ) & \
    (data['ph_pt']           >  SRcuts['ph_pt']*1000.     ) & \
    (data['nBTagJets']       == 0                         ) & \
    (data['mindPhiJetMet']   >  0.4                       ) & \
    (data['nElectrons']      == 0                         ) & \
    (data['nMuons']          == 0                         ) & \
    (data['nTau20_baseline'] == 0                         )

    
    SR={}
    SR['0L-mT-low'] = PS['0L'] & \
    (data['mTGammaMet']      <  50.*1000.) & \
    (data['met_signif']      >  25       ) & \
    (data['mindPhiGammaJet'] >  1.5      )

    SR['0L-mT-mid'] = PS['0L'] & \
    (data['mTGammaMet']      >   50*1000.) & \
    (data['mTGammaMet']      <  115*1000.) & \
    (data['met_signif']      >  20       ) & \
    (data['mindPhiGammaJet'] >  1.5      ) & \
    (data['dPhiGammaJ1']     >  1.5       )

    SR['0L-mT-hgh'] = PS['0L'] & \
    (data['mTGammaMet']      >  115*1000.) & \
    (data['met_signif']      >  15       ) & \
    (data['mindPhiGammaJet'] >  1.5      ) & \
    (data['dPhiGammaJ1']     >  1.5      )

    VR={}
    VR['0L-mT-mid'] = PS['0L'] & \
    (data['mTGammaMet']      >   50*1000.) & \
    (data['mTGammaMet']      <  115*1000.) & \
    (data['dPhiGammaMet']    >  2.0      )
    #(data['mindPhiGammaJet'] <  1.0      )
    
    
    return {'SR': {'0L-mT-low': ABCDresults(data, SR['0L-mT-low'], isMC),
                   '0L-mT-mid': ABCDresults(data, SR['0L-mT-mid'], isMC),
                   '0L-mT-hgh': ABCDresults(data, SR['0L-mT-hgh'], isMC),
                  },
            'VR': {'0L-mT-mid': ABCDresults(data, VR['0L-mT-mid'], isMC),
                  },
           }

In [56]:
base_path = "/data/mhance/SUSY/ntuples/v3"

# Iterate over subdirectories and files
for root, _, files in os.walk(base_path):
    for file in files:
        if not file.endswith('.root'): continue
        print(file)
        filepath = os.path.join(root, file)
        #if filepath != "/data/mhance/SUSY/ntuples/v3/output_Wtaunugamma.root": continue
        #if filepath != "/data/mhance/SUSY/ntuples/v3/output_data_2018.root": continue
        #if filepath != "/data/mhance/SUSY/ntuples/v3/output_Znunu_CVetoBVeto.root": continue
        #if filepath != "/data/mhance/SUSY/ntuples/v3/output_N2_220_N1_200_HH.root": continue
        #print(filepath)
        with uproot.open(filepath) as f:
            if 'picontuple' in f:
                tree = f['picontuple']
                # Extract the data
                data = tree.arrays(library="np")
                #data['met_signif'] = data['met_met']/data['ph_pt']

                results=dumpjson(data,"data_" not in filepath)
                #print(json.dumps(results, indent=4, cls=NumpyEncoder))
                with open("ABCD_results/"+file.replace(".root","_ABCD.json"),'w') as jf:
                    json.dump(results, jf, indent=4, cls=NumpyEncoder)

output_Sh_2211_Ztautau_HH_maxHTpTV2_CVetoBVeto.root
{
    "SR": {
        "0L-mT-low": {
            "TT": {
                "real": {
                    "nevents": 0,
                    "sumweights": 0.0
                },
                "fake": {
                    "nevents": 0,
                    "sumweights": 0.0
                },
                "other": {
                    "nevents": 0,
                    "sumweights": 0.0
                },
                "data": 0
            },
            "TL": {
                "real": {
                    "nevents": 0,
                    "sumweights": 0.0
                },
                "fake": {
                    "nevents": 0,
                    "sumweights": 0.0
                },
                "other": {
                    "nevents": 0,
                    "sumweights": 0.0
                },
                "data": 0
            },
            "LT": {
                "real": {
                    "nevents": 0,
     

In [57]:
def getfakeestimate(regiontype="SR",regionname="0L-mT-low",debug=False):
    
    totalresults=getemptyresults()
    sample_max={}
    sample_max['TL']=[0,'']
    sample_max['LT']=[0,'']
    sample_max['TT']=[0,'']
    sample_max['LL']=[0,'']
    
    samples=[]
    
    for fp in glob.glob("ABCD_results/*.json"):
        if "gammajet" in fp: continue
        if "jetjet" in fp: continue
        if "N2" in fp: continue
            
        data = load_json_file(fp)
    
        sample_tag = fp.replace("_results/output_","").replace("ABCD","").replace(".json","")[:-1]
        samples.append(sample_tag)

        region=data[regiontype][regionname]
        
        for b in ['TT', 'TL', 'LT', 'LL']:
            totalresults[b]['data'] += region[b]["data"]
            totalresults[b]['real']['sumweights'] += region[b]["real"]["sumweights"]
            totalresults[b]['fake']['sumweights'] += region[b]["fake"]["sumweights"]
            totalresults[b]['other']['sumweights'] += region[b]["other"]["sumweights"]
    
            if sample_max[b][0] < region[b]["real"]["sumweights"]:
                sample_max[b][0] = region[b]["real"]["sumweights"]
                sample_max[b][1] = sample_tag
                
    if debug:
        print(json.dumps(totalresults,indent=4,cls=NumpyEncoder))

        print("Most contributing samples:")
        for b in ['TT', 'TL', 'LT', 'LL']:
            print(f"{b}: {sample_max[b][1][:-1]}")
        
        mcs_data=load_json_file(f"ABCD_results/output_{sample_max['TT'][1]}_ABCD.json")
        print(json.dumps(mcs_data,indent=4,cls=NumpyEncoder))

    return totalresults

In [58]:
#totalresults=getfakeestimate("SR","0L-mT-low",False)
regiontype="VR"
blindTT = (regiontype == "SR")
totalresults=getfakeestimate(regiontype,"0L-mT-mid",False)

In [59]:
#N_LL = totalresults['LL']['data']-totalresults['LL']['real']['sumweights']
#N_TL = totalresults['TL']['data']-totalresults['TL']['real']['sumweights']
#N_LT = totalresults['LT']['data']-totalresults['LT']['real']['sumweights']

N={}
for b in ['TT', 'TL', 'LT', 'LL']:
    N[b] = totalresults[b]['data']-totalresults[b]['real']['sumweights']
    if b != 'TT' or (not blindTT):
        print(f"{b}: {totalresults[b]['data']:6.1f}   {totalresults[b]['real']['sumweights']:6.1f}")

if N['LL']>0:
    N_TT_bkg_DDfake = N['TL']*N['LT']/N['LL']
else:
    N_TT_bkg_DDfake = 0

N_TT_bkg_real = totalresults['TT']['real']['sumweights']
N_TT_bkg_other = totalresults['TT']['other']['sumweights']
N_TT_bkg_MCfake = totalresults['TT']['fake']['sumweights']

N_TT_bkg_MC = N_TT_bkg_MCfake + N_TT_bkg_real + N_TT_bkg_other

N_TT_bkg_DD = N_TT_bkg_DDfake + N_TT_bkg_real + N_TT_bkg_other

print(f"N_TT_bkg = ({N['TL']:.1f}*{N['LT']:.1f})/({N['LL']:.1f}) = {N_TT_bkg_DDfake:.1f}, N_TT_fake={N_TT_bkg_MCfake:.1f}, N_TT_other={N_TT_bkg_other:.1f}, and N_TT_real={N_TT_bkg_real:.1f}")

if not blindTT:
    print(f"Total data in TT region is {totalresults['TT']['data']:.1f}.")
    print(f"DD background prediction: {N_TT_bkg_real:5.1f} (real) + {N_TT_bkg_DDfake:.1f} (fake) + {N_TT_bkg_other:.1f} (other) = {N_TT_bkg_DD:.1f}")
    print(f"MC background prediction: {N_TT_bkg_real:5.1f} (real) + {N_TT_bkg_MCfake:.1f} (fake) + {N_TT_bkg_other:.1f} (other) = {N_TT_bkg_MC:.1f}")    

TT: 1746.0    262.9
TL: 2292.0    134.8
LT: 1208.0     40.1
LL: 1966.0     26.0
N_TT_bkg = (2157.2*1167.9)/(1940.0) = 1298.6, N_TT_fake=140.4, N_TT_other=508.7, and N_TT_real=262.9
Total data in TT region is 1746.0.
DD background prediction: 262.9 (real) + 1298.6 (fake) + 508.7 (other) = 2070.2
MC background prediction: 262.9 (real) + 140.4 (fake) + 508.7 (other) = 912.0


Quick function that will test closure for any single sample.

In [61]:
def sampleABCD(sample,debug=False):
    sresults=None
    if isinstance(sample,str):
        sresults=load_json_file(f"ABCD_results/output_{sample}_ABCD.json")["VR"]["0L-mT-mid"]
        #print(json.dumps(results,indent=4,cls=NumpyEncoder))
    elif isinstance(sample,dict):
        sresults=sample
    else:
        print("Must provide either valid sample string or dictionary of results.")
        return None

    N_all={}
    for b in ['TT', 'TL', 'LT', 'LL']:
        N_all[b] = sresults[b]['real']['sumweights']+sresults[b]['fake']['sumweights']

    num_TL = (N_all['TL']-sresults['TL']['real']['sumweights'])
    num_LT = (N_all['LT']-sresults['LT']['real']['sumweights'])
    den_LL = (N_all['LL']-sresults['LL']['real']['sumweights'])
    N_TT_fake_est = 0.
    if den_LL > 0:
        N_TT_fake_est = num_TL*num_LT/den_LL

    if debug and den_LL > 0 and sresults['TT']['fake']['sumweights']>0:
        print(f"{sample:52s} {sresults['TT']['real']['sumweights']:6.1f}  {N_TT_fake_est:6.1f}  {sresults['TT']['fake']['sumweights']:6.1f}  {sresults['TT']['other']['sumweights']:6.1f}   {(N_TT_fake_est-sresults['TT']['fake']['sumweights'])/sresults['TT']['fake']['sumweights']:6.1f}")
    elif debug:
        print(f"{sample:52s} {sresults['TT']['real']['sumweights']:6.1f}  {N_TT_fake_est:6.1f}  {sresults['TT']['fake']['sumweights']:6.1f}  {sresults['TT']['other']['sumweights']:6.1f}")
        
    return N_TT_fake_est

In [67]:
print(f"{"Sample":52s} {"Prompt":6s}    {"ABCD":6s} {"MC fakes":6s}  {"EFP/other":6s}   {"(ABCD-MC)/MC":6s}")
for s in samples:
    est=sampleABCD(s,False)
    if est>0.:
        sampleABCD(s,True)

Sample                                               Prompt    ABCD   MC fakes  EFP/Other   (ABCD-MC)/MC
Ztautaugamma                                            0.6     0.0    -0.0     0.0
Wtaunu_L_BFilter                                        0.0     0.3     0.1     0.9      1.9
Sh_2211_ZbbZvv                                          0.0     0.0     0.0     0.0      0.2
Wtaunu_H_CFilterBVeto                                   0.0     3.5     2.2    11.1      0.6
Sh_2212_lllv                                            0.0     0.0     0.0     0.0     -0.9
Wenugamma                                              16.9     0.2     0.1     1.0      0.1
Sh_2212_llvv_ss                                         0.0     0.0     0.0     0.0     -0.9
Znunu_BFilter                                          -0.0     1.9     2.9     6.4     -0.3
Sh_2211_Ztautau_LL_maxHTpTV2_CFilterBVeto               0.0     0.0     0.0     0.0     -0.4
Zmumugamma                                              0.7     0.0

This seems to be working.  To do:
* Implement some other regions