In [1]:
# this portion is done to ignore warnings from coffea for now
from __future__ import annotations

import os
import json
import time
from pathlib import Path

import awkward as ak
import dask
import dask_awkward as dak
import parse
import atlas_schema

import coffea

from atlas_schema.methods import behavior as as_behavior
from atlas_schema.schema import NtupleSchema
from coffea import processor
from coffea.analysis_tools import PackedSelection
from coffea.dataset_tools import apply_to_fileset
from dask.diagnostics import ProgressBar
from dask.distributed import Client
from dask_jobqueue.htcondor import HTCondorCluster
from dask.distributed import LocalCluster
from matplotlib import pyplot as plt
import hist.dask as had

fname_pattern = parse.compile(
    "user.{username:w}.{dsid:d}.{process:S}.{campaign:w}.v{version:.1f}_ANALYSIS.root"
)

colors_dict = {
    "Znunu": "b",
    "Wenu": "g",
    "Wmunu": "r",
    "Wtaunu_L": "c",
    "Wtaunu_H": "m",
    "Znunugamma": "y",
    "Wmunugamma": "k",
    "Wenugamma": "brown",
    "Wtaunugamma": "pink",
    "N2_100_N1_97_WB_signal": "rosybrown",
    "Fake/Nonprompt": "lime",
}  #  'slategrey', 'blueviolet', 'crimson'


import warnings
warnings.filterwarnings("ignore", module="coffea.*")
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore')


Now create a processor that will handle the data

In [2]:
class MyProcessor(processor.ProcessorABC):
    def __init__(self):
        # can define histograms here
        pass

    def process(self, events):
        ## TODO: remove this temporary fix when https://github.com/scikit-hep/vector/issues/498 is resolved
        met_dict = {field: events.met[field] for field in events.met.fields}
        met_dict["pt"] = dak.zeros_like(events.met.met)
        met_dict["eta"] = dak.zeros_like(events.met.met)
        events["met"] = dak.zip(met_dict, with_name="MissingET", behavior=as_behavior)

        dataset = events.metadata["dataset"]
        
        print(f"processing {len(events)} events for {dataset}")
        # xs = events.metadata["xs"]
        # lum = events.metadata["luminosity"]
        # process = events.metadata["process"]
        # genFiltEff = events.metadata["genFiltEff"]
        # evt_count = ak.num(events, axis=0).compute()
        # weights = (xs * genFiltEff * lum / evt_count) * np.ones(evt_count)

        leptons = ak.concatenate((events.el, events.mu), axis=1)

        # here are some selection cuts for something that looks like the signal region.
        # the only thing that's different is the MET requirement, which I inverted to be
        # met<250 instead of met>250, to make sure we don't accidentally unblind the SR
        # and to give us some more stats while we study MC samples.
        selections = {
            "met": (events.met.met < 250 * 1.0e3),
            "lepton_veto": (ak.sum(leptons.pt, axis=1) == 0),
            "leading_jet_pt": (ak.firsts(events.jet.pt) > 100 * 1.0e3),
            "min_dphi_jet_met": (ak.min(abs(events.met.delta_phi(events.jet)), axis=1) > 0.4),
            "bjet_veto": (ak.sum(events.jet.btag_select, axis=1) == 0),
            "vgamma_overlap": (events["in"]["vgamma_overlap_7"]==1),
        }
        
        selection = PackedSelection()
        selection.add_multiple(selections)

        SR=(selection.all())
        presel_events=events[SR]
        
        # photon object preselection
        ph_preselection = (
            (presel_events.ph.pt>10000) &
            (presel_events.ph.select_baseline==1) &
            ((presel_events.ph.isEM&0x45fc01)==0) &
            (
             (abs(presel_events.ph.eta)<1.37) | 
             ((abs(presel_events.ph.eta)>1.52) & 
              (abs(presel_events.ph.eta)<2.37))
            ) &
            (presel_events.ph.select_or_dR02Ph==1)
        )

        # this selects events with at least one baseline photon
        ph_presel_data=presel_events[ak.any(ph_preselection,axis=1)]

        # define tight and loose cuts, now on the smaller data sample that only has good events
        ph_preselection=((ph_presel_data.ph.pt>10000) & 
                         ((abs(ph_presel_data.ph.eta)<1.37) | ((abs(ph_presel_data.ph.eta)>1.52) & 
                                                               (abs(ph_presel_data.ph.eta)<2.37))) &
                         (ph_presel_data.ph.select_or_dR02Ph==1) &
                         ((ph_presel_data.ph.isEM&0x45fc01)==0) &
                         (ph_presel_data.ph.select_baseline==1)
                        )

        # get the index of the first preselected photon (which should be the leading preselected photon)
        indices=ak.argmax(ph_preselection,axis=1,keepdims=True)
        
        # apply cuts to that index
        ph_tight = (ak.firsts(ph_presel_data.ph[indices].select_tightID)==1)
        ph_iso   = (ak.firsts(ph_presel_data.ph[indices].select_tightIso)==1)
        ph_truth = ((ak.firsts(ph_presel_data.ph[indices].truthType) != 0) & 
                    (ak.firsts(ph_presel_data.ph[indices].truthType) != 16))

        return {
            dataset: {
                "entries": ak.num(events, axis=0)
            },
            "presel": {
                "total": ak.num(presel_events,axis=0),
                "met": presel_events.met.met,
                "njets": ak.num(presel_events.jet.pt,axis=1)
            },
            "ABCD": {
                "A_true": ak.num(ph_presel_data.ph[indices].pt[ ph_tight & ~ph_iso &  ph_truth][:,0],axis=0),
                "B_true": ak.num(ph_presel_data.ph[indices].pt[~ph_tight & ~ph_iso &  ph_truth][:,0],axis=0),
                "C_true": ak.num(ph_presel_data.ph[indices].pt[ ph_tight &  ph_iso &  ph_truth][:,0],axis=0),
                "D_true": ak.num(ph_presel_data.ph[indices].pt[~ph_tight &  ph_iso &  ph_truth][:,0],axis=0),
                "A_fake": ak.num(ph_presel_data.ph[indices].pt[ ph_tight & ~ph_iso & ~ph_truth][:,0],axis=0),
                "B_fake": ak.num(ph_presel_data.ph[indices].pt[~ph_tight & ~ph_iso & ~ph_truth][:,0],axis=0),
                "C_fake": ak.num(ph_presel_data.ph[indices].pt[ ph_tight &  ph_iso & ~ph_truth][:,0],axis=0),
                "D_fake": ak.num(ph_presel_data.ph[indices].pt[~ph_tight &  ph_iso & ~ph_truth][:,0],axis=0),
            }
        }

    def postprocess(self, accumulator):
        pass

In [4]:
start_time = time.time()

my_processor = MyProcessor()

# load in a bunch of datasets
#dataset_runnable = json.loads(Path("af_v2_2_mc.json").read_text())
dataset_runnable = json.loads(Path("af_v2_2_mc_onefile.json").read_text())

cluster=None
dataset_to_run=None

can_submit_to_condor=False
datasettag='Znunugamma'

if can_submit_to_condor:
    # To facilitate usage with HTCondor
    cluster = HTCondorCluster(
        log_directory=Path().cwd() / ".condor_logs" / "cutflows_v2",
        cores=4,
        memory="4GB",
        disk="2GB",
    )
    cluster.scale(jobs=100)

    # if we're running over all samples, ensure that here
    dataset_to_run=dataset_runnable
else:
    cluster=LocalCluster()
    dataset_to_run={datasettag: dataset_runnable[datasettag]}


client = Client(cluster)

print("Applying to fileset")
out = apply_to_fileset(
    my_processor,
    dataset_to_run,
    schemaclass=NtupleSchema,
)

print("Beginning of dask.compute()")

# Add progress bar for dask
pbar = ProgressBar()
pbar.register()

(computed,) = dask.compute(out)
end_time = time.time()

print("Execution time: ", end_time - start_time)
print("Finished dask.compute")

Applying to fileset
processing 508801 events for Znunugamma
Beginning of dask.compute()
[########################################] | 100% Completed | 206.11 ms


Issue: coffea.nanoevents.methods.vector will be removed and replaced with scikit-hep vector. Nanoevents schemas internal to coffea will be migrated. Otherwise please consider using that package!.
  from coffea.nanoevents.methods import vector
Issue: coffea.nanoevents.methods.vector will be removed and replaced with scikit-hep vector. Nanoevents schemas internal to coffea will be migrated. Otherwise please consider using that package!.
  from coffea.nanoevents.methods import vector
Issue: coffea.nanoevents.methods.vector will be removed and replaced with scikit-hep vector. Nanoevents schemas internal to coffea will be migrated. Otherwise please consider using that package!.
  from coffea.nanoevents.methods import vector
Issue: coffea.nanoevents.methods.vector will be removed and replaced with scikit-hep vector. Nanoevents schemas internal to coffea will be migrated. Otherwise please consider using that package!.
  from coffea.nanoevents.methods import vector


Execution time:  573.7915887832642
Finished dask.compute


In [5]:
print(computed)
print(computed['Znunugamma']['presel'])

{'Znunugamma': {'Znunugamma': {'entries': 508801}, 'presel': {'total': np.int64(116881), 'met': <Array [2.26e+05, 2.08e+05, ..., 2.35e+05] type='116881 * float32[parameter...'>, 'njets': <Array [2, 2, 7, 2, 3, 4, 5, 2, ..., 2, 2, 3, 2, 2, 4, 3] type='116881 * int64'>}, 'ABCD': {'A_true': np.int64(24384), 'B_true': np.int64(2221), 'C_true': np.int64(65414), 'D_true': np.int64(3464), 'A_fake': np.int64(598), 'B_fake': np.int64(858), 'C_fake': np.int64(163), 'D_fake': np.int64(166)}}}
{'total': np.int64(116881), 'met': <Array [2.26e+05, 2.08e+05, ..., 2.35e+05] type='116881 * float32[parameter...'>, 'njets': <Array [2, 2, 7, 2, 3, 4, 5, 2, ..., 2, 2, 3, 2, 2, 4, 3] type='116881 * int64'>}
