In [1]:
from servicex import ProgressBarFormat
from servicex import query, dataset, deliver
import os
import yaml
# we need to unset BEARER_TOKEN_FILE environment variable for now
del os.environ['BEARER_TOKEN_FILE']

In [2]:
filelist = [
    # "root://xcache.cmsaf-dev.flatiron.hollandhpc.org//store/mc/RunIISummer20UL18NanoAODv9/DYJetsToLL_M-50_HT-100to200_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8/NANOAODSIM/106X_upgrade2018_realistic_v16_L1v1-v1/70000/92DF66B4-2A9E-4A49-8B43-A1B2F4B24379.root"#,  # noqa: E501
    # "root://xcache//store/data/Run2018A/DoubleMuon/NANOAOD/02Apr2020-v1/30000/0555868D-6B32-D249-9ED1-6B9A6AABDAF7.root"
    # "root://cmsxrootd.fnal.gov//store/mc/RunIISummer20UL18NanoAODv9/DYJetsToLL_M-50_HT-100to200_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8/NANOAODSIM/106X_upgrade2018_realistic_v16_L1v1-v1/70000/92DF66B4-2A9E-4A49-8B43-A1B2F4B24379.root",
    "root://cmsxrootd.fnal.gov//store/data/Run2018A/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9-v2/2550000/171C5530-5150-7C44-9029-40794C1FDBF5.root",
]

branches = {
            "Muon": ["pt", "eta", "phi", "mass", "miniIsoId", "tightId", "charge"],
            "FatJet": ["particleNet_TvsQCD", "pt", "eta", "phi", "mass"],
            "Jet": ["btagDeepB", "jetId", "pt", "eta", "phi", "mass"],
            "PuppiMET": ["pt", "phi"],
            "HLT": ["Mu50"],
            "Pileup": ["nTrueInt"],
            "event": ["genWeight", "run", "luminosityBlock", "event"],
        }

mc_branches = {
            "event": ["genWeight"],
            "Pileup": ["nTrueInt"],
        }


one_D_branches = ["PuppiMET","Pileup"]

preprocess_config = {
        "branches": branches,
        "ignore_missing": False,  # is this implemented?
        "mc_branches": mc_branches,
        # "skimming": skimming_config,
}

data = True


In [3]:
if data :
    # Remove MC-specific branches
    for key, values in mc_branches.items():
        if key in branches:
            branches[key] = [
                v for v in branches[key]
                if v not in values
            ]
    
    # Optional: remove keys that become empty
    branches = {k: v for k, v in branches.items() if v}
    
need_these_branches = branches

In [4]:
skim_selections = [
    "Where( lambda e : e['HLT_Mu50'] )",
    "Where( lambda e : e['PuppiMET_pt'] > 50 )"
]

In [5]:

def generate_servicex_yaml(all_branches, one_D_branches, filelist, sample_name="Sample"):
    """
    Since AST, qastle use only literal strings for the inputs, one has to
    generate a query yaml before hand (or on-the-fly) and pass it to the servicex.query
    """

    # Generate the selection dictionary
    # rawstring = {branch:{subbranch:'*'+branch+"_"+subbranch+'$' for subbranch in subbranches} for branch,subbranches in need_these_branches.items() if branch != "event"}.__str__()
    # raw_dict = {}
    # for branch,subbranches in need_these_branches.items():
    #     for subbranch in subbranches:
    #         raw_dict[branch+'_'+subbranch] = '*'+branch+"_"+subbranch+'$'
    
    raw_dict = {branch:{subbranch:'*'+branch+"_"+subbranch+'$' for subbranch in subbranches} for branch,subbranches in need_these_branches.items() if branch != "event"}
    if "event" in need_these_branches:
        for branch in need_these_branches["event"]:
            raw_dict[branch]='*'+branch+'$'
    
    clean_dict = {}
    for branch in raw_dict:
        if isinstance(raw_dict[branch], dict):
            if (len(list(raw_dict[branch].keys())) == 1):
                subbranch = list(raw_dict[branch].keys())[0]
                clean_dict[branch+'_'+subbranch] = raw_dict[branch][subbranch]
            elif branch in one_D_branches:
                for subbranch,value in raw_dict[branch].items():
                    clean_dict[branch+"_"+subbranch] = raw_dict[branch][subbranch]
            else:
                clean_dict[branch] = raw_dict[branch]
        else:
            clean_dict[branch] = raw_dict[branch] 
            
    rawstring = clean_dict.__str__()
    step1 = "events[\'".join(rawstring.split("\'*"))
    step2 = "\']".join(step1.split("$\'"))
    splits = step2.split("\']}")
    selection_dict = "\']}.Zip()".join(splits[:-1])+splits[-1]+"\']}"
    # selection_dict = step2


    spec = "\n  ".join(
        [
            f"Sample:",
            f"- Name: '{sample_name}'",
            f"  Dataset: !FileList",
            f"    {filelist.__str__()}",
            f"  Query: !FuncADL_Uproot |",
            f"          FromTree('Events').Select(lambda events: {selection_dict}).{'.'.join(skim_selections)}",
        ]
    )
    with open("../../servicex_query.yaml", "w") as f:
        f.write(spec)

    print("Saved servicex_query.yaml")


In [6]:
generate_servicex_yaml(need_these_branches,one_D_branches, filelist, "Single Muon")

Saved servicex_query.yaml


In [7]:
files = deliver("../../servicex_query.yaml", ignore_local_cache=True)
print(f"Files: {files}")

Output()

Files: {'Single Muon': ['/tmp/servicex_cms-jovyan/597c9e0b-24dd-4a1d-a1c2-166b2be28db8/root___cmsxrootd.fnal.gov__store_data_Run2018A_SingleMuon_NANOAOD_UL2018_MiniAODv2_NanoAODv9-v2_2550000_171C5530-5150-7C44-9029-40794C1FDBF5.root']}


In [8]:
# Test 
if len(files['Single Muon']) == 0:
    raise("Incorrect run")
output = files['Single Muon'][0]
import uproot
f = uproot.open(output)
t = f['servicex;1']

In [9]:
t.keys()

['nMuon',
 'Muon_pt',
 'Muon_eta',
 'Muon_phi',
 'Muon_mass',
 'Muon_miniIsoId',
 'Muon_tightId',
 'Muon_charge',
 'nFatJet',
 'FatJet_particleNet_TvsQCD',
 'FatJet_pt',
 'FatJet_eta',
 'FatJet_phi',
 'FatJet_mass',
 'nJet',
 'Jet_btagDeepB',
 'Jet_jetId',
 'Jet_pt',
 'Jet_eta',
 'Jet_phi',
 'Jet_mass',
 'PuppiMET_pt',
 'PuppiMET_phi',
 'HLT_Mu50',
 'run',
 'luminosityBlock',
 'event']

In [10]:
t['HLT_Mu50'].arrays()

In [11]:
t['PuppiMET_pt'].arrays()