# Creating filesets of datasets

This assumes that dataset has been published in DAS.
The output files should be json files in the `infile` directory.

In [1]:
import os 
import json 

In [2]:
"""Make dictionary of datasets and its files"""
filesets = {}

# maximum files per json
maxfiles_per_json = 20

filesets['2017'] = {}

# From Jan PFNano production
# this is preUL
datasets_preUL = !dasgoclient --query="dataset=/*/cmantill-RunIIFall17Jan22-PU2017_12Apr2018_94X_mc2017_realistic_v14-*-48cf03c3f8bf040d30c72392be4bf937/USER instance=prod/phys03"
# this is UL
datasets_UL = !dasgoclient --query="dataset=/*/cmantill-RunIISummer19UL17Jan22-106X_mc2017_realistic_v6-v*-3c6a7e6cdd956a176f75369cad3a5feb/USER instance=prod/phys03"

datasets = datasets_preUL + datasets_UL
for dataset in datasets:
    newquery = "file dataset=%s instance=prod/phys03"%dataset
    filearray = !dasgoclient --query="file ${newquery}"
    # warning - this only works for datasets  in FNAL
    fileeosarray = [f.replace('/store/','root://cmseos.fnal.gov//store/') for f in filearray]
    filesets['2017'][dataset.split('/')[1]] = fileeosarray
    

In [3]:
print(filesets['2017'].keys())
with open("fileset_2017_das.json", 'w') as json_file:
    json.dump(filesets['2017'], json_file, indent=4, sort_keys=True)

dict_keys(['BulkGravTohhTohVVhbb_narrow_M-1000_TuneCP5_13TeV-madgraph-pythia8', 'BulkGravTohhTohVVhbb_narrow_M-1400_TuneCP5_13TeV-madgraph-pythia8', 'BulkGravTohhTohVVhbb_narrow_M-1800_TuneCP5_13TeV-madgraph-pythia8', 'BulkGravTohhTohVVhbb_narrow_M-2000_TuneCP5_13TeV-madgraph-pythia8', 'BulkGravTohhTohVVhbb_narrow_M-2500_TuneCP5_13TeV-madgraph-pythia8', 'BulkGravTohhTohVVhbb_narrow_M-4500_TuneCP5_13TeV-madgraph-pythia8', 'GluGluHToWWToLNuQQ_M125_NNPDF31_TuneCP5_PSweights_13TeV_powheg_JHUGen710_pythia8', 'GluGluToHHTo2B2VLNu2J_node_cHHH0_TuneCP5_PSWeights_13TeV-powheg-pythia8', 'GluGluToHHTo2B2VLNu2J_node_cHHH1_TuneCP5_PSWeights_13TeV-powheg-pythia8', 'GluGluToHHTo2B2VLNu2J_node_cHHH2p45_TuneCP5_PSWeights_13TeV-powheg-pythia8', 'GluGluToHHTo2B2VLNu2J_node_cHHH5_TuneCP5_PSWeights_13TeV-powheg-pythia8', 'GluGluToHHTo2B2WToLNu2J_node_10_TuneCP5_PSWeights_13TeV-madgraph-pythia8', 'GluGluToHHTo2B2WToLNu2J_node_11_TuneCP5_PSWeights_13TeV-madgraph-pythia8', 'GluGluToHHTo2B2WToLNu2J_node_12_Tun

In [None]:
def split(year,sets,maxfiles_per_json):
    
    prefix = 'root://cmsxrootd.fnal.gov/'
    
    fsets = []
    for n,flist in sets[year].items():
        for f in flist:
            fsets.append({'file': prefix + f,
                          'dataset': n})
    nfiles = len(fsets)
        
    njsons = int(nfiles/maxfiles_per_json) + 1
    
    for i in range(0,njsons):
        subfilesets = {}
        for f in fsets[i*maxfiles_per_json:(i+1)*maxfiles_per_json]:
            dataset = f['dataset']
            if dataset in subfilesets:
                subfilesets[dataset].append(f['file'])
            else:
                subfilesets[dataset] = [f['file']]

        this_file = 'infiles/'+str(year)+'_'+str(i)+'.json'
        with open(this_file, 'w') as json_file:
            json.dump(subfilesets, json_file,indent=4, sort_keys=True)

    print('Created ' + str(njsons) + ' files')

In [None]:
# Skipping this for now
#split('2017',filesets,maxfiles_per_json)