Playing around to do 2-file solution file-matching for LS (lumisection).

To get JSON:

```
das_client.py --query="file,lumi dataset=/ttHTobb_M125_13TeV_powheg_pythia8/RunIIFall15DR76-25nsPUfixed30NzshcalRaw_76X_mcRun2_asymptotic_v12-v1/AODSIM" --format=json --limit=100 > aod.json
```

```
das_client.py --query="file,lumi dataset=/ttHTobb_M125_13TeV_powheg_pythia8/RunIIFall15DR76-25nsPUfixed30NzshcalRaw_76X_mcRun2_asymptotic_v12-v1/GEN-SIM-RAW" --format=json --limit=100 > raw.json
```

In [74]:
import json

In [2]:
with open('raw.json') as f:
    raw_dict = json.load(f)

with open('aod.json') as f:
    aod_dict = json.load(f)

# Exploration of JSON dicts

In [14]:
raw_dict.keys()

[u'status',
 u'mongo_query',
 u'ctime',
 u'nresults',
 u'timestamp',
 u'incache',
 u'data',
 u'apilist']

File are under the `data` key, stored as a list (1 entry = 1 file).

In [5]:
type(raw_dict['data'])

list

In [6]:
raw_dict['data'][0]

{u'_id': u'57123b74e139182c3b74d99b',
 u'cache_id': [u'571235a1e139182c3b742806'],
 u'das': {u'api': [u'file_lumi4dataset'],
  u'condition_keys': [u'dataset.name'],
  u'expire': 1460812780,
  u'instance': u'prod/global',
  u'primary_key': u'file.name',
  u'record': 1,
  u'services': [{u'dbs3': [u'dbs3']}],
  u'system': [u'dbs3'],
  u'ts': 1460812660.332109},
 u'das_id': [u'571235a0e139182c3b7427e8'],
 u'file': [{u'name': u'/store/mc/RunIIFall15DR76/ttHTobb_M125_13TeV_powheg_pythia8/GEN-SIM-RAW/25nsPUfixed30NzshcalRaw_76X_mcRun2_asymptotic_v12-v1/10000/0002DED1-DFC2-E511-BC7C-20CF3027A61A.root'}],
 u'lumi': [{u'number': [[589, 589],
    [602, 602],
    [609, 609],
    [637, 637],
    [10487, 10487],
    [10498, 10498],
    [10506, 10506],
    [10511, 10511],
    [10520, 10520],
    [10543, 10543]]}],
 u'qhash': u'230da56b3426866bdcf25a94cbd97994'}

In [11]:
raw_dict['data'][0]['file'][0]['name']

u'/store/mc/RunIIFall15DR76/ttHTobb_M125_13TeV_powheg_pythia8/GEN-SIM-RAW/25nsPUfixed30NzshcalRaw_76X_mcRun2_asymptotic_v12-v1/10000/0002DED1-DFC2-E511-BC7C-20CF3027A61A.root'

In [13]:
raw_dict['data'][0]['lumi'][0]['number']

[[589, 589],
 [602, 602],
 [609, 609],
 [637, 637],
 [10487, 10487],
 [10498, 10498],
 [10506, 10506],
 [10511, 10511],
 [10520, 10520],
 [10543, 10543]]

# Making our 2-file solution 

What we need to do:

1) build a list of the AOD files, with LS for each
 
2) make a list of RAW files for same dataset, with LS for each (is this necessary?)

3) do matching: for each AOD file, find the RAW file(s) that cover the AOD file.

4) ...

5) profit?

In [22]:
# DatasetFile = namedtuple('DatasetFile', ['name', 'lumi'])

In [64]:
class DatasetFile(object):
    
    def __init__(self, name, lumi):
        self.name = name
        self.lumi = lumi
        self.parents = []
        
    def has_lumi(self, ls):
        """Returns whether this 
        file contains lumisection `ls`"""
        for lr in self.lumi:
            if lr[0]<= ls <= lr[1]:
                return True
        return False
    
    def __repr__(self):
        return 'DatasetFile(name=%s, lumi=%s, parents=%s)' %(self.name, self.lumi, self.parents)

### 1) AOD file info

In [65]:
aod_files = [DatasetFile(name=entry['file'][0]['name'], lumi=entry['lumi'][0]['number']) for entry in aod_dict['data']]

### 2) RAW file info

In [66]:
raw_files = [DatasetFile(name=entry['file'][0]['name'], lumi=entry['lumi'][0]['number']) for entry in raw_dict['data']]

In [67]:
aod_files

[DatasetFile(name=/store/mc/RunIIFall15DR76/ttHTobb_M125_13TeV_powheg_pythia8/AODSIM/25nsPUfixed30NzshcalRaw_76X_mcRun2_asymptotic_v12-v1/10000/04F64DC2-1DC3-E511-8C46-B083FED138B3.root, lumi=[[589, 589], [602, 602], [609, 609], [620, 620], [628, 628], [630, 631], [637, 637], [10480, 10480], [10486, 10487], [10498, 10498], [10506, 10506], [10511, 10511], [10516, 10517], [10520, 10521], [10541, 10543], [10545, 10546], [10548, 10548], [12864, 12864], [12886, 12886], [13021, 13021], [13023, 13023], [13040, 13040], [13043, 13043], [13527, 13527], [13534, 13534], [13536, 13536], [13545, 13545]], parents=[]),
 DatasetFile(name=/store/mc/RunIIFall15DR76/ttHTobb_M125_13TeV_powheg_pythia8/AODSIM/25nsPUfixed30NzshcalRaw_76X_mcRun2_asymptotic_v12-v1/10000/062B6ADF-2FC3-E511-B674-02163E012699.root, lumi=[[611, 611], [10494, 10494], [10501, 10501], [10505, 10505], [10547, 10547], [10552, 10554], [10557, 10558], [12801, 12802], [12804, 12804], [12806, 12806], [12844, 12844], [12850, 12850], [12885, 

### 3) Method to retrieve file(s) for specified LS

In [68]:
def find_matching_ls_range(raw_files, ls_range):
    """Find all files that have lumisections 
    that fully cover ls_range.

    Parameters
    ----------
    raw_files : list[DatasetFile]
        List of files to match against.
    ls_range : list[int, int]
        Edges of lumisection range to match, e.g. [610, 621]
        
    Returns
    -------
    list[DatasetFile]
        List of unique DatasetFiles that cover ls_range.
    """
    matching_files = []
    for ls in xrange(ls_range[0], ls_range[1] + 1):
        matching_files.extend([f for f in raw_files if f.has_lumi(ls)])
    return list(set(matching_files))


def find_matching_files(raw_files, ls_ranges):
    """Find all files that cover all lumisections in ls_ranges

    Parameters
    ----------
    raw_files : list[DatasetFile]
        List of files to match against.
    ls_range : list[list[int, int]]
        List of edges of lumisection ranges to match, 
        e.g. [[610, 621], [701, 711]]
        
    Returns
    -------
    list[DatasetFile]
        List of unique DatasetFiles that cover ls_ranges.
        
    Raises
    ------
    RuntimeError
        If no files in `raw_files` match the lumisection.
    """
    matching_files = []
    for lsr in ls_ranges:
        res = find_matching_ls_range(raw_files, lsr)
        if not res:
            raise RuntimeError('No matching RAW file for this LS %s' % lsr)
        matching_files.extend(res)
    return list(set(matching_files))

In [69]:
print aod_files[0].lumi
find_matching_files(raw_files, aod_files[0].lumi)

[[589, 589], [602, 602], [609, 609], [620, 620], [628, 628], [630, 631], [637, 637], [10480, 10480], [10486, 10487], [10498, 10498], [10506, 10506], [10511, 10511], [10516, 10517], [10520, 10521], [10541, 10543], [10545, 10546], [10548, 10548], [12864, 12864], [12886, 12886], [13021, 13021], [13023, 13023], [13040, 13040], [13043, 13043], [13527, 13527], [13534, 13534], [13536, 13536], [13545, 13545]]


[DatasetFile(name=/store/mc/RunIIFall15DR76/ttHTobb_M125_13TeV_powheg_pythia8/GEN-SIM-RAW/25nsPUfixed30NzshcalRaw_76X_mcRun2_asymptotic_v12-v1/10000/3CDF872F-ECC2-E511-9F17-B083FED138B3.root, lumi=[[10480, 10480], [10486, 10486], [10516, 10517], [10521, 10521], [10541, 10542], [10546, 10546]], parents=[]),
 DatasetFile(name=/store/mc/RunIIFall15DR76/ttHTobb_M125_13TeV_powheg_pythia8/GEN-SIM-RAW/25nsPUfixed30NzshcalRaw_76X_mcRun2_asymptotic_v12-v1/10000/34DB90A2-EAC2-E511-BC26-20CF307C98DC.root, lumi=[[10545, 10545], [10548, 10548], [12864, 12864], [12886, 12886], [12929, 12929], [12932, 12932], [12945, 12945], [12958, 12958]], parents=[]),
 DatasetFile(name=/store/mc/RunIIFall15DR76/ttHTobb_M125_13TeV_powheg_pythia8/GEN-SIM-RAW/25nsPUfixed30NzshcalRaw_76X_mcRun2_asymptotic_v12-v1/10000/0002DED1-DFC2-E511-BC7C-20CF3027A61A.root, lumi=[[589, 589], [602, 602], [609, 609], [637, 637], [10487, 10487], [10498, 10498], [10506, 10506], [10511, 10511], [10520, 10520], [10543, 10543]], parents=[

In [72]:
%%timeit
for f in aod_files:
    f.parents = find_matching_files(raw_files, f.lumi)

10 loops, best of 3: 68.8 ms per loop


In [73]:
len(aod_files)

20