In [1]:
import os
import sys
import numpy as np
import itertools
import pickle as pkl
import h5py

## Access Issues
Two Directories have files that can't be accessed: 

* **/project/projectdirs/dayabay/data/exp/dayabay/2013/p14a/**
    * has some empty directories
    * these seem to all be regenerated in /project/projectdirs/dayabay/data/exp/dayabay/2013/p15a/Neutrino/

* **/project/projectdirs/dayabay/data/exp/dayabay/2012/p14a**
    * sym links to /global/projecta/projectdirs/dayabay/data/exp/dayabay/2012/p14a
    * permission denied
    
## Solution
All unfound files except /project/projectdirs/dayabay/data/exp/dayabay/2012/p14a/Neutrino/1105/recon.Neutrino.0034955.Physics.EH3-Merged.P14A-P._0025.root can be found in /project/projectdirs/dayabay/data/exp/dayabay/year/p15a


In [2]:
def get_id(file_string):
    fs_split = file_string.split('.')
    return fs_split[2] + fs_split[6] + fs_split[4]

In [3]:
def get_filepaths(directory,filter_fxn=None):
    """
    This function will generate the file names in a directory 
    tree by walking the tree either top-down or bottom-up. For each 
    directory in the tree rooted at directory top (including top itself), 
    it yields a 3-tuple (dirpath, dirnames, filenames). Only takes files that make
    filter_fxn evaluate to true
    """
    file_paths = []  # List which will store all of the full filepaths.

    # Walk the tree.
    for root, directories, files in os.walk(directory):
        for filename in files:
            if filter_fxn is not None:
                if not filter_fxn(filename):
                    continue
            # Join the two strings in order to form the full filepath.
            filepath = os.path.join(root, filename)
            file_paths.append(filepath)  # Add it to the list.

    return file_paths  # Self-explanatory.

In [5]:
def get_eh(rootfile):
    fs_split = rootfile.split('.')
    return fs_split[4].split('-')[0]

In [4]:
with open("./FileList-6Oct-Official-1") as f:
    content = [x.strip('\n') for x in f.readlines()]
with(open("./FileList-6Oct-Official-2")) as f:
    content2 = [x.strip('\n') for x in f.readlines()]
new_content = content + content2
new_content = list(set(new_content))

In [52]:
print "official list 1 has %i elemeents, official list \
2 has %i elements. Their sum is %i and the set of the union has %i elements" %\
(len(content), len(content2), len(content) + len(content2),  len(new_content))

official list 1 has 189159 elemeents, official list 2 has 118961 elements. Their sum is 308120 and the set of the union has 308120 elements


In [6]:
eh1 = [c for c in new_content if get_eh(c) == 'EH1' ]

In [7]:
len(eh1)

137544

In [74]:
old_content = content
content = new_content


In [54]:
#list of files in content that can cannot be read
cant = [ f for f in content if not os.path.exists(f) ]
can = [ f for f in content if os.path.exists(f) ]

In [57]:
print can[0]
print cant[1]
print len(cant)
print len(can)

/project/projectdirs/dayabay/data/exp/dayabay/2012/p14a/Neutrino/0103/recon.Neutrino.0021367.Physics.EH1-Merged.P14A-P._0092.root
/project/projectdirs/dayabay/data/exp/dayabay/2013/p14a/Neutrino/0422/recon.Neutrino.0038701.Physics.EH1-Merged.P14A-P._0026.root
261585
46535


In [8]:
print "I cant read fies from these years %s" % str(list(set([c.split('/')[7] for c in cant ])))

I cant read fies from these years ['2014', '2013', '2012']


In [58]:
can_ids = [get_id(c) for c in can]
cant_ids = [get_id(c) for c in cant]

In [59]:
print can_ids[0]

0021367_0092EH1-Merged


In [11]:
new_2013 = '/project/projectdirs/dayabay/data/exp/dayabay/2013/p15a/Neutrino/'
new_2012 =  '/project/projectdirs/dayabay/data/exp/dayabay/2012/p15a/Neutrino/'
new_2014 = '/project/projectdirs/dayabay/data/exp/dayabay/2014/p15a/Neutrino/'

In [76]:
new_files_2012 = get_filepaths(new_2012,lambda f: '.root' in f) 
new_files_2013 = get_filepaths(new_2013,lambda f: '.root' in f) 
new_files_2014 = get_filepaths(new_2014,lambda f: '.root' in f) 

In [77]:
new_files = new_files_2013 + new_files_2012 + new_files_2014 

In [79]:
new_files[100000]
#can[0]



In [80]:
print xml

[]


In [81]:
new_ids = {get_id(fil): fil for fil in new_files}

In [82]:
cant_dict = {get_id(f): f for f in cant}

In [83]:
num_cant = len(cant)
num_no_need_to_recov = len(set(can_ids).intersection(set(new_ids.keys())))

recov = list(set(cant_ids).intersection(set(new_ids.keys())))
num_recov = len(recov)
not_recovered = list(set(cant_ids).difference(set(new_ids.keys())))
print "Number of files I can't read: %i, total recoveredthat were needed in p15a: %i. recov that I could already read anyway %i " % (num_cant, num_recov, num_no_need_to_recov)
print "Total files on list: %i, Total files recovered from 2012,2013,2014: \
%i, Files recov not on list: %i" % (len(content), len(new_files), len(new_files) - num_recov - num_no_need_to_recov)
#print not_recovered
files_not_recovered = [f for f in cant if get_id(f) in not_recovered]
print "file not recovered: ", files_not_recovered
print len(content)
print len(new_ids)

Number of files I can't read: 261585, total recoveredthat were needed in p15a: 261585. recov that I could already read anyway 44123 
Total files on list: 308120, Total files recovered from 2012,2013,2014: 314147, Files recov not on list: 8439
file not recovered:  []
308120
314144


In [84]:
files_recovered =  [new_ids[f_id] for f_id in recov]

In [85]:
len(files_recovered)

261585

In [86]:
new_official_list = files_recovered + can

In [87]:
len(new_official_list)

308120

In [88]:
with open('./FileList-14Mar-Recovered-1-2', 'w') as f:
    for item in new_official_list:
        f.write("%s\n" % item)

In [89]:
!ls


FileList-14Mar-Recovered-1-2  extract_background.ipynb
FileList-6Oct-Official-1      makedataset_withtime.py
FileList-6Oct-Official-2      write_new_filelist.ipynb


In [90]:
with open('./FileList-14Mar-Recovered-1-2', 'r') as f:
    test_content = [x.strip('\n') for x in f.readlines()]

In [91]:
for i, c in enumerate(test_content):
    assert test_content[i] == new_official_list[i], "Ahhh sopmething went wrong!"