In [None]:
import pandas as pd
import numpy as np
from copy import deepcopy

import subprocess32 as subprocess
import os.path
from time import time

import ROOT
ROOT.enableJSVis()

import root_numpy
# import root_pandas

In [None]:
PERIOD = 'LHC18o'
MAIN_DIR = '/alice/data/2018/'
FNAME = 'FilterEvents_Trees.root'
BRANCHES = ['runNumber', 'evtTimeStamp', 'v0.fEffMass']

In [None]:
def measure_time(tic, operation=''):
    print '\t\texec. time of {operation}: {t} sec'.format(operation=operation, t=time()-tic)
    return time()

def process_file(fname, branches=BRANCHES):
    """
    reads ROOT file and return numpy 2D array with `branches` as columns 
    """
    tic = time()
    start = tic
    froot = ROOT.TFile.Open('alien:///{fname}'.format(fname=fname)) 
    if not froot: 
        print 'File opening FAILED'
        return None
    try:
        t = froot.FindObjectAny('V0s')
    except:
        print 'FindObjectAny FAILED'
        return None
    tic = measure_time(tic, 'Open and get tree')

    try:
        arr = root_numpy.tree2array(t, branches=branches)
    except:
        print 'tree2array FAILED'
        return None
    arr = np.array([list(r) for r in arr])
    tic = measure_time(tic, 'tree2array')
    
    froot.Close()
    tic = measure_time(tic, 'close()')
    
    print '\t\toutput array shape: ', arr.shape
    print '\t\texec. time of process_file(): ', time()-start, 'sec'
    return arr

In [None]:
ROOT.TGrid.Connect("alien://")  

In [None]:
%%time


period_dir = os.path.join(MAIN_DIR, PERIOD)
cmd = 'alien_ls {period_dir}'.format(period_dir=period_dir)
output_runs = subprocess.check_output(cmd, shell=True)
runs_lst = output_runs.split('\n')

print runs_lst
for run in runs_lst:
    print '\n\n\n\n'
    print 'RUN = ', run
    output_fname = 'data_validation_V0s/validation_V0s_{PERIOD}_{run}.root'.format(PERIOD=PERIOD, run=run)

    if os.path.isfile(output_fname):
        print 'output file already exists -- continue'
        continue
        
    run_dir = os.path.join(period_dir, run, 'pass1/MergedTrees')
    cmd = 'timeout 30 alien_find {run_dir} "{FNAME}"'.format(run_dir=run_dir, FNAME=FNAME)
    tic = time()
    files = None
    for i in range(3):
        try: 
            files = subprocess.check_output(cmd, shell=True, timeout=10)
        except subprocess.TimeoutExpired:
            print '\ttimeout in iter: ', i
            continue
        else:
            break
    print 'exec. time of alien_find: ', time()-tic, 'sec'
    if not files:
        print 'alien_find FAILED for {run} -- continue with other run'.format(run=run)
        continue
    if 'does not exist in the catalogue' in files: 
        print 'no pass1 dir for this run -- continue'
        print files
        continue
    files_lst = [f for f in files.split('\n') if FNAME in f]
    print '#files found', len(files_lst)
    arr_lst = []
    if not files_lst: 
        continue
        
    for f in files_lst:
        print '\tprocessing ', f
        print '\t', '- '*30
        arr = process_file(f)
        if arr is not None: 
            arr_lst.append(arr)
        print '\t', '- '*30
    if not arr_lst:
        print '0 files were read successfully -- no output file produced'
        continue
    
    # arr -> tree and save
    arr_merged = np.vstack(arr_lst)
    arr_merged = np.core.records.fromarrays(arr_merged.transpose(), 
                                            names=', '.join(BRANCHES))
    root_numpy.array2root(arr_merged, output_fname, 'v0_mass')