In [2]:
import sys

from os import listdir
from os.path import isfile, join

import numpy as np

import uproot3 as upr3
import pandas as pd
from ROOT import TLorentzVector

import awkward as ak
import pyarrow as pa

from pathlib import Path

GeV = 1000

In [10]:
path = "/gpfs/slac/atlas/fs1/d/recsmith/aswt/samples/data16/user.rasmith.TOPQ1.data16.diphoton_ntuple.root_ANALYSIS"

files = [join(path, f) for f in listdir(path) if isfile(join(path, f))]
files.sort()

In [11]:
def myy(ph_pt, ph_eta, ph_phi, ph_m):
        
    ph1 = TLorentzVector()
    ph2 = TLorentzVector()
    
    ph1.SetPtEtaPhiM(ph_pt[0],ph_eta[0],ph_phi[0],ph_m[0])
    ph2.SetPtEtaPhiM(ph_pt[1],ph_eta[1],ph_phi[1],ph_m[1])

    return (ph1 + ph2).M()

In [12]:
def mll(el_pt, el_eta, el_phi, el_m, mu_pt, mu_eta, mu_phi, mu_m):
        
    l1 = TLorentzVector()
    l2 = TLorentzVector()
    
    if len(el_pt)>0:
        l1.SetPtEtaPhiM(el_pt[0],el_eta[0],el_phi[0],el_m[0])
        l2.SetPtEtaPhiM(el_pt[1],el_eta[1],el_phi[1],el_m[1])
    else:
        l1.SetPtEtaPhiM(mu_pt[0],mu_eta[0],mu_phi[0],mu_m[0])
        l2.SetPtEtaPhiM(mu_pt[1],mu_eta[1],mu_phi[1],mu_m[1])

    return (l1 + l2).M()

In [17]:
arrays_to_open = [
    "ph_OR_NOSYS_pt", "ph_OR_NOSYS_phi", "ph_OR_NOSYS_eta", "ph_OR_NOSYS_m",
    "el_OR_NOSYS_pt", "el_OR_NOSYS_phi", "el_OR_NOSYS_eta", "el_OR_NOSYS_m", "el_OR_NOSYS_charge",
    "mu_OR_NOSYS_pt", "mu_OR_NOSYS_phi", "mu_OR_NOSYS_eta", "mu_OR_NOSYS_charge"
]

for file in files:
    
    print(path + file)
    print("Opening...")
    df = pd.DataFrame(upr3.open(file)["particles"].arrays(arrays_to_open, namedecode="utf-8"))
    print("Opened!")
    
    print("Starting with {} events".format(len(df)))
    
    print("Applying cuts...")
    new_df = df.copy()
    new_df = new_df[
        (df.apply(lambda row: len(row.ph_OR_NOSYS_pt), axis=1) == 2) &
        (
            (
                (df.apply(lambda row: len(row.mu_OR_NOSYS_pt), axis=1) == 2) & 
                (df.apply(lambda row: len(row.el_OR_NOSYS_pt), axis=1) == 0)
            )  |
            (
                (df.apply(lambda row: len(row.el_OR_NOSYS_pt), axis=1) == 2) &
                (df.apply(lambda row: len(row.mu_OR_NOSYS_pt), axis=1) == 0)
            )
        )
    ]
    print("Cuts applied!")
    
    print(len(new_df))
    if len(new_df)==0:
        print("No events left! Continuing...\n")
        continue
    
    print("Creating new variables...")
    new_df["mu_OR_NOSYS_m"] = new_df.apply(lambda row: np.full_like(row.mu_OR_NOSYS_pt, 105.6583755),axis=1)
    new_df["myy"] = new_df.apply(
        lambda row: myy(row.ph_OR_NOSYS_pt,row.ph_OR_NOSYS_eta,row.ph_OR_NOSYS_phi,row.ph_OR_NOSYS_m),
        axis=1
    )
    new_df["mll"] = new_df.apply(
        lambda row: mll(
            row.el_OR_NOSYS_pt,row.el_OR_NOSYS_eta,row.el_OR_NOSYS_phi,row.el_OR_NOSYS_m,
            row.mu_OR_NOSYS_pt,row.mu_OR_NOSYS_eta,row.mu_OR_NOSYS_phi,row.mu_OR_NOSYS_m
        ),
        axis=1
    )
    new_df["ll_charge"] = new_df.apply(lambda row: sum(row.el_OR_NOSYS_charge) + sum(row.mu_OR_NOSYS_charge),axis=1)
    print("Variables created!")
    
    print("Applying final cuts...")
    final_df = new_df.copy()
    final_df = final_df[
        (final_df.apply(lambda row: row.myy, axis=1) >= 100*GeV) &
        (final_df.apply(lambda row: row.myy, axis=1) <= 200*GeV) &
        (final_df.apply(lambda row: row.mll, axis=1) >= 81*GeV) &
        (final_df.apply(lambda row: row.mll, axis=1) <= 101*GeV) &
        (final_df.apply(lambda row: row.ll_charge, axis=1) == 0)
    ]
    final_df = final_df.reset_index(drop=True)
    print("Final cuts applied!")
    
    print(len(final_df))
    if len(final_df)==0:
        print("No events left! Continuing...\n")
        continue
    
    print("Saving...")
    ak_array = ak.from_arrow(pa.Table.from_pandas(final_df))
    filename = Path(file)
    filename_replace_ext = filename.with_suffix('.parquet')
    print(str(filename_replace_ext))
    ak.to_parquet(ak_array, str(filename_replace_ext))
    
    del df
    del new_df
    del final_df
    print("Saved and finished!\n")

/gpfs/slac/atlas/fs1/d/recsmith/aswt/samples/data16/user.rasmith.TOPQ1.data16.diphoton_ntuple.root_ANALYSIS/gpfs/slac/atlas/fs1/d/recsmith/aswt/samples/data16/user.rasmith.TOPQ1.data16.diphoton_ntuple.root_ANALYSIS/user.rasmith.30346040.ANALYSIS._000001.root
Opening...
Opened!
Starting with 2064 events
Applying cuts...
Cuts applied!
0
No events left! Continuing...

/gpfs/slac/atlas/fs1/d/recsmith/aswt/samples/data16/user.rasmith.TOPQ1.data16.diphoton_ntuple.root_ANALYSIS/gpfs/slac/atlas/fs1/d/recsmith/aswt/samples/data16/user.rasmith.TOPQ1.data16.diphoton_ntuple.root_ANALYSIS/user.rasmith.30346040.ANALYSIS._000002.root
Opening...
Opened!
Starting with 1934 events
Applying cuts...
Cuts applied!
0
No events left! Continuing...

/gpfs/slac/atlas/fs1/d/recsmith/aswt/samples/data16/user.rasmith.TOPQ1.data16.diphoton_ntuple.root_ANALYSIS/gpfs/slac/atlas/fs1/d/recsmith/aswt/samples/data16/user.rasmith.TOPQ1.data16.diphoton_ntuple.root_ANALYSIS/user.rasmith.30346040.ANALYSIS._000003.root
Openi

Opened!
Starting with 88282 events
Applying cuts...
Cuts applied!
2
Creating new variables...
Variables created!
Applying final cuts...
Final cuts applied!
0
No events left! Continuing...

/gpfs/slac/atlas/fs1/d/recsmith/aswt/samples/data16/user.rasmith.TOPQ1.data16.diphoton_ntuple.root_ANALYSIS/gpfs/slac/atlas/fs1/d/recsmith/aswt/samples/data16/user.rasmith.TOPQ1.data16.diphoton_ntuple.root_ANALYSIS/user.rasmith.30346040.ANALYSIS._000021.root
Opening...
Opened!
Starting with 29206 events
Applying cuts...
Cuts applied!
0
No events left! Continuing...

/gpfs/slac/atlas/fs1/d/recsmith/aswt/samples/data16/user.rasmith.TOPQ1.data16.diphoton_ntuple.root_ANALYSIS/gpfs/slac/atlas/fs1/d/recsmith/aswt/samples/data16/user.rasmith.TOPQ1.data16.diphoton_ntuple.root_ANALYSIS/user.rasmith.30346040.ANALYSIS._000022.root
Opening...
Opened!
Starting with 5381 events
Applying cuts...
Cuts applied!
0
No events left! Continuing...

/gpfs/slac/atlas/fs1/d/recsmith/aswt/samples/data16/user.rasmith.TOPQ1.dat

Opened!
Starting with 44716 events
Applying cuts...
Cuts applied!
4
Creating new variables...
Variables created!
Applying final cuts...
Final cuts applied!
0
No events left! Continuing...

/gpfs/slac/atlas/fs1/d/recsmith/aswt/samples/data16/user.rasmith.TOPQ1.data16.diphoton_ntuple.root_ANALYSIS/gpfs/slac/atlas/fs1/d/recsmith/aswt/samples/data16/user.rasmith.TOPQ1.data16.diphoton_ntuple.root_ANALYSIS/user.rasmith.30346040.ANALYSIS._000040.root
Opening...
Opened!
Starting with 14457 events
Applying cuts...
Cuts applied!
0
No events left! Continuing...

/gpfs/slac/atlas/fs1/d/recsmith/aswt/samples/data16/user.rasmith.TOPQ1.data16.diphoton_ntuple.root_ANALYSIS/gpfs/slac/atlas/fs1/d/recsmith/aswt/samples/data16/user.rasmith.TOPQ1.data16.diphoton_ntuple.root_ANALYSIS/user.rasmith.30346040.ANALYSIS._000041.root
Opening...
Opened!
Starting with 53200 events
Applying cuts...
Cuts applied!
3
Creating new variables...
Variables created!
Applying final cuts...
Final cuts applied!
0
No events left

Opened!
Starting with 104039 events
Applying cuts...
Cuts applied!
6
Creating new variables...
Variables created!
Applying final cuts...
Final cuts applied!
0
No events left! Continuing...

/gpfs/slac/atlas/fs1/d/recsmith/aswt/samples/data16/user.rasmith.TOPQ1.data16.diphoton_ntuple.root_ANALYSIS/gpfs/slac/atlas/fs1/d/recsmith/aswt/samples/data16/user.rasmith.TOPQ1.data16.diphoton_ntuple.root_ANALYSIS/user.rasmith.30346040.ANALYSIS._000059.root
Opening...
Opened!
Starting with 52364 events
Applying cuts...
Cuts applied!
1
Creating new variables...
Variables created!
Applying final cuts...
Final cuts applied!
0
No events left! Continuing...

/gpfs/slac/atlas/fs1/d/recsmith/aswt/samples/data16/user.rasmith.TOPQ1.data16.diphoton_ntuple.root_ANALYSIS/gpfs/slac/atlas/fs1/d/recsmith/aswt/samples/data16/user.rasmith.TOPQ1.data16.diphoton_ntuple.root_ANALYSIS/user.rasmith.30346040.ANALYSIS._000060.root
Opening...
Opened!
Starting with 72297 events
Applying cuts...
Cuts applied!
1
Creating new 

IndexError: indexes 838336512:1732890391 are beyond the end of data source '/gpfs/slac/atlas/fs1/d/recsmith/aswt/samples/data16/user.rasmith.TOPQ1.data16.diphoton_ntuple.root_ANALYSIS/user.rasmith.30346040.ANALYSIS._000069.root'

In [19]:
path = "/gpfs/slac/atlas/fs1/d/recsmith/aswt/samples/data16/user.rasmith.TOPQ1.data16.diphoton_ntuple.root_ANALYSIS"

files = [join(path, f) for f in listdir(path) if isfile(join(path, f)) and "parquet" in f]
files.sort()

In [21]:
dfs = []

for file in files:

    df = pd.read_parquet(file, engine='pyarrow')
    dfs.append(df)

In [22]:
df = pd.concat(dfs)

In [23]:
df

Unnamed: 0,ph_OR_NOSYS_pt,ph_OR_NOSYS_phi,ph_OR_NOSYS_eta,ph_OR_NOSYS_m,el_OR_NOSYS_pt,el_OR_NOSYS_phi,el_OR_NOSYS_eta,el_OR_NOSYS_m,el_OR_NOSYS_charge,mu_OR_NOSYS_pt,mu_OR_NOSYS_phi,mu_OR_NOSYS_eta,mu_OR_NOSYS_charge,mu_OR_NOSYS_m,myy,mll,ll_charge
0,"[56136.746, 27138.54]","[-1.7658815, 1.2258216]","[0.17738795, -1.8213241]","[0.0, 0.0]",[],[],[],[],[],"[47239.055, 24778.86]","[2.9902744, 0.33311865]","[-2.3730123, -1.0076123]","[1.0, -1.0]","[105.65838, 105.65838]",120256.936807,83402.068693,0.0
0,"[113026.44, 66965.57]","[-1.889858, 2.808263]","[0.030318156, -0.3367155]","[0.0, 0.0]",[],[],[],[],[],"[29176.635, 62051.23]","[2.064617, 1.1562636]","[1.9697148, 0.13902153]","[1.0, -1.0]","[105.65838, 105.65838]",128003.384304,96732.608384,0.0
0,"[65101.62, 23937.818]","[0.17083432, 1.7656554]","[0.021048814, 2.0031922]","[0.0, 0.0]","[67577.73, 73735.04]","[-2.678523, -1.6657364]","[-0.4239365, -1.2456447]","[0.510998, 0.510998]","[-1.0, 1.0]",[],[],[],[],[],107707.102739,90812.603046,0.0


In [10]:
ak_array = ak.from_arrow(pa.Table.from_pandas(df))

In [11]:
ak.to_parquet(ak_array, path + "ntuple_HIGG1D1.data16.allYear.parquet")