In [35]:
import sys
import numpy as np

import uproot3 as upr3
import pandas as pd
from ROOT import TLorentzVector

import awkward as ak
import pyarrow as pa

from pathlib import Path

GeV = 1000

In [32]:
path = "/gpfs/slac/atlas/fs1/d/recsmith/aswt/samples/data16/"
files = [
    "ntuple_HIGG1D1.data16.periodA.root",
    "ntuple_HIGG1D1.data16.periodB.root",
    "ntuple_HIGG1D1.data16.periodC.root",
    "ntuple_HIGG1D1.data16.periodD.root",
    "ntuple_HIGG1D1.data16.periodE.root",
    "ntuple_HIGG1D1.data16.periodF.root",
    "ntuple_HIGG1D1.data16.periodG.root",
    "ntuple_HIGG1D1.data16.periodI.root",
    "ntuple_HIGG1D1.data16.periodK.root",
    "ntuple_HIGG1D1.data16.periodL.root",
]

In [15]:
def myy(ph_pt, ph_eta, ph_phi, ph_m):
        
    ph1 = TLorentzVector()
    ph2 = TLorentzVector()
    
    ph1.SetPtEtaPhiM(ph_pt[0],ph_eta[0],ph_phi[0],ph_m[0])
    ph2.SetPtEtaPhiM(ph_pt[1],ph_eta[1],ph_phi[1],ph_m[1])

    return (ph1 + ph2).M()

In [None]:
arrays_to_open = ["ph_pt_NOSYS", "ph_phi", "ph_eta", "ph_m"]

for file in files:
    
    print(path + file)
    print("Opening...")
    df = pd.DataFrame(upr3.open(path + file)["analysis"].arrays(arrays_to_open, namedecode="utf-8"))
    print("Opened!")
    
    print("Applying cuts...")
    new_df = df.copy()
    new_df = new_df[
        (df.apply(lambda row: len(row.ph_pt_NOSYS), axis=1) == 2) 
    ]
    print("Cuts applied!")
    
    print("Creating new variables...")
    new_df["myy"] = new_df.apply(lambda row: myy(row.ph_pt_NOSYS,row.ph_eta,row.ph_phi,row.ph_m),axis=1)
    print("Variables created!")
    
    print("Applying final cuts...")
    final_df = new_df.copy()
    final_df = final_df[
        (final_df.apply(lambda row: row.myy, axis=1) >= 100*GeV) &
        (final_df.apply(lambda row: row.myy, axis=1) <= 200*GeV) 
    ]
    final_df = final_df.reset_index(drop=True)
    print("Final cuts applied!")
    
    print("Saving...")
    ak_array = ak.from_arrow(pa.Table.from_pandas(final_df))
    filename = Path(file)
    filename_replace_ext = filename.with_suffix('.parquet')
    print(path + str(filename_replace_ext))
    ak.to_parquet(ak_array, path + str(filename_replace_ext))
    
    del df
    del new_df
    del final_df
    print("Saved and finished!\n")