# Pandas, ntuple choices, and plots

I spoke to Rafael today, and we talked about the old ntuples, i.e. the ones with variables like "jet_trk_pdg_id". The worry was that we wanted to use reconstructed muons, but the tracks in this ntuple were matched to truth muons. Rafael said that the tracks themselves are reconstructed, even if they are matched to truth muons, so I think it should be okay to use them?

In [1]:
import glob
import numpy as np
import pandas as pd
from root_numpy import root2array

GeV = 1000

In [2]:
branchNames = [
                "jet_LabDr_HadF",
                "jet_JVT",
                "jet_aliveAfterOR",
                "jet_aliveAfterORmu",
                "jet_pt",
                "jet_eta",
                "jet_phi",
                "jet_trk_pdg_id",
                "jet_trk_pt",
                "jet_trk_eta",
                "jet_trk_phi",
                "jet_trk_ip3d_d0",
                "jet_trk_ip3d_z0",
                "jet_trk_ip3d_d0sig",
                "jet_trk_ip3d_z0sig"
              ]

In [3]:
files = glob.glob("/gpfs/slac/atlas/fs1/d/rafaeltl/public/RNNIP/FTAG_ntups/user.rateixei.mc16_13TeV.410470.PhPy8EG_A14_ttbar_hdamp258p75_nonallhad.AOD..dijetSamplesNominal20180629_Akt4EMTo/*.root")
data = root2array(files[0], branches=branchNames)
print "Added ntuple #1"

for f in range(1,10):
    data = np.hstack((data,root2array(files[f], branches=branchNames)))
    print "Added ntuple #{}".format(f+1)

Added ntuple #1
Added ntuple #2
Added ntuple #3
Added ntuple #4
Added ntuple #5
Added ntuple #6
Added ntuple #7
Added ntuple #8
Added ntuple #9
Added ntuple #10


I've been learning about pandas! So let's start by flattening the ntuples to the jet level:

In [4]:
def flatten(column):
    try:
        return np.array([var for entry in column for var in entry])
    except (TypeError, ValueError):
        return column

In [5]:
df = pd.DataFrame(data)

In [6]:
df_flat = pd.DataFrame({k: flattenToJets(c) for k, c in df.iteritems()})

Then let's flatten down to the track level and apply cuts:

In [7]:
def flattenToTrks(f, k, c, keyToFlattenTo):
    if 'jet_trk' not in k:
        a = f[k]
        b = f[keyToFlattenTo]
        b_full = [np.full_like(x,1) for x in b]
        return np.concatenate([aa*np.array(bb) for aa, bb in zip(a,b_full)])
    else:
        return np.array([v for e in c for v in e])
    
def applyCutsJets(f, c):
    return c[
                #We only want to keep muon tracks
                abs(f["jet_trk_pdg_id"]) == 13     and
        
                #We only want light jets
                f["jet_LabDr_HadF"]      == 0      and
        
                #Nicole's jet cuts
                f["jet_pt"]               > 20*GeV and
                abs(f["jet_eta"])         < 2.5    and
                ((f["jet_pt"] > 60*GeV) or (abs(f["jet_eta"]) > 2.4) or (f["jet_JVT"] > 0.59)) and
                f["jet_aliveAfterOR"]    == 1      and
                f["jet_aliveAfterORmu"]  == 1                                 
            ]

def applyCutsMuons(f, c):
    return c[
                #Muon cuts
                f["jet_trk_pt"]           > 1*GeV  and
                abs(f["jet_trk_eta"])   < 2.5 
            ]

In [8]:
df_jetcuts = pd.DataFrame({k: applyCutsJets(df_flat, c) for k, c in df_flat.iteritems()})

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [None]:
df_flattrks = pd.DataFrame({k: flattenToTrks(df_flat, k, c, "jet_trk_pdg_id") for k, c in df_flat.iteritems()})

In [None]:
df_muons = pd.DataFrame({k: keepMuons(df_flattrks, c) for k, c in df_flattrks.iteritems()})

In [None]:
df_muons