In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import sys
import datetime
import gc
import warnings
import time
warnings.filterwarnings('ignore')
import csv
from util.theory import deltas

In [2]:
full = False

if full:
    data_loc = r"./data/simulated_cosmics_full.root:nuselection/CalorimetryAnalyzer"
else:
    data_loc = r"./data/simulated_cosmics.root:CalorimetryAnalyzer"
print("Using data location:", data_loc)

Using data location: ./data/simulated_cosmics.root:CalorimetryAnalyzer


In [3]:
# Don't import ROOT unless absolutely necessary (takes a long time)
# import ROOT
import uproot

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed',)).History will not be written to the database.


In [4]:
tree = uproot.open(data_loc)

In [5]:
per_particle_variables = ['backtracked_e','backtracked_pdg','backtracked_purity']
variables = ['dedx_y','rr_y','pitch_y']
slimmer_variables = ['trk_sce_start_x','trk_sce_start_y','trk_sce_start_z', 'trk_sce_end_x','trk_sce_end_y','trk_sce_end_z','backtracked_e', 'backtracked_pdg']

### Tag for removal
The following cell populates the list <code>idxs_to_remove</code>, tagging the relevant rows of the dataframe for removal. For now, removal criterion is based solely on whether we think the particle both enters and exits the detector. If it neither enters nor exits at a boundary, particle is tagged for removal.
- TODO: Speed this up using jit

### Generate principal dataframe and slim
In the future, we will want to do this in batches, as even the slimmed data will be too large to hold in memory all at once. Here, the data is loaded to memory in its entirety, and then slimmed accordingly. Even if we slim better, there is no way around loading the data entirely first before slimming (at least, not that I know of, uproot documentation seems to suggest no - there may be a way in  raw C++)

In [57]:
def distance_to_edge(r):
    dimensions = np.array([[0, 256], [-116,116], [0,1036]])
    return  np.min(np.abs(dimensions - r[:, np.newaxis]))

print("Preparing Slimming Mask...")
slimmerdf = tree.arrays(slimmer_variables, library='pd')

start_dists, end_dists = np.array([ [distance_to_edge(r[:3]), distance_to_edge(r[3:6])] for _, r in slimmerdf.iterrows() ]).T
energy_mask = (slimmerdf.backtracked_e > 1) & (slimmerdf.backtracked_e < 10) & (np.abs(slimmerdf.backtracked_pdg) == 13)
mask = ((start_dists < 2) & (end_dists < 2) & energy_mask).to_numpy()
print("Will remove", np.sum(~mask), "particles")

Preparing Slimming Mask...
Will remove 3431 particles


In [58]:
# Generate DataFrame with the data
# There seems to be a memory leak in pandas https://github.com/pandas-dev/pandas/issues/2659. This casues the 
# allocated memory for the dataframe to be much higher than required. As of now there is no simple fix that I 
# can find, so I will have to work around it.
# Maybe look into this further later if it is a problem with the larger dataset.

print("Generating Principal Dataframe...")
part_df = tree.arrays(per_particle_variables, library='pd')
df = tree.arrays(variables[0], library='pd')
print("Loaded", variables[0], "data...")
size = sys.getsizeof(df)

# Slim according to mask
part_df = part_df.loc[mask]
mask = mask[df.index.get_level_values(0)] # Broadcast to multiindex shape
print(len(mask))
df = df.loc[mask, :]

# This loop loads in the next column of the dataframe, slims it, and appends it to df
for name in variables[1:]:
    next_col = tree.arrays(name, library='pd')
    print("Loaded", name, "data...")
    size += sys.getsizeof(next_col[name])
    next_col = next_col.loc[mask, :]
    df = df.join(next_col, on=['entry', 'subentry'])

part_df.index.name = 'entry'
print("Generated!")
print("Original Size:", size/10**6, "MB")
print("Slimmed Size:", sys.getsizeof(df)/10**6, "MB")

Generating Principal Dataframe...
Loaded dedx_y data...
1828660
Loaded rr_y data...
Loaded pitch_y data...
Generated!
Original Size: 45.395421 MB
Slimmed Size: 19.694047 MB


### Begin Analysis
The following cell initializes all the necessary variables to be used in the analysis loop.

#### Initialize some debug counting variables
These variables keep track of some important data regarding how many particles are ignored, how many data points are ignored, and the number of data points / particles that are ignored for each of the various possible reasons. This way we can keep track of the main reasons why data from certain particles is not being considered.
- Move info on nege, highe, non-muon, and the number of bad particles to slimming section

In [142]:
# Truncates a given (multiindexed) dataframe after the first bad datapoint
def truncate(df):
    df = df.droplevel(level=0)
    bad_indices = df.index[(df.dedx_y > 100) | (df.e_y <= 0) | (df.pitch_y < 0.3) | (df.pitch_y > 0.3/np.cos(70*np.pi/180))]
    if len(bad_indices) == 0:
        return df
    
    trunc = min(bad_indices)
    if trunc < 10:
        return df.iloc[:0]
    
    return df.iloc[:trunc]

In [143]:
# Removes the delta rays from a given muon (miltiindexed) dataframe
def delta_rm(df):
    df = df.droplevel(level=0)
    
    delta_locs, count = deltas(df.dedx_y.to_numpy())
    return df.drop(delta_locs, axis=0)    

In [164]:
def display_uptime(start, msg=''):
    now = time.perf_counter()
    t = now-start
    print(f'{msg} {int(t//60)}m {t%60:0.1f}s')
    return now

In [204]:
def analyze_data(df, part_df):

    start = time.perf_counter()
    print('Analyzing...', end='')
    dxs = df.groupby(level=0).rr_y.diff(periods=1).fillna(df.rr_y)
    des = dxs * df.dedx_y / 1000
    cum_eloss = des.groupby(level=0).cumsum()
    
    
    data = df[['dedx_y', 'pitch_y']].join(part_df.backtracked_e, on='entry')
    data.backtracked_e -= cum_eloss
    data.rename(columns={'backtracked_e': 'e_y'}, inplace=True)
    tanalyzed = display_uptime(start)
    
    print('Applying Cuts...', end='')
    data = data.groupby(level=0).apply(truncate)
    tcut = display_uptime(tanalyzed)
    
    print('Removing Delta-Rays...', end='')
    data = data.groupby(level=0).apply(delta_rm)
    tdelta_rm = display_uptime(tcut)
    
    data.reset_index(drop=True, inplace=True)
    
    display_uptime(start, 'Done! Total Analysis Time:')
    return data

In [178]:
data = analyze_data(df, part_df)

Analyzing... 0m 7.8s
Applying Cuts... 0m 29.3s
Removing Delta-Rays... 0m 16.6s
Done! Total Analysis Time: 0m 53.7s


In [179]:
display(data)

Unnamed: 0,dedx_y,pitch_y,e_y
0,2.541412,0.563403,9.336881
1,2.566944,0.563348,9.335584
2,2.129791,0.563345,9.334373
3,2.324910,0.563407,9.333061
4,2.042750,0.563404,9.331923
...,...,...,...
738653,0.413425,0.753983,6.236037
738654,2.991644,0.753983,6.235998
738655,2.457547,0.754043,6.234173
738656,2.168005,0.754043,6.232259
