In [1]:
import pandas as pd
import numpy as np
from pyjet import cluster,DTYPE_PTEPM
import math
import h5py

# Pre processing

In [2]:
# m_12 = sqrt ( (E_1 + E_2)^2 - (p_x1 + p_x2)^2 - (p_y1 + p_y2)^2 - (p_z1 + p_z2)^2 )
def invariant_mass(jet1, jet2):
    return math.sqrt((jet1.e + jet2.e)**2 - (jet1.px + jet2.px)**2 - (jet1.py + jet2.py)**2 - (jet1.pz + jet2.pz)**2)

In [3]:
path = '/anomalyvol/data/events_LHCO2020_BlackBox1.h5'

In [4]:
chunk_size = 20000
total_size = 1000000 # 1 mil max

def generator(path, chunk_size=10000,total_size=1000000):
    i = 0
    
    while True:
        yield pd.read_hdf(path,start=i*chunk_size, stop=(i+1)*chunk_size)
        
        i+=1
        if (i+1)*chunk_size > total_size:
            i=0

gen = generator(path, chunk_size, total_size)

In [None]:
data = []

for iteration in range(total_size // chunk_size):
    
    events = np.array(next(gen))
    rows = events.shape[0]
    cols = events.shape[1]

    for i in range(rows):
        pseudojets_input = np.zeros(len([x for x in events[i][::3] if x > 0]), dtype=DTYPE_PTEPM)
        for j in range(cols // 3):
            if (events[i][j*3]>0):
                pseudojets_input[j]['pT'] = events[i][j*3]
                pseudojets_input[j]['eta'] = events[i][j*3+1]
                pseudojets_input[j]['phi'] = events[i][j*3+2]
            pass
        # cluster jets from the particles in one observation
        sequence = cluster(pseudojets_input, R=1.0, p=-1)
        jets = sequence.inclusive_jets()
        mass = invariant_mass(jets[0], jets[1])
        data.append(mass)

In [None]:
loaded_data = data
# data = loaded_data

In [None]:
df = pd.DataFrame(data)

In [None]:
df.plot.line()

In [None]:
outliers = df.loc[df > 8000]

In [None]:
len(outliers) / len(df)