![eagle](../../eagleeye/eagle_eye_log.jpeg)


In [3]:
"""
@author: Andre Scaffidi (AndreScaffidi)
@author: Sebastian Springer (sspringe137)

This is a notebook to create figures 2 and 8 from the paper

"""

# Download the LHC Olympics R&D data from Zenodo if you don't have it already. 
# https://zenodo.org/records/4536377
# Can then use load_the_data.py to create the npy file we use for analysis. 
# Current data file is in ./data/LHC_data1p1M_new_features.npy so don't need to do this step

!wget https://zenodo.org/records/6466204/files/events_anomalydetection_v2.features.h5?download=1 .
!mv 'events_anomalydetection_v2.features.h5?download=1' events_anomalydetection_v2.features.h5


--2024-10-29 10:17:57--  https://zenodo.org/records/6466204/files/events_anomalydetection_v2.features.h5?download=1
Resolving zenodo.org (zenodo.org)... 188.184.103.159, 188.185.79.172, 188.184.98.238, ...
Connecting to zenodo.org (zenodo.org)|188.184.103.159|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 74315238 (71M) [application/octet-stream]
Saving to: ‘events_anomalydetection_v2.features.h5?download=1’


2024-10-29 10:18:00 (20.4 MB/s) - ‘events_anomalydetection_v2.features.h5?download=1’ saved [74315238/74315238]

--2024-10-29 10:18:00--  http://./
Resolving . (.)... failed: Name or service not known.
wget: unable to resolve host address ‘.’
FINISHED --2024-10-29 10:18:00--
Total wall clock time: 3.8s
Downloaded: 1 files, 71M in 3.5s (20.4 MB/s)


In [1]:
"""
Read the data and partitian the features as per Sec. 3.2 of the paper.
The result is a numpy array with columns corresponding to 
['|p|1', '|p|2', 'tau21j1', 'tau21j2', 'tau32j1', 'tau32j2' , 'mj1', 'mj2', 'label']
"""
import pandas as pd
import numpy as np

file_path              = 'data/events_anomalydetection_v2.features.h5'
df_features            = pd.read_hdf(file_path)
# Feature engineering
df_features['|p|1']    = np.sqrt(np.sum(df_features[['pxj1', 'pyj1', 'pzj1']]**2, axis=1))
df_features['|p|2']    = np.sqrt(np.sum(df_features[['pxj2', 'pyj2', 'pzj2']]**2, axis=1))
df_features['tau21j1'] = df_features['tau2j1']/df_features['tau1j1']
df_features['tau21j2'] = df_features['tau2j2']/df_features['tau1j2']
df_features['tau32j1'] = df_features['tau3j1']/df_features['tau2j1']
df_features['tau32j2'] = df_features['tau3j2']/df_features['tau2j2']

new_features           = ['|p|1', '|p|2', 'tau21j1', 'tau21j2', 'tau32j1', 'tau32j2']
df_features            = df_features[new_features + [col for col in df_features.columns if col not in new_features]]

LHC_data1p1M           = df_features.to_numpy()[ :, [0,1,2,3,4,5,9,16,20] ]

# Print the shape of the np array
LHC_data1p1M[np.isnan(LHC_data1p1M)]                  = 0     # Andre suggestion

np.save('data/LHC_data1p1M_new_features.npy',LHC_data1p1M)