In [1]:
import os
import time
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
import json
import importlib
import ipynbname
%matplotlib inline

base_path = os.path.dirname(ipynbname.path())
parent_path = os.path.dirname(base_path)  
raw_data_path = parent_path + '/data/raw'

print(f"parent path: {parent_path}")
print(f"base path: {base_path}")
print(f"raw data path: {raw_data_path}")

parent path: /home/sid/coding/mass_recon
base path: /home/sid/coding/mass_recon/notebooks
raw data path: /home/sid/coding/mass_recon/data/raw


In [4]:
sys.path

['/home/sid/software/install/root_install/lib',
 '/usr/lib64/python313.zip',
 '/usr/lib64/python3.13',
 '/usr/lib64/python3.13/lib-dynload',
 '',
 '/home/sid/.local/lib/python3.13/site-packages',
 '/usr/lib64/python3.13/site-packages',
 '/usr/lib/python3.13/site-packages',
 '/home/sid/coding/mass_recon/utils',
 '/home/sid/coding/mass_recon/utils']

In [5]:
# import utils file
scripts_path = scripts_path = os.path.join(parent_path, "utils")
sys.path.append(scripts_path)
import data_utils
importlib.reload(data_utils)
from data_utils import *

## 1. Load the Data

To load the data, we need to specify:
- `side`: Takes values `electron` and `positron`
- `scat`: Choose `n` for using the data with physics effects turned off and `y` otherwise 

`get_data_path` gets the path of the raw data file based on the `side` and `scat`

In [6]:
path_train_e, path_test_e = get_data_path(side = 'electron', 
                                          scat = 'n', 
                                          data_path = raw_data_path)

print(f"Train:\t{path_train_e},\nTest:\t{path_test_e}")

Train:	/home/sid/coding/mass_recon/data/raw/ElectronCoords_no_scat.dat,
Test:	/home/sid/coding/mass_recon/data/raw/ElectronSort_no_scat.dat


`get_data()` returns the data in the form of a Pandas DataFrame

In [7]:
e_train = get_data(file_path = path_train_e)
e_test = get_data(file_path = path_test_e)

In [8]:
path_train_p, path_test_p = get_data_path(side = 'positron', 
                                          scat = 'n', 
                                          data_path = raw_data_path)

print(f"Train Path:\t{path_train_p},\nTest Path:\t{path_test_p}")

Train Path:	/home/sid/coding/mass_recon/data/raw/PositronCoords_no_scat.dat,
Test Path:	/home/sid/coding/mass_recon/data/raw/PositronSort_no_scat.dat


In [9]:
p_train = get_data(file_path = path_train_p)
p_test = get_data(file_path = path_test_p)

In [10]:
print(f"Electron Training Data Length:\t{len(e_train)}")
print(f"Positron Training Data Length:\t{len(p_train)}")

Electron Training Data Length:	1000013
Positron Training Data Length:	997376


In [11]:
print(f"E Test Data Length:\t{len(e_test)}")
print(f"P Test Data Length:\t{len(p_test)}")

E Test Data Length:	20000
P Test Data Length:	19946


## 2. Analyze and Filter the Training Data

The lengths of Electron and Positron data should ideally be equal. So, the data needs to be filtered. We will keep data with common EventIDs. All other data points such that their EventID does not match with any other datapoint in the other dataset will be removed.

Electron dataset should have only those events whose `TrackID==1`. Rest all other datapoints should be removed

In [12]:
print(f"num: {len(e_train[e_train['TrackID']!=1])}")
e_eventid = e_train[e_train['TrackID']!=1]['EventID'].to_numpy()

num: 13


Positron dataset should have only those events whose `TrackID==2`. Rest all other datapoints should be removed

In [13]:
print(f"num: {len(p_train[p_train['TrackID']!=2])}")
p_eventid = p_train[p_train['TrackID']!=2]['EventID'].to_numpy()

num: 622


In [14]:
bad_eventid = np.concatenate((e_eventid, p_eventid))
print(f"Number of bad events: {len(bad_eventid)}")

Number of bad events: 635


In [15]:
# filter these rows from both the dataframes:
e_train_filtered = e_train[~e_train['EventID'].isin(bad_eventid)]
p_train_filtered = p_train[~p_train['EventID'].isin(bad_eventid)]

In [16]:
print(f"(Before) num: {len(e_train[e_train['TrackID']!=1])}")
print(f"(After) num: {len(e_train_filtered[e_train_filtered['TrackID']!=1])}")

(Before) num: 13
(After) num: 0


In [17]:
print(f"(Before) num: {len(p_train[p_train['TrackID']!=2])}")
print(f"(After) num: {len(p_train_filtered[p_train_filtered['TrackID']!=2])}")

(Before) num: 622
(After) num: 0


In [18]:
# length of filtered data:
print(f"len of Electron Dataset: {len(e_train_filtered)}")
print(f"len of Positron Dataset: {len(p_train_filtered)}")

len of Electron Dataset: 999458
len of Positron Dataset: 996665


Length mismatch means there are some `EventID` in electron side which are not present in positron side and vice versa. Let us check

In [19]:
e_eventid = e_train_filtered['EventID'].to_numpy()
p_eventid = p_train_filtered['EventID'].to_numpy()

In [20]:
print(f"Number of EventID in E Side which are not in P Side: {len(np.setdiff1d(e_eventid, p_eventid))}")
print(f"Number of EventID in P Side which are not in E Side: {len(np.setdiff1d(p_eventid, e_eventid))}")

Number of EventID in E Side which are not in P Side: 2793
Number of EventID in P Side which are not in E Side: 0


In [21]:
# we wish to keep only the common_eventid (which are present on both the sides):
common_eventid = np.intersect1d(e_eventid, p_eventid)
e_train_filtered2 = e_train_filtered[e_train_filtered['EventID'].isin(common_eventid)]
p_train_filtered2 = p_train_filtered[p_train_filtered['EventID'].isin(common_eventid)]

In [22]:
# let us compare the lengths of the two again:
print(f"len of e: {len(e_train_filtered2)}")
print(f"len of p: {len(p_train_filtered2)}")

len of e: 996665
len of p: 996665


We can now do the same with the function `filter_data`

In [24]:
df_e, df_p, ids = filter_data(electron_path=path_train_e, 
                              positron_path=path_train_p, 
                              side='train',
                              scat='n')

Initial Electron rows: 1000013, Initial Positron rows: 997376
TrackID Outliers - Electron: 13, Positron: 622
After TrackID Filtering - Electron: 999458, Positron: 996665
Common EventIDs found: 996665
Unmatched EventIDs after filtering: 2793
After Common EventID Filtering - Electron: 996665, Positron: 996665
Final Electron: 996665, Final Positron: 996665
Removed 3348 outliers from dataset: /home/sid/coding/mass_recon/data/raw/ElectronCoords_no_scat.dat
Removed 711 outliers from dataset: /home/sid/coding/mass_recon/data/raw/PositronCoords_no_scat.dat
Filtered Electron data saved at: /home/sid/coding/mass_recon/data/filtered/ElectronCoords_no_scat.dat
Filtered Positron data saved at: /home/sid/coding/mass_recon/data/filtered/PositronCoords_no_scat.dat
Saved 3428 removed EventIDs to: /home/sid/coding/mass_recon/data/filtered/train_n_removed_eventids.txt


In [25]:
# let us compare the lengths of the two again:
print(f"len of e: {len(df_e)}")
print(f"len of p: {len(df_p)}")

len of e: 996665
len of p: 996665


We do the same for the test data

In [27]:
e_test, p_test, bad_ids = filter_data(electron_path=path_test_e, 
                                      positron_path=path_test_p, 
                                      side='test',
                                      scat='n')

Initial Electron rows: 20000, Initial Positron rows: 19946
TrackID Outliers - Electron: 0, Positron: 10
After TrackID Filtering - Electron: 19991, Positron: 19935
Common EventIDs found: 19935
Unmatched EventIDs after filtering: 56
After Common EventID Filtering - Electron: 19935, Positron: 19935
Final Electron: 19935, Final Positron: 19935
Removed 65 outliers from dataset: /home/sid/coding/mass_recon/data/raw/ElectronSort_no_scat.dat
Removed 11 outliers from dataset: /home/sid/coding/mass_recon/data/raw/PositronSort_no_scat.dat
Filtered Electron data saved at: /home/sid/coding/mass_recon/data/filtered/ElectronSort_no_scat.dat
Filtered Positron data saved at: /home/sid/coding/mass_recon/data/filtered/PositronSort_no_scat.dat
Saved 66 removed EventIDs to: /home/sid/coding/mass_recon/data/filtered/test_n_removed_eventids.txt


In [28]:
len(e_test), len(p_test)

(19935, 19935)

# 3. Filter All the Data using `filter_data()`

In [39]:
# NO SCATTERING DATA (PHYSICS SWITCHED OFF)
scat = 'n'
path_train_e_n, path_test_e_n = get_data_path(side = 'electron', scat = scat, data_path = raw_data_path)
path_train_p_n, path_test_p_n = get_data_path(side = 'positron', scat = scat, data_path = raw_data_path)

_, _,_ = filter_data(electron_path = path_train_e_n, positron_path = path_train_p_n, side = 'train', scat = scat)
_, _,_ = filter_data(electron_path = path_test_e_n,  positron_path = path_test_p_n,  side = 'test',  scat = scat)

Initial Electron rows: 1000013, Initial Positron rows: 997376
TrackID Outliers - Electron: 13, Positron: 622
After TrackID Filtering - Electron: 999458, Positron: 996665
Common EventIDs found: 996665
Unmatched EventIDs after filtering: 2793
After Common EventID Filtering - Electron: 996665, Positron: 996665
Final Electron: 996665, Final Positron: 996665
Removed 3348 outliers from dataset: /home/sid/coding/mass_recon/data/raw/ElectronCoords_no_scat.dat
Removed 711 outliers from dataset: /home/sid/coding/mass_recon/data/raw/PositronCoords_no_scat.dat
Filtered Electron data saved at: /home/sid/coding/mass_recon/data/filtered/ElectronCoords_no_scat.dat
Filtered Positron data saved at: /home/sid/coding/mass_recon/data/filtered/PositronCoords_no_scat.dat
Saved 3428 removed EventIDs to: /home/sid/coding/mass_recon/data/filtered/train_n_removed_eventids.txt
Initial Electron rows: 20000, Initial Positron rows: 19946
TrackID Outliers - Electron: 0, Positron: 10
After TrackID Filtering - Electron

In [40]:
# SCATTERING DATA (NO PHYSICS SWITCHED OFF)
scat = 'y'
path_train_e_y, path_test_e_y = get_data_path(side = 'electron', scat = scat, data_path = raw_data_path)
path_train_p_y, path_test_p_y = get_data_path(side = 'positron', scat = scat, data_path = raw_data_path)

_, _,_ = filter_data(electron_path = path_train_e_y, positron_path = path_train_p_y, side = 'train', scat = scat)
_, _,_ = filter_data(electron_path = path_test_e_y,  positron_path = path_test_p_y,  side = 'test',  scat = scat)

Initial Electron rows: 999151, Initial Positron rows: 998844
TrackID Outliers - Electron: 830, Positron: 541
After TrackID Filtering - Electron: 996953, Positron: 996936
Common EventIDs found: 995268
Unmatched EventIDs after filtering: 3353
After Common EventID Filtering - Electron: 995268, Positron: 995268
Final Electron: 995268, Final Positron: 995268
Removed 3883 outliers from dataset: /home/sid/coding/mass_recon/data/raw/ElectronCoords_wide_acp.dat
Removed 3576 outliers from dataset: /home/sid/coding/mass_recon/data/raw/PositronCoords_wide_acp.dat
Filtered Electron data saved at: /home/sid/coding/mass_recon/data/filtered/ElectronCoords_wide_acp.dat
Filtered Positron data saved at: /home/sid/coding/mass_recon/data/filtered/PositronCoords_wide_acp.dat
Saved 4724 removed EventIDs to: /home/sid/coding/mass_recon/data/filtered/train_y_removed_eventids.txt
Initial Electron rows: 19812, Initial Positron rows: 19790
TrackID Outliers - Electron: 27, Positron: 17
After TrackID Filtering - El

# 4. Filter the Signal too

We remove the bad EventIDs from the Signal file too.

In [29]:
path_train_e, path_test_e = get_data_path(side = 'electron', 
                                          scat = 'y', 
                                          data_path = raw_data_path)

print(f"Train:\t{path_train_e},\nTest:\t{path_test_e}")

e_train, e_test = get_data(file_path = path_train_e), get_data(file_path = path_test_e)
print(f"E Train Length:\t{len(e_train)}\nE Test Length:\t{len(e_test)}")

Train:	/home/sid/coding/mass_recon/data/raw/ElectronCoords_wide_acp.dat,
Test:	/home/sid/coding/mass_recon/data/raw/ElectronSort_signal.dat
E Train Length:	999151
E Test Length:	19812


In [33]:
path_train_p, path_test_p = get_data_path(side = 'positron', 
                                          scat = 'y', 
                                          data_path = raw_data_path)

print(f"Train Path:\t{path_train_p},\nTest Path:\t{path_test_p}")

p_train, p_test = get_data(file_path = path_train_p), get_data(file_path = path_test_p)
print(f"P Train Length:\t{len(p_train)}\nP Test Length:\t{len(p_test)}")

Train Path:	/home/sid/coding/mass_recon/data/raw/PositronCoords_wide_acp.dat,
Test Path:	/home/sid/coding/mass_recon/data/raw/PositronSort_signal.dat
P Train Length:	998844
P Test Length:	19790


In [34]:
e_test, p_test, bad_ids = filter_data(electron_path = path_train_e, 
                                      positron_path = path_train_p, 
                                      side = 'train',
                                      scat = 'y')

Initial Electron rows: 999151, Initial Positron rows: 998844
TrackID Outliers - Electron: 830, Positron: 541
After TrackID Filtering - Electron: 996953, Positron: 996936
Common EventIDs found: 995268
Unmatched EventIDs after filtering: 3353
After Common EventID Filtering - Electron: 995268, Positron: 995268
Final Electron: 995268, Final Positron: 995268
Removed 3883 outliers from dataset: /home/sid/coding/mass_recon/data/raw/ElectronCoords_wide_acp.dat
Removed 3576 outliers from dataset: /home/sid/coding/mass_recon/data/raw/PositronCoords_wide_acp.dat
Filtered Electron data saved at: /home/sid/coding/mass_recon/data/filtered/ElectronCoords_wide_acp.dat
Filtered Positron data saved at: /home/sid/coding/mass_recon/data/filtered/PositronCoords_wide_acp.dat
Saved 4724 removed EventIDs to: /home/sid/coding/mass_recon/data/filtered/train_y_removed_eventids.txt


In [35]:
signal_file_path = raw_data_path + "/Signal_DL_SIG_13_30_elec36_pos20.dat"
df_signal = get_data(signal_file_path)
print(f"Length Signal: {len(df_signal)}")

Length Signal: 20749


In [36]:
df_signal_filtered = df_signal[~df_signal['EventID'].isin(bad_ids)]
print(f"Length Filtered Signal: {len(df_signal_filtered)}")

Length Filtered Signal: 20658
