In [20]:
import os
import time
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
import json
import funcs
import importlib
importlib.reload(funcs)
from funcs import *
import ipynbname

%matplotlib inline

# sys.path.append("~/files/rough_work/path_recon_ml_models")
base_path = os.path.dirname(ipynbname.path())
data_path = base_path + '/mass_recon_data'
print(f"base path: {base_path}")
print(f"data path: {data_path}")

base path: /home/sid/coding/rough_work/path_recon_ml_models
data path: /home/sid/coding/rough_work/path_recon_ml_models/mass_recon_data


## 1. Load the Data

In [120]:
side = 'electron'
scat = 'y'

In [121]:
data_train_e, data_test_e = get_data_path(side, scat, data_path)
print(f"Train:\t{data_train_e},\nTest:\t{data_test_e}")

Train:	/home/sid/files/rough_work/path_recon_ml_models/mass_recon_data/ElectronCoords_wide_acp.dat,
Test:	/home/sid/files/rough_work/path_recon_ml_models/mass_recon_data/ElectronSort_signal.dat


In [122]:
e_train = pd.read_csv(data_train_e, sep='\s+', header=None, names=[
        'OUT', 'EventID', 'TrackID', 'ParticleCount1', 'ParticleCount2', 'X', 'Y', 
        'dX', 'dY', 'E', 'P', 'ip', 'oop', 'vert_x', 'vert_y', 'vert_z'
    ])
e_test = pd.read_csv(data_test_e, sep='\s+', header=None, names=[
        'OUT', 'EventID', 'TrackID', 'ParticleCount1', 'ParticleCount2', 'X', 'Y', 
        'dX', 'dY', 'E', 'P', 'ip', 'oop', 'vert_x', 'vert_y', 'vert_z'
    ])

In [125]:
side = 'positron'
scat = 'y'

In [127]:
data_train_p, data_test_p = get_data_path(side, scat, data_path)
print(f"Train:\t{data_train_p},\nTest:\t{data_test_p}")

Train:	/home/sid/files/rough_work/path_recon_ml_models/mass_recon_data/PositronCoords_wide_acp.dat,
Test:	/home/sid/files/rough_work/path_recon_ml_models/mass_recon_data/PositronSort_signal.dat


In [128]:
p_train = pd.read_csv(data_train_p, sep='\s+', header=None, names=[
        'OUT', 'EventID', 'TrackID', 'ParticleCount1', 'ParticleCount2', 'X', 'Y', 
        'dX', 'dY', 'E', 'P', 'ip', 'oop', 'vert_x', 'vert_y', 'vert_z'
    ])
p_test = pd.read_csv(data_test_p, sep='\s+', header=None, names=[
        'OUT', 'EventID', 'TrackID', 'ParticleCount1', 'ParticleCount2', 'X', 'Y', 
        'dX', 'dY', 'E', 'P', 'ip', 'oop', 'vert_x', 'vert_y', 'vert_z'
    ])

In [129]:
print(f"E Training Data ({data_train_e}):\t{len(e_train)}")
print(f"P Training Data ({data_train_p}):\t{len(p_train)}")

E Training Data (/home/sid/files/rough_work/path_recon_ml_models/mass_recon_data/ElectronCoords_wide_acp.dat):	995268
P Training Data (/home/sid/files/rough_work/path_recon_ml_models/mass_recon_data/PositronCoords_wide_acp.dat):	995268


In [130]:
print(f"E Test Data ({data_test_e}):\t{len(e_test)}")
print(f"P Test Data ({data_test_p}):\t{len(p_test)}")

E Test Data (/home/sid/files/rough_work/path_recon_ml_models/mass_recon_data/ElectronSort_signal.dat):	19812
P Test Data (/home/sid/files/rough_work/path_recon_ml_models/mass_recon_data/PositronSort_signal.dat):	19790


## 2. Analyze and Filter the Training Data

In [131]:
print(f"num: {len(e_train[e_train['TrackID']!=1])}")
e_eventid = e_train[e_train['TrackID']!=1]['EventID'].to_numpy()

num: 0


In [132]:
print(f"num: {len(p_train[p_train['TrackID']!=2])}")
p_eventid = p_train[p_train['TrackID']!=2]['EventID'].to_numpy()

num: 0


In [133]:
bad_eventid = np.concatenate((e_eventid, p_eventid))

In [134]:
# filter these rows from both the dataframes:
e_train_filtered = e_train[~e_train['EventID'].isin(bad_eventid)]
p_train_filtered = p_train[~p_train['EventID'].isin(bad_eventid)]

In [135]:
print(f"(Before) num: {len(e_train[e_train['TrackID']!=1])}")
print(f"(After) num: {len(e_train_filtered[e_train_filtered['TrackID']!=1])}")

(Before) num: 0
(After) num: 0


In [136]:
print(f"(Before) num: {len(p_train[p_train['TrackID']!=2])}")
print(f"(After) num: {len(p_train_filtered[p_train_filtered['TrackID']!=2])}")

(Before) num: 0
(After) num: 0


In [137]:
# length of filtered data:
print(f"len of e: {len(e_train_filtered)}")
print(f"len of p: {len(p_train_filtered)}")

len of e: 995268
len of p: 995268


Length mismatch means there are some `EventID` in electron side which are not present in positron side and vice versa. Let us check

In [138]:
e_eventid = e_train_filtered['EventID'].to_numpy()
p_eventid = p_train_filtered['EventID'].to_numpy()

In [139]:
print(f"Number of EventID in E Side which are not in P Side: {len(np.setdiff1d(e_eventid, p_eventid))}")
print(f"Number of EventID in P Side which are not in E Side: {len(np.setdiff1d(p_eventid, e_eventid))}")

Number of EventID in E Side which are not in P Side: 0
Number of EventID in P Side which are not in E Side: 0


In [140]:
# we wish to keep only the common_eventid (which are present on both the sides):
common_eventid = np.intersect1d(e_eventid, p_eventid)
e_train_filtered2 = e_train_filtered[e_train_filtered['EventID'].isin(common_eventid)]
p_train_filtered2 = p_train_filtered[p_train_filtered['EventID'].isin(common_eventid)]

In [141]:
# let us compare the lengths of the two again:
print(f"len of e: {len(e_train_filtered2)}")
print(f"len of p: {len(p_train_filtered2)}")

len of e: 995268
len of p: 995268


In [142]:
# save the filtered training data:
e_train_filtered2.to_csv(data_train_e, sep=' ', header=False, index=False)
p_train_filtered2.to_csv(data_train_p, sep=' ', header=False, index=False)

In [143]:
# load the training data and check if size mismatch still exists:
e_train = pd.read_csv(data_train_e, sep='\s+', header=None, names=[
        'OUT', 'EventID', 'TrackID', 'ParticleCount1', 'ParticleCount2', 'X', 'Y', 
        'dX', 'dY', 'E', 'P', 'ip', 'oop', 'vert_x', 'vert_y', 'vert_z'
    ])
p_train = pd.read_csv(data_train_p, sep='\s+', header=None, names=[
        'OUT', 'EventID', 'TrackID', 'ParticleCount1', 'ParticleCount2', 'X', 'Y', 
        'dX', 'dY', 'E', 'P', 'ip', 'oop', 'vert_x', 'vert_y', 'vert_z'
    ])

In [144]:
print(f"Length of Training E: {len(e_train)}")
print(f"Length of Training P: {len(p_train)}")

Length of Training E: 995268
Length of Training P: 995268


In [66]:
print(f"# of Datapoints with TrackID !=1 in Training E: {len(e_train[e_train['TrackID']!=1])}")
print(f"# of Datapoints with TrackID !=2 in Training P: {len(p_train[p_train['TrackID']!=2])}")

# of Datapoints with TrackID !=1 in Training E: 0
# of Datapoints with TrackID !=2 in Training P: 0


## 3. Analyze and Filter the Test Data

In [150]:
bad_events_to_be_removed = []
bad_events_to_be_removed

[]

In [145]:
bad_eventid_e = e_test[e_test['TrackID']!=1]['EventID'].to_numpy()
bad_eventid_p = p_test[p_test['TrackID']!=2]['EventID'].to_numpy()
bad_eventid_total = np.concatenate((bad_eventid_e, bad_eventid_p))
print(f"# of bad EventID in Test E: {len(bad_eventid_e)}")
print(f"# of bad EventID in Test P: {len(bad_eventid_p)}")
print(f"Total # bad EventID: {len(bad_eventid_total)}")

# of bad EventID in Test E: 27
# of bad EventID in Test P: 17
Total # bad EventID: 44


In [151]:
bad_events_to_be_removed.append(bad_eventid_total)

In [152]:
bad_events_to_be_removed

[array([  501,  1408,  1874,  2032,  2062,  4243,  5040,  5704,  6477,
         7884,  7964,  8330,  9080, 10010, 10019, 10723, 10743, 11146,
        11420, 11703, 12574, 13867, 14628, 14905, 16156, 16400, 16833,
         3871,  5467,  5744,  6218,  6632,  7048,  7474,  8750,  8854,
        10957, 12168, 13197, 13904, 14917, 15026, 15139, 19724])]

In [153]:
# filter these rows from both the dataframes:
e_test_filtered = e_test[~e_test['EventID'].isin(bad_eventid_total)]
p_test_filtered = p_test[~p_test['EventID'].isin(bad_eventid_total)]

In [154]:
bad_eventid_e = e_test_filtered[e_test_filtered['TrackID']!=1]['EventID'].to_numpy()
bad_eventid_p = p_test_filtered[p_test_filtered['TrackID']!=2]['EventID'].to_numpy()
bad_eventid_total = np.concatenate((bad_eventid_e, bad_eventid_p))
print(f"# of bad EventID in Test E: {len(bad_eventid_e)}")
print(f"# of bad EventID in Test P: {len(bad_eventid_p)}")
print(f"Total # bad EventID: {len(bad_eventid_total)}")

# of bad EventID in Test E: 0
# of bad EventID in Test P: 0
Total # bad EventID: 0


In [155]:
# now we need to look at the length of both the dataframes:
print(f"length of Test E: {len(e_test_filtered)}")
print(f"length of Test P: {len(p_test_filtered)}")

length of Test E: 19741
length of Test P: 19729


In [156]:
# there is a mismatch so, we need to keep only the common EventID datapoints:
e_eventid = e_test_filtered['EventID'].to_numpy()
p_eventid = p_test_filtered['EventID'].to_numpy()

In [157]:
# we check the EventID mismatch now:
print(f"# of EventID present in Test E but not in Test P: {len(np.setdiff1d(e_eventid, p_eventid))}")
print(f"# of EventID present in Test P but not in Test N: {len(np.setdiff1d(p_eventid, e_eventid))}")

# of EventID present in Test E but not in Test P: 222
# of EventID present in Test P but not in Test N: 210


In [160]:
bad_events_to_be_removed.append(np.setdiff1d(e_eventid, p_eventid))
bad_events_to_be_removed.append(np.setdiff1d(p_eventid, e_eventid))

In [164]:
# we will keep only the common eventIDs:
common_eventid = np.intersect1d(e_eventid, p_eventid)
e_test_filtered2 = e_test_filtered[e_test_filtered['EventID'].isin(common_eventid)]
p_test_filtered2 = p_test_filtered[p_test_filtered['EventID'].isin(common_eventid)]

In [165]:
# let us check the lengths once again:
print(f"len of Test E: {len(e_test_filtered2)}")
print(f"len of Test P: {len(p_test_filtered2)}")

len of Test E: 19519
len of Test P: 19519


In [166]:
# save the filtered test data:
# save the filtered training data:
e_test_filtered2.to_csv(data_test_e, sep=' ', header=False, index=False)
p_test_filtered2.to_csv(data_test_p, sep=' ', header=False, index=False)

In [167]:
# load the test data and check if size mismatch still exists:
e_test = pd.read_csv(data_test_e, sep='\s+', header=None, names=[
        'OUT', 'EventID', 'TrackID', 'ParticleCount1', 'ParticleCount2', 'X', 'Y', 
        'dX', 'dY', 'E', 'P', 'ip', 'oop', 'vert_x', 'vert_y', 'vert_z'
    ])
p_test = pd.read_csv(data_test_p, sep='\s+', header=None, names=[
        'OUT', 'EventID', 'TrackID', 'ParticleCount1', 'ParticleCount2', 'X', 'Y', 
        'dX', 'dY', 'E', 'P', 'ip', 'oop', 'vert_x', 'vert_y', 'vert_z'
    ])

In [168]:
print(f"Length of Test E: {len(e_test)}")
print(f"Length of Test P: {len(p_test)}")

Length of Test E: 19519
Length of Test P: 19519


In [169]:
print(f"# of Datapoints with TrackID !=1 in Test E: {len(e_test[e_test['TrackID']!=1])}")
print(f"# of Datapoints with TrackID !=2 in Test P: {len(p_test[p_test['TrackID']!=2])}")

# of Datapoints with TrackID !=1 in Test E: 0
# of Datapoints with TrackID !=2 in Test P: 0


# 4. Filter the Signal too

In [183]:
signal_file = "mass_recon_data/Signal_DL_SIG_13_30_elec36_pos20.dat"

In [184]:
df_signal = pd.read_csv(signal_file, sep='\s+', header=None, names=[
        'OUT', 'EventID', 'TrackID', 'ParticleCount1', 'ParticleCount2', 'X', 'Y', 
        'dX', 'dY', 'E', 'P', 'ip', 'oop', 'vert_x', 'vert_y', 'vert_z'
    ])

In [179]:
bad_eventids = np.concatenate(bad_events_to_be_removed)

In [187]:
data_train_p, data_test_p = get_data_path(side, scat, data_path)
p_test = pd.read_csv(data_test_p, sep='\s+', header=None, names=[
        'OUT', 'EventID', 'TrackID', 'ParticleCount1', 'ParticleCount2', 'X', 'Y', 
        'dX', 'dY', 'E', 'P', 'ip', 'oop', 'vert_x', 'vert_y', 'vert_z'
    ])

In [188]:
test_p_eventid = p_test['EventID']

In [190]:
df_signal_filtered = df_signal[df_signal['EventID'].isin(test_p_eventid)]

In [192]:
df_signal_filtered.to_csv(signal_file, sep=' ', header=False, index=False)