In [1]:
import numpy as np
import torch
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
'''             0       1     2       3         4           5        6        7       8       9       10
hasTHETA_CHI2  ['pt', 'eta', 'phi', 'charge', 'POCA_x', 'POCA_y', 'POCA_z', 'chi2', 'ndof', 'pdgId']
'hasEPZ'      ['pt', 'eta', 'phi', 'charge', 'POCA_x', 'POCA_y', 'POCA_z', 'energy', 'pz', 'pdgId']
'hasMEPZ'      ['pt', 'eta', 'phi', 'charge', 'POCA_x', 'POCA_y', 'POCA_z', 'mass', 'energy', 'pz', 'pdgId']
In this file, the -1 element (pdgId) is converted to eta, calculated from theta. 

'''

"       0       1     2       3         4           5        6        7       8      9       10\nnoMEPZ'  ['pt', 'eta', 'phi', 'charge', 'POCA_x', 'POCA_y', 'POCA_z', 'pdgId']\n'hasEPZ' ['pt', 'eta', 'phi', 'charge', 'POCA_x', 'POCA_y', 'POCA_z', 'energy', 'pz', 'pdgId']\n'hasMEPZ'['pt', 'eta', 'phi', 'charge', 'POCA_x', 'POCA_y', 'POCA_z', 'mass', 'energy', 'pz', 'pdgId']\n\n\n"

In [2]:
'''
80/20/20 split train/test/val.
'''
pTcut = 1.0
ptc = "pTcut:" + str(pTcut) + "GeV"
iserr = "noErr"
opts = ['hasTHETA_CHI2', 'hasEPZ', 'hasMEPZ']
pocas = ["pocas", "ips"]
scal = "scal"
poca = "pocas"

for mepz in opts[:1]:
    print(mepz)
    train_frac = 0.6
    test_frac = 0.2

    PFC_data_np = np.load('PFC_data_%s_%s.npy'%(mepz, poca))
    print(PFC_data_np.shape)
    SV_true_np = np.load('SV_true_%s_%s.npy'%(mepz, poca))

    nev = PFC_data_np.shape[0] # no. events
    nvar = PFC_data_np.shape[2] # no. vars

    #Cuts
    for j in range (PFC_data_np.shape[0]):
        for k in range(PFC_data_np.shape[1]):
            #Cut on charge, pdgId, pT)
            if (PFC_data_np[j,k,3] == 0 or PFC_data_np[j,k,-1] == 0 or PFC_data_np[j,k,0] < pTcut): 
                PFC_data_np[j,k] = np.zeros(PFC_data_np.shape[-1])

    #find max no. of particles in a jet after this cut
    maxp = 0
    for j in range (PFC_data_np.shape[0]):
        nump = 0
        for k in range(PFC_data_np.shape[1]):
            if (PFC_data_np[j,k,0] != 0): #this means we have an actual particle, not zero padding
                nump += 1
                #convert pdgId to theta using theta = 2*atan(exp(-eta))
                PFC_data_np[j,k,-1] = 2 * np.arctan(np.exp(-1*PFC_data_np[j,k,1]))
            if (PFC_data_np[j,k,0] < 0 or (PFC_data_np[j,k,0] < 1 and PFC_data_np[j,k,0] != 0)):
                print(j, k, "We have a pT problem!")
        if (nump > maxp): 
            maxp = nump
        #order in pT so the all-zero particles will always be the ones lost when i cull
        PFC_data_np[j] = PFC_data_np[j, np.flip((PFC_data_np[j,:,0].argsort()), 0)]
        
    
    
    
    print("maxp (after three cuts): ", maxp)    
    PFC_data_np_new = PFC_data_np[:,:maxp,:]
    maxval = np.amax(PFC_data_np_new)
    maxind = np.where(PFC_data_np_new == maxval)
    minval = np.amin(PFC_data_np_new)
    minind = np.where(PFC_data_np_new == minval)
    print("max, inds", maxval, maxind)
    print("min: ", minval, minind)

    print("First two entries pre-scale:", PFC_data_np_new[:2,:,:])

    #Apply Scaling
    scaler = StandardScaler()
    arr = PFC_data_np_new.reshape(PFC_data_np_new.shape[0]*PFC_data_np_new.shape[1], PFC_data_np_new.shape[2])
    scaler.fit(arr)
    arr = scaler.transform(arr)
    PFC_data_np_new = arr.reshape(PFC_data_np_new.shape[0], PFC_data_np_new.shape[1], PFC_data_np_new.shape[2])
    
    print("First two entries post-scale:", PFC_data_np_new[:2,:,:])
    
    cut1 = int(nev*train_frac)
    cut2 = cut1 + int(nev*test_frac)

    X_train_np = PFC_data_np_new[:cut1]
    y_train_np = SV_true_np[:cut1]

    X_test_np = PFC_data_np_new[cut1:cut2]
    y_test_np = SV_true_np[cut1:cut2]

    X_validate_np = PFC_data_np_new[cut2:]
    y_validate_np = SV_true_np[cut2:]
    
    X_train = torch.tensor(X_train_np, dtype=torch.float)
    y_train = torch.tensor(y_train_np,dtype=torch.float)

    X_test = torch.tensor(X_test_np, dtype=torch.float)
    y_test = torch.tensor(y_test_np, dtype=torch.float)

    X_validate = torch.tensor(X_validate_np, dtype=torch.float)
    y_validate = torch.tensor(y_validate_np, dtype = torch.float)

    torch.save(X_train, 'X_train_%s_%s_%s_%s_%s.pt'%(iserr, mepz, poca, scal, ptc))
    torch.save(y_train, 'y_train_%s_%s_%s_%s_%s.pt'%(iserr, mepz, poca, scal, ptc))
    torch.save(X_test, 'X_test_%s_%s_%s_%s_%s.pt'%(iserr, mepz, poca, scal, ptc))
    torch.save(y_test, 'y_test_%s_%s_%s_%s_%s.pt'%(iserr, mepz, poca, scal, ptc))
    torch.save(X_validate, 'X_validate_%s_%s_%s_%s_%s.pt'%(iserr, mepz, poca, scal, ptc))
    torch.save(y_validate, 'y_validate_%s_%s_%s_%s_%s.pt'%(iserr, mepz, poca, scal, ptc))
    
    
    
    

noMEPZ
(70191, 75, 8)
maxp (after three cuts):  29
max, inds 300.1837158203125 (array([53256]), array([5]), array([6]))
min:  -211.0 (array([    0,     0,     1, ..., 70190, 70190, 70190]), array([ 4,  5,  0, ..., 14, 15, 16]), array([7, 7, 7, ..., 7, 7, 7]))
First two entries pre-scale: [[[ 2.70000000e+01 -5.86870909e-01  2.04605913e+00  1.00000000e+00
    1.82914257e-01  2.01659396e-01  1.26696360e+00  2.11000000e+02]
  [ 1.33203125e+01 -7.12302029e-01  1.88804519e+00  1.00000000e+00
   -4.33726460e-02  1.13134116e-01  1.51744652e+00 -1.30000000e+01]
  [ 1.11718750e+01 -5.61235368e-01  1.98179793e+00  1.00000000e+00
    8.18332955e-02  1.58565059e-01  1.21789801e+00  2.11000000e+02]
  [ 6.65234375e+00 -5.44938505e-01  2.09342480e+00  1.00000000e+00
    1.45719278e+00  9.81468558e-01  2.84448206e-01  2.11000000e+02]
  [ 4.06250000e+00 -5.63432693e-01  2.02369690e+00 -1.00000000e+00
    1.53552204e-01  1.92033201e-01  1.22246337e+00 -2.11000000e+02]
  [ 2.19335938e+00 -6.98019326e-01  

(70191, 75, 10)
maxp (after three cuts):  29
max, inds 653.8782958984375 (array([55725]), array([0]), array([7]))
min:  -551.42138671875 (array([68457]), array([0]), array([8]))
First two entries pre-scale: [[[ 2.70000000e+01 -5.86870909e-01  2.04605913e+00  1.00000000e+00
    1.82914257e-01  2.01659396e-01  1.26696360e+00  3.17849350e+01
   -1.67708855e+01  2.11000000e+02]
  [ 1.33203125e+01 -7.12302029e-01  1.88804519e+00  1.00000000e+00
   -4.33726460e-02  1.13134116e-01  1.51744652e+00  1.68451500e+01
   -1.03110228e+01 -1.30000000e+01]
  [ 1.11718750e+01 -5.61235368e-01  1.98179793e+00  1.00000000e+00
    8.18332955e-02  1.58565059e-01  1.21789801e+00  1.29787846e+01
   -6.60443687e+00  2.11000000e+02]
  [ 6.65234375e+00 -5.44938505e-01  2.09342480e+00  1.00000000e+00
    1.45719278e+00  9.81468558e-01  2.84448206e-01  7.66603279e+00
   -3.80721903e+00  2.11000000e+02]
  [ 4.06250000e+00 -5.63432693e-01  2.02369690e+00 -1.00000000e+00
    1.53552204e-01  1.92033201e-01  1.22246337

First two entries post-scale: [[[ 5.26323181e+00 -1.00550953e+00  2.16049192e+00  1.91513722e+00
    2.28360130e-01  2.27238084e-01  4.46920272e-01  3.03331477e+00
   -1.96495657e+00  1.98642041e+00]
  [ 2.42185645e+00 -1.21944027e+00  1.99386628e+00  1.91513722e+00
   -1.05749054e-01  9.84834011e-02  5.18805255e-01  1.46335154e+00
   -1.20960113e+00 -1.26751855e-01]
  [ 1.97560960e+00 -9.61786491e-01  2.09272851e+00  1.91513722e+00
    7.91156216e-02  1.64559969e-01  4.32839156e-01  1.05705040e+00
   -7.76187914e-01  1.98642041e+00]
  [ 1.03686851e+00 -9.33991156e-01  2.21043900e+00  1.91513722e+00
    2.10981306e+00  1.36142349e+00  1.64952531e-01  4.98754219e-01
   -4.49107629e-01  1.98642041e+00]
  [ 4.98938225e-01 -9.65534169e-01  2.13691096e+00 -1.92276196e+00
    1.85007504e-01  2.13237364e-01  4.34149348e-01  1.89864316e-01
   -2.85962579e-01 -1.99464520e+00]
  [ 1.10703470e-01 -1.19508021e+00  1.99406829e+00 -1.92276196e+00
   -8.75581161e-02  1.15170708e-01  5.05129441e-01 -1

hasMEPZ
(70191, 75, 11)
maxp (after three cuts):  29
max, inds 653.8782958984375 (array([55725]), array([0]), array([8]))
min:  -551.42138671875 (array([68457]), array([0]), array([9]))
First two entries pre-scale: [[[ 2.70000000e+01 -5.86870909e-01  2.04605913e+00  1.00000000e+00
    1.82914257e-01  2.01659396e-01  1.26696360e+00  1.39526367e-01
    3.17849350e+01 -1.67708855e+01  2.11000000e+02]
  [ 1.33203125e+01 -7.12302029e-01  1.88804519e+00  1.00000000e+00
   -4.33726460e-02  1.13134116e-01  1.51744652e+00  1.05712891e-01
    1.68451500e+01 -1.03110228e+01 -1.30000000e+01]
  [ 1.11718750e+01 -5.61235368e-01  1.98179793e+00  1.00000000e+00
    8.18332955e-02  1.58565059e-01  1.21789801e+00  1.39526367e-01
    1.29787846e+01 -6.60443687e+00  2.11000000e+02]
  [ 6.65234375e+00 -5.44938505e-01  2.09342480e+00  1.00000000e+00
    1.45719278e+00  9.81468558e-01  2.84448206e-01  1.39526367e-01
    7.66603279e+00 -3.80721903e+00  2.11000000e+02]
  [ 4.06250000e+00 -5.63432693e-01  2.023

First two entries post-scale: [[[ 5.26323181e+00 -1.00550953e+00  2.16049192e+00  1.91513722e+00
    2.28360130e-01  2.27238084e-01  4.46920272e-01  1.69999032e+00
    3.03331477e+00 -1.96495657e+00  1.98642041e+00]
  [ 2.42185645e+00 -1.21944027e+00  1.99386628e+00  1.91513722e+00
   -1.05749054e-01  9.84834011e-02  5.18805255e-01  1.14351826e+00
    1.46335154e+00 -1.20960113e+00 -1.26751855e-01]
  [ 1.97560960e+00 -9.61786491e-01  2.09272851e+00  1.91513722e+00
    7.91156216e-02  1.64559969e-01  4.32839156e-01  1.69999032e+00
    1.05705040e+00 -7.76187914e-01  1.98642041e+00]
  [ 1.03686851e+00 -9.33991156e-01  2.21043900e+00  1.91513722e+00
    2.10981306e+00  1.36142349e+00  1.64952531e-01  1.69999032e+00
    4.98754219e-01 -4.49107629e-01  1.98642041e+00]
  [ 4.98938225e-01 -9.65534169e-01  2.13691096e+00 -1.92276196e+00
    1.85007504e-01  2.13237364e-01  4.34149348e-01  1.69999032e+00
    1.89864316e-01 -2.85962579e-01 -1.99464520e+00]
  [ 1.10703470e-01 -1.19508021e+00  1.99