In [1]:
import pandas as pd
import numpy as np
from pyjet import cluster,DTYPE_PTEPM

# Pre processing

In [2]:
path = path = '/anomalyvol/data/events_LHCO2020_backgroundMC_Pythia.h5'

In [3]:
df = pd.read_hdf(path,stop=1000) # just read first 1000 events

In [4]:
all_events = df.values

In [5]:
rows = all_events.shape[0]
cols = all_events.shape[1]
data = []

In [6]:
for i in range(rows):
    pseudojets_input = np.zeros(len([x for x in all_events[i][::3] if x > 0]), dtype=DTYPE_PTEPM)
    for j in range(cols // 3):
        if (all_events[i][j*3]>0):
            pseudojets_input[j]['pT'] = all_events[i][j*3]
            pseudojets_input[j]['eta'] = all_events[i][j*3+1]
            pseudojets_input[j]['phi'] = all_events[i][j*3+2]
        pass
    sequence = cluster(pseudojets_input, R=1.0, p=-1)
    jets = sequence.inclusive_jets()
    for k in range(len(jets)):
        jet = []
        jet.append(jets[k].pt)
        jet.append(jets[k].eta)
        jet.append(jets[k].phi)
        jet.append(jets[k].mass)
        data.append(jet)

In [7]:
data = np.array(data)
data.shape

(14795, 4)

In [8]:
# looking for bad data
bad_data = 0
for i in range(len(data)):
    if(0.0 in data[i]):
        print(data[i])
        bad_data += 1

[1.14669609 0.38852206 2.76795745 0.        ]
[0.39850035 0.71061379 0.47403628 0.        ]
[ 1.03505492 -0.63787401 -2.29100299  0.        ]
[ 0.86411041 -1.07041073  0.51943153  0.        ]
[ 0.3633931   0.25732511 -2.30640078  0.        ]
[0.83586824 0.03770738 0.02798815 0.        ]
[ 0.75008154 -0.20511004  2.81549978  0.        ]
[0.68774503 2.13499832 1.63541198 0.        ]
[0.94247574 0.11074729 1.84531426 0.        ]
[1.07450008 1.1747762  3.01616168 0.        ]
[ 0.9007169   0.99564695 -0.63307905  0.        ]
[1.84172845 1.57989311 0.0058518  0.        ]
[ 0.64557874 -2.13930798 -0.2995446   0.        ]
[3.00485945 0.58341414 1.47977638 0.        ]
[ 0.59982067 -0.94319236 -2.62025762  0.        ]
[ 0.66232765 -0.55070883  1.12762892  0.        ]
[ 0.71634895 -1.78028464  2.07532454  0.        ]
[ 0.26076359 -0.53986913  2.90156698  0.        ]
[1.77076709 0.33344129 1.08254683 0.        ]
[1.38311613 0.0942976  0.13351126 0.        ]
[0.55915505 0.21554288 2.87588239 0.    

[ 1.13005853 -0.77902609 -3.05477333  0.        ]
[ 2.99637771 -0.60527068 -2.6844573   0.        ]
[ 2.19303393 -0.33169627 -0.50877905  0.        ]
[ 3.01721978  0.3041524  -2.43942666  0.        ]
[ 0.91745889  0.08398183 -2.98438072  0.        ]
[ 0.62256813  1.78474343 -2.33784819  0.        ]
[ 1.11139703  0.99215287 -2.84892178  0.        ]
[ 0.74016535  0.93497664 -0.90829629  0.        ]
[ 0.73525625 -0.03046078  2.3198967   0.        ]
[1.48436105 1.11628532 1.84006488 0.        ]
[ 0.94256085 -1.88000619 -1.1373893   0.        ]
[0.38128456 0.74237204 2.52361488 0.        ]
[ 1.39599764 -0.48468807 -1.98809707  0.        ]
[ 0.21155588 -0.99707013  2.89939094  0.        ]
[ 3.68566585  0.49110258 -0.96738195  0.        ]
[ 4.61271143 -1.79622173 -1.12663972  0.        ]
[ 1.7951926  -0.76584697  2.37826729  0.        ]


In [24]:
(bad_data, len(data))

(439, 14795)

In [10]:
# checking distribution of data
(data[:][1].mean(), data[:][1].std()) # eta

(344.6350920188037, 387.5184029169282)

# VAE Model

In [11]:
import keras
from keras.layers import Lambda, Input, Dense, Flatten, Reshape
from keras.models import Model, Sequential
from keras import metrics
from keras import backend as K

Using TensorFlow backend.


In [12]:
def sampling(args):
    """
    # Arguments
        args (tensor): mean and log of variance of Q(z|X)
    # Returns
        z (tensor): sampled latent vector
    """

    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    # by default, random_normal has mean = 0 and std = 1.0
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

In [13]:
inter_dim = 16
final_dim = 8
latent_dim = 2
input_dim = 4

In [14]:
# encoder
x = Input(shape=(input_dim,))
#x_flat = Flatten()(x)
h1 = Dense(inter_dim, activation='relu')(x)
h2 = Dense(final_dim, activation='relu')(h1)
z_mean = Dense(latent_dim)(h2)
z_log_sigma = Dense(latent_dim)(h2)

In [15]:
# random sampling
z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_sigma])

encoder = Model(inputs = x, outputs = z)

In [16]:
# decoder
decoder_h2 = Dense(final_dim, activation='relu')
decoder_h1 = Dense(inter_dim, activation='relu')
decoder_mean = Dense(input_dim, activation='sigmoid')

h2_decoded = decoder_h2(z)
h1_decoded = decoder_h1(h2_decoded)
x_decoded_mean = decoder_mean(h1_decoded)
# x_decoded = Reshape(input_shape)(x_decoded_mean)

In [17]:
vae = Model(inputs = x, outputs = x_decoded_mean, name = 'vae')

In [18]:
def vae_loss(x, y):
    xent_loss = metrics.binary_crossentropy(x, y)
    kl_loss = - 0.5 * K.mean(1 + z_log_sigma - K.square(z_mean) - K.exp(z_log_sigma), axis=-1)
    return xent_loss + kl_loss

vae.compile(optimizer='rmsprop', loss=vae_loss)

In [19]:
vae.summary()

Model: "vae"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 4)            0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 16)           80          input_1[0][0]                    
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 8)            136         dense_1[0][0]                    
__________________________________________________________________________________________________
dense_3 (Dense)                 (None, 2)            18          dense_2[0][0]                    
________________________________________________________________________________________________

# Train

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
x_train = data[0:int(len(data) * 0.8)]
x_val = data[int(len(data) * 0.8):]
batch_size = 10
epochs = 150

In [22]:
x_train.shape

(11836, 4)

In [23]:
hist = vae.fit(x_train, x_train,
               shuffle=True,
               epochs=epochs,
               batch_size=batch_size,
               validation_data=(x_val, x_val))

Train on 11836 samples, validate on 2959 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150

KeyboardInterrupt: 