In [1]:
import pandas as pd
import numpy as np
from pyjet import cluster,DTYPE_PTEPM

# Pre processing

In [2]:
path = path = '/anomalyvol/data/events_LHCO2020_backgroundMC_Pythia.h5'

In [3]:
df = pd.read_hdf(path,stop=1000) # just read first 1000 events
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2090,2091,2092,2093,2094,2095,2096,2097,2098,2099
0,3.587869,-2.323472,-2.597121,1.497173,-2.480994,-2.269457,0.848844,-2.465643,-2.096595,0.961511,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.921213,-0.745233,1.018857,0.689363,-0.642245,3.050711,1.999174,-0.343135,-0.322586,1.580572,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.580352,-2.412026,1.680236,0.429869,-0.778697,-1.453413,0.856914,-2.243512,0.217628,0.407344,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.579134,-0.243543,-2.561824,0.312690,-0.283086,-0.281626,0.775053,-2.062494,-1.598718,0.868891,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.644219,-2.457281,-2.670996,0.186128,-1.757650,2.719159,0.346987,-2.318233,-0.155036,0.501437,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.493004,-0.886976,-0.391002,0.534181,-2.081904,2.548825,0.458036,-1.230976,2.204294,0.639672,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
996,0.404833,-0.510012,2.969482,0.300500,-2.248194,1.012964,0.316375,-1.815956,1.011110,0.604675,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
997,0.407082,-2.195407,2.632497,0.413497,-1.212703,2.704103,0.578276,-0.291654,1.065150,0.508287,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
998,0.573155,-0.755578,0.347725,0.751455,-1.929361,0.759764,0.798534,-0.532149,0.979545,0.941284,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
all_events = df.values

In [5]:
rows = all_events.shape[0]
cols = all_events.shape[1]
data = np.zeros((rows, cols // 3, 4))

In [6]:
for i in range(rows):
    pseudojets_input = np.zeros(len([x for x in all_events[i][::3] if x > 0]), dtype=DTYPE_PTEPM)
    for j in range(cols // 3):
        if (all_events[i][j*3]>0):
            pseudojets_input[j]['pT'] = all_events[i][j*3]
            pseudojets_input[j]['eta'] = all_events[i][j*3+1]
            pseudojets_input[j]['phi'] = all_events[i][j*3+2]
        pass
    sequence = cluster(pseudojets_input, R=1.0, p=-1)
    jets = sequence.inclusive_jets()
    for k in range(len(jets)):
        data[i][k][0] = jets[k].pt
        data[i][k][1] = jets[k].eta
        data[i][k][2] = jets[k].phi
        data[i][k][3] = jets[k].mass

In [7]:
data.shape

(1000, 700, 4)

# VAE Model

In [8]:
import keras
from keras.layers import Lambda, Input, Dense, Flatten, Reshape
from keras.models import Model, Sequential
from keras.losses import mse, binary_crossentropy
from keras import backend as K

Using TensorFlow backend.


In [9]:
def sampling(args):
    """
    # Arguments
        args (tensor): mean and log of variance of Q(z|X)
    # Returns
        z (tensor): sampled latent vector
    """

    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    # by default, random_normal has mean = 0 and std = 1.0
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

In [10]:
h_dim = 8
latent_dim = 2
input_shape = (cols // 3, 4)

In [11]:
# encoder
x = Input(shape=input_shape)
x_flat = Flatten()(x)
h = Dense(h_dim, activation='relu')(x_flat)
z_mean = Dense(latent_dim)(h)
z_log_sigma = Dense(latent_dim)(h)

In [12]:
# random sampling
z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_sigma])

encoder = Model(inputs = x, outputs = z)

In [13]:
# decoder
decoder = Sequential([
    Dense(h_dim, input_dim=latent_dim, activation='relu'),
    Dense(np.prod(input_shape), activation='sigmoid'),
    Reshape(input_shape)
])

pred = decoder(z)

In [14]:
vae = Model(inputs = x, outputs = pred, name = 'vae')

In [15]:
def vae_loss(x, x_decoded_mean):
    xent_loss = K.binary_crossentropy(x, x_decoded_mean)
    kl_loss = - 0.5 * K.mean(1 + z_log_sigma - K.square(z_mean) - K.exp(z_log_sigma), axis=-1)
    return xent_loss + kl_loss

vae.compile(optimizer='rmsprop', loss=vae_loss)

# Train

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
x_train = data[0:int(len(data) * 0.8)]
x_val = data[int(len(data) * 0.8):]
batch_size = 100
epochs = 15

In [20]:
x_train.shape

(800, 700, 4)

In [23]:
hist = vae.fit(
    x_train,
    x_train
)

Epoch 1/1


InvalidArgumentError:  Incompatible shapes: [32] vs. [32,700,4]
	 [[node loss/sequential_1_loss/vae_loss/add_4 (defined at /opt/conda/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3009) ]] [Op:__inference_keras_scratch_graph_1345]

Function call stack:
keras_scratch_graph
