# IMPORTS

In [1]:
import numpy as np
import pandas as pd
import sklearn
import tensorflow as tf
from sklearn import datasets, metrics, model_selection, svm
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
np.random.seed(31415)
from sklearn.metrics import accuracy_score, precision_score, recall_score
from tensorflow.keras import layers, losses
from tensorflow.keras.models import Model

# IMPORT DATASETS

In [2]:
signals = pd.read_csv(r'/Users/cadodo/Desktop/main/LIP/CSV/ResmMed4000mX1lb0p2yp0p4.csv')
BG = pd.read_csv(r'/Users/cadodo/Desktop/main/LIP/CSV/bkg.csv')

BG['LABEL'] = 0
signals['LABEL'] = 1

# SPLITTING BG

In [3]:
# INPUTS
X_bg = BG.loc[:,['normalisedCombinedWeight','MET_px', 'MET_py','jet_e', 'jet_px', 'jet_py', 'jet_pz',
                  'ljet_e', 'ljet_px', 'ljet_py','ljet_pz','HT',
                  'gen_split','train_weight','LABEL']]  # REMOVER POR ENQUANTO jet_DL1r_max


# WEIGHTS OF bg_test
nCW_bg_test = (X_bg.loc[X_bg['gen_split'] == 'test'])['normalisedCombinedWeight']

# bg_train TO TRAIN AUTOENCODER
bg_train = (X_bg.loc[X_bg['gen_split'] == 'train']).drop(columns=['normalisedCombinedWeight',
                                                                  'gen_split','train_weight','LABEL'])

# bg_test TO JOIN TO signals_test TO TEST AUTOENCODER
bg_test = (X_bg.loc[X_bg['gen_split'] == 'test']).drop(columns=['normalisedCombinedWeight',
                                                                'gen_split','train_weight','LABEL'])

# bg_val TO JOIN TO signals_val TO VALIDATE AUTOENCODER
bg_val = (X_bg.loc[X_bg['gen_split'] == 'val']).drop(columns=['normalisedCombinedWeight',
                                                              'gen_split','train_weight','LABEL'])


# DEFINE LABELS TO USE FOR THE ROC CURVE
y_bg = BG.loc[:,['gen_split','LABEL']]

y_bg_train = (y_bg.loc[y_bg['gen_split'] == 'train']).drop(columns=['gen_split'])
y_bg_test = (y_bg.loc[y_bg['gen_split'] == 'test']).drop(columns=['gen_split'])
y_bg_val = (y_bg.loc[y_bg['gen_split'] == 'val']).drop(columns=['gen_split'])

# SPLITTING SIGNALS

In [4]:
# INPUTS
X_signals = signals.loc[:,['normalisedCombinedWeight','MET_px', 'MET_py','jet_e', 'jet_px', 'jet_py', 'jet_pz',
                  'ljet_e', 'ljet_px', 'ljet_py','ljet_pz','HT',
                  'gen_split','train_weight','LABEL']]    # REMOVER POR ENQUANTO jet_DL1r_max



# WEIGHTS OF signals_test
nCW_signals_test = (X_signals.loc[X_signals['gen_split'] == 'test'])['normalisedCombinedWeight']



signals_train = (X_signals.loc[X_signals['gen_split'] == 'train']).drop(columns=['normalisedCombinedWeight',
                                                              'gen_split','train_weight','LABEL'])

# signals_test TO JOIN TO bg_test TO TEST AUTOENCODER
signals_test = (X_signals.loc[X_signals['gen_split'] == 'test']).drop(columns=['normalisedCombinedWeight',
                                                              'gen_split','train_weight','LABEL'])

# signals_val TO JOIN TO bg_val TO VALIDATE AUTOENCODER
signals_val = (X_signals.loc[X_signals['gen_split'] == 'val']).drop(columns=['normalisedCombinedWeight',
                                                              'gen_split','train_weight','LABEL'])


# DEFINE LABELS TO USE FOR THE ROC CURVE
y_signals = signals.loc[:,['gen_split','LABEL']]

y_signals_train = (y_signals.loc[y_signals['gen_split'] == 'train']).drop(columns=['gen_split'])
y_signals_test = (y_signals.loc[y_signals['gen_split'] == 'test']).drop(columns=['gen_split'])
y_signals_val = (y_signals.loc[y_signals['gen_split'] == 'val']).drop(columns=['gen_split'])

# DEFINE WEIGHTS

In [5]:
# BG WEIGHTS
weight_bg_train = (X_bg.loc[X_bg['gen_split'] == 'train'])['train_weight'] 
weight_bg_test = (X_bg.loc[X_bg['gen_split'] == 'test'])['train_weight']
weight_bg_val = (X_bg.loc[X_bg['gen_split'] == 'val'])['train_weight']

# SIGNALS WEIGHTS
weight_signals_train = (X_signals.loc[X_signals['gen_split'] == 'train'])['train_weight']
weight_signals_test = (X_signals.loc[X_signals['gen_split'] == 'test'])['train_weight']
weight_signals_val = (X_signals.loc[X_signals['gen_split'] == 'val'])['train_weight']

# SUM WEIGHTS

In [6]:
class_weights_train = (weight_bg_train.values.sum(),weight_signals_train.values.sum())
class_weights_test = (weight_bg_test.values.sum(),weight_signals_test.values.sum())
class_weights_val = (weight_bg_val.values.sum(),weight_signals_val.values.sum())

print("class_weights_train (BG, signals):",class_weights_train)
print("class_weights_test (BG, signals):",class_weights_test)
print("class_weights_val (BG, signals):",class_weights_val)

class_weights_train (BG, signals): (0.9999999999999982, 1.0)
class_weights_test (BG, signals): (0.9999999999999997, 1.0)
class_weights_val (BG, signals): (1.0000000000000004, 1.0)


# CONCATING DATASETS

In [7]:
# JOIN NECESSARY CSVs FOR X
X_train = bg_train
X_test = pd.concat([signals_test, bg_test], ignore_index=True)
X_val = bg_val

# JOIN NECESSARY CSVs FOR y
y_test = pd.concat([y_signals_test, y_bg_test], ignore_index=True).values
y_val = y_bg_val

# JOIN NECESSARY CSVs FOR weight
weight_train = weight_bg_train.values
weight_test = pd.concat([weight_signals_test, weight_bg_test], ignore_index=True).values.reshape(-1,1)
weight_val = weight_bg_val
nCW_test = pd.concat([nCW_signals_test, nCW_bg_test], ignore_index=True).values
nCW_test = nCW_test.reshape(-1,1)

# STANDARDISATION OF INPUTS

In [8]:
from sklearn.preprocessing import StandardScaler

print("Original mean and variance:")
for feature, mean, std in zip(X_train.columns,X_train.mean(0), X_train.std(0)):
      print("{:9}: {:7.4f} +/- {:7.4f}".format(feature,mean,std))


scaler = StandardScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train),columns = X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test),columns = X_test.columns)
X_val = pd.DataFrame(scaler.transform(X_val),columns = X_val.columns)


print("\nStandardised mean and variance:")
for feature, mean, std in zip(X_train.columns,X_train.mean(0), X_train.std(0)):
      print("{:9}: {:7.4f} +/- {:7.4f}".format(feature,mean,std))

Original mean and variance:
MET_px   :  0.8456 +/- 314.9951
MET_py   : -3.0400 +/- 315.0266
jet_e    : 698.2610 +/- 424.8866
jet_px   : -1.0216 +/- 351.9621
jet_py   :  3.0931 +/- 353.5471
jet_pz   : -5.0986 +/- 645.8425
ljet_e   : 726.7014 +/- 394.2431
ljet_px  : -1.1067 +/- 365.4329
ljet_py  :  2.8156 +/- 366.9482
ljet_pz  : -5.3815 +/- 638.0405
HT       : 1115.6198 +/- 473.4075

Standardised mean and variance:
MET_px   : -0.0000 +/-  1.0000
MET_py   :  0.0000 +/-  1.0000
jet_e    :  0.0000 +/-  1.0000
jet_px   :  0.0000 +/-  1.0000
jet_py   :  0.0000 +/-  1.0000
jet_pz   :  0.0000 +/-  1.0000
ljet_e   :  0.0000 +/-  1.0000
ljet_px  :  0.0000 +/-  1.0000
ljet_py  :  0.0000 +/-  1.0000
ljet_pz  : -0.0000 +/-  1.0000
HT       : -0.0000 +/-  1.0000


In [49]:
np.isnan(X_train).any()

MET_px     False
MET_py     False
jet_e      False
jet_px     False
jet_py     False
jet_pz     False
ljet_e     False
ljet_px    False
ljet_py    False
ljet_pz    False
HT         False
dtype: bool

In [50]:
np.isinf(X_train).any()

MET_px     False
MET_py     False
jet_e      False
jet_px     False
jet_py     False
jet_pz     False
ljet_e     False
ljet_px    False
ljet_py    False
ljet_pz    False
HT         False
dtype: bool

In [52]:
X_train.shape

(3438041, 11)

# VAE

Create a sampling layer

In [9]:
class Sampling(layers.Layer):
    # Uses (z_mean, z_log_var) to sample z, the vector encoding a digit

    def call(self, inputs):
        z_mean, z_log_var = inputs
        
        batch = tf.shape(z_mean)[0]   # tf.shape returns a 1-D integer tensor representing the shape of input
        dim = tf.shape(z_mean)[1]
        epsilon = tf.random.normal(shape=(batch, dim))    # Outputs random values from a normal distribution.
        
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

Build the ENCODER

In [29]:
import keras

latent_dim = 2
shape = X_train.shape[1]   # Number of features

encoder_inputs = keras.Input(shape=(shape,))    # A shape tuple (integers), not including the batch size. 
print(encoder_inputs,'\n')                      # For instance, shape=(11,) indicates that the expected 
                                                # input will be batches of 11-dimensional vectors.

x = layers.Dense(64, activation="relu")(encoder_inputs)
print(x,'\n')
x = layers.Dense(32, activation="relu")(x)
print(x,'\n')
x = layers.Dense(16, activation="relu")(x)
print(x,'\n')

z_mean = layers.Dense(latent_dim, name="z_mean")(x)
print(z_mean,'\n')
z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)
print(z_log_var,'\n')

z = Sampling()([z_mean, z_log_var])
print(z,'\n')

encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
encoder.summary()

KerasTensor(type_spec=TensorSpec(shape=(None, 11), dtype=tf.float32, name='input_6'), name='input_6', description="created by layer 'input_6'") 

KerasTensor(type_spec=TensorSpec(shape=(None, 64), dtype=tf.float32, name=None), name='dense_16/Relu:0', description="created by layer 'dense_16'") 

KerasTensor(type_spec=TensorSpec(shape=(None, 32), dtype=tf.float32, name=None), name='dense_17/Relu:0', description="created by layer 'dense_17'") 

KerasTensor(type_spec=TensorSpec(shape=(None, 16), dtype=tf.float32, name=None), name='dense_18/Relu:0', description="created by layer 'dense_18'") 

KerasTensor(type_spec=TensorSpec(shape=(None, 2), dtype=tf.float32, name=None), name='z_mean/BiasAdd:0', description="created by layer 'z_mean'") 

KerasTensor(type_spec=TensorSpec(shape=(None, 2), dtype=tf.float32, name=None), name='z_log_var/BiasAdd:0', description="created by layer 'z_log_var'") 

KerasTensor(type_spec=TensorSpec(shape=(None, 2), dtype=tf.float32, name=None), name='sampling_4/add:0

Build the DECODER

In [32]:
latent_inputs = keras.Input(shape=(latent_dim,))   # (None, 2)
print(latent_inputs,'\n')

x = layers.Dense(16, activation="relu")(latent_inputs)
print(x,'\n')
x = layers.Dense(32, activation="relu")(x)
print(x,'\n')
x = layers.Dense(64, activation="relu")(x)
print(x,'\n')

decoder_outputs = layers.Dense(shape, activation=None)(x)   # Get back the initial number of features (None, 11)
print(decoder_outputs,'\n')

decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")
decoder.summary()

KerasTensor(type_spec=TensorSpec(shape=(None, 2), dtype=tf.float32, name='input_9'), name='input_9', description="created by layer 'input_9'") 

KerasTensor(type_spec=TensorSpec(shape=(None, 16), dtype=tf.float32, name=None), name='dense_27/Relu:0', description="created by layer 'dense_27'") 

KerasTensor(type_spec=TensorSpec(shape=(None, 32), dtype=tf.float32, name=None), name='dense_28/Relu:0', description="created by layer 'dense_28'") 

KerasTensor(type_spec=TensorSpec(shape=(None, 64), dtype=tf.float32, name=None), name='dense_29/Relu:0', description="created by layer 'dense_29'") 

KerasTensor(type_spec=TensorSpec(shape=(None, 11), dtype=tf.float32, name=None), name='dense_30/BiasAdd:0', description="created by layer 'dense_30'") 

Model: "decoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_9 (InputLayer)        [(None, 2)]               0         
                                     

Custom train_step and define VAE as a Model

In [46]:
class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super().__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(
            name="reconstruction_loss"
        )
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def train_step(self, data):
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)     # <- Not encoding properly idk why
            reconstruction = self.decoder(z) 
            
            # Mean Squared Error
            mse = tf.keras.losses.MeanSquaredError()
            reconstruction_loss = tf.reduce_mean(mse(data, reconstruction))   # tf.reduce_mean >> tf.reduce_sum ??
            
            # KL Divergence Error
            kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
            kl_loss = tf.reduce_mean(kl_loss)
            
            # Total Loss -> MSE + KL
            total_loss = reconstruction_loss + kl_loss
            
        #clip_value = 0.5
        
        grads = tape.gradient(total_loss, self.trainable_weights)
        
        #clipped_grads = [tf.clip_by_value(g, -clip_value, clip_value) for g in grads]
        #self.optimizer.apply_gradients(zip(clipped_grads, self.trainable_weights))
        
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }

Train the model

In [51]:
vae = VAE(encoder, decoder)
vae.compile(optimizer=keras.optimizers.Adam(0.0001))

#X = pd.concat([X_train,X_test], ignore_index=True)

vae.fit(X_train, epochs=30, batch_size=1024)   # loss: nan - reconstruction_loss: nan - kl_loss: nan



Epoch 1/30


2023-07-07 16:35:51.074366: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




KeyboardInterrupt: 

# DEBUG

In [14]:
mymodel = VAE(encoder, decoder)

In [15]:
x = np.random.rand(2000,11)  # Create random X_train

In [16]:
z_mean, z_log_var, z = mymodel.encoder(x) # encode the data

In [17]:
z_mean

<tf.Tensor: shape=(2000, 2), dtype=float32, numpy=
array([[nan, nan],
       [nan, nan],
       [nan, nan],
       ...,
       [nan, nan],
       [nan, nan],
       [nan, nan]], dtype=float32)>

In [18]:
z_log_var

<tf.Tensor: shape=(2000, 2), dtype=float32, numpy=
array([[nan, nan],
       [nan, nan],
       [nan, nan],
       ...,
       [nan, nan],
       [nan, nan],
       [nan, nan]], dtype=float32)>

In [19]:
z

<tf.Tensor: shape=(2000, 2), dtype=float32, numpy=
array([[nan, nan],
       [nan, nan],
       [nan, nan],
       ...,
       [nan, nan],
       [nan, nan],
       [nan, nan]], dtype=float32)>