References:

https://keras.io/examples/generative/vae/

https://www.tensorflow.org/xla/tutorials/autoclustering_xla

Goal:

1) Expand 1D features to 2D

2) VAE as front end preprocessing

a) New Latent features (not use in this case)

b) VAE decoder output (this is used for 2D-CNN input)

3) 2D-CNN

a) Evaluate XLA Enable/Disable speed

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import matplotlib.pyplot as plt
import gc

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Dense, Conv2D , MaxPool2D , Flatten , Dropout , BatchNormalization
from keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.regularizers import l2

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


import seaborn as sns
        
input_path = Path('/kaggle/input/tabular-playground-series-jan-2021/')

In [None]:
DEBUG      = False     #True is debug mode
VAE_OUTPUT = True     #True is Enable VAE, False is Disable VAE

In [None]:
train = pd.read_csv(input_path / 'train.csv', index_col='id')
display(train.head(2))

In [None]:
test = pd.read_csv(input_path / 'test.csv', index_col='id')
display(test.head(2))

In [None]:
submission = pd.read_csv(input_path / 'sample_submission.csv', index_col='id')
display(submission.head(2))

In [None]:
train.shape, test.shape, submission.shape

In [None]:
train = train.astype('float32') 
test  = test.astype('float32')

if DEBUG:
    train_df = train[:20000].copy()
    test_df  = test[:20000].copy()
else:
    train_df = train.copy()
    test_df  = test.copy()

In [None]:
f, ax = plt.subplots(nrows=3, ncols=5, figsize=(15, 12))

k=1
for i in range(3):
    for j in range(5):
        if (k==15):
            sns.distplot(train_df['target'], ax=ax[i, j])
        else:
            sns.distplot(train_df[f'cont{k}'].values, ax=ax[i, j])
            ax[i, j].set_xlabel('cont'+str(k))
        k +=1
        
plt.tight_layout()
plt.show()

In [None]:
target = train_df['target']
y_test = target.mean() * np.ones(len(test_df))   
del train_df['target']

In [None]:
IMG_SIZE = train_df.shape[1]
IMG_SIZE

In [None]:
x_train = train_df.values
x_test  = test_df.values

In [None]:
#Expand the dimension to 2D to get more degree of freedom
#2D array = more like an estimate of covariance matrix on per-sample basis, diag elements = estimate variance, features square

train_cov =   np.array([x_train]).T * [x_train]  
test_cov  =   np.array([x_test]).T * [x_test]

np.shape(train_cov), np.shape(test_cov)

In [None]:
train_cov = np.moveaxis(train_cov, 1, 0)        #target is its label
test_cov = np.moveaxis(test_cov, 1, 0)
train_cov.shape, test_cov.shape

In [None]:
train_test_cov = np.concatenate([train_cov, test_cov], axis=0)  
train_test_cov = train_test_cov.reshape(-1,14,14,1)

np.shape(train_test_cov)

In [None]:
del train_df, test_df, train, test
gc.collect()

In [None]:
class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

In [None]:
latent_dim = 24

encoder_inputs = keras.Input(shape=(IMG_SIZE, IMG_SIZE, 1))
x = layers.Conv2D(32, 3, activation="relu", strides=2, padding="same")(encoder_inputs)
x = layers.Conv2D(64, 3, activation="relu", strides=2, padding="same")(x)
x = layers.Flatten()(x)
x = layers.Dense(16, activation="relu")(x)
z_mean = layers.Dense(latent_dim, name="z_mean")(x)
z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)
z = Sampling()([z_mean, z_log_var])

encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")

encoder.summary()

In [None]:
latent_inputs = keras.Input(shape=(latent_dim,))
x = layers.Dense(7 * 7 * 64, activation="relu")(latent_inputs)  
x = layers.Reshape((7, 7, 64))(x)                               
x = layers.Conv2DTranspose(64, 3, activation="relu", strides=2, padding="same")(x)  
x = layers.Conv2DTranspose(32, 3, activation="relu", strides=1, padding="same")(x)
decoder_outputs = layers.Conv2DTranspose(1, 3, activation="sigmoid", padding="same")(x)

decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")

decoder.summary()

In [None]:
class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder

    def train_step(self, data):
        if isinstance(data, tuple):
            data = data[0]
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = encoder(data)
            reconstruction = decoder(z)
            reconstruction_loss = tf.reduce_mean(keras.losses.binary_crossentropy(data, reconstruction))
            reconstruction_loss *= (IMG_SIZE * IMG_SIZE)
            kl_loss = 1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
            kl_loss = tf.reduce_mean(kl_loss)
            kl_loss *= -0.5
            total_loss = reconstruction_loss + kl_loss
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        return {
            "loss": total_loss,
            "reconstruction_loss": reconstruction_loss,
            "kl_loss": kl_loss,
        }

In [None]:
def compare_images(encoder,decoder, test_sample):
    z_mean, z_log_var, z = encoder(test_sample)
    predictions = decoder(z)                  #logits output = batch,14,14,1

    #Plot 2D features covariance per sample
    k = 0 
    fig, ax = plt.subplots(nrows=2, ncols=8, figsize=(28, 8))
    fig.subplots_adjust(hspace = .5)

    for i in range(1):
        for j in range(8):
            ax[i,j].imshow(tf.squeeze(test_sample[k]) , cmap = "gray")
            ax[i,j].set_title('ID'+str(j))
            ax[i,j].set_xlabel('actual')
            ax[i,j].set_ylabel('covariance')
            
            ax[i+1,j].imshow(tf.squeeze(predictions[k]) , cmap = "gray")
            ax[i+1,j].set_title('ID'+str(j))
            ax[i+1,j].set_xlabel('predict')
            ax[i+1,j].set_ylabel('covariance')
            k += 1
    
    #Plot 1D, Diag elements are the magnitude square of the features
    fig, ax = plt.subplots(nrows=4, ncols=2, figsize=(28, 12))
    fig.subplots_adjust(hspace = .5)

    for i in range(4):
        ax[i,0].plot(tf.linalg.diag_part(tf.squeeze(test_sample[2*i])),'b.-',label='actual')
        ax[i,0].plot(tf.linalg.diag_part(tf.squeeze(predictions[2*i])),'r^-',label='predict')
        ax[i,0].legend()
        ax[i,0].set_title('ID'+str(2*i))
        ax[i,0].set_ylabel('features square')
        
        ax[i,1].plot(tf.linalg.diag_part(tf.squeeze(test_sample[2*i+1])),'b.-',label='actual')
        ax[i,1].plot(tf.linalg.diag_part(tf.squeeze(predictions[2*i+1])),'r^-',label='predict')
        ax[i,1].legend()
        ax[i,1].set_title('ID'+str(2*i+1))
        ax[i,1].set_ylabel('features square')  
        

In [None]:
vae = VAE(encoder, decoder)
vae.compile(optimizer=keras.optimizers.Adam())

if VAE_OUTPUT:
    hist = vae.fit(train_test_cov, 
                   epochs=8, 
                   batch_size=128)
    
    test_sample = train_test_cov[:8]
   
    compare_images(encoder,decoder, test_sample)
    
del train_test_cov    

In [None]:
train_cov = train_cov.reshape(-1,14,14,1)
test_cov = test_cov.reshape(-1,14,14,1)

if VAE_OUTPUT:
   
    BATCH_SIZE = 10000
    EPOCH_STEPS = len(train_cov)//BATCH_SIZE
    VAL_STEPS = len(test_cov)//BATCH_SIZE

    train_ds = (tf.data.Dataset
                .from_tensor_slices((train_cov, target))
                .batch(BATCH_SIZE) ) 

    test_ds = (tf.data.Dataset
                .from_tensor_slices((test_cov, y_test))
                .batch(BATCH_SIZE)) 
    
    #Need to predict on per minibatch to avoid OOM:ERROR
    x_train = []
    x_test = []
    
    #x_train VAE decoder output
    for x in list( train_ds.take(EPOCH_STEPS).as_numpy_iterator()):
        x = x[0]     #(batch, 14, 14, 1)
        _, _, z = encoder(x) 
        x_train.append(decoder(z).numpy())
        
    #x_test VAE decoder output
    for x in list(test_ds.take(EPOCH_STEPS).as_numpy_iterator()):
        x = x[0]     #(batch, 14, 14, 1)
        _, _, z = encoder(x)
        x_test.append(decoder(z).numpy())   #x_test(EPOCH_STEPS,batch,14,14,1) <-- logits output(batch,14,14,1) 
    
    x_train = np.asarray(x_train).reshape(-1,14,14,1)
    x_test = np.asarray(x_test).reshape(-1,14,14,1)
    del z,x,train_ds,test_ds
    
else:    
    x_train = train_cov
    x_test = test_cov
    del train_cov,test_cov
    
gc.collect() 

x_train.shape, target.shape, x_test.shape

Comparing XLA Enable/Disable

In [None]:
assert(tf.test.gpu_device_name())
tf.keras.backend.clear_session()
tf.config.optimizer.set_jit(False) # Start with XLA disabled.


x_train, x_val, y_train, y_val = train_test_split(x_train, target, train_size=0.60)

x_train.shape, y_train.shape, x_val.shape, y_val.shape

In [None]:
l2_reg = 5e-4  # Regularization rate for l2

def generate_model():
    return tf.keras.models.Sequential([
            tf.keras.layers.Conv2D(32, (3, 3), padding='same', input_shape=x_train.shape[1:]),
            #tf.keras.layers.Conv2D(32, (3, 3), padding='same', input_shape=x_train.shape[1:], kernel_regularizer=l2(l2_reg)),    
            tf.keras.layers.Activation('relu'),
            tf.keras.layers.Conv2D(32, (3, 3)),
            #tf.keras.layers.Conv2D(32, (3, 3), kernel_regularizer=l2(l2_reg)),  
            tf.keras.layers.Activation('relu'),
            #tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
            #tf.keras.layers.Dropout(0.25),

            tf.keras.layers.Conv2D(64, (3, 3), padding='same'),
            #tf.keras.layers.Conv2D(64, (3, 3), padding='same', kernel_regularizer=l2(l2_reg)),  
            tf.keras.layers.Activation('relu'),
            #tf.keras.layers.Conv2D(64, (3, 3)),
            #tf.keras.layers.Activation('relu'),
            tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
            tf.keras.layers.Dropout(0.25),

            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(512),
            tf.keras.layers.Activation('relu'),
            #tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(1),
            tf.keras.layers.Activation('linear')
          ])

model = generate_model()

In [None]:
def compile_model(model):
    opt = tf.keras.optimizers.Adam(lr=0.0001,  beta_1=0.9, beta_2=0.999, epsilon=1e-07)
    model.compile(loss='mse', optimizer=opt, metrics=[tf.keras.metrics.MeanAbsoluteError()])
    return model

model = compile_model(model)

In [None]:
early_stopping = keras.callbacks.EarlyStopping(
        patience=10,
        min_delta=0.001,
        restore_best_weights=True,
        )

In [None]:
model.summary()

In [None]:
def train_model(model, x_train, y_train, x_val, y_val, epochs=20):
    model.fit(x_train, y_train, 
              batch_size=256, 
              epochs=epochs, 
              validation_data=(x_val, y_val),
              callbacks=[early_stopping],
              shuffle=True)

def warmup(model, x_train, y_train, x_val, y_val):
    # Warm up the JIT, we do not wish to measure the compilation time.
    initial_weights = model.get_weights()
    train_model(model, x_train, y_train, x_val, y_val, epochs=1)
    model.set_weights(initial_weights)

warmup(model, x_train, y_train, x_val, y_val)

%time train_model(model, x_train, y_train, x_val, y_val)

scores = model.evaluate(x_val, y_val, verbose=1)

print('Val loss:', scores[0])
print('Val mae:', scores[1])

In [None]:
# Calculate evaluation metric: Root Mean Squared Error (RMSE)
y_val_pred = model.predict(x_val)
score_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f"RMSE: {score_rmse:.5f}\n")

In [None]:
# We need to clear the session to enable JIT in the middle of the program.

tf.keras.backend.clear_session()
tf.config.optimizer.set_jit(True) # Enable XLA.

model = compile_model(generate_model())
warmup(model, x_train, y_train, x_val, y_val)
%time train_model(model, x_train, y_train, x_val, y_val)

scores = model.evaluate(x_val, y_val, verbose=1)
print('Val loss:', scores[0])
print('Val mae:', scores[1])

In [None]:
def plot_results(name, y, yhat, num_to_plot=10000, lims=(0,12), figsize=(6,6)):
    plt.figure(figsize=figsize)
    score = mean_squared_error(y, yhat, squared=False)
    plt.scatter(y[:num_to_plot], yhat[:num_to_plot])
    plt.plot(lims, lims)
    plt.ylim(lims)
    plt.xlim(lims)
    plt.title(f'{name}: {score:0.5f}', fontsize=18)
    plt.show()

In [None]:
# Calculate evaluation metric: Root Mean Squared Error (RMSE)
y_val_pred = model.predict(x_val)
score_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f"RMSE: {score_rmse:.5f}\n")

In [None]:
 plot_results('2D-CNN', y_val, y_val_pred)

In [None]:
if not DEBUG:
    submission['target'] = model.predict(x_test)
    submission.to_csv('2DCNN.csv')

Conclusions:

1) VAE KL-Loss need some tuning

2) For 20 Epochs, Enable XLA speed is about 6 secs faster than Disable.  Not to much of improvement.