#  Ubiquant Market Prediction on TPU
In this notebook I will create a Ubiquant Market Prediction Model. I will train the Model on TPU using TF-Record dataset. The notebook contains Two Modes: training modes and inference model. During Training Mode, you need to switch this Accelerator to TPU and enable Internet Access. During inference Mode, it's not allowed to submit the notebook using TPU, so you need to switch the Acceleartor to GPU or CPU and close the Internet Access. To see how the TF-Record dataset is created, you can have look at my notebook [Create TF-Record for UMP dataset](https://www.kaggle.com/lonnieqin/create-tf-record-for-ump-dataset).
## Import Packages

In [None]:
import os
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
from scipy import stats
from tensorflow.python.ops import math_ops
from tensorflow.python.keras import backend as K
from kaggle_datasets import KaggleDatasets

## Distribution Strategy

In [None]:
tf.config.set_soft_device_placement(True)
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() # default distribution strategy in Tensorflow. Works on CPU and single GPU.

print("REPLICAS: ", strategy.num_replicas_in_sync)

## Configurations

In [None]:
batch_size = strategy.num_replicas_in_sync * 128
modes = ["training", "inference"]
mode = modes[1]
if mode == modes[0]:
    GCS_DS_PATH = KaggleDatasets().get_gcs_path("ump-tf-record-time-series-split-10-fold")

## Import dataset

## Make Tensorflow dataset
Now I am using first 9 folds as training set and last fold as validation set. You can experiment different data spliting strategy.

In [None]:
if mode == modes[0]:
    paths = tf.io.gfile.glob(GCS_DS_PATH + '*/*.tfrecords')
    train_paths = []
    for path in paths:
        for i in range(9):
            if f"fold_{i}.tfrecords" in path:
                train_paths.append(path)
    valid_paths = []
    for path in paths:
        if f"fold_9.tfrecords" in path:
            valid_paths.append(path)

In [None]:
def decode_function(record_bytes):
  return tf.io.parse_single_example(
      # Data
      record_bytes,
      # Schema
      {
          "features": tf.io.FixedLenFeature([300], dtype=tf.float32),
          "time_id": tf.io.FixedLenFeature([], dtype=tf.int64),
          "investment_id": tf.io.FixedLenFeature([], dtype=tf.int64),
          "target": tf.io.FixedLenFeature([], dtype=tf.float32)
      }
  )

def preprocess(item):
    return tf.reshape(item["features"], [batch_size, 300]), tf.reshape(item["target"], [batch_size])

def make_dataset(file_paths, batch_size=1024, mode="train"):
    ds = tf.data.TFRecordDataset(file_paths)
    ds = ds.map(decode_function)
    options = tf.data.Options()
    if mode == "train":
        ds = ds.shuffle(batch_size)
        options.experimental_deterministic = False
    ds = ds.with_options(options) 
    ds = ds.batch(batch_size, drop_remainder=True)
    ds = ds.map(preprocess)
    ds = ds.cache().prefetch(tf.data.experimental.AUTOTUNE)
    return ds

In [None]:
%%time
if mode == modes[0]:
    train_ds = make_dataset(train_paths, batch_size=batch_size)
    valid_ds = make_dataset(valid_paths, mode="valid", batch_size=batch_size)

## Modeling

In [None]:
def correlation(x, y, axis=-2):
    """Metric returning the Pearson correlation coefficient of two tensors over some axis, default -2."""
    x = tf.convert_to_tensor(x)
    y = math_ops.cast(y, x.dtype)
    n = tf.cast(tf.shape(x)[axis], x.dtype)
    xsum = tf.reduce_sum(x, axis=axis)
    ysum = tf.reduce_sum(y, axis=axis)
    xmean = xsum / n
    ymean = ysum / n
    xvar = tf.reduce_sum( tf.math.squared_difference(x, xmean), axis=axis)
    yvar = tf.reduce_sum( tf.math.squared_difference(y, ymean), axis=axis)
    cov = tf.reduce_sum( (x - xmean) * (y - ymean), axis=axis)
    corr = cov / tf.sqrt(xvar * yvar)
    return tf.constant(1.0, dtype=x.dtype) - corr

def correlationLoss(x,y, axis=-2):
    """Loss function that maximizes the pearson correlation coefficient between the predicted values and the labels,
    while trying to have the same mean and variance"""
    x = tf.convert_to_tensor(x)
    y = math_ops.cast(y, x.dtype)
    n = tf.cast(tf.shape(x)[axis], x.dtype)
    xsum = tf.reduce_sum(x, axis=axis)
    ysum = tf.reduce_sum(y, axis=axis)
    xmean = xsum / n
    ymean = ysum / n
    xsqsum = tf.reduce_sum( tf.math.squared_difference(x, xmean), axis=axis)
    ysqsum = tf.reduce_sum( tf.math.squared_difference(y, ymean), axis=axis)
    cov = tf.reduce_sum( (x - xmean) * (y - ymean), axis=axis)
    corr = cov / tf.sqrt(xsqsum * ysqsum)
    sqdif = tf.reduce_sum(tf.math.squared_difference(x, y), axis=axis) / n / tf.sqrt(ysqsum / n)
    return tf.convert_to_tensor( K.mean(tf.constant(1.0, dtype=x.dtype) - corr + (0.01 * sqdif)) , dtype=tf.float32 )


In [None]:
def get_model(strategy):
    with strategy.scope():
        model = tf.keras.Sequential([
            tf.keras.Input((300,), dtype=tf.float32, name="features"),
            layers.Dense(512, activation='swish', kernel_regularizer="l2"),
            layers.Dense(256, activation='swish'),
            layers.Dense(256, activation='swish', kernel_regularizer="l2"),
            layers.Dense(128, activation='swish'),
            layers.Dense(32, activation='swish', kernel_regularizer="l2"),
            layers.Dense(1, name="target")
        ])
        rmse = keras.metrics.RootMeanSquaredError(name="rmse")
        model.compile(optimizer=tf.optimizers.Adam(3e-4), loss=correlationLoss, metrics=["mae", "mape", rmse, correlation])
        return model

Let's see what the Model looks like.

In [None]:
keras.backend.clear_session()
model = get_model(strategy)
model.summary()
keras.utils.plot_model(model, show_shapes=True)

In [None]:
%%time
models = []
checkpoint = keras.callbacks.ModelCheckpoint("model.h5", save_best_only=True)
early_stop = keras.callbacks.EarlyStopping(patience=10)
with strategy.scope():
    if mode == modes[0]:
        history = model.fit(train_ds, epochs=30, validation_data=valid_ds, callbacks=[checkpoint, early_stop])
        model.load_weights("model.h5")
        for metric in ["loss", "mae", "rmse", "correlation"]:
            pd.DataFrame(history.history, columns=[metric, f"val_{metric}"]).plot()
            plt.title(metric.upper())
            plt.show()
    else:
        model.load_weights("../input/ubiquant-market-prediction-on-tpu-output/model.h5")
models.append(model)

## Model Evaluation

In [None]:
if mode == modes[0]:
    y_vals = []
    for _, y in valid_ds:
        y_vals += list(y.numpy().reshape(-1))
    y_val = np.array(y_vals)
    y_pred = model.predict(valid_ds)
    pearson_score = stats.pearsonr(model.predict(valid_ds).reshape(-1), y_val[:y_pred.shape[0]])[0]
    print(f"Pearson Correlation: {pearson_score}")

## Submission

In [None]:
def inference(models, ds):
    y_preds = []
    for model in models:
        y_pred = model.predict(ds).reshape(-1)
        y_preds.append(y_pred)
    return np.mean(y_preds, axis=0)

In [None]:
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test() 
features = [f"f_{i}" for i in range(300)]
for (test_df, sample_prediction_df) in iter_test:
    sample_prediction_df['target'] = inference(models, test_df[features])
    env.predict(sample_prediction_df) 