# Jane Street Market Prediction - TPU Training


Run some setup code.

In [None]:
import json
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

from kaggle_datasets import KaggleDatasets
from kaggle_secrets import UserSecretsClient

# set up the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
tpu_replicas = tpu_strategy.num_replicas_in_sync

# get dataset credential from the Google Cloud SDK
# and  pass credential to tensorflow
# this needs to run after TPU intialization
user_secrets = UserSecretsClient()
user_credential = user_secrets.get_gcloud_credential()
user_secrets.set_tensorflow_credential(user_credential)

# set tensorflow's random seed
tf.random.set_seed(42)

Settings.

In [None]:
# training parameters
EPOCHS = 5
BATCH_SIZE = 512 * tpu_replicas

# cross-validation parameters
FOLDS = 5
HOLDOUT = 4

# model parameters
WINDOW_SIZE = 20
NOISE = 0.1

`get_dataset()` returns a dataset generated from the folds in the list `folds`.

In [None]:
def get_dataset(folds, repeat=False, shuffle=False, cache=False):
    # load a dictionary mapping feature names to columns
    col_file = os.path.join(os.pardir, "input",
                            "jane-street-market-prediction-data",
                            "columns.json")
    with open(col_file) as file:
        cols = json.loads(file.read())

    # shorthand notation for autotune option
    auto = tf.data.experimental.AUTOTUNE

    # opens a tf record in filename as a dataset that parses serialized
    # tensors and returns sliding windows of WINDOW_SIZE samples
    def open_windowed_ds(filename):
        ds = tf.data.TFRecordDataset(filename)
        ds = ds.map(lambda x: tf.io.parse_tensor(x, tf.float32), num_parallel_calls=auto) # !!!!
        ds = ds.window(WINDOW_SIZE, shift=1, drop_remainder=True)
        ds = ds.flat_map(lambda x: x.batch(WINDOW_SIZE))
        return ds

    # create a dataset with filenames of tf records in files_ds
    # then interleave the datasets obtained by calling
    # open_windowed_ds(x) on each element of files_ds
    data_path = KaggleDatasets().get_gcs_path()
    patterns = [data_path + f"/fold{fold}" + "/*.tfrec" for fold in folds]
    files = tf.io.gfile.glob(patterns)
    files_ds = tf.data.Dataset.from_tensor_slices(files)
    ds = files_ds.interleave(open_windowed_ds, num_parallel_calls=auto)

    # filter out any time series spanning multiple dates
    def single_date(series):
        dates, ix = tf.unique(series[:, cols["date"]])
        return tf.equal(tf.size(dates), tf.constant(1))

    ds = ds.filter(single_date)

    # separate the series into a training sample consisting
    # of the features and a label indicating whether the
    # response at final time is positive
    # need to explicitly reshape the tensors here for things
    # to work properly on TPU
    def collate(series):
        X = series[:, cols["feature_0"]:(cols["feature_129"] + 1)]
        y = (1.0 + tf.sign(series[-1, cols["resp"]])) / 2.0
        return tf.reshape(X, [WINDOW_SIZE, 130]), tf.reshape(y, [1])
    
    ds = ds.map(collate, num_parallel_calls=auto)

    # allow the dataset to ignore the order for speed
    ignore_order = tf.data.Options()
    ignore_order.experimental_deterministic = False
    ds = ds.with_options(ignore_order)

    # check if the dataset should repeat once exhausted
    if repeat:
        ds = ds.repeat()
    
    # check if we should shuffle the dataset
    if shuffle:
        ds = ds.shuffle(16 * BATCH_SIZE)

    # set the batch size of the dataset
    ds = ds.batch(BATCH_SIZE)
    
    # check if we should cache the dataset
    if cache:
        ds = ds.cache()

    # prefetch new batches in the background
    ds = ds.prefetch(auto)

    return ds

Define the model.

In [None]:
# get features from training dataset for the normalization layer
train_folds = [fold for fold in range(FOLDS) if fold != HOLDOUT]
norm_ds = get_dataset(train_folds)
norm_ds = norm_ds.map(lambda x, y: x)

# compile model on the TPU
with tpu_strategy.scope():
    
    # normalization layer
    # gaussian noise (is this first or second!?) (strong, probably...)
    # conv1d (seperable conv1d? )
    # max pooling
    
    # experiment with Gaussian dropout - its multiplicative like a stock?
    
    # wave net type of architecture...
    # what kind of regularization in there?
    
    # norm -> noise -> batch norm?
    # use tanh activation?
    
    # input layer
    inputs = layers.Input(shape=[WINDOW_SIZE, 130], name="inputs")
    
    # normalization layer
    norm = preprocessing.Normalization(name="normalization")
    norm.adapt(norm_ds)
    flow = norm(inputs)
    
    # gaussian noise
    flow = layers.GaussianNoise(NOISE, name="noise")(flow)
    
    # convolutional net (for now)
    flow = layers.Conv1D(filters=256,
                         kernel_size=7,
                         activation="relu",
                         padding="same",
                         data_format="channels_last")(flow)
    
    flow = layers.MaxPooling1D(2)(flow)
    
    flow = layers.Conv1D(filters=256,
                         kernel_size=3,
                         activation="relu",
                         padding="same",
                         data_format="channels_last")(flow)

    flow = layers.Conv1D(filters=256,
                         kernel_size=3,
                         activation="relu",
                         padding="same",
                         data_format="channels_last")(flow)
    
    flow = layers.MaxPooling1D(2)(flow)
    
    # flattened layers and dense logic towards the end
    flow = layers.Flatten()(flow)
    
    flow = layers.Dense(128, activation="relu")(flow)
    flow = layers.Dropout(0.5)(flow)

    flow = layers.Dense(64, activation="relu")(flow)
    flow = layers.Dropout(0.5)(flow)
    
    outputs = layers.Dense(1, activation="sigmoid")(flow)
    
    
    # apply sigmoid activation to get outputs between 0 and 1
    # outputs = layers.Activation("sigmoid", name="outputs")(flow)

    model = keras.Model(inputs=inputs, outputs=outputs, name="model")
        
    # binary cross-entropy as loss function, ADAM as optimizer,
    # track precision, recall, and are under the ROC curve
    metrics=[keras.metrics.Recall(), keras.metrics.Precision(), keras.metrics.AUC()]
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=metrics)
    
    # print a summary of the model
    model.summary()

Train the model.

In [None]:
# get the training and validation datasets
train_ds = get_dataset(train_folds, repeat=True, shuffle=True)
valid_ds = get_dataset([HOLDOUT], cache=True)

# load stats dictionary to get the number of training samples
stats_file = os.path.join(os.pardir, "input",
                          "jane-street-market-prediction-data",
                          "stats.json")

with open(stats_file) as file:
    stats = json.loads(file.read())

# train the model
hist = model.fit(train_ds,
                 epochs=EPOCHS,
                 steps_per_epoch=stats[str(HOLDOUT)]["length"] // BATCH_SIZE,
                 validation_data=valid_ds)

Plot the learning curve and associated metrics.

In [None]:
hist_df = pd.DataFrame(hist.history)

# loss
plt.figure(figsize=(8, 5))
sns.lineplot(data=hist_df["loss"], label="Training loss")
sns.lineplot(data=hist_df["val_loss"], label="Validation loss")
# sns.lineplot(data=hist_df["lr"], label="Learning rate")
plt.title("Loss")
plt.grid(True)
plt.xlabel("Epoch")
plt.ylabel("")
plt.show()

# training metrics
plt.figure(figsize=(8, 5))
sns.lineplot(data=hist_df["precision"], label="Precision")
sns.lineplot(data=hist_df["recall"], label="Recall")
sns.lineplot(data=hist_df["auc"], label="Area under ROC curve")
plt.title("Training metrics")
plt.grid(True)
plt.xlabel("Epoch")
plt.ylabel("")
plt.show()

# validation metrics
plt.figure(figsize=(8, 5))
sns.lineplot(data=hist_df["val_precision"], label="Precision")
sns.lineplot(data=hist_df["val_recall"], label="Recall")
sns.lineplot(data=hist_df["val_auc"], label="Area under ROC curve")
plt.title("Validation metrics")
plt.grid(True)
plt.xlabel("Epoch")
plt.ylabel("")
plt.show()

Save the model architecture and weights.

In [None]:
with open("model.json", "w") as f:
    f.write(model.to_json())

model.save_weights("model.h5")