# Jane Street Market Prediction - TPU Training

To run this notebook on kaggle, open the interactive editor and select to `Google Cloud SDK` from the `Addons` dropdown menu. Follow the instructions to link a Google Cloud account. Then select `TPU v3-8` as the accelerator in the `Settings` pane on the right before running the notebook.

In [None]:
import gc
import json
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import precision_recall_curve, roc_curve

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import AUC, PrecisionAtRecall
from tensorflow.keras.optimizers import Adam, SGD

from kaggle_datasets import KaggleDatasets
from kaggle_secrets import UserSecretsClient

# make the x,y labels legible on plots
plt.rc("axes", labelsize=16)

# set up the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
tpu_replicas = tpu_strategy.num_replicas_in_sync

# get dataset credential from the Google Cloud SDK
# and  pass credential to tensorflow
# this needs to run after TPU intialization
user_secrets = UserSecretsClient()
user_credential = user_secrets.get_gcloud_credential()
user_secrets.set_tensorflow_credential(user_credential)

# set tensorflow's random seed
tf.random.set_seed(13)

In [None]:
# model training parameters
BATCH_SIZE = 4096 * tpu_replicas
LEARNING_RATE = 0.005
EPOCHS = 200
SHUFFLE_BUFFER = 4 * BATCH_SIZE
LABEL_SMOOTHING = 0.01

# length of the time series windows
WINDOW_SIZE = 20
NOISE = 0.05

# show precision at this recall in metrics
METRIC_RECALL = 0.55

# cross-validation parameters
FOLDS = 5
HOLDOUT = 4
TRAIN_FOLDS = [fold for fold in range(FOLDS) if fold != HOLDOUT]

# write relevant parameters to params.json for other notebooks
params = {"holdout": HOLDOUT, "window_size": WINDOW_SIZE}
with open(os.path.join(os.curdir, "params.json"), "w") as file:
    json.dump(params, file)
    
# load stats dictionary to get the number of training samples
stats_file = os.path.join(os.pardir, "input",
                          "jane-street-market-prediction-data",
                          "stats.json")

with open(stats_file) as file:
    stats = json.loads(file.read())

SAMPLES = stats[str(HOLDOUT)]["length"]

In [None]:
def dataset(folds, repeat=False, shuffle=False, cache=False):
    # load a dictionary mapping feature names to columns
    col_file = os.path.join(os.pardir, "input",
                            "jane-street-market-prediction-data",
                            "columns.json")
    with open(col_file) as file:
        cols = json.loads(file.read())

    # shorthand notation for autotune option
    auto = tf.data.experimental.AUTOTUNE

    # opens a tf record in filename as a dataset that parses serialized
    # tensors and returns sliding windows of WINDOW_SIZE samples
    def open_windowed_ds(filename):
        ds = tf.data.TFRecordDataset(filename)
        ds = ds.map(lambda x: tf.io.parse_tensor(x, tf.float32), num_parallel_calls=auto)
        ds = ds.window(WINDOW_SIZE, shift=1, drop_remainder=True)
        ds = ds.flat_map(lambda x: x.batch(WINDOW_SIZE, drop_remainder=True))
        return ds

    # create a dataset with filenames of tf records in files_ds
    # then interleave the datasets obtained by calling
    # open_windowed_ds(x) on each element of files_ds
    data_path = KaggleDatasets().get_gcs_path()
    patterns = [data_path + f"/fold{fold}" + "/*.tfrec" for fold in folds]
    files = tf.io.gfile.glob(patterns)
    files_ds = tf.data.Dataset.from_tensor_slices(files)
    ds = files_ds.interleave(open_windowed_ds, num_parallel_calls=auto)

    # filter out any time series spanning multiple dates
    def single_date(series):
        dates, ix = tf.unique(series[:, cols["date"]])
        return tf.equal(tf.size(dates), tf.constant(1))

    ds = ds.filter(single_date)

    # separate the series into a training sample consisting
    # of the features and a label indicating whether the
    # response at final time is positive
    # need to explicitly reshape the tensors here for things
    # to work properly on TPU
    def collate(series):
        X = series[:, cols["feature_0"]:(cols["feature_129"] + 1)]
        y = (1.0 + tf.sign(series[-1, cols["resp"]])) / 2.0
        return tf.reshape(X, [WINDOW_SIZE, 130]), tf.reshape(y, [1])

    ds = ds.map(collate, num_parallel_calls=auto)

    # if shuffling, allow the dataset to ignore the order for speed
    ignore_order = tf.data.Options()
    ignore_order.experimental_deterministic = not shuffle
    ds = ds.with_options(ignore_order)

    # check if the dataset should repeat once exhausted
    ds = ds.repeat() if repeat else ds
    
    # check if we should shuffle the dataset
    ds = ds.shuffle(4 * BATCH_SIZE) if shuffle else ds

    # set the batch size of the dataset
    ds = ds.batch(BATCH_SIZE)
    
    # check if we should cache the dataset
    ds = ds.cache() if cache else ds

    # prefetch new batches in the background
    ds = ds.prefetch(auto)

    return ds

In [None]:
# wave block
def Wave(inputs, filters, size, depth):
    flow = layers.Conv1D(filters=filters,
                         kernel_size=1, 
                         padding="same",
                         data_format="channels_last")(inputs)
    flow = layers.BatchNormalization()(flow)

    
    for d in range(depth):
        skip = flow
        
        tanh = layers.Conv1D(filters=filters,
                             kernel_size=size, 
                             padding="same",
                             dilation_rate=2 ** d,
                             data_format="channels_last")(flow)
        tanh = layers.Activation("tanh")(tanh)
        
        sigmoid = layers.Conv1D(filters=filters,
                                kernel_size=size, 
                                padding="same",
                                dilation_rate=2 ** d,
                                data_format="channels_last")(flow)
        sigmoid = layers.Activation("sigmoid")(sigmoid)
        
        flow = layers.Multiply()([tanh, sigmoid])    
        flow = layers.BatchNormalization()(flow)
        
        flow = layers.Conv1D(filters=filters,
                             kernel_size=1,
                             padding="same",
                             data_format="channels_last")(flow)
        
        flow = layers.Add()([skip, flow])
        flow = layers.BatchNormalization()(flow)
   
    return flow

In [None]:
# compile model on the TPU
with tpu_strategy.scope():
    # input & normalization
    inputs = layers.Input(shape=[WINDOW_SIZE, 130])
    flow = layers.BatchNormalization()(inputs)

    # gaussian noise
    flow = layers.GaussianNoise(stddev=NOISE)(flow)

    # denoise/encode the features (includes normalization)
    # this layer is trainable
    #flow = layers.TimeDistributed(encoder)(inputs)

    # wavenet with normalized dropouts of entire feature maps
    # the first block is a bottleneck designed to strip the
    # redundancy from the features
    flow = Wave(flow, 64, 3, 4)
    flow = layers.AlphaDropout(rate=0.1, noise_shape=[BATCH_SIZE, 1, 64])(flow)

    flow = Wave(flow, 64, 3, 4)
    flow = layers.AlphaDropout(rate=0.1, noise_shape=[BATCH_SIZE, 1, 128])(flow)

    flow = Wave(flow, 64, 3, 4)
    flow = layers.AlphaDropout(rate=0.1, noise_shape=[BATCH_SIZE, 1, 256])(flow)

    # dense logic for final prediction
    flow = layers.Flatten()(flow)
    flow = layers.Dense(units=1)(flow)
    outputs = layers.Activation("sigmoid")(flow)

    # optimization parameters
    loss = BinaryCrossentropy(label_smoothing=LABEL_SMOOTHING)
    optimizer = Adam(learning_rate=LEARNING_RATE)
    metrics = [PrecisionAtRecall(recall=METRIC_RECALL, name="p@r"), AUC(name="auc")]

    # compile the model and print a summary
    model = keras.Model(inputs=inputs, outputs=outputs)
    model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
    model.summary()

In [None]:
# define callbacks for learning rate schedule and early stopping
stopping = EarlyStopping(monitor="val_auc",
                         mode="max",
                         patience=20,
                         min_delta=0.001,
                         restore_best_weights=True)

rate = ReduceLROnPlateau(monitor="val_loss",
                         factor=0.5,
                         patience=5,
                         min_lr=0.0005,
                         min_delta=0.001)

# get training and validation datasets and fit the model
train_ds = dataset(TRAIN_FOLDS, repeat=True, shuffle=True)
valid_ds = dataset([HOLDOUT], cache=True)
hist = model.fit(train_ds,
                 epochs=EPOCHS,
                 steps_per_epoch=SAMPLES // BATCH_SIZE,
                 validation_data=valid_ds,
                 callbacks=[rate, stopping])

In [None]:
hist_df = pd.DataFrame(hist.history)

# loss
plt.figure(figsize=(8, 5))
sns.lineplot(data=hist_df["loss"], label="Training")
sns.lineplot(data=hist_df["val_loss"], label="Validation")
plt.title("Loss")
plt.grid(True)
plt.xlabel("Epoch")
plt.ylabel("")
plt.show()

# precision at recall
plt.figure(figsize=(8, 5))
sns.lineplot(data=hist_df["p@r"], label="Training")
sns.lineplot(data=hist_df["val_p@r"], label="Validation")
plt.title(f"Precision at {int(100 * METRIC_RECALL)}% recall")
plt.grid(True)
plt.xlabel("Epoch")
plt.ylabel("")
plt.show()

# area under ROC curve
plt.figure(figsize=(8, 5))
sns.lineplot(data=hist_df["auc"], label="Training")
sns.lineplot(data=hist_df["val_auc"], label="Validation")
plt.title("Area under the ROC curve")
plt.grid(True)
plt.xlabel("Epoch")
plt.ylabel("")
plt.show()

In [None]:
y_ds = valid_ds.unbatch().map(lambda X, y: y)
labels = np.vstack(list(y_ds.as_numpy_iterator()))

X_ds = valid_ds.map(lambda X, y: X)
probs = model.predict(X_ds)

# precision vs recall
precisions, recalls, thresholds = precision_recall_curve(labels, probs)

plt.figure(figsize=(8, 5))
plt.plot(thresholds, precisions[:-1], "tab:blue", label="Precision")
plt.plot(thresholds, recalls[:-1], "tab:orange", label="Recall")
plt.legend()
plt.xlabel("Threshold")
plt.title("Precision/recall at threshold")
plt.axis([0, 1, 0, 1])
plt.grid(True)
plt.show()

plt.figure(figsize=(8, 5))
plt.plot(recalls, precisions, "tab:blue")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision at recall")
plt.axis([0, 1, 0, 1])
plt.grid(True)
plt.show()

# ROC curve
false_positives, true_positives, thresholds = roc_curve(labels, probs)
plt.figure(figsize=(8, 5))
plt.plot(false_positives, true_positives, "tab:blue")
plt.plot([0, 1], [0, 1], "tab:gray")
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.title("ROC curve")
plt.axis([0, 1, 0, 1])
plt.grid(True)
plt.show()

In [None]:
with open("model.json", "w") as f:
    f.write(model.to_json())

model.save_weights("model.h5")