# Jane Street Market Prediction - TPU Training

To run this notebook on kaggle, open the interactive editor and select to `Google Cloud SDK` from the `Addons` dropdown menu. Follow the instructions to link a Google Cloud account. Then select `TPU v3-8` as the accelerator in the `Settings` pane on the right before running the notebook.

Run some setup code.

In [1]:
import json
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import precision_recall_curve, roc_curve

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import AUC, PrecisionAtRecall
from tensorflow.keras.optimizers import SGD

from kaggle_datasets import KaggleDatasets
from kaggle_secrets import UserSecretsClient

# make the x,y labels legible on plots
plt.rc("axes", labelsize=16)

# set up the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
tpu_replicas = tpu_strategy.num_replicas_in_sync

# get dataset credential from the Google Cloud SDK
# and  pass credential to tensorflow
# this needs to run after TPU intialization
user_secrets = UserSecretsClient()
user_credential = user_secrets.get_gcloud_credential()
user_secrets.set_tensorflow_credential(user_credential)

# set tensorflow's random seed
tf.random.set_seed(13)

Settings.

In [2]:
# training parameters
EPOCHS = 100
BATCH_SIZE = 32 * tpu_replicas
LEARNING_RATE = 0.01
MIN_RECALL = 0.55
LABEL_SMOOTHING = 0

# cross-validation parameters
FOLDS = 5
HOLDOUT = 4

# model parameters
WINDOW_SIZE = 40
NOISE = 0.05

# write relevant parameters to params.json for other notebooks
params = {"holdout": HOLDOUT, "window_size": WINDOW_SIZE}
with open(os.path.join(os.curdir, "params.json"), "w") as file:
    json.dump(params, file)

`get_dataset()` returns a dataset generated from the folds in the list `folds`.

In [3]:
def get_dataset(folds, repeat=False, shuffle=False, cache=False):
    # load a dictionary mapping feature names to columns
    col_file = os.path.join(os.pardir, "input",
                            "jane-street-market-prediction-data",
                            "columns.json")
    with open(col_file) as file:
        cols = json.loads(file.read())

    # shorthand notation for autotune option
    auto = tf.data.experimental.AUTOTUNE

    # opens a tf record in filename as a dataset that parses serialized
    # tensors and returns sliding windows of WINDOW_SIZE samples
    def open_windowed_ds(filename):
        ds = tf.data.TFRecordDataset(filename)
        ds = ds.map(lambda x: tf.io.parse_tensor(x, tf.float32), num_parallel_calls=auto)
        ds = ds.window(WINDOW_SIZE, shift=1, drop_remainder=True)
        ds = ds.flat_map(lambda x: x.batch(WINDOW_SIZE))
        return ds

    # create a dataset with filenames of tf records in files_ds
    # then interleave the datasets obtained by calling
    # open_windowed_ds(x) on each element of files_ds
    data_path = KaggleDatasets().get_gcs_path()
    patterns = [data_path + f"/fold{fold}" + "/*.tfrec" for fold in folds]
    files = tf.io.gfile.glob(patterns)
    files_ds = tf.data.Dataset.from_tensor_slices(files)
    ds = files_ds.interleave(open_windowed_ds, num_parallel_calls=auto)

    # filter out any time series spanning multiple dates
    def single_date(series):
        dates, ix = tf.unique(series[:, cols["date"]])
        return tf.equal(tf.size(dates), tf.constant(1))

    ds = ds.filter(single_date)

    # separate the series into a training sample consisting
    # of the features and a label indicating whether the
    # response at final time is positive
    # need to explicitly reshape the tensors here for things
    # to work properly on TPU
    def collate(series):
        resp = tf.reshape(series[:-1, cols["resp"]], [WINDOW_SIZE - 1, 1])
        resp = tf.pad(resp, [[0, 1], [0, 0]])
        X = series[:, cols["feature_0"]:(cols["feature_129"] + 1)]
        X = tf.concat([X, resp], 1)
        y = (1.0 + tf.sign(series[-1, cols["resp"]])) / 2.0
        s = series[:, cols["ts_id"]]   
        return tf.reshape(X, [WINDOW_SIZE, 131]), tf.reshape(y, [1]), tf.reshape(s, [WINDOW_SIZE])

    ds = ds.map(collate, num_parallel_calls=auto)

    # if shuffling, allow the dataset to ignore the order for speed
    ignore_order = tf.data.Options()
    ignore_order.experimental_deterministic = not shuffle
    ds = ds.with_options(ignore_order)

    # check if the dataset should repeat once exhausted
    ds = ds.repeat() if repeat else ds
    
    # check if we should shuffle the dataset
    ds = ds.shuffle(4 * BATCH_SIZE) if shuffle else ds

    # set the batch size of the dataset
    ds = ds.batch(BATCH_SIZE)
    
    # check if we should cache the dataset
    ds = ds.cache() if cache else ds

    # prefetch new batches in the background
    ds = ds.prefetch(auto)

    return ds

In [None]:
ds = get_dataset([0,1,2,3])

ds = ds.unbatch()

for X, y, s in ds.take(10):
    print(X)
    print(y)
    print(s)
    print("\n\n\n")

In [None]:
# location of data files
comp_folder = os.path.join(os.pardir, "input", "jane-street-market-prediction")

# load training data, convert to 32-bit floats and replace missing values by median
train_df = pd.read_csv(os.path.join(comp_folder, "train.csv"))
train_df.set_index("ts_id", inplace=True)

float_cols = train_df.select_dtypes(include="float64").columns
train_df = train_df.astype({c: np.float32 for c in float_cols})

train_df.fillna(-100, inplace=True)

In [None]:
print(train_df[10:20])