In [None]:
import json
import os

import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from kaggle_datasets import KaggleDatasets
from kaggle_secrets import UserSecretsClient

# set up the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
tpu_replicas = tpu_strategy.num_replicas_in_sync

# get dataset credential from the Google Cloud SDK
# and  pass credential to tensorflow
# this needs to run after TPU intialization
user_secrets = UserSecretsClient()
user_credential = user_secrets.get_gcloud_credential()
user_secrets.set_tensorflow_credential(user_credential)

# set tensorflow's random seed
tf.random.set_seed(42)

In [None]:
TRAINING_SAMPLES = 20 * 50000

# batch size should be divisible by number of replicas
BATCH_SIZE = 8 * tpu_replicas
WINDOW_SIZE = 3

In [None]:
def get_dataset(folder):
    # load a dictionary mapping feature names to columns
    col_file = os.path.join(os.pardir, "input",
                            "jane-street-market-prediction-data",
                            "columns.json")
    with open(col_file) as file:
        cols = json.loads(file.read())

    # shorthand notation for autotune option
    auto = tf.data.experimental.AUTOTUNE

    # opens a tf record in filename as a dataset that parses serialized
    # tensors and returns sliding windows of WINDOW_SIZE + 1 samples
    # the last sample is used to provide a label for the first
    # WINDOW_SIZE samples
    def open_windowed_ds(filename):
        ds = tf.data.TFRecordDataset(filename)
        ds = ds.map(lambda x: tf.io.parse_tensor(x, tf.float32))
        ds = ds.window(WINDOW_SIZE + 1, shift=1, drop_remainder=True)
        ds = ds.flat_map(lambda x: x.batch(WINDOW_SIZE + 1))
        return ds

    # create a dataset with filenames of tf records in files_ds
    # then interleave the datasets obtained by calling
    # open_windowed_ds(x) on each element of files_ds
    data_path = KaggleDatasets().get_gcs_path()
    files = tf.io.gfile.glob(data_path + "/" + folder + "/*.tfrec")
    files_ds = tf.data.Dataset.from_tensor_slices(files)
    ds = files_ds.interleave(open_windowed_ds, num_parallel_calls=auto)

    # filter out any time series spanning multiple dates
    def single_date(series):
        dates, ix = tf.unique(series[:, cols["date"]])
        return tf.math.equal(tf.size(dates), 1)

    ds = ds.filter(single_date)

    # separate the series into a training sample consisting
    # of the features at the first WINDOW_SIZE times and a
    # label indicating whether the response at time
    # WINDOW_SIZE + 1 is positive
    def collate(series):
        X = series[:-1, cols["feature_0"]:cols["feature_129"]]
        y = 1.0 if series[-1, cols["resp"]] > 0.0 else 0.0
        return (X, y)

    ds = ds.map(collate, num_parallel_calls=auto)

    # allow the dataset to ignore the order for speed
    ignore_order = tf.data.Options()
    ignore_order.experimental_deterministic = False
    ds = ds.with_options(ignore_order)

    # for the training dataset, we shuffle samples and repeat
    # when we reach the end of the dataset
    if folder == "train":
        ds = ds.repeat()
        ds = ds.shuffle(16 * BATCH_SIZE)

    # set the batch size of the dataset
    # ds = ds.batch(BATCH_SIZE)

    # prefetch new batches in the background
    ds = ds.prefetch(auto)

    return ds