# Jane Street - GPU

In [None]:
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import precision_recall_curve, roc_curve

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import AUC, PrecisionAtRecall
from tensorflow.keras.optimizers import Adam, SGD

# set tensorflow's random seed
tf.random.set_seed(13)

In [None]:
# read data as 32 bit floats
file = os.path.join(os.pardir, "input", "jane-street-market-prediction", "train.csv")
dtype = {c: np.float32 for c in pd.read_csv(file, nrows=1).columns}
full_df = pd.read_csv(file, engine="c", dtype=dtype)

In [None]:
# split into training and validation
train_df = full_df[full_df["date"].between(86, 375)]
valid_df = full_df[full_df["date"].between(425, 500)]

# fill missing values with median
median = train_df.median()
train_df = train_df.fillna(median)
valid_df = valid_df.fillna(median)

# store imputation values
median.to_csv("impute.csv")

# build features and labels
features = [c for c in train_df.columns if "feature" in c]
train_X = train_df[features].to_numpy()
valid_X = valid_df[features].to_numpy()
train_y = train_df["resp"].gt(0.0).astype(np.float32).to_numpy()
valid_y = valid_df["resp"].gt(0.0).astype(np.float32).to_numpy()

print(f"Class imbalance: {train_y.mean()}")

In [None]:
# model training parameters
BATCH_SIZE = 512
LEARNING_RATE = 0.001
EPOCHS = 200
LABEL_SMOOTHING = 0.0
NOISE = 0.1

# show precision at this recall in metrics
METRIC_RECALL = 0.55

In [None]:
# input & normalization
norm = layers.experimental.preprocessing.Normalization()
norm.adapt(train_X)
inputs = layers.Input(shape=[len(features)])
flow = norm(inputs)

flow = layers.GaussianNoise(stddev=NOISE)(flow)

flow = layers.Dense(units=70)(flow)
flow = layers.Activation(keras.activations.swish)(flow)
flow = layers.Dropout(rate=0.5)(flow)

flow = layers.Dense(units=1)(flow)
outputs = layers.Activation("sigmoid")(flow)

# optimization parameters
loss = BinaryCrossentropy(label_smoothing=LABEL_SMOOTHING)
optimizer = Adam(learning_rate=LEARNING_RATE)
metrics = [PrecisionAtRecall(recall=METRIC_RECALL, name="p@r"), AUC(name="auc")]

# compile the model and print a summary
model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
model.summary()

In [None]:
# define callbacks for learning rate schedule and early stopping
stopping = EarlyStopping(monitor="val_auc",
                         mode="max",
                         patience=20,
                         min_delta=0.001,
                         restore_best_weights=True)

rate = ReduceLROnPlateau(monitor="val_auc",
                         mode="max",
                         factor=0.5,
                         patience=5,
                         min_lr=0.0005,
                         min_delta=0.001)

hist = model.fit(train_X, train_y,
                 epochs=EPOCHS,
                 batch_size=BATCH_SIZE,
                 validation_data=(valid_X, valid_y),
                 callbacks=[rate, stopping])

In [None]:
hist_df = pd.DataFrame(hist.history)

# loss
plt.figure(figsize=(8, 5))
sns.lineplot(data=hist_df["loss"], label="Training")
sns.lineplot(data=hist_df["val_loss"], label="Validation")
plt.title("Loss")
plt.grid(True)
plt.xlabel("Epoch")
plt.ylabel("")
plt.show()

# precision at recall
plt.figure(figsize=(8, 5))
sns.lineplot(data=hist_df["p@r"], label="Training")
sns.lineplot(data=hist_df["val_p@r"], label="Validation")
plt.title(f"Precision at {int(100 * METRIC_RECALL)}% recall")
plt.grid(True)
plt.xlabel("Epoch")
plt.ylabel("")
plt.show()

# area under ROC curve
plt.figure(figsize=(8, 5))
sns.lineplot(data=hist_df["auc"], label="Training")
sns.lineplot(data=hist_df["val_auc"], label="Validation")
plt.title("Area under the ROC curve")
plt.grid(True)
plt.xlabel("Epoch")
plt.ylabel("")
plt.show()

In [None]:
probs = model.predict(valid_X)

# precision vs recall
precisions, recalls, thresholds = precision_recall_curve(valid_y, probs)

plt.figure(figsize=(8, 5))
plt.plot(thresholds, precisions[:-1], "tab:blue", label="Precision")
plt.plot(thresholds, recalls[:-1], "tab:orange", label="Recall")
plt.legend()
plt.xlabel("Threshold")
plt.title("Precision/recall at threshold")
plt.axis([0, 1, 0, 1])
plt.grid(True)
plt.show()

plt.figure(figsize=(8, 5))
plt.plot(recalls, precisions, "tab:blue")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision at recall")
plt.axis([0, 1, 0, 1])
plt.grid(True)
plt.show()

# ROC curve
false_positives, true_positives, thresholds = roc_curve(valid_y, probs)
plt.figure(figsize=(8, 5))
plt.plot(false_positives, true_positives, "tab:blue")
plt.plot([0, 1], [0, 1], "tab:gray")
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.title("ROC curve")
plt.axis([0, 1, 0, 1])
plt.grid(True)
plt.show()

In [None]:
with open("model.json", "w") as f:
    f.write(model.to_json())

model.save_weights("model.h5")