# Jane Street - Single-time NN (GPU)

In [None]:
import json
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import precision_recall_curve, roc_curve

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

# set tensorflow's random seed
tf.random.set_seed(42)

# read data
comp_folder = os.path.join(os.pardir, "input", "jane-street-market-prediction")
df = pd.read_csv(os.path.join(comp_folder, "train.csv"))
df = df.astype({c: np.float32 for c in df.select_dtypes(include="float64").columns})

# set missing values to -100
df.fillna(-100, inplace=True)

# split by date to reduce temporal correlations between training/test
train_df = df[df["date"] < 350]
test_df = df[df["date"] >= 400]

# Split into features and labels
features = [c for c in df.columns if "feature" in c]
train_X = train_df[features].to_numpy()
train_y = (train_df["resp"] > 0).astype(int).to_numpy()
test_X = test_df[features].to_numpy()
test_y = (test_df["resp"] > 0).astype(int).to_numpy()

In [None]:
# construct neural network
inputs = layers.Input(shape=[130])
    
norm = preprocessing.Normalization()
norm.adapt(train_X)
flow = norm(inputs)

flow = layers.GaussianNoise(stddev=0.1)(flow)

for units in [256, 128, 64, 32]:
    flow = layers.BatchNormalization()(flow)
    flow = layers.Dense(units=units)(flow)
    flow = layers.Activation(keras.activations.swish)(flow)
    flow = layers.Dropout(rate=0.2)(flow)

outputs = layers.Dense(units=1, activation="sigmoid")(flow)

model = keras.Model(inputs=inputs, outputs=outputs, name="model")

# compile
MIN_RECALL = 0.55
optimizer = keras.optimizers.Adam(learning_rate=1e-3)
metrics = [keras.metrics.PrecisionAtRecall(recall=MIN_RECALL, name="p@r"),
           keras.metrics.AUC(name="auc")]

loss = tf.keras.losses.BinaryCrossentropy(name="binary_crossentropy",
                                          label_smoothing=1e-2)

model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
model.summary()


# train
stopping = keras.callbacks.EarlyStopping(monitor="val_auc",
                                         patience=10,
                                         min_delta=1e-3,
                                         mode="max",
                                         restore_best_weights=True)

callbacks = [stopping]

hist = model.fit(train_X, train_y,
                 epochs=30,
                 validation_data=(test_X, test_y),
                 callbacks=callbacks)

# make predictions
probs = model.predict(test_X)
labels = test_y

In [None]:
hist_df = pd.DataFrame(hist.history)

# loss
plt.figure(figsize=(8, 5))
sns.lineplot(data=hist_df["loss"], label="Training loss")
sns.lineplot(data=hist_df["val_loss"], label="Validation loss")
plt.title("Loss")
plt.grid(True)
plt.xlabel("Epoch")
plt.ylabel("")
plt.show()

# training metrics
plt.figure(figsize=(8, 5))
sns.lineplot(data=hist_df["p@r"], label=f"Precision at {int(100 * MIN_RECALL)}% recall")
sns.lineplot(data=hist_df["auc"], label="Area under ROC curve")
plt.title("Training metrics")
plt.grid(True)
plt.xlabel("Epoch")
plt.ylabel("")
plt.show()

# validation metrics
plt.figure(figsize=(8, 5))
sns.lineplot(data=hist_df["val_p@r"], label=f"Precision at {int(100 * MIN_RECALL)}% recall")
sns.lineplot(data=hist_df["val_auc"], label="Area under ROC curve")
plt.title("Validation metrics")
plt.grid(True)
plt.xlabel("Epoch")
plt.ylabel("")
plt.show()


# precision vs recall
precisions, recalls, thresholds = precision_recall_curve(labels, probs)

plt.figure(figsize=(8, 5))
plt.plot(thresholds, precisions[:-1], "tab:blue", label="Precision")
plt.plot(thresholds, recalls[:-1], "tab:orange", label="Recall")
plt.legend()
plt.xlabel("Threshold")
plt.title("Precision/recall at threshold")
plt.axis([0, 1, 0, 1])
plt.grid(True)
plt.show()

plt.figure(figsize=(8, 5))
plt.plot(recalls, precisions, "tab:blue")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision at recall")
plt.axis([0, 1, 0, 1])
plt.grid(True)
plt.show()


# ROC curve
fpr, tpr, thresholds = roc_curve(labels, probs)
plt.figure(figsize=(8, 5))
plt.plot(fpr, tpr, "tab:blue")
plt.plot([0, 1], [0, 1], "tab:gray")
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.title("ROC curve")
plt.axis([0, 1, 0, 1])
plt.grid(True)
plt.show()