# Jane Street - Utility

For the $i$-th day, the model is given the score
$$p_i = \sum_j \mbox{weight}_{ij} \cdot \mbox{resp}_{ij}$$
where the sum ranges over all accepted trades for the day. The final score of the model over all days is then
$$u = \min(\max(t, 0), 6) \cdot \sum_i p_i $$
with
$$t = \frac{\sum_i p_i}{\sqrt{\sum_i p_i^2}} \sqrt{\frac{250}{\mbox{total number of days}}}.$$

In this notebook we investigate how to maximize the utility given a probability that $resp_{ij}$ is positive or an estimate for $resp_{ij}$.

In [None]:
from time import time

import json
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import precision_recall_curve, roc_curve

import tensorflow as tf
import tensorflow.keras as keras

tick = time()

# make the x,y labels legible on plots
plt.rc("axes", labelsize=16)

In [None]:
comp_folder = os.path.join(os.pardir, "input", "jane-street-market-prediction")

#df = pd.read_csv(os.path.join(comp_folder, "train.csv"))
#df.set_index("ts_id", inplace=True)
#df = df.astype({c: np.float32 for c in df.select_dtypes(include="float64").columns})
#df.fillna(df.median(), inplace=True)

In [None]:
#df["prod"] = df["weight"] * df["resp"]
#sns.pairplot(df[["resp", "weight", "prod"]],
#             diag_kind="kde", plot_kws={"s": 3})
#plt.show()

In [None]:
model_folder = os.path.join(os.pardir, "input", "jane-street-market-prediction-tpu")

with open(os.path.join(model_folder, "model.json"), "r") as f:
    model_json = f.read()

model = keras.models.model_from_json(model_json)
model.load_weights(os.path.join(model_folder, "model.h5"))

In [None]:
# load the validation data from training notebook
# see training notebook for comments

data_folder = os.path.join(os.pardir, "input", "jane-street-market-prediction-data")

with open(os.path.join(data_folder, "columns.json")) as file:
    cols = json.loads(file.read())
    
#with open(os.path.join(model_folder, "params.json")) as file:
#    params = json.loads(file.read())

#HOLDOUT = params["holdout"]
#WINDOW_SIZE = params["window_size"]

HOLDOUT = 4
WINDOW_SIZE = 1

auto = tf.data.experimental.AUTOTUNE

def open_windowed_ds(filename):
    ds = tf.data.TFRecordDataset(filename)
    ds = ds.map(lambda x: tf.io.parse_tensor(x, tf.float32), num_parallel_calls=auto)
    ds = ds.window(WINDOW_SIZE, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda x: x.batch(WINDOW_SIZE))
    return ds

def single_date(series):
    dates, ix = tf.unique(series[:, cols["date"]])
    return tf.equal(tf.size(dates), tf.constant(1))

def collate(series):
    X = series[:, cols["feature_0"]:(cols["feature_129"] + 1)]
    r = series[-1, cols["resp"]]
    w = series[-1, cols["weight"]]
    d = series[-1, cols["date"]]
    return tf.reshape(X, [WINDOW_SIZE, 130]), tf.reshape(r, [1]), tf.reshape(w, [1]), tf.reshape(d, [1])

patterns = data_folder + f"/fold{HOLDOUT}" + "/*.tfrec" 
files = tf.io.gfile.glob(patterns)    
files_ds = tf.data.Dataset.from_tensor_slices(files)

ds = files_ds.interleave(open_windowed_ds, num_parallel_calls=auto)
ds = ds.filter(single_date)
ds = ds.map(collate, num_parallel_calls=auto)
ds = ds.prefetch(auto)




#ds = ds.take(100000)






# retrieve the labels and predictions of the model
rwd_ds = ds.map(lambda X, r, w, d: [r, w, d])
rwd = np.hstack(list(rwd_ds.as_numpy_iterator()))
resp, weight, date = rwd
date = date.astype(np.uint8)
labels = np.heaviside(resp, 0.0)

X_ds = ds.map(lambda X, r, w, d: X)
probs = model.predict(X_ds).squeeze()

# precision vs recall
precisions, recalls, thresholds = precision_recall_curve(labels, probs)

plt.figure(figsize=(8, 5))
plt.plot(thresholds, precisions[:-1], "tab:blue", label="Precision")
plt.plot(thresholds, recalls[:-1], "tab:orange", label="Recall")
plt.legend()
plt.xlabel("Threshold")
plt.title("Precision/recall at threshold")
plt.axis([0, 1, 0, 1])
plt.grid(True)
plt.show()

plt.figure(figsize=(8, 5))
plt.plot(recalls, precisions, "tab:blue")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision at recall")
plt.axis([0, 1, 0, 1])
plt.grid(True)
plt.show()

# ROC curve
false_positives, true_positives, thresholds = roc_curve(labels, probs)
plt.figure(figsize=(8, 5))
plt.plot(false_positives, true_positives, "tab:blue")
plt.plot([0, 1], [0, 1], "tab:gray")
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.title("ROC curve")
plt.axis([0, 1, 0, 1])
plt.grid(True)
plt.show()

In [None]:
def utility(threshold):
    action = np.heaviside(probs - threshold, 0.0)
    p = np.bincount(date, weight * resp * action)
    
    if np.sum(p ** 2) == 0.0:
        return 0.0
    
    t = np.sum(p) / np.sqrt(np.sum(p ** 2)) * np.sqrt(250 / p.size)
    u = min(max(t, 0), 6) * np.sum(p)
    return u

thresholds = np.arange(0.01, 1.0, 0.01)
utilities = np.array([utility(thresh) for thresh in thresholds])
utilities = (utilities - utilities.min()) / (utilities.max() - utilities.min())

plt.figure(figsize=(8, 5))
plt.plot(thresholds, utilities, "tab:blue")
plt.xlabel("Threshold")
plt.xlabel("Utility")
plt.title("Utility at threshold")
plt.axis([0, 1, 0, 1])
plt.grid(True)
plt.show()

best = thresholds[np.argmax(utilities)]
print(f"Best threshold at p = {best}")

with open(os.path.join(os.curdir, "threshold.json"), "w") as file:
    json.dump({"threshold": best}, file)

tock = time()
print(f"Notebook took {(tock -tick) / 60} minutes")