# Target definition

Making the action positive when `resp > 0` is the logical thing to do. But I wanted to check how Jane utility would change for different thresholds. 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import janestreet
import warnings
import seaborn as sns
import matplotlib.ticker as mtick

from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score


plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (12, 4)
warnings.filterwarnings('ignore')

In [None]:
def utility_score_numba(date, weight, resp, action):
    Pi = np.bincount(date, weight * resp * action)
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / len(Pi))
    u = min(max(t, 0), 6) * np.sum(Pi)
    return u

def jane_utility(data, action_column="action"):
    return utility_score_numba(data["date"].values, 
                               data["weight"].values, 
                               data["resp"].values, 
                               data[action_column].values)

In [None]:
data = pd.read_csv("/kaggle/input/jane-street-market-prediction/train.csv")
### Undersample to prototype
#data = data.sample(frac=0.05)

In [None]:
data["resp"].describe()

In [None]:
TARGET_THRESHOLDS = [-1e-2, -1e-4, -1e-5, 0, 1e-5, 1e-4, 1e-2]
TIME_SPLIT = 300
TIME_COLUMN = "date"
TARGET = "action"

In [None]:
features = [col for col in data.columns if "feature" in col]

In [None]:
for threshold in TARGET_THRESHOLDS:
    data["action_" + str(threshold)] = data["resp"] > threshold

In [None]:
in_time = data[data[TIME_COLUMN] <= TIME_SPLIT]
out_of_time = data[data[TIME_COLUMN] > TIME_SPLIT]

In [None]:
train, test = train_test_split(in_time, 
                               test_size=0.2, 
                               random_state=42)

In [None]:
models = [LGBMClassifier() for threshold in TARGET_THRESHOLDS]
for i, model in enumerate(models):
    threshold = TARGET_THRESHOLDS[i]
    model.fit(train[features], train["action_" + str(threshold)])
    test["model_" + str(threshold)] = model.predict(test[features])
    out_of_time["model_" + str(threshold)] = model.predict(out_of_time[features])

In [None]:
for threshold in TARGET_THRESHOLDS:
    pd.concat([test, out_of_time]).groupby("date").apply(lambda x: np.sum(x["resp"] * x["weight"] * x["model_" + str(threshold)])).rolling(60).mean().plot(label="model_" + str(threshold))

xmin, xmax, ymin, ymax = plt.axis()
plt.vlines(TIME_SPLIT, ymin, ymax, linestyle="dotted", color="red", label="Out of time split")
plt.legend(bbox_to_anchor=(1.05, 1.0))
plt.title("Performance moving average of 60 periods window for both test and out of time periods", pad=16)
plt.ylabel("sum(Weight * Resp * Action)")
plt.xlabel("Date")
plt.show()    

In [None]:
for threshold in TARGET_THRESHOLDS:
    print("-----------")
    print(threshold)
    print("Test Jane Utility (in time): {:.2f}".format(jane_utility(test, "model_" + str(threshold))))
    print("Out of time Jane Utility: {:.2f}".format(jane_utility(out_of_time, "model_" + str(threshold))))

## Bootstrapping

In order to make our comparison more robust, we use bootstrapping. 

In [None]:
N_BOOT = 50

In [None]:
experiment_dict = {threshold: {} for threshold in TARGET_THRESHOLDS}
for key, values in experiment_dict.items():
    experiment_dict[key]["test_utility"] = []
    experiment_dict[key]["oot_utility"] = []
    
for n_round in range(N_BOOT):
    
    resampled_data = data.sample(frac=1.0, replace=True)
    in_time = resampled_data[resampled_data[TIME_COLUMN] <= TIME_SPLIT]
    out_of_time = resampled_data[resampled_data[TIME_COLUMN] > TIME_SPLIT]
    train, test = train_test_split(in_time, 
                               test_size=0.2, 
                               random_state=42)
    
    for i, model in enumerate(models):
        threshold = TARGET_THRESHOLDS[i]
        model.fit(train[features], train["action_" + str(threshold)])
        test["model_" + str(threshold)] = model.predict(test[features])
        out_of_time["model_" + str(threshold)] = model.predict(out_of_time[features])
        
        experiment_dict[threshold]["test_utility"].append(jane_utility(test, "model_" + str(threshold))) 
        experiment_dict[threshold]["oot_utility"].append(jane_utility(out_of_time, "model_" + str(threshold)))
        

In [None]:
metrics = ["test_utility", "oot_utility"]

results = pd.concat([pd.DataFrame.from_dict(experiment_dict).transpose()[[metric]].explode(metric) for metric in metrics], axis=1)
results["threshold"] = results.index * 10e3
results = results.astype(float)


In [None]:
fig, ax = plt.subplots(figsize=(12, 4))

sns.boxplot(data=results, x="threshold", y="test_utility")

plt.title("Test set Jane Utility for different action thresholds")
plt.xticks(rotation=45)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12, 4))
sns.boxplot(data=results, x="threshold", y="oot_utility")
plt.title("Out of time set Jane Utility for different action thresholds")
plt.xlabel("Threshold to define target (10e-3)")
plt.xticks(rotation=45)
plt.show()

# Conclusion

There's a range around the threshold 0 when building the target that might offer a margin for investigation. 