In [None]:
!pip install datatable > /dev/null

In [None]:
import os
import gc

import numpy as np
import pandas as pd
import datatable as dt
import matplotlib.pyplot as plt

In [None]:
root_dir = "/kaggle/input/jane-street-market-prediction/"

In [None]:
%%time

df = dt.fread(f"{root_dir}train.csv").to_pandas()
sample_sub = dt.fread(f"{root_dir}example_sample_submission.csv").to_pandas()

In [None]:
df

> This dataset contains an anonymized set of features, feature_{0...129}, representing real stock market data. Each row in the dataset represents a trading opportunity, for which you will be predicting an action value: 1 to make the trade and 0 to pass on it. 

We can see the features here grouped into dates along with `resp`

> Each trade has an associated weight and resp, which together represents a return on the trade. The date column is an integer which represents the day of the trade, while ts_id represents a time ordering.

`resp * weight` seems to be an indication of return for that trading oppurtinity

>In the training set, train.csv, you are provided a resp value, as well as several other resp_{1,2,3,4} values that represent returns over different time horizons. 

Other `resp_{1, 2, 3, 4}` also represents the returns over different [time horizons](https://www.investopedia.com/terms/t/timehorizon.asp)

## Cumulative return

If we take weighted `resp` as an indication of return, let's see overall return

https://www.kaggle.com/carlmcbrideellis/jane-street-eda-of-day-0-and-feature-importance

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))
ax.set_xlabel ("Trade", fontsize=18)
ax.set_ylabel ("Cumulative return", fontsize=18)
plt.plot(pd.Series(df['resp']).cumsum())

Let's see how other `resp`'s performed

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))
ax.set_xlabel ("Trade", fontsize=18)
ax.set_ylabel ("Cumulative return", fontsize=18)

df['resp'].cumsum().plot()
df['resp_1'].cumsum().plot()
df['resp_2'].cumsum().plot()
df['resp_3'].cumsum().plot()
df['resp_4'].cumsum().plot()

plt.legend(loc="upper left")

### Weight

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))
ax.set_xlabel ("Trade", fontsize=18)
ax.set_ylabel ("Cumulative return", fontsize=18)
pd.Series(df['resp']*df["weight"], name="weighted_resp").cumsum().plot()
pd.Series(df['resp_1']*df["weight"], name="weighted_resp_1").cumsum().plot()
pd.Series(df['resp_2']*df["weight"], name="weighted_resp_2").cumsum().plot()
pd.Series(df['resp_3']*df["weight"], name="weighted_resp_3").cumsum().plot()
pd.Series(df['resp_4']*df["weight"], name="weighted_resp_4").cumsum().plot()
plt.legend(loc="upper left")


This also includes the trades with zero weights and the ones which should be avoided

Looks like `resp_4` is holding good for longer time horizons and `resp` is highly correlated to that

## Combining resps

In [None]:
resp_cols = [col for col in df.columns if col.startswith("resp")]

In [None]:
(df[resp_cols[:-1]].mean(1)).cumsum().plot()

`resp` has better comulative returns than the mean of other resps

## Returns for a day

Since the decisions made are with a time-stamp, we can check how much return we can get in a day

In [None]:
day_0 = df[df["date"]==0]

In [None]:
day_0

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))
ax.set_xlabel ("Trade", fontsize=18)
ax.set_ylabel ("Cumulative return for a day", fontsize=18)

day_0['resp'].cumsum().plot()
day_0['resp_1'].cumsum().plot()
day_0['resp_2'].cumsum().plot()
day_0['resp_3'].cumsum().plot()
day_0['resp_4'].cumsum().plot()

plt.legend(loc="upper left")

#### Weighted resp => returns

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))
ax.set_xlabel ("Trade", fontsize=18)
ax.set_ylabel ("Cumulative return for a day", fontsize=18)
pd.Series(day_0['resp']*day_0["weight"], name="weighted_resp").cumsum().plot()
pd.Series(day_0['resp_1']*day_0["weight"], name="weighted_resp_1").cumsum().plot()
pd.Series(day_0['resp_2']*day_0["weight"], name="weighted_resp_2").cumsum().plot()
pd.Series(day_0['resp_3']*day_0["weight"], name="weighted_resp_3").cumsum().plot()
pd.Series(day_0['resp_4']*day_0["weight"], name="weighted_resp_4").cumsum().plot()
plt.legend(loc="upper left")


## Grouping by date

In [None]:
grouped = df.groupby("date")

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))
ax.set_xlabel ("Trade", fontsize=18)
ax.set_ylabel ("Cumulative return", fontsize=18)

(pd.Series(grouped["resp"].sum() * grouped["weight"].sum(), name = "grouped_weigh_resp")).cumsum().plot()
(pd.Series(grouped["resp_1"].sum() * grouped["weight"].sum(), name = "grouped_weigh_resp_1")).cumsum().plot()
(pd.Series(grouped["resp_2"].sum() * grouped["weight"].sum(), name = "grouped_weigh_resp_2")).cumsum().plot()
(pd.Series(grouped["resp_3"].sum() * grouped["weight"].sum(), name = "grouped_weigh_resp_3")).cumsum().plot()
(pd.Series(grouped["resp_4"].sum() * grouped["weight"].sum(), name = "grouped_weigh_resp_4")).cumsum().plot()

plt.legend(loc="upper left")

## Action

Let's use positive `resp` as an indication of making a trade

In [None]:
df['action'] = ((df['resp'])>0)*1

In [None]:
df[df["action"]==1]

## Evaluation

> This competition is evaluated on a utility score. Each row in the test set represents a trading opportunity for which you will be predicting an action value, 1 to make the trade and 0 to pass on it. Each trade j has an associated weight and resp, which represents a return.

For each date i, we define:

#### $$ p_i = \sum_j(weight_{ij} * resp_{ij} * action_{ij}), $$
#### $$ t = \frac{\sum p_i }{\sqrt{\sum p_i^2}} * \sqrt{\frac{250}{|i|}}, $$
where |i| is the number of unique dates in the test set. 
The utility is then defined as:
#### $$u = min(max(t,0), 6)  \sum p_i.$$

So, the utility function scores based on daily overall returns.

## Utility function for a date

https://www.kaggle.com/renataghisloti/understanding-the-utility-score-function


### p_i

#### $$ p_i = \sum_j(weight_{ij} * resp_{ij} * action_{ij}), $$

#### All trades executed

In [None]:
p_i = (day_0["weight"] * day_0["resp"] * 1).sum()
print(p_i)

#### No trades executed

In [None]:
p_i = (day_0["weight"] * day_0["resp"] * 0).sum()
print(p_i)

Only trades with positive `resp`. the max p_i we can have for a day

In [None]:
p_i = (day_0["weight"] * day_0["resp"] * (df["resp"]>0)*1).sum()
print(p_i)

### t_i

#### $$ t = \frac{\sum p_i }{\sqrt{\sum p_i^2}} * \sqrt{\frac{250}{|i|}}, $$

where |i| is the number of unique dates in the test set. 250 seems to indicate a span of trading for a year. 

**So, I guess t_i is the daily sharpe ratio annualized.** 

let's take 10 days of trades

In [None]:
n_days = len(df["date"].unique())
pi_arr = []

for i in range(n_days):
    
    #considering the best opportunities where resp is positive
    p_i = (
        df[df["date"] == i]["weight"]
        * df[df["date"] == i]["resp"]
        * (df[df["date"] == i]["resp"] > 0) * 1
    ).sum()
    
    pi_arr.append(p_i)

pi_arr = np.array(pi_arr)

t = (pi_arr.sum() / np.sqrt((pi_arr ** 2).sum())) * np.sqrt(250/n_days);
print(f"t: {t}")

utility_score = np.minimum(np.maximum(t, 0), 6) * pi_arr.sum()
print(f"Utility score for {n_days} days: {utility_score}")

In [None]:
n_days = len(df["date"].unique())
pi_arr = []

for i in range(n_days):
    
    #considering all the trades
    p_i = (
        df[df["date"] == i]["weight"]
        * df[df["date"] == i]["resp"]
        * 1
    ).sum()
    
    pi_arr.append(p_i)

pi_arr = np.array(pi_arr)

t = (pi_arr.sum() / np.sqrt((pi_arr ** 2).sum())) * np.sqrt(250/n_days);
print(f"t: {t}")

utility_score = np.minimum(np.maximum(t, 0), 6) * pi_arr.sum()
print(f"Utility score for {n_days} days: {utility_score}")

### Making a target

Since this is the maxilum utility score we can achieve on training data

We can set action = 1 for all the rows which contributes to higher utility score and others as 0.