In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datatable as dt
import scipy.stats
from sklearn.metrics import mean_absolute_error

import matplotlib.pyplot as plt
import statsmodels.api as sm
import xgboost as xgb

from tqdm import tqdm
import os

# 0. Create Environment

In [None]:
import janestreet
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set

# 1. Data Preparation

## Read the data

Use datatable to read the data faster:

In [None]:
train = dt.fread('../input/jane-street-market-prediction/train.csv').to_pandas()


In [None]:
train = train[train['weight'] != 0]

## Manage missing values

First, check the shape of the dataset. We have 2 390 491 rows and 138 columns.
Second, check if there are any missings:

In [None]:
print(train.shape)
missing_val_count_by_column = (train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

All missings are for features, never for a target variable (resp) that is somewhere among first 10 columns. Good.

Notably, some features have a lot of missings, let's fill them:

In [None]:
train.fillna(train.mean(),inplace=True)

## Define key variables/matrices

Define an action variable and an exact list of features:

In [None]:
train['action'] = (train['resp'] > 0).astype('int')
features = [c for c in train.columns if 'feature' in c]

Finally, we will use all features (X) to predict resp (y) and make an action:

In [None]:
#X = train[features[1:]]  # all except for feature_0 that indicates buying/selling
X = train[features]
y = train.loc[:, 'action']

Other potentially usefull stuff:

In [None]:
resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp_4']

weights = train['weight'].values
resp = train['resp'].values
resps_rest = train[resp_cols].values
dates = train['date'].values

f0 = train['feature_0'].values

weights_resp = weights * abs(resp)
weights_resp_sq = weights * (abs(resp) ** 2)

# 2. Data exploration

## Return's visualization

I will have a look at different returns: resp, and other reps?

In [None]:
plt.figure(figsize=(15, 7))
plt.plot(resp)
plt.title('Returns [resp]')
plt.grid(True)
plt.show()

Not very informative, let's look into a subset:

In [None]:
plt.figure(figsize=(15, 7))
plt.plot(resp[0:500])
plt.title('Returns [resp]')
plt.grid(True)
plt.show()

Ok, now we an observe some variance clusterization.

Let's look into other returns (resps), the same subset from the beginning of the sample:

In [None]:
plt.figure(figsize=(15, 7))
plt.plot(resps_rest[0:500, 0])
plt.plot(resps_rest[0:500, 1])
plt.plot(resps_rest[0:500, 2])
plt.plot(resps_rest[0:500, 3])
plt.legend(['resp_1','resp_2','resp_3','resp_4'])
plt.title('Returns [resp_1, resp_2, resp_3, resp_4]')
plt.grid(True)
plt.show()

There is not so much difference visible this way. However, resp_4 looks more volatile than resp_1. 

Finally, let's compare resp with resp_1 and resp_4 over even smaller period for the sake of clarity:

In [None]:
plt.figure(figsize=(15, 7))
plt.plot(resp[0:300])
plt.plot(resps_rest[0:300, 0])
plt.plot(resps_rest[0:300, 3])
plt.legend(['resp','resp_1','resp_4'])
plt.title('Returns [resp, resp_1, resp_4]')
plt.grid(True)
plt.show()

We see that resp is often somewhere in between of resp_1 and resp_4 in terms of volatility. Probably, resp was generated as an average of resp_1-resp_4.


Let's look at the distribution of returns, it looks like normal, but with excess mass near 0 and probably heavy tails:

In [None]:
plt.figure(figsize=(15, 7))
plt.hist(resp, 400)
plt.title('Returns [resp]')
plt.grid(True)
plt.show()

Let's look at the estimate of the kurtosis:


In [None]:
print(scipy.stats.kurtosis(resp))

Indeed, it is very different from the kurtosis of the normal distribution.

## ACF and PACF plots

I will use again data subsample (o/w computations take too much time) to plot autocorrelation and partial autocorrelation function for returns and squared returns: 

In [None]:
resp_sq = resp ** 2;

fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(14,5), dpi= 80)
sm.graphics.tsa.plot_acf(resp[0:100000], ax=ax1, lags=10, alpha=0.01)
sm.graphics.tsa.plot_acf(resp_sq[0:100000], ax=ax2, lags=10, alpha=0.01)


# Decorate
# lighten the borders
ax1.spines["top"].set_alpha(.3); ax2.spines["top"].set_alpha(.3)
ax1.spines["bottom"].set_alpha(.3); ax2.spines["bottom"].set_alpha(.3)
ax1.spines["right"].set_alpha(.3); ax2.spines["right"].set_alpha(.3)
ax1.spines["left"].set_alpha(.3); ax2.spines["left"].set_alpha(.3)

# font size of tick labels
ax1.tick_params(axis='both', labelsize=12)
ax2.tick_params(axis='both', labelsize=12)
plt.show()

fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(14,5), dpi= 80)
sm.graphics.tsa.plot_pacf(resp[0:100000], ax=ax1, lags=10, alpha=0.01)
sm.graphics.tsa.plot_pacf(resp_sq[0:100000], ax=ax2, lags=10, alpha=0.01)


# Decorate
# lighten the borders
ax1.spines["top"].set_alpha(.3); ax2.spines["top"].set_alpha(.3)
ax1.spines["bottom"].set_alpha(.3); ax2.spines["bottom"].set_alpha(.3)
ax1.spines["right"].set_alpha(.3); ax2.spines["right"].set_alpha(.3)
ax1.spines["left"].set_alpha(.3); ax2.spines["left"].set_alpha(.3)

# font size of tick labels
ax1.tick_params(axis='both', labelsize=12)
ax2.tick_params(axis='both', labelsize=12)
plt.show()

Overall, we see that for the returns there is about no serial correlation, while for the squared returns there is certainly some serial correlation. So, there is serial (nonlinear) dependence of returns that can be exploited. 


# 3. Prepare for Validation
I would like to split the sample into training and validation portions. Unfortunately, with time series the standard random sampling into them is not appropriate. There exists a TimeSeriesSplit function
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html, but its application will take more time for training than I am ready to allocate for that right now.

Thus, for now let's just split manually into two parts , keeping the order of observations, 70%/30%:

In [None]:
print(1981287*0.7)  # obs 0-1386900 -> training, 1386901-1981287 -> validation
t_split = 1386900;

In [None]:
print([round(np.mean(y[0:t_split].values), 3), round(np.var(y[0:t_split].values), 3)]);
print([round(np.mean(y[t_split:1981287].values), 3), round(np.var(y[t_split:1981287].values), 3)]);
print(scipy.stats.ttest_ind(y[0:t_split].values, y[t_split:1981287].values))

The means are statistically different, I have to come up with a better splitting scheme later.

# 4. Baseline Model

### Sample split

In [None]:
X_train = X[0:t_split] # to be changed later
y_train = y[0:t_split]

X_val = X[t_split+1:]
y_val = y[t_split+1:]

weights_resp_train =  weights_resp[0:t_split]
weights_resp_sq_train = weights_resp_sq[0:t_split]

In [None]:
base_boost_tree = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=10,
    gamma=0.5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    missing=-999,
    random_state=7,
    tree_method='gpu_hist'  
)
# --> results in 0.4838

In [None]:
base_boost_tree.fit(X_train, y_train)

### Evaluate Prediction Quality

In [None]:
y_pred = base_boost_tree.predict(X_val)
base_boost_tree_mae = mean_absolute_error(y_pred, y_val)
print(base_boost_tree_mae) 

# 5. Baseline+-

1) Decrease the number of estimators from 500 to 50:

In [None]:
#base_boost_tree_1 = xgb.XGBClassifier(
#    n_estimators=50,
#    max_depth=10,
#    gamma=0.5,
#    learning_rate=0.05,
#    subsample=0.8,
#    colsample_bytree=0.8,
#    missing=-999,
#    random_state=7,
#    tree_method='gpu_hist'  
#)

In [None]:
#base_boost_tree_1.fit(X_train, y_train)

In [None]:
#y_pred = base_boost_tree_1.predict(X_val)
#base_boost_tree_1_mae = mean_absolute_error(y_pred, y_val)
#print(base_boost_tree_1_mae) 

This change results in minor decrease of MAE but takes much less time to train.

2) Increase gamma from 0.5 to 0.7:

In [None]:
base_boost_tree_2 = xgb.XGBClassifier(
    n_estimators=50,
    max_depth=10,
    gamma=0.7,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    missing=-999,
    random_state=7,
    tree_method='gpu_hist'  
)

In [None]:
#base_boost_tree_2.fit(X_train, y_train)
#y_pred = base_boost_tree_2.predict(X_val)
#base_boost_tree_2_mae = mean_absolute_error(y_pred, y_val)
#print(base_boost_tree_2_mae) 

3) Change subsample from 0.8 to 0.7:

In [None]:
base_boost_tree_3 = xgb.XGBClassifier(
    n_estimators=50,
    max_depth=10,
    gamma=0.7,
    learning_rate=0.05,
    subsample=0.7,
    colsample_bytree=0.8,
    missing=-999,
    random_state=7,
    tree_method='gpu_hist'  
)

In [None]:
#base_boost_tree_3.fit(X_train, y_train)
#y_pred = base_boost_tree_3.predict(X_val)
#base_boost_tree_3_mae = mean_absolute_error(y_pred, y_val)
#print(base_boost_tree_3_mae) 

# 6. Baseline XGBClassifier with sample weights
(python reports that the weights are unused, skip it)

In [None]:
weighted_XGB = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=11,
    gamma=0.5,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.8,
    missing=-999,
    random_state=7,
    tree_method='gpu_hist'  
)

In [None]:
#weighted_XGB.fit(X_train, y_train, sample_weight = weights_resp_train)

In [None]:
#y_pred = weighted_XGB.predict(X_val)
#weighted_XGB_mae = mean_absolute_error(y_pred, y_val)
#print(weighted_XGB_mae) # -> 0.4845

 # 7. LGBMClassifier with sample weights

In [None]:
from lightgbm import LGBMClassifier
weighted_LGBM = LGBMClassifier(
    n_estimators=500,
    max_depth=10,
    gamma=0.5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    missing=-999,
    random_state=7,
    tree_method='gpu_hist'  
)
# add num_leaves 


In [None]:
weighted_LGBM.fit(X_train, y_train)

In [None]:
y_pred = weighted_LGBM.predict(X_val)
weighted_LGBM_mae = mean_absolute_error(y_pred, y_val)
print(weighted_LGBM_mae) 

In [None]:
#for s_weights in [weights_resp_train, weights_resp_sq_train]:
#    weighted_LGBM.fit(X_train, y_train, sample_weight=s_weights)
#    y_pred = weighted_LGBM.predict(X_val)
#    weighted_LGBM_mae = mean_absolute_error(y_pred, y_val)
#    print(weighted_LGBM_mae)

# Making Predictions

### Whole Sample Model Fitting

In [None]:
#final_model = base_boost_tree_3
final_model = weighted_LGBM

In [None]:
final_model.fit(X, y)

In [None]:
for (test_df, sample_prediction_df) in tqdm(iter_test):
    X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
    y_preds = final_model.predict(X_test)
    sample_prediction_df.action = y_preds
    env.predict(sample_prediction_df)