In [1]:
import os
os.chdir('..')

from typing import Any, Dict, Optional

import pandas as pd
import numpy as np

import xtx.utils.dev_utils as dev_utils
from xtx.features.feature_extractor import FeatureExtractor
from xtx.modeling.runners import CrossValClassificationRunner, CrossValRunner
from xtx.modeling.stacking import RunnersStacking
from xtx.modeling.time_folds import TimeFolds

pd.set_option("display.max_columns", 100)
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


logger = dev_utils.init_logger("logging/train.log")
data_path = 'data/data.pkl'

For this exercise we have a prediction problem.

The set-up is the following:

1 - For each point you have target variable "y", and 60 features (4 groups of 15 features representing the state of the order book at each point in time).

2 - Whenever there is a NaN in any of askRate,bidRate,askSize,bidSize, it means that the book is empty beyond that level (you can interpret it as the size being zero).

3 - Your forecast of y(k) can depend on askRate[:k+1,:], bidRate[:k+1,:], askSize[:k+1,:], bidSize[:k+1,:], i.e. all history including the current observation.

4 - The objective function is mean squared error, and you will be evaluated based on the out-of-sample performance of the model.

You can use the programming language/packages of your choice.

Ideally, your solution should have ‘train’ part that will use data.csv file for training and produce a model; and ‘predict’ part that will load data in the same format (but without ‘y’) and produce a prediction.

You'll have writing access to it so that once you have finished, you can upload your solution. There is no particular deadline, you can take your time doing this.

As for benchmark scores, on test dataset 0.17 correlation is very good, while 0.15 correlation is mediocre (can be achieved with reasonable linear model). (used correlation here just for optics, objective function is MSE)


# Data

In [2]:
feature_extractor = FeatureExtractor(data_path)
data = feature_extractor.data
data.style.set_properties(**{'background-color': 'red'}, subset=['askRate0'])
data.head()

Unnamed: 0,askRate0,askRate1,askRate2,askRate3,askRate4,askRate5,askRate6,askRate7,askRate8,askRate9,...,bidSize6,bidSize7,bidSize8,bidSize9,bidSize10,bidSize11,bidSize12,bidSize13,bidSize14,y
0,1619.5,1620.0,1621.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,20,27,11,14,35,10,1,10,13,-0.5
1,1619.5,1620.0,1621.0,1621.5,0.0,0.0,0.0,0.0,0.0,0.0,...,20,27,11,14,35,10,1,10,13,-0.5
2,1619.5,1620.0,1621.0,1621.5,1622.0,0.0,0.0,0.0,0.0,0.0,...,20,27,11,14,35,10,1,10,13,-0.5
3,1619.5,1620.0,1621.0,1621.5,1622.0,0.0,0.0,0.0,0.0,0.0,...,20,27,11,14,35,10,1,10,13,-0.5
4,1619.5,1620.0,1621.0,1621.5,1622.0,0.0,0.0,0.0,0.0,0.0,...,20,27,11,14,35,10,1,10,13,-0.5


# Just look to random moments
- green is ask, blue is bid

In [10]:
from xtx.utils.modeling_utils import draw_moment_book
for _ in range(10):
    draw_moment_book(data.sample().iloc[0], 4)

rate,1603.500000,1603.000000,1602.500000,1602.000000,1601.500000,1601.000000,1600.500000,1600.000000
bid#,0,0,0,0,5,49,9,19
ask#,11,16,7,1,0,0,0,0


rate,1633.000000,1632.500000,1632.000000,1631.500000,1631.000000,1630.500000,1630.000000,1629.500000
bid#,0,0,0,0,1,7,15,5
ask#,4,15,8,3,0,0,0,0


rate,1602.500000,1602.000000,1601.500000,1601.000000,1600.500000,1600.000000,1599.500000,1599.000000
bid#,0,0,0,0,1,12,4,3
ask#,7,15,6,2,0,0,0,0


rate,1644.500000,1644.000000,1643.500000,1643.000000,1642.500000,1642.000000,1641.500000,1641.000000
bid#,0,0,0,0,1,6,5,4
ask#,7,4,4,4,0,0,0,0


rate,1640.500000,1640.000000,1639.500000,1639.000000,1638.500000,1638.000000,1637.500000,1637.000000
bid#,0,0,0,0,3,8,10,10
ask#,9,30,17,5,0,0,0,0


rate,1614.000000,1613.500000,1613.000000,1612.500000,1611.000000,1610.500000,1610.000000,1609.500000
bid#,0,0,0,0,24,1,25,2
ask#,3,1,23,3,0,0,0,0


rate,1673.500000,1673.000000,1672.500000,1672.000000,1671.500000,1671.000000,1670.500000,1670.000000
bid#,0,0,0,0,3,18,11,17
ask#,16,20,15,5,0,0,0,0


rate,1599.500000,1599.000000,1598.500000,1598.000000,1597.000000,1596.500000,1596.000000,1595.500000
bid#,0,0,0,0,10,14,12,10
ask#,6,11,9,6,0,0,0,0


rate,1569.000000,1568.500000,1568.000000,1567.500000,1567.000000,1566.500000,1566.000000,1565.500000
bid#,0,0,0,0,2,18,124,10
ask#,11,15,12,14,0,0,0,0


rate,1652.500000,1652.000000,1651.500000,1651.000000,1650.500000,1650.000000,1649.500000,1649.000000
bid#,0,0,0,0,2,24,17,12
ask#,10,34,13,26,0,0,0,0


# Define target

- Let's define the **Mid price** as average between largest bid and lower ask rates
- Let's see the target correlating with different statistics of this value.
- The Mid price 87-difference is highly correlated with our target!
- Our real task is to predict how the value of the average price will change after 87 steps.
- If we assume that each step is a millisecond, then we need to predict how the mid price will change in almost 0.1 seconds.
- 87 probably related to https://en.wikipedia.org/wiki/Black_Monday_(1987) :)

In [33]:
mid_price = -(data['askRate0'] + data['bidRate0']).diff(-87).iloc[:-87] / 2
np.corrcoef(mid_price, data['y'].iloc[:-87])

array([[1.        , 0.99913505],
       [0.99913505, 1.        ]])

## Extract Common Sense features

Once we know what we need to predict let's just build some common sense features:
- 'ask_rate_0' - just chippest sail price from the market
- 'mid_price' - something like price by market opinion on current moment. Also our target is change of this statistic.
- 'mid_price_log' - log for linear models 
- 'ask_len' - cumulative ask size for any price 
- 'bid_len' - cumulative bid size for any price
- 'wap0', - weighted average price aka 'macro price' by ask0 and bid0
- 'wap1' - weighted average price aka 'macro price' by ask1 and bid1
- 'len_ratio' - ask_len / bid_len
- 'volume_imbalance' - (bid_size - ask_size) / (ask_size + bid_size) by ask0 and bid0
- 'volume_imbalance_1' - (bid_size - ask_size) / (ask_size + bid_size) by ask1 and bid1
- 'volume_imbalance_2' - (bid_size - ask_size) / (ask_size + bid_size) by ask2 and bid2


Have time dependency:
- 'increased_ask_counts' - amount of ask rates where count increased by last step
- 'increased_ask_rank' - smallest rank of ask rates where count increased by last step
- 'decreased_ask_counts'- amount of ask rates where count decreased by last step
- 'decreased_ask_rank' - smallest rank of ask rates where count increased by last step
- 'increased_bid_counts' - amount of bid rates where count increased by last step
- 'increased_bid_rank' - smallest rank of bid rates where count increased by last step
- 'decreased_bid_counts' - amount of bid rates where count decreased by last step
- 'decreased_bid_rank' - smallest rank of bid rates where count decreased by last step

Inspired by:
- https://gist.github.com/sebjai/574d46939eb6efe9e38fe3dbe0a3dc32
- https://iextrading.com/docs/stoikov_micro-price.pdf

In [None]:
## Extract 

In [35]:
base_features = feature_extractor.get_base_features()
base_features

Unnamed: 0,ask_rate_0,mid_price,mid_price_log,ask_len,bid_len,wap0,wap1,len_ratio,volume_imbalance,volume_imbalance_1,volume_imbalance_2,increased_ask_counts,increased_ask_rank,decreased_ask_counts,decreased_ask_rank,increased_bid_counts,increased_bid_rank,decreased_bid_counts,decreased_bid_rank
0,1619.5,1617.25,7.389101,35,192,1618.937500,1617.000000,0.182292,0.750000,0.000000,-0.920000,0,15,0,15,0,15,0,15
1,1619.5,1617.25,7.389101,40,192,1618.937500,1617.000000,0.208333,0.750000,0.000000,-0.920000,1,3,0,15,0,15,0,15
2,1619.5,1617.25,7.389101,42,192,1618.937500,1617.000000,0.218750,0.750000,0.000000,-0.920000,1,4,0,15,0,15,0,15
3,1619.5,1617.25,7.389101,62,192,1618.937500,1617.000000,0.322917,0.750000,0.000000,-0.920000,1,4,0,15,0,15,0,15
4,1619.5,1617.25,7.389101,72,192,1618.937500,1617.000000,0.375000,0.750000,0.000000,-0.920000,1,4,0,15,0,15,0,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3497661,1576.0,1575.50,7.362962,150,654,1575.769287,1576.000000,0.229358,0.538462,0.600000,0.512195,1,0,0,15,0,15,0,15
3497662,1576.0,1575.50,7.362962,149,654,1575.833374,1576.000000,0.227829,0.666667,0.600000,0.512195,0,15,1,0,0,15,0,15
3497663,1576.0,1575.50,7.362962,150,654,1575.769287,1576.000000,0.229358,0.538462,0.600000,0.512195,1,0,0,15,0,15,0,15
3497664,1576.0,1575.50,7.362962,169,654,1575.769287,1576.000000,0.258410,0.538462,0.600000,0.512195,6,3,5,4,0,15,0,15


In [36]:
base_features.columns

Index(['ask_rate_0', 'mid_price', 'mid_price_log', 'ask_len', 'bid_len',
       'wap0', 'wap1', 'len_ratio', 'volume_imbalance', 'volume_imbalance_1',
       'volume_imbalance_2', 'increased_ask_counts', 'increased_ask_rank',
       'decreased_ask_counts', 'decreased_ask_rank', 'increased_bid_counts',
       'increased_bid_rank', 'decreased_bid_counts', 'decreased_bid_rank'],
      dtype='object')

## Validation scheme or How to prevent overfitting?

### Intuition
Usually for time series better use time-depended hold-out validation because watching to the future is not honest and can lead to serious overfitting.
But in our case signal is so weak and almost dissapear after few time steps.
Also as I showed above in our feature engineering pipeline we don't use some deep time aggregations:
- we are not looking to the future
- we look into the past no more than 80 steps (maybe it's also space for improvent)

### Solution
Taking into account these facts, it was decided for validation to use Time Folds with minibatchs and hold-out test:
- hold-out test dataset selecting from last p percents of data, where p is customizable parameter
- whole dataset divided into minibatchs with fixed length
- each minibatch can be selected for train or test but only in its entirety (not acceptable when part of minibatch selected for train and other for validation)
- in case of kfold validation each minibatch correspond to some fold_id
- to prevent the possibility of the model looking into validation folds some percent of validation data cut out in the beginning and the end of minibatch  

### Example bellow:
- n_folds: number of folds for our metrics uncertainty estimation, possible models ensembling or stacking
- minifold_size: if we suppose that each step is 1ms - 10,000 steps is 10 sec
- neutral_ratio: ratio of data to skip in the beginning and ending of validation dataset
- test_ratio: hold our test dataset ratio
- test_neutral_ratio: ratio of data to skip in the beginning of test dataset (use only for training)

In [85]:
time_folds = TimeFolds(n_folds=5, minifold_size=10000, neutral_ratio=0.1, test_ratio=0.2, test_neutral_ratio=0.1,)
time_folds.fit(base_features, data.y)

In [86]:
train_data = time_folds.get_train_data(fold_id=0, include_minifolds=True)
print(f'Number of minifolds on train: {train_data.minifold.max()}')

minifold_starts = train_data.minifold.diff().fillna(1) > 0
print(f'Minifold start indexs: {train_data[minifold_starts].index.tolist()}')
train_data.head()

Number of minifolds on train: 279
Minifold start indexs: [10000, 20000, 30000, 40000, 60000, 70000, 80000, 90000, 110000, 120000, 130000, 140000, 160000, 170000, 180000, 190000, 210000, 220000, 230000, 240000, 260000, 270000, 280000, 290000, 310000, 320000, 330000, 340000, 360000, 370000, 380000, 390000, 410000, 420000, 430000, 440000, 460000, 470000, 480000, 490000, 510000, 520000, 530000, 540000, 560000, 570000, 580000, 590000, 610000, 620000, 630000, 640000, 660000, 670000, 680000, 690000, 710000, 720000, 730000, 740000, 760000, 770000, 780000, 790000, 810000, 820000, 830000, 840000, 860000, 870000, 880000, 890000, 910000, 920000, 930000, 940000, 960000, 970000, 980000, 990000, 1010000, 1020000, 1030000, 1040000, 1060000, 1070000, 1080000, 1090000, 1110000, 1120000, 1130000, 1140000, 1160000, 1170000, 1180000, 1190000, 1210000, 1220000, 1230000, 1240000, 1260000, 1270000, 1280000, 1290000, 1310000, 1320000, 1330000, 1340000, 1360000, 1370000, 1380000, 1390000, 1410000, 1420000, 1430

Unnamed: 0,ask_rate_0,mid_price,mid_price_log,ask_len,bid_len,wap0,wap1,len_ratio,volume_imbalance,volume_imbalance_1,volume_imbalance_2,increased_ask_counts,increased_ask_rank,decreased_ask_counts,decreased_ask_rank,increased_bid_counts,increased_bid_rank,decreased_bid_counts,decreased_bid_rank,minifold
10000,1619.0,1618.75,7.390027,180,88,1618.821411,1618.552612,2.045455,0.285714,-0.263158,-0.263158,0,15,0,15,1,1,0,15,1
10001,1619.0,1618.75,7.390027,178,88,1618.875,1618.552612,2.022727,0.5,-0.263158,-0.263158,0,15,1,0,0,15,0,15,1
10002,1619.0,1618.75,7.390027,177,88,1618.909058,1618.552612,2.011364,0.636364,-0.263158,-0.263158,0,15,1,0,0,15,0,15,1
10003,1619.0,1618.75,7.390027,177,88,1618.909058,1618.65625,2.011364,0.636364,-0.125,-0.363636,1,2,1,1,0,15,0,15,1
10004,1619.0,1618.75,7.390027,176,88,1618.949951,1618.65625,2.0,0.8,-0.125,-0.363636,0,15,1,0,0,15,0,15,1


In [87]:
valid_data = time_folds.get_valid_data(fold_id=0, include_minifolds=True)
print(f'Number of minifolds on valid: {valid_data.minifold.max()}')

minifold_starts = valid_data.minifold.diff().fillna(1) > 0
print(f'Minifold start indexs: {valid_data[minifold_starts].index.tolist()}')
valid_data.head()

Number of minifolds on valid: 275
Minifold start indexs: [1001, 51001, 101001, 151001, 201001, 251001, 301001, 351001, 401001, 451001, 501001, 551001, 601001, 651001, 701001, 751001, 801001, 851001, 901001, 951001, 1001001, 1051001, 1101001, 1151001, 1201001, 1251001, 1301001, 1351001, 1401001, 1451001, 1501001, 1551001, 1601001, 1651001, 1701001, 1751001, 1801001, 1851001, 1901001, 1951001, 2001001, 2051001, 2101001, 2151001, 2201001, 2251001, 2301001, 2351001, 2401001, 2451001, 2501001, 2551001, 2601001, 2651001, 2701001, 2751001]


Unnamed: 0,ask_rate_0,mid_price,mid_price_log,ask_len,bid_len,wap0,wap1,len_ratio,volume_imbalance,volume_imbalance_1,volume_imbalance_2,increased_ask_counts,increased_ask_rank,decreased_ask_counts,decreased_ask_rank,increased_bid_counts,increased_bid_rank,decreased_bid_counts,decreased_bid_rank,minifold
1001,1620.0,1619.75,7.390644,378,178,1619.533325,1619.75,2.123595,-0.866667,0.0,-0.714286,0,15,0,15,9,1,6,0,0
1002,1620.0,1619.75,7.390644,378,175,1619.533325,1619.666626,2.16,-0.866667,-0.111111,-0.714286,0,15,0,15,0,15,1,1,0
1003,1620.0,1619.75,7.390644,378,175,1619.5625,1619.666626,2.16,-0.75,-0.111111,-0.714286,0,15,0,15,1,0,1,7,0
1004,1620.0,1619.75,7.390644,378,174,1619.533325,1619.666626,2.172414,-0.866667,-0.111111,-0.714286,0,15,0,15,0,15,1,0,0
1005,1620.0,1619.75,7.390644,378,173,1619.533325,1619.666626,2.184971,-0.866667,-0.111111,-0.714286,0,15,0,15,3,8,5,7,0
