In [1]:
import os
os.chdir('..')

import pandas as pd
import numpy as np
import matplotlib.pylab as plt

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)
from tqdm.auto import tqdm

from xtx.modeling.time_folds import TimeFolds
from sklearn.linear_model import Ridge, Lasso
from xtx.modeling.evaluation import ridge_eval
from xtx.modeling.runners import CrossValRunner
from xtx.features.feature_extractor import FeatureExtractor 

N_SCALING = 2500000

def shrink_dtype(df: pd.DataFrame):
    for col in df.columns:
        if df[col].dtype == np.float64:
            df[col] = df[col].astype(np.float32)

MODEL_CONFIGS = {
    'default_ridge': {
        'model_module': 'sklearn.linear_model',
        'model_cls': 'Ridge',
        'model_params': {'alpha': 100}
    },
    'default_lasso': {
        'model_module': 'sklearn.linear_model',
        'model_cls': 'Lasso',
        'model_params': {'alpha': 0.01}
    },
    'default_lgbm': {
        'model_module': 'lightgbm',
        'model_cls': 'LGBMRegressor',
        'model_params': {
            'n_jobs': -1, 
            'num_leaves': 13, 
            'learning_rate': 0.01,
            'n_estimators': 500, 
            'reg_lambda': 1, 
            'colsample_bytree': 0.7, 
            'subsample': 0.05
        }
    },
}

  from IPython.core.display import display, HTML


INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
# data = pd.read_pickle('data/data.pkl').fillna(0)
# target_col = 'y'
# target = data[target_col]
# # data.info()

In [4]:
feature_extractor = FeatureExtractor('data/data.pkl')
data = feature_extractor.data
data.head()

Unnamed: 0,askRate0,askRate1,askRate2,askRate3,askRate4,askRate5,askRate6,askRate7,askRate8,askRate9,askRate10,askRate11,askRate12,askRate13,askRate14,askSize0,askSize1,askSize2,askSize3,askSize4,askSize5,askSize6,askSize7,askSize8,askSize9,askSize10,askSize11,askSize12,askSize13,askSize14,bidRate0,bidRate1,bidRate2,bidRate3,bidRate4,bidRate5,bidRate6,bidRate7,bidRate8,bidRate9,bidRate10,bidRate11,bidRate12,bidRate13,bidRate14,bidSize0,bidSize1,bidSize2,bidSize3,bidSize4,bidSize5,bidSize6,bidSize7,bidSize8,bidSize9,bidSize10,bidSize11,bidSize12,bidSize13,bidSize14,y
0,1619.5,1620.0,1621.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,10,24,0,0,0,0,0,0,0,0,0,0,0,0,1615.0,1614.0,1613.0,1612.0,1611.0,1610.0,1607.0,1606.0,1605.0,1604.0,1603.0,1602.0,1601.5,1601.0,1600.0,7,10,1,10,20,3,20,27,11,14,35,10,1,10,13,-0.5
1,1619.5,1620.0,1621.0,1621.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,10,24,5,0,0,0,0,0,0,0,0,0,0,0,1615.0,1614.0,1613.0,1612.0,1611.0,1610.0,1607.0,1606.0,1605.0,1604.0,1603.0,1602.0,1601.5,1601.0,1600.0,7,10,1,10,20,3,20,27,11,14,35,10,1,10,13,-0.5
2,1619.5,1620.0,1621.0,1621.5,1622.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,10,24,5,2,0,0,0,0,0,0,0,0,0,0,1615.0,1614.0,1613.0,1612.0,1611.0,1610.0,1607.0,1606.0,1605.0,1604.0,1603.0,1602.0,1601.5,1601.0,1600.0,7,10,1,10,20,3,20,27,11,14,35,10,1,10,13,-0.5
3,1619.5,1620.0,1621.0,1621.5,1622.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,10,24,5,22,0,0,0,0,0,0,0,0,0,0,1615.0,1614.0,1613.0,1612.0,1611.0,1610.0,1607.0,1606.0,1605.0,1604.0,1603.0,1602.0,1601.5,1601.0,1600.0,7,10,1,10,20,3,20,27,11,14,35,10,1,10,13,-0.5
4,1619.5,1620.0,1621.0,1621.5,1622.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,10,24,5,32,0,0,0,0,0,0,0,0,0,0,1615.0,1614.0,1613.0,1612.0,1611.0,1610.0,1607.0,1606.0,1605.0,1604.0,1603.0,1602.0,1601.5,1601.0,1600.0,7,10,1,10,20,3,20,27,11,14,35,10,1,10,13,-0.5


In [5]:
base_features = feature_extractor.get_base_features()
base_features.head()

Unnamed: 0,ask_rate_0,mid_price,mid_price_log,ask_len,bid_len,wap0,wap1,len_ratio,volume_imbalance,volume_imbalance_1,volume_imbalance_2,increased_ask_counts,increased_ask_rank,decreased_ask_counts,decreased_ask_rank,increased_bid_counts,increased_bid_rank,decreased_bid_counts,decreased_bid_rank
0,1619.5,1617.25,7.389101,35,192,1618.9375,1617.0,0.182292,0.75,0.0,-0.92,0,15,0,15,0,15,0,15
1,1619.5,1617.25,7.389101,40,192,1618.9375,1617.0,0.208333,0.75,0.0,-0.92,1,3,0,15,0,15,0,15
2,1619.5,1617.25,7.389101,42,192,1618.9375,1617.0,0.21875,0.75,0.0,-0.92,1,4,0,15,0,15,0,15
3,1619.5,1617.25,7.389101,62,192,1618.9375,1617.0,0.322917,0.75,0.0,-0.92,1,4,0,15,0,15,0,15
4,1619.5,1617.25,7.389101,72,192,1618.9375,1617.0,0.375,0.75,0.0,-0.92,1,4,0,15,0,15,0,15


In [6]:
base_features = feature_extractor.get_base_features()

time_folds = TimeFolds(n_folds=5, minifold_size=60000, neutral_ratio=0.15, test_ratio=0.25, test_neutral_ratio=0.1)
time_folds.fit(base_features, data.y)

cross_val_runner = CrossValRunner(time_folds, **MODEL_CONFIGS['default_ridge'])
cross_val_runner.fit(verbose=True)

  0%|          | 0/5 [00:00<?, ?it/s]

|    | dataset   | metric_name   |   fold_0 |   fold_1 |   fold_2 |   fold_3 |   fold_4 |
|---:|:----------|:--------------|---------:|---------:|---------:|---------:|---------:|
|  0 | val       | mse           |    0.503 |    0.46  |    0.461 |    0.621 |    0.575 |
|  1 | val       | corr          |    0.139 |    0.164 |    0.138 |    0.125 |    0.121 |
|  2 | test      | mse           |    0.333 |    0.333 |    0.332 |    0.333 |    0.332 |
|  3 | test      | corr          |    0.151 |    0.151 |    0.151 |    0.151 |    0.152 |
        		Val  corr averaged: 0.137
        		Val   MSE averaged: 0.524
        		Test corr averaged: 0.151
        		Test  MSE averaged: 0.333
        ------------------------------------------------------------------
        		Averaged test  MSE: 0.333
        		Averaged test corr: 0.151
        


In [6]:
from xtx.feature_extractor import FeatureExtractor
fe = FeatureExtractor('data/data.pkl')
base_features_v2 = fe.get_base_features()

In [11]:
time_folds.fit(base_features_v2, data.y)

cross_val_runner = CrossValRunner(time_folds, **MODEL_CONFIGS['default_ridge'])
cross_val_runner.fit(verbose=True)

  0%|          | 0/5 [00:00<?, ?it/s]

|    | dataset   | metric_name   |   fold_0 |   fold_1 |   fold_2 |   fold_3 |   fold_4 |
|---:|:----------|:--------------|---------:|---------:|---------:|---------:|---------:|
|  0 | val       | mse           |    0.503 |    0.46  |    0.461 |    0.62  |    0.575 |
|  1 | val       | corr          |    0.135 |    0.164 |    0.139 |    0.127 |    0.121 |
|  2 | test      | mse           |    0.459 |    0.443 |    0.452 |    0.482 |    0.469 |
|  3 | test      | corr          |    0.039 |    0.04  |    0.04  |    0.037 |    0.037 |
        		Val  corr averaged: 0.137
        		Val   MSE averaged: 0.524
        		Test corr averaged: 0.039
        		Test  MSE averaged: 0.461
        ------------------------------------------------------------------
        		Averaged test  MSE: 0.460
        		Averaged test corr: 0.038
        


In [16]:
%%time
base_features['minifold'] = np.arange(base_features.shape[0]) // 5
group = base_features.groupby('minifold')

CPU times: user 8.75 ms, sys: 5.42 ms, total: 14.2 ms
Wall time: 13 ms


In [6]:
features_list = []
for n_per_row in tqdm((5, 15, 50, 100)):
    current_features = pd.read_pickle(f'../new_artefacts/features_{n_per_row}.pkl')
    if n_per_row == 3:
        selected_cols = ['ask_flatten_mean_3', 'bid_flatten_mean_3', 'wap_flatten_3']
        drop_cols = [col for col in current_features.columns if col not in selected_cols]
    elif n_per_row < 10:
        drop_cols = [
            f'flatten_spread_{n_per_row}_median',
            f'ask_flatten_iqr_{n_per_row}',
            f'bid_flatten_iqr_{n_per_row}',
            f'ask_flatten_median_{n_per_row}',
            f'bid_flatten_median_{n_per_row}',
            f'ask_flatten_len_{n_per_row}',
            f'bid_flatten_len_{n_per_row}',
        ]
    elif n_per_row < 100:
        drop_cols = [f'ask_flatten_len_{n_per_row}', f'bid_flatten_len_{n_per_row}']
    else:
        drop_cols = []
        
    current_features.drop(drop_cols, axis=1, inplace=True)
    for col in current_features.columns:
        if current_features[col].dtype == np.float64:
            current_features[col] = current_features[col].astype(np.float32)
    features_list.append(current_features)
topk_features = pd.concat(features_list, axis=1)

# usecols = list(set(usecols))
topk_features.head()

  0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,ask_flatten_mean_5,ask_flatten_std_5,ask_flatten_skew_5,ask_flatten_kurtosis_5,bid_flatten_mean_5,bid_flatten_skew_5,bid_flatten_kurtosis_5,flatten_spread_5_mean,wap_flatten_5,bid_flatten_std_5,ask_flatten_mean_15,ask_flatten_median_15,ask_flatten_std_15,ask_flatten_iqr_15,ask_flatten_skew_15,ask_flatten_kurtosis_15,bid_flatten_mean_15,bid_flatten_median_15,bid_flatten_iqr_15,bid_flatten_skew_15,bid_flatten_kurtosis_15,flatten_spread_15_mean,flatten_spread_15_median,wap_flatten_15,bid_flatten_std_15,ask_flatten_mean_50,ask_flatten_median_50,ask_flatten_std_50,ask_flatten_iqr_50,ask_flatten_skew_50,ask_flatten_kurtosis_50,bid_flatten_mean_50,bid_flatten_median_50,bid_flatten_iqr_50,bid_flatten_skew_50,bid_flatten_kurtosis_50,flatten_spread_50_mean,flatten_spread_50_median,wap_flatten_50,ask_flatten_len_100,ask_flatten_mean_100,ask_flatten_median_100,ask_flatten_std_100,ask_flatten_iqr_100,ask_flatten_skew_100,ask_flatten_kurtosis_100,bid_flatten_len_100,bid_flatten_mean_100,bid_flatten_median_100,bid_flatten_iqr_100,bid_flatten_skew_100,bid_flatten_kurtosis_100,flatten_spread_100_mean,flatten_spread_100_median,wap_flatten_100,bid_flatten_std_100
0,1619.900024,0.2,-1.5,0.25,1615.0,0.0,-3.0,0.003034,1617.449951,0.0,1620.233276,1620.0,0.478423,0.5,0.779935,-0.835753,1614.466675,1614.0,1.0,0.133631,-1.982143,0.003572,0.003717,1617.349976,0.498888,1620.671387,1621.0,0.491976,1.0,-0.9077,-0.954024,1612.359985,1612.0,3.0,0.483242,-1.276247,0.005155,0.005583,1617.249023,35,1620.671387,1621.0,0.491976,1.0,-0.9077,-0.954024,100,1609.400024,1610.0,6.0,0.302392,-1.384458,0.007003,0.006832,1617.749146,3.209361
1,1619.900024,0.2,-1.5,0.25,1615.0,0.0,-3.0,0.003034,1617.449951,0.0,1620.233276,1620.0,0.478423,0.5,0.779935,-0.835753,1614.466675,1614.0,1.0,0.133631,-1.982143,0.003572,0.003717,1617.349976,0.498888,1620.775024,1621.0,0.535607,1.0,-0.740104,-0.663021,1612.359985,1612.0,3.0,0.483242,-1.276247,0.005219,0.005583,1617.035034,40,1620.775024,1621.0,0.535607,1.0,-0.740104,-0.663021,100,1609.400024,1610.0,6.0,0.302392,-1.384458,0.007068,0.006832,1617.525024,3.209361
2,1619.900024,0.2,-1.5,0.25,1615.0,0.0,-3.0,0.003034,1617.449951,0.0,1620.233276,1620.0,0.478423,0.5,0.779935,-0.835753,1614.466675,1614.0,1.0,0.133631,-1.982143,0.003572,0.003717,1617.349976,0.498888,1620.833374,1621.0,0.584183,0.75,-0.404726,-0.404813,1612.359985,1612.0,3.0,0.483242,-1.276247,0.005255,0.005583,1616.965088,42,1620.833374,1621.0,0.584183,0.75,-0.404726,-0.404813,100,1609.400024,1610.0,6.0,0.302392,-1.384458,0.007104,0.006832,1617.45166,3.209361
3,1619.900024,0.2,-1.5,0.25,1615.0,0.0,-3.0,0.003034,1617.449951,0.0,1620.233276,1620.0,0.478423,0.5,0.779935,-0.835753,1614.466675,1614.0,1.0,0.133631,-1.982143,0.003572,0.003717,1617.349976,0.498888,1621.02002,1621.0,0.685274,0.5,-0.258492,-0.673599,1612.359985,1612.0,3.0,0.483242,-1.276247,0.005371,0.005583,1616.689941,62,1621.209717,1621.0,0.72706,1.0,-0.500972,-0.770738,100,1609.400024,1610.0,6.0,0.302392,-1.384458,0.007338,0.006832,1616.689941,3.209361
4,1619.900024,0.2,-1.5,0.25,1615.0,0.0,-3.0,0.003034,1617.449951,0.0,1620.233276,1620.0,0.478423,0.5,0.779935,-0.835753,1614.466675,1614.0,1.0,0.133631,-1.982143,0.003572,0.003717,1617.349976,0.498888,1621.02002,1621.0,0.685274,0.5,-0.258492,-0.673599,1612.359985,1612.0,3.0,0.483242,-1.276247,0.005371,0.005583,1616.689941,72,1621.319458,1621.5,0.727942,1.0,-0.707885,-0.606647,100,1609.400024,1610.0,6.0,0.302392,-1.384458,0.007406,0.007143,1616.329956,3.209361


In [8]:
topk_usecols = [
    # 'ask_flatten_mean_5',
    'bid_flatten_mean_5',
    'wap_flatten_15',
    'ask_flatten_skew_50',
    'bid_flatten_mean_50',
    'bid_flatten_kurtosis_50',
    'ask_flatten_mean_100',
    'ask_flatten_iqr_100',
]

base_features = make_base_features(data)
from xtx.time_folds import TimeFolds
time_folds = TimeFolds(n_folds=5, minifold_size=60000, neutral_ratio=0., test_ratio=0.25, test_neutral_ratio=0.1)

x = pd.concat((base_features, topk_features.loc[:, topk_usecols]), axis=1)

# time_folds.fit(x, data.y)
# cross_val_runner = CrossValRunner(time_folds, **MODEL_CONFIGS['default_ridge'])
# cross_val_runner.fit(verbose=True)

In [8]:
# time_folds = TimeFolds(n_folds=5, minifold_size=60000, neutral_ratio=0., test_ratio=0.25, test_neutral_ratio=0.1)
# time_folds.fit(x, data.y)

# cross_val_runner = CrossValRunner(time_folds, **MODEL_CONFIGS['default_lgbm'])
# cross_val_runner.fit(verbose=True)

In [11]:
base_features = make_base_features(data)
from xtx.time_folds import TimeFolds
time_folds = TimeFolds(n_folds=5, minifold_size=60000, neutral_ratio=0., test_ratio=0.25, test_neutral_ratio=0.1)
# time_folds.fit(base_features, data.y)
# ridge_eval(time_folds, 3, 100)

base_features = pd.concat((base_features, topk_features.loc[:, topk_usecols]), axis=1)
for window in (3, 5, 10, 20, 40, 80):
    # ask bid size changes
    ask_size_diff = data[ask_size_cols].diff(window)
    bid_size_diff = data[bid_size_cols].diff(window)

    ask_size_diff.columns = np.arange(15)
    base_features['increased_ask_counts'] = (ask_size_diff> 0).sum(axis=1)
    base_features['decreased_ask_counts'] = (ask_size_diff < 0).sum(axis=1)

    # base_features[f'increased_ask_counts_{window}_volume'] = (ask_size_diff * (ask_size_diff > 0)).sum(1)
    # base_features[f'decreased_ask_counts_{window}_volume'] = (-ask_size_diff * (ask_size_diff < 0)).sum(1)
    # (ask_size_diff > 0) * ask_size_diff

    bid_size_diff.columns = np.arange(15)
    base_features['increased_bid_counts'] = (bid_size_diff > 0).sum(axis=1)
    base_features['decreased_bid_counts'] = (bid_size_diff < 0).sum(axis=1)    
    
    base_features[f'volume_imbalance_{window}'] = base_features['volume_imbalance'].diff(window)
    base_features[f'mid_price_log_{window}'] = base_features[f'mid_price_log'].diff(window)

for window in (40, 80):
    # base_features[f'mid_price_log_std_{window}'] = base_features[f'mid_price_log'].rolling(window).std()
    # base_features[f'mid_price_log_mean_diff_{window}'] = base_features[f'mid_price_log'].rolling(window).mean() - base_features[f'mid_price_log']
    base_features[f'mid_price_log_max_diff_{window}'] = base_features[f'mid_price_log'].rolling(window).max() - base_features[f'mid_price_log']
    # base_features[f'mid_price_log_std_{window}'] = base_features[f'mid_price_log'].diff(window).rolling(window).std()
    
for window in (10, 20, 40, 80):    
    base_features[f'wap0_{window}_mean'] = base_features['wap0'].rolling(window).mean()
    base_features[f'wap0_{window}_std'] = base_features['wap0'].rolling(window).std() #overfit?
    base_features[f'wap0_{window}_max'] = base_features['wap0'].rolling(window).max() #overfit?

    # base_features[f'wap1_{window}_mean'] = base_features['wap1'].rolling(window).mean()
    # base_features[f'wap1_{window}_std'] = base_features['wap1'].rolling(window).std() #overfit?
    # base_features[f'wap1_{window}_max'] = base_features['wap1'].rolling(window).max() #overfit?

    base_features[f'volume_imbalance_{window}_mean'] = base_features['volume_imbalance'].rolling(window).mean()
    base_features[f'volume_imbalance_{window}_max'] = base_features['volume_imbalance'].rolling(window).max()
    base_features[f'volume_imbalance_{window}_std'] = base_features['volume_imbalance'].rolling(window).std()
    base_features[f'volume_imbalance_{window}_skew'] = base_features['volume_imbalance'].rolling(window).skew()
    # base_features[f'volume_imbalance_{window}_iqr'] = base_features['volume_imbalance'].rolling(window).quantile(0.75) - \
        # base_features['volume_imbalance'].rolling(window).quantile(0.25)
    base_features[f'len_ratio_{window}_mean'] = base_features['len_ratio'].rolling(window).mean()
    base_features[f'len_ratio_{window}_std'] = base_features['len_ratio'].rolling(window).std()
    # base_features[f'len_ratio_{window}_max'] = base_features['len_ratio'].rolling(window).max()
    
    # base_features[f'len_ratio_{window}_mean_diff'] = base_features[f'len_ratio_{window}_mean'].diff(window)

# base_features['mid_price_rolling_max_100'] = base_features.mid_price.diff(87).rolling(500).std()
# base_features['mid_price_rolling_median_100'] = base_features.mid_price_log.rolling(87)
# base_features['mid_price_rolling_max_100'] = base_features.mid_price_log.rolling(1000).max() - base_features.mid_price_log
# base_features['mid_price_rolling_mean_10_diff'] = base_features.mid_price_log.rolling(400).mean().diff(10)
# base_features['mid_price_rolling_std_100_diff'] = base_features.mid_price_log.rolling(100).max()
# base_features['mid_price_rolling_std_10_diff'] = base_features.mid_price_log.rolling(400).std().diff(10)
# time_folds.fit(base_features, data.y)

# cross_val_runner = CrossValRunner(time_folds, **MODEL_CONFIGS['default_ridge'])
# cross_val_runner.fit(verbose=True)

In [16]:
time_folds = TimeFolds(n_folds=10, minifold_size=60000, neutral_ratio=0., test_ratio=0.2, test_neutral_ratio=0.1)
time_folds.fit(base_features, data.y)

In [17]:
ridge_runner = CrossValRunner(time_folds, **MODEL_CONFIGS['default_ridge'])
ridge_runner.fit(verbose=True)

  0%|          | 0/10 [00:00<?, ?it/s]

|    | dataset   | metric_name   |   fold_0 |   fold_1 |   fold_2 |   fold_3 |   fold_4 |   fold_5 |   fold_6 |   fold_7 |   fold_8 |   fold_9 |
|---:|:----------|:--------------|---------:|---------:|---------:|---------:|---------:|---------:|---------:|---------:|---------:|---------:|
|  0 | val       | mse           |    0.468 |    0.491 |    0.411 |    0.623 |    0.505 |    0.512 |    0.439 |    0.494 |    0.49  |    0.547 |
|  1 | val       | corr          |    0.151 |    0.156 |    0.17  |    0.136 |    0.156 |    0.158 |    0.162 |    0.139 |    0.16  |    0.126 |
|  2 | test      | mse           |    0.334 |    0.334 |    0.334 |    0.334 |    0.334 |    0.334 |    0.334 |    0.334 |    0.334 |    0.334 |
|  3 | test      | corr          |    0.16  |    0.16  |    0.16  |    0.162 |    0.16  |    0.16  |    0.16  |    0.16  |    0.162 |    0.16  |
        		Val  corr averaged: 0.152
        		Val   MSE averaged: 0.498
        		Test corr averaged: 0.160
        		Test  MSE av

In [18]:
lgbm_runner = CrossValRunner(time_folds, **MODEL_CONFIGS['default_lgbm'])
lgbm_runner.fit(verbose=True)

  0%|          | 0/10 [00:00<?, ?it/s]

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16781
[LightGBM] [Info] Number of data points in the train set: 2374627, number of used features: 74
[LightGBM] [Info] Start training from score -0.000130
Training until validation scores don't improve for 10 rounds
[50]	valid_0's l2: 0.474747
[100]	valid_0's l2: 0.472638
[150]	valid_0's l2: 0.47144
[200]	valid_0's l2: 0.470682
[250]	valid_0's l2: 0.470159
[300]	valid_0's l2: 0.46977
[350]	valid_0's l2: 0.46949
[400]	valid_0's l2: 0.469301
[450]	valid_0's l2: 0.469161
[500]	valid_0's l2: 0.469066
Did not meet early stopping. Best iteration is:
[500]	valid_0's l2: 0.469066
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16780
[LightGBM] [Info] Number of data points in the train set: 2376674, number of used features: 74
[LightGBM] [Info] 

In [23]:
current_lgbm = {
    'model_module': 'lightgbm',
    'model_cls': 'LGBMRegressor',
    'model_params': {
        'n_jobs': -1, 
        'num_leaves': 255, 
        'learning_rate': 0.01,
        'n_estimators': 2000, 
        'reg_lambda': 1, 
        'reg_alpha': 1,
        'colsample_bytree': 0.7, 
        'subsample': 0.05
    }
}
time_folds = TimeFolds(n_folds=10, minifold_size=60000, neutral_ratio=0., test_ratio=0.2, test_neutral_ratio=0.1)
time_folds.fit(base_features, data.y)
lgbm_runner = CrossValRunner(time_folds, **current_lgbm)
lgbm_runner.fit(verbose=True)

  0%|          | 0/10 [00:00<?, ?it/s]

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16781
[LightGBM] [Info] Number of data points in the train set: 2374627, number of used features: 74
[LightGBM] [Info] Start training from score -0.000130
Training until validation scores don't improve for 10 rounds
[50]	valid_0's l2: 0.473226
[100]	valid_0's l2: 0.470574
[150]	valid_0's l2: 0.469298
[200]	valid_0's l2: 0.468512
[250]	valid_0's l2: 0.468087
Early stopping, best iteration is:
[271]	valid_0's l2: 0.467952
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16780
[LightGBM] [Info] Number of data points in the train set: 2376674, number of used features: 74
[LightGBM] [Info] Start training from score 0.000054
Training until validation scores don't improve for 10 rounds
[50]	valid_0's l2: 0.496603
[100]	valid_0's l2: 0.493799
[1

In [24]:
current_lgbm_v2 = {
    'model_module': 'lightgbm',
    'model_cls': 'LGBMRegressor',
    'model_params': {
        'n_jobs': -1, 
        'num_leaves': 255, 
        'learning_rate': 0.01,
        'n_estimators': 2000, 
        'reg_lambda': 1, 
        'reg_alpha': 1,
        'colsample_bytree': 0.7, 
        'subsample': 0.3
    }
}

lgbm_runner_v2 = CrossValRunner(time_folds, **current_lgbm_v2)
lgbm_runner_v2.fit(verbose=True)

  0%|          | 0/10 [00:00<?, ?it/s]

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16781
[LightGBM] [Info] Number of data points in the train set: 2374627, number of used features: 74
[LightGBM] [Info] Start training from score -0.000130
Training until validation scores don't improve for 10 rounds
[50]	valid_0's l2: 0.473226
[100]	valid_0's l2: 0.470574
[150]	valid_0's l2: 0.469298
[200]	valid_0's l2: 0.468512
[250]	valid_0's l2: 0.468087
Early stopping, best iteration is:
[271]	valid_0's l2: 0.467952
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16780
[LightGBM] [Info] Number of data points in the train set: 2376674, number of used features: 74
[LightGBM] [Info] Start training from score 0.000054
Training until validation scores don't improve for 10 rounds
[50]	valid_0's l2: 0.496603
[100]	valid_0's l2: 0.493799
[1

In [21]:
from sklearn.metrics import mean_squared_error
predicted = (ridge_runner.averaged_test * 0.6 + cross_val_runner.averaged_test * 0.3 + lasso_runner.averaged_test * 0.1)
corr_score = np.corrcoef(predicted, ridge_runner.report.test_target)[0,1]
mse_score = mean_squared_error(predicted, ridge_runner.report.test_target)
mse_score, corr_score

(0.33092325076038726, 0.16676269522399212)

In [None]:
lasso_runner = CrossValRunner(time_folds, **MODEL_CONFIGS['default_lasso'])
lasso_runner.fit(verbose=True)

In [59]:
train_meta = np.vstack((cross_val_runner.oof, ridge_runner.oof, lasso_runner.oof)).T[:time_folds.train_size]
nan_idxs = np.isnan(train_meta).any(1)
meta_model = Ridge(alpha=1)
train_meta_target = time_folds.target[:time_folds.train_size]
train_meta_target = train_meta_target[~nan_idxs]

meta_model.fit(train_meta[~nan_idxs], train_meta_target)

test_meta = np.vstack((ridge_runner.averaged_test, cross_val_runner.averaged_test, lasso_runner.averaged_test)).T
test_target = ridge_runner.report.test_target
predicted_test = meta_model.predict(test_meta)

corr_score = np.corrcoef(predicted_test, test_target)[0,1]
mse_score = mean_squared_error(predicted_test, test_target)
mse_score, corr_score


(0.3319893397593852, 0.16341332428995836)

In [55]:
nan_idxs

array([ True,  True,  True, ..., False, False, False])

In [49]:
oof_x = cross_val_runner.oof[:time_folds.train_size]
nan_idxs = np.isnan(oof_x)
oof_x = oof_x[~nan_idxs]

oof_target = time_folds.target[:time_folds.train_size]
oof_target = oof_target[~nan_idxs]
np.corrcoef(oof_x, oof_target)

array([[1.        , 0.13624127],
       [0.13624127, 1.        ]])

In [41]:
oof_x[~nan_idxs]

NameError: name 'nan_idxs' is not defined

In [62]:
# base_features['progressive_ask_count'] = np.dot(data[ask_size_cols].values, np.arange(1, 16)[::-1]) / 15
# base_features['progressive_bid_count'] = np.dot(data[bid_size_cols].values, np.arange(1, 16)[::-1]) / 15
# base_features[['progressive_ask_count', 'progressive_bid_count']]

In [63]:
# base_features['ask_rate0_changed'] = data.askRate0.diff() 
# base_features['bid_rate0_changed'] = data.bidRate0.diff() 

# base_features

In [57]:
base_features = make_base_features(data)
base_columns = base_features.columns.tolist()

In [58]:
topk_features[col]

0          1615.0
1          1615.0
2          1615.0
3          1615.0
4          1615.0
            ...  
3497661    1575.0
3497662    1575.0
3497663    1575.0
3497664    1575.0
3497665    1575.0
Name: bid_flatten_mean_5, Length: 3497666, dtype: float64

In [76]:
time_folds = TimeFolds(n_folds=5, minifold_size=60000, neutral_ratio=0.1, test_ratio=0.2)
usecols = base_columns

time_folds.fit(base_features, data.y)
best_score, _ = ridge_eval(time_folds, 3, 100)
for col in topk_features.columns:
    base_features[col] = topk_features[col]
    usecols.append(col)
    
    time_folds = TimeFolds(n_folds=5, minifold_size=60000, neutral_ratio=0.1, test_ratio=0.2)
    time_folds.fit(base_features.loc[:, usecols], data.y)
    mse_score, test_mse_score = ridge_eval(time_folds, 3, 100, verbose=False)
    if mse_score < best_score:
        best_score = mse_score
        print('=' * 50)
        print(col, best_score)
    else:
        usecols = usecols[:-1]
        print(col, mse_score)

Init correlation: 0.1428
Init MSE: 0.6428
Init test correlation: 0.1502
Init test MSE: 0.3312
ask_flatten_mean_5 0.3321552019960154
ask_flatten_std_5 0.33209220243242804
ask_flatten_skew_5 0.332031915654447
ask_flatten_kurtosis_5 0.6428091322447211
bid_flatten_mean_5 0.6427935603490048
bid_flatten_skew_5 0.3320685692075931
bid_flatten_kurtosis_5 0.3320037473032549
flatten_spread_5_mean 0.33210796339473503
wap_flatten_5 0.6427678402254664
bid_flatten_std_5 0.6423377632419085
ask_flatten_mean_10 0.33212082073242516
ask_flatten_median_10 0.3320779180775223
ask_flatten_std_10 0.3320004297867703
ask_flatten_iqr_10 0.3318913539396287
ask_flatten_skew_10 0.3319032966209914
ask_flatten_kurtosis_10 0.6422866813282984
bid_flatten_mean_10 0.33192402578218994
bid_flatten_median_10 0.3318660697421667
bid_flatten_iqr_10 0.6421884359271357
bid_flatten_skew_10 0.6421483776155109
bid_flatten_kurtosis_10 0.6420894123513163
flatten_spread_10_mean 0.331882226990282
flatten_spread_10_median 0.3318300180100

In [77]:
usecols

['ask_rate_0',
 'mid_price',
 'mid_price_log',
 'ask_len',
 'bid_len',
 'wap0',
 'wap1',
 'len_ratio',
 'volume_imbalance',
 'increased_ask_counts',
 'decreased_ask_counts',
 'increased_bid_counts',
 'decreased_bid_counts',
 'ask_flatten_len_5',
 'ask_flatten_len_5',
 'ask_flatten_len_5',
 'ask_flatten_kurtosis_5',
 'bid_flatten_mean_5',
 'wap_flatten_5',
 'bid_flatten_std_5',
 'ask_flatten_kurtosis_10',
 'bid_flatten_iqr_10',
 'bid_flatten_skew_10',
 'bid_flatten_kurtosis_10',
 'wap_flatten_10',
 'bid_flatten_std_10',
 'ask_flatten_skew_15',
 'bid_flatten_kurtosis_15',
 'wap_flatten_15',
 'ask_flatten_median_50',
 'ask_flatten_skew_50',
 'bid_flatten_mean_50',
 'bid_flatten_kurtosis_50',
 'ask_flatten_mean_100',
 'ask_flatten_std_100',
 'ask_flatten_iqr_100',
 'ask_flatten_skew_100',
 'bid_flatten_len_100',
 'bid_flatten_skew_100']

In [80]:
from xtx.time_folds import TimeFolds


time_folds = TimeFolds(n_folds=5, minifold_size=60000, neutral_ratio=0.1, test_ratio=0.2)
time_folds.fit(base_features[base_columns], data.y)
ridge_eval(time_folds, 3, 100)

Init correlation: 0.1404
Init MSE: 0.6430
Init test correlation: 0.1520
Init test MSE: 0.3322


(0.6430450248193251, 0.3321552019960154)

In [108]:
from xtx.time_folds import TimeFolds


time_folds = TimeFolds(n_folds=5, minifold_size=60000, neutral_ratio=0.1, test_ratio=0.2)
time_folds.fit(base_features[usecols], data.y)
ridge_eval(time_folds, 3, 100)

Init correlation: 0.1527
Init MSE: 0.6407
Init test correlation: 0.1537
Init test MSE: 0.3325


(0.6407272279996943, 0.3324579275214507)

['ask_rate_0',
 'mid_price',
 'mid_price_log',
 'ask_len',
 'bid_len',
 'wap0',
 'wap1',
 'len_ratio',
 'volume_imbalance',
 'increased_ask_counts',
 'decreased_ask_counts',
 'increased_bid_counts',
 'decreased_bid_counts',
 'bid_flatten_mean_5',
 'wap_flatten_10',
 'ask_flatten_skew_15',
 'bid_flatten_kurtosis_15',
 'wap_flatten_15',
 'ask_flatten_skew_50',
 'bid_flatten_mean_50',
 'bid_flatten_kurtosis_50',
 'ask_flatten_mean_100',
 'ask_flatten_iqr_100',
 'bid_flatten_skew_100']

In [27]:
# merged_features = pd.concat((base_features.reset_index(drop=True), topk_features.reset_index(drop=True)), axis=1)
# merged_features.shape
# merged_features.sample(5)

In [28]:
from xtx.time_folds import TimeFolds


time_folds = TimeFolds(n_folds=5, minifold_size=60000, neutral_ratio=0.1, test_ratio=0.2)


In [29]:
time_folds.fit(base_features, data.y)
ridge_eval(time_folds, 3, 100)

Init correlation: 0.1381
Init MSE: 0.6434
Init test correlation: 0.1515
Init test MSE: 0.3319


In [41]:
time_folds.fit(topk_features, data.y)
ridge_eval(time_folds, 3, 100)

Init correlation: 0.1356
Init MSE: 0.6437
Init test correlation: 0.1419
Init test MSE: 0.3314


In [7]:
# base_features['ask_size_top3cols_ratio'] = data[ask_size_cols[:3]].sum(1) / base_features['ask_len']
# base_features['bid_size_top3cols_ratio'] = data[bid_size_cols[:3]].sum(1) / base_features['bid_len']




In [41]:
from xtx.time_folds import TimeFolds

time_folds = TimeFolds(n_folds=5, minifold_size=60000, neutral_ratio=0.1, test_ratio=0.2)
time_folds.fit(base_features, data.y)

fold_id = 3
train_data = time_folds.get_train_data(fold_id)
train_target = time_folds.get_train_target(fold_id)
valid_data = time_folds.get_valid_data(fold_id)
valid_target = time_folds.get_valid_target(fold_id)
test_data = time_folds.get_test_data()
test_target = time_folds.get_test_target()

print(train_data.shape, valid_data.shape, test_data.shape)

(2258133, 26) (431991, 26) (699533, 26)


In [9]:
train_data.dropna(inplace=True)
valid_data.dropna(inplace=True)
test_data.dropna(inplace=True)

train_target = train_target.loc[train_data.index]
valid_target = valid_target.loc[valid_data.index]
test_target = test_target.loc[test_data.index]

In [10]:
from sklearn.preprocessing import RobustScaler, StandardScaler
scaler = RobustScaler().fit(train_data)
train_data_norm = scaler.transform(train_data)
valid_data_norm = scaler.transform(valid_data)
test_data_norm = scaler.transform(test_data.fillna(0))

In [11]:
from sklearn.linear_model import Lasso, Ridge, LogisticRegression
from sklearn.metrics import mean_squared_error


In [12]:
model = Ridge(alpha=100)

model.fit(train_data_norm, train_target)
predicted = model.predict(valid_data_norm)
corr_score = np.corrcoef(predicted, valid_target)[0,1]
mse_score = mean_squared_error(predicted, valid_target)

test_predicted = model.predict(test_data_norm)
test_corr_score = np.corrcoef(test_predicted, test_target)[0,1]
test_mse_score = mean_squared_error(test_predicted, test_target)
print(f'Init correlation: {corr_score:.6f}')
print(f'Init MSE: {mse_score:.6f}')
print(f'Init test correlation: {test_corr_score:.6f}')
print(f'Init test MSE: {test_mse_score:.6f}')

Init correlation: 0.141755
Init MSE: 0.642837
Init test correlation: 0.151614
Init test MSE: 0.332012
