In [1]:
from fastai.tabular.all import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, GroupKFold
from optiver_features import *

In [2]:
book_feature_dict = {
    wap1: [np.mean, np.std, 'nunique'],
    wap2: [np.mean, np.std],
    log_return1: [np.std],
    log_return2: [np.std],
    ask_spread: [np.mean, np.std],
    price_spread:[np.mean, np.std],
    total_volume:[np.mean, np.std],
}
trade_feature_dict = {
        log_return_price: [np.std, np.mean],
        'seconds_in_bucket':[np.size],
        'size':[np.sum],
        'order_count':[np.sum],
}

time_windows = [(0,600), (0,100), (100,200), (200,300), (300,400), (400, 500), (500,600)]
agg_cols = ['log_return_price_std', 'log_return1_std', 'log_return2_std', 'size_sum', 'order_count_sum']
time_id_features = [f'{col}_{x}_{y}' for x,y in time_windows for col in agg_cols] 
time_id_aggregations = ['mean', 'std', 'min' ]
stock_id_features = time_id_features
stock_id_aggregations = time_id_aggregations
ofg = OptiverFeatureGenerator(book_feature_dict, trade_feature_dict, time_windows, time_id_features,time_id_aggregations, stock_id_features, stock_id_aggregations)


In [3]:
# train_df = ofg.generate_train_df()



# train_df.to_feather('train_351cols.feather')
train_df = pd.read_feather('train_351cols.feather')

In [4]:
train_df = train_df.fillna(0)


In [5]:
for c in ['wap1_nunique_0_600','wap1_nunique_0_100','wap1_nunique_100_200','wap1_nunique_200_300','wap1_nunique_300_400','wap1_nunique_400_500','wap1_nunique_500_600']:
    train_df[c] = train_df[c].astype(np.float32)

In [6]:
kfold = GroupKFold()


In [7]:
cont_nn,cat_nn = cont_cat_split(train_df, max_card=9000, dep_var='target')

In [8]:
cat_nn.remove('time_id'), cat_nn.remove('row_id')

(None, None)

In [9]:
cat_nn

['stock_id']

In [10]:
def rmspe(preds, targs):
    x = (targs-preds)/targs
    return (x**2).mean().sqrt()

In [21]:
def train_fold(trn_idx, val_idx, fname):
    

    procs_nn = [Categorify, Normalize]
    to_nn = TabularPandas(train_df, procs_nn, cat_nn, cont_nn,
                          splits=[list(trn_idx), list(val_idx)], y_names='target')
    config={'lin_first':False, 'embed_p':.1, 'ps':[0,.4,.1]}
    dls = to_nn.dataloaders(1024)
    learn = tabular_learner(dls, y_range=(0,.1), layers=[800,400,200], config = config,
                        n_out=1, loss_func=rmspe, metrics=AccumMetric(rmspe),opt_func=ranger)
    learn.fit_flat_cos(30, 5e-3, wd=.2)
    learn.save(fname)
    return learn.recorder.metrics[0].value

In [22]:
res =[]
for trn_idx, val_idx in kfold.split(train_df, groups = train_df.time_id):
    res.append(train_fold(trn_idx, val_idx, f'tuned_{len(res)}'))
    print('fold', len(res), res[-1])
np.mean(res),res

epoch,train_loss,valid_loss,rmspe,time
0,0.281516,0.231522,0.235145,00:03
1,0.269182,0.308024,0.31473,00:03
2,0.25991,0.246854,0.249282,00:03
3,0.247678,0.228876,0.23183,00:03
4,0.244717,0.232611,0.235814,00:03
5,0.244125,0.223966,0.226567,00:03
6,0.234166,0.223567,0.225835,00:03
7,0.235046,0.222901,0.225362,00:03
8,0.241038,0.224083,0.229653,00:03
9,0.234901,0.22236,0.224928,00:03


fold 1 TensorBase(0.2209)


epoch,train_loss,valid_loss,rmspe,time
0,0.275395,0.311563,0.315203,00:03
1,0.26282,0.23558,0.237671,00:03
2,0.252615,0.231785,0.23439,00:03
3,0.244919,0.233676,0.238279,00:03
4,0.241398,0.22954,0.232695,00:03
5,0.239143,0.224583,0.232297,00:03
6,0.232087,0.236211,0.239182,00:03
7,0.236386,0.229647,0.23438,00:03
8,0.228343,0.23274,0.235999,00:03
9,0.228221,0.222779,0.228449,00:03


fold 2 TensorBase(0.2270)


epoch,train_loss,valid_loss,rmspe,time
0,0.280427,0.258523,0.264974,00:03
1,0.269019,0.252232,0.254674,00:03
2,0.264711,0.230016,0.232821,00:03
3,0.249257,0.228571,0.230356,00:03
4,0.243478,0.225821,0.227649,00:03
5,0.2446,0.216548,0.218501,00:03
6,0.238319,0.215574,0.217405,00:03
7,0.242529,0.218963,0.222341,00:03
8,0.232098,0.221724,0.223195,00:03
9,0.230302,0.215843,0.217435,00:03


fold 3 TensorBase(0.2151)


epoch,train_loss,valid_loss,rmspe,time
0,0.282513,0.281296,0.282774,00:03
1,0.264172,0.267296,0.273161,00:03
2,0.252598,0.230223,0.232043,00:03
3,0.252108,0.242944,0.244111,00:03
4,0.251977,0.223574,0.22566,00:03
5,0.235348,0.218999,0.221081,00:03
6,0.240142,0.218706,0.220915,00:03
7,0.237702,0.2173,0.219355,00:03
8,0.240988,0.233369,0.235401,00:03
9,0.234832,0.233693,0.238539,00:03


fold 4 TensorBase(0.2133)


epoch,train_loss,valid_loss,rmspe,time
0,0.280025,0.244757,0.248609,00:03
1,0.257517,0.222204,0.223896,00:03
2,0.255546,0.226088,0.227904,00:03
3,0.262454,0.228685,0.231033,00:03
4,0.249413,0.211676,0.213752,00:02
5,0.246076,0.22614,0.228104,00:02
6,0.244986,0.232149,0.23333,00:03
7,0.241172,0.216714,0.218319,00:03
8,0.237953,0.228701,0.231126,00:03
9,0.240352,0.245036,0.24644,00:02


fold 5 TensorBase(0.2127)


(0.2178067,
 [TensorBase(0.2209),
  TensorBase(0.2270),
  TensorBase(0.2151),
  TensorBase(0.2133),
  TensorBase(0.2127)])