In [1]:
from fastai.vision.all import *
from fastai.tabular.all import *
from sklearn.model_selection import KFold, GroupKFold

In [2]:
PATH = Path('../input/optiver-realized-volatility-prediction')





In [3]:
train_ftrs = pd.read_feather('train_24cols.feather')

In [4]:
train_ftrs['offset']=pd.read_csv(PATH/'train_with_wap.csv').offset

In [5]:
train_ftrs = train_ftrs.fillna(0)
train_ftrs['trade_seconds'] = 'more'



for val in range(3): train_ftrs.loc[train_ftrs.seconds_in_bucket_size_0_600==val, 'trade_seconds'] = val

In [6]:
%%time
torch_data = torch.load(PATH/'torch_data.pth')



means, stds = torch_data.mean(dim=0), torch_data.std(dim=0)
print(means, stds)
torch_data = (torch_data - means) / stds

tensor([9.9968e-01, 1.0003e+00, 7.6999e-04, 7.6673e-04, 9.9948e-01, 1.0005e+00,
        9.5934e-04, 9.2822e-04]) tensor([0.0037, 0.0037, 0.0054, 0.0050, 0.0037, 0.0037, 0.0067, 0.0057])
CPU times: user 14.8 s, sys: 14.7 s, total: 29.5 s
Wall time: 2min 3s


In [7]:
class ReadBatch(ItemTransform):
    def encodes(self, to):
        book_offsets = torch.tensor(to['offset'].to_numpy()).long()
        book_data = torch_data.view(-1,600,8)[book_offsets//600,:,:]
        book_data = book_data.permute(0,2,1)
        res = (tensor(to.cats).long(),tensor(to.conts).float(), book_data)        
        res = res + (tensor(to.targ),)
        if to.device is not None: res = to_device(res, to.device)
        return res

In [8]:
trn_idx, val_idx = first(GroupKFold().split(train_ftrs, groups = train_ftrs.time_id))

In [9]:
train_ftrs.columns

Index(['log_return2_std_0_600', 'stock_id', 'row_id', 'time_id', 'target',
       'log_return_price_std_0_600', 'order_count_sum_0_600',
       'seconds_in_bucket_size_0_600', 'size_sum_0_600',
       'log_return1_std_0_600_min_time', 'log_return1_std_0_600_mean_time',
       'log_return1_std_0_600_min_stock', 'log_return1_std_0_600_mean_stock',
       'log_return1_std_0_600', 'log_return1_std_200_600',
       'log_return1_std_400_600', 'price_spread_mean_0_600',
       'log_return_price_std_0_600_mean_time',
       'log_return_price_std_200_600_mean_time',
       'log_return_price_std_400_600_mean_time',
       'log_return_price_std_0_600_min_time',
       'log_return_price_std_200_600_min_time',
       'log_return_price_std_400_600_min_time', 'total_volume_mean_0_600',
       'offset', 'trade_seconds'],
      dtype='object')

In [35]:
cols_to_keep = ['stock_id', 'row_id', 'time_id', 'target',
       'log_return_price_std_0_600', 'order_count_sum_0_600',
       'seconds_in_bucket_size_0_600', 'size_sum_0_600',
       'log_return1_std_0_600_min_time', 'log_return1_std_0_600_mean_time',
       'log_return1_std_0_600_min_stock', 'log_return1_std_0_600_mean_stock',
       'log_return_price_std_0_600_mean_time',
       'log_return_price_std_200_600_mean_time',
       'log_return_price_std_400_600_mean_time',
       'log_return_price_std_0_600_min_time',
       'log_return_price_std_200_600_min_time',
       'log_return_price_std_400_600_min_time', 'total_volume_mean_0_600',
       'offset', 'trade_seconds']

train_ftrs = train_ftrs[cols_to_keep]

In [36]:
cont_nn,cat_nn = cont_cat_split(train_ftrs, max_card=9000, dep_var='target')
cont_nn.remove('offset')
cat_nn=[x for x in cat_nn if not x in ['row_id', 'time_id']]

In [38]:
procs_nn = [Categorify, FillMissing, Normalize]
to_nn = TabularPandas(train_ftrs, procs_nn, cat_nn, cont_nn,
                      splits=[list(trn_idx), list(val_idx)], y_names='target')
dls = to_nn.dataloaders(1024, after_batch = ReadBatch)

In [39]:
class ResnetModel(nn.Module):
    def __init__(self):
        self.conv_layers = None

In [46]:
class ConvModel(nn.Module):
    def __init__(self, emb_szs, n_cont, layer_sizes, conv_layers, embed_p,ps):
        super().__init__()
        self.embeds = nn.ModuleList([Embedding(ni, nf) for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(embed_p)
        self.conv_layers = conv_layers
        self.conv_drop = nn.Dropout(.1)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        n_emb = sum(e.embedding_dim for e in self.embeds)
        sizes = [n_emb + n_cont + 20] + layer_sizes + [1]
        actns = [nn.ReLU() for _ in range(len(sizes)-2)] + [None]
        layers = [LinBnDrop(sizes[i], sizes[i+1], bn = (i!=len(actns)-1), p=p, act=a, lin_first=True)
                       for i,(p,a) in enumerate(zip(ps+[0.],actns))]
        layers.append(SigmoidRange(0, 0.1))
        self.layers = nn.Sequential(*layers)
    def forward(self, x_cat, x_cont, x_raw):
        x = [e(x_cat[:,i]) for i,e in enumerate(self.embeds)]
        x = torch.cat(x, 1)
        x = self.emb_drop(x)
        x_cont = self.bn_cont(x_cont)
        x_conv = self.conv_layers(x_raw)
        x_conv = self.conv_drop(x_conv)
        x = torch.cat([x, x_cont, x_conv], 1)
        return self.layers(x)

In [41]:
class ResBlock(nn.Module):
    def __init__(self, ch):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Conv1d(ch, ch, kernel_size = 5, padding = 2, padding_mode='replicate'),
            nn.BatchNorm1d(ch),
            nn.ReLU(),
            nn.Conv1d(ch, ch, kernel_size = 5, padding = 2, padding_mode='replicate'),
            nn.BatchNorm1d(ch),
        )
        
    def forward(self, x):
        res = self.layers(x) + x
        res = F.relu(res)
        return res

In [47]:
conv_layers =torch.load('models/conv_model.pth', map_location='cpu').conv_layers

In [48]:
def split_2way(model):
    #return L(params(model.initial_conv)+params(model.conv_layers), params(model.classifier))
    return L(params(model.conv_layers), params(model.layers)+params(model.embeds))
def rmspe(preds, targs):
    x = (targs-preds)/targs
    return (x**2).mean().sqrt()

In [49]:
emb_sizes = [(len(dls.train.classes['stock_id']), 10),
             (len(dls.train.classes['trade_seconds']), 3)]
n_cont = len(dls.cont_names)
layer_sizes = [200,100,100]
embed_p = .2
ps = [.1,.1,0]
model = ConvModel(emb_sizes, n_cont, layer_sizes, conv_layers, embed_p,ps)

learn = Learner(dls,model, loss_func=rmspe, splitter = split_2way, metrics=AccumMetric(rmspe))

In [50]:
learn.fine_tune(70,5e-3, freeze_epochs=3, lr_mult=10)

epoch,train_loss,valid_loss,rmspe,time
0,2.461885,0.970326,0.99342,00:06
1,0.287343,0.254659,0.259033,00:06
2,0.294754,0.254282,0.256688,00:06


epoch,train_loss,valid_loss,rmspe,time
0,0.247198,0.247194,0.250201,00:08
1,0.246132,0.240204,0.242984,00:08
2,0.25012,0.242845,0.245445,00:08
3,0.241874,0.234216,0.236897,00:08
4,0.250913,0.242715,0.24525,00:08
5,0.241679,0.240764,0.244151,00:08
6,0.241102,0.236172,0.238868,00:08
7,0.244014,0.241145,0.243819,00:08
8,0.246248,0.238286,0.242008,00:08
9,0.241779,0.266978,0.271143,00:08
