In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from fastai.tabular.all import *
from sklearn.model_selection import KFold, GroupKFold
from optuna.integration import FastAIPruningCallback
import optuna

In [3]:
STOCK_COUNT = 112

### Hyperparameters - obtained through Optuna study and manual experiments

In [4]:
config = {
 'block_size': 512,
 'bottleneck': 64,
 'emb_p': 0.2,
 'emb_size': 26,
 'jit_std': 0.05,
 'lr': 0.009,
 'mask_perc': 15,
 'p0': 0.25,
 'p1': 0.25,
 'p2': 0.25,
 'p3': 0.25,
 'time_p0': 0,
 'time_p1': 0.25,
 'time_p2': 0.25,
 'time_p3': 0.25,
 'wd': 0.18
}

### Fast.ai dataloader
Extending the `TabularPandas` to add augmentations and keep rows with the same time_id together while shuffling

In [5]:
# Augmentation adding Gaussian noise
class Jitter(ItemTransform):
    def __init__(self, jit_std):
            super().__init__()
            self.split_idx = 0
            self.jit_std = jit_std
            
    def encodes(self, b):
        jitter = torch.empty_like(b[1]).normal_(0, self.jit_std)
        b[1] += jitter
        return b
    
# Augmentation masking random rows
class MaskTfm(ItemTransform):  
    def __init__(self, mask_perc):
        super().__init__()
        self.split_idx = 0
        self.mask_perc = mask_perc
    
    def mask(self, x, indices):
        x[torch.tensor(indices, device=x.device)] = 0
        return x
    
    def encodes(self, x):
        n = len(x[0])
        to_mask = (n * self.mask_perc) // 100
        indices = np.random.choice(np.array(range(n)), to_mask, replace=False)
        x = [self.mask(y, indices) for y in x]
        
        return x

class MyDataLoader(TabDataLoader):
    def __init__(self, dataset, jit_std, mask_perc, bs=16, shuffle=False, after_batch=None, num_workers=0,  **kwargs):
        if after_batch is None: after_batch = L(TransformBlock().batch_tfms)+ReadTabBatch(dataset) + [Jitter(jit_std), MaskTfm(mask_perc)]
        super().__init__(dataset, bs=bs, shuffle=shuffle, after_batch=after_batch, num_workers=num_workers, **kwargs)

    def shuffle_fn(self, idxs):
        idxs = np.array(idxs).reshape(-1,112)
        np.random.shuffle(idxs)
        return idxs.reshape(-1).tolist()

def get_dls(train_df, bs, trn_idx, val_idx, jit_std=.13, mask_perc=8):
    cont_nn,cat_nn = cont_cat_split(train_df, max_card=9000, dep_var='target')
    cat_nn=[x for x in cat_nn if not x in ['row_id', 'time_id']]
    
    procs_nn = [Categorify, Normalize]
    to_nn = TabularPandas(train_df, procs_nn, cat_nn, cont_nn, splits=[list(trn_idx), list(val_idx)], y_names='target')
    dls = to_nn.dataloaders(bs=112*100, shuffle=True, dl_type = MyDataLoader, jit_std=jit_std, mask_perc=mask_perc)
    dls.train_ds.split_idx=0
    dls.valid_ds.split_idx=1
    return dls

### The model
It has four blocks which first run every every stock separately through a `LinearLayer` and then concat all the stocks across `time_id`, run it through another layer and add up the the values.

In [6]:
class TimeEncoding(nn.Module):
    def __init__(self, inp_size, bottleneck, p):
        super().__init__()
        self.multiplier  = nn.Parameter(torch.tensor(.5)) 
        self.initial_layers = LinBnDrop(inp_size, bottleneck, act=nn.ReLU(True), p=p, bn=False)
        
        self.concat_layers = nn.Sequential(
            nn.BatchNorm1d(bottleneck * STOCK_COUNT),
            nn.Linear(bottleneck * STOCK_COUNT, inp_size),
            nn.Tanh()
        )
        
    def forward(self, x):
        y = self.initial_layers(x)
        y = y.view(x.shape[0], -1)
        y = self.concat_layers(y)
        y = y[:,None,:]
        y = y.expand(*x.shape)
        
        res = x + y * self.multiplier
        return res


class ParallelBlock(nn.Module):
    def __init__(self, block_size, p, time_p, bottleneck, do_skip):
        super().__init__()
        self.do_skip = do_skip
        self.layers = nn.Sequential(
            nn.Linear(block_size, block_size),
            nn.BatchNorm1d(STOCK_COUNT),
            nn.Dropout(p),
            nn.ReLU(True),
            TimeEncoding(block_size, bottleneck, time_p)
        )
    def forward(self, x):
        y = self.layers(x)
        if self.do_skip: return (y + x) /2
        else: return y
    
class ParallelModel(nn.Module):
    def __init__(self, inp_size, emb_sz, block_size, ps, bottleneck, time_ps, embed_p, do_skip ):
        super().__init__()
        
        self.embeds = nn.Parameter(torch.empty(STOCK_COUNT, emb_sz))
        torch.nn.init.normal_(self.embeds)
        self.embed_drop = nn.Dropout(embed_p)
        
        layers = [nn.Linear(inp_size + emb_sz, block_size),
                 nn.BatchNorm1d(STOCK_COUNT),
                 nn.ReLU(True)]
        for p, time_p in zip( ps, time_ps):            
            layers.append(ParallelBlock(block_size, p, time_p, bottleneck, do_skip))
            
        layers.append(nn.Linear(block_size, 1))
        layers.append(SigmoidRange(0, .1))
        self.layers = nn.Sequential(*layers)
    
    
    def forward(self, x_cat, x_cont): #x_cat is not used in this version
        bs = x_cont.shape[0] // STOCK_COUNT
        emb = self.embeds.expand(bs, *self.embeds.shape)  
        emb = self.embed_drop(emb)
        x_cont = x_cont.view(bs, STOCK_COUNT, -1)
        x = torch.cat([x_cont, emb], dim=2)
        res = self.layers(x)
        return res.view(bs * STOCK_COUNT, -1)

## Generate data

In [9]:
from optiver_features import *

In [10]:
def fill_missing(train_df):
    all_times = train_df.time_id.unique()
    all_stocks = train_df.stock_id.unique()
    filled_df = train_df.copy()
    filled_df=filled_df.set_index(['time_id', 'stock_id'])
    new_index = pd.MultiIndex.from_product([all_times, all_stocks], names = ['time_id', 'stock_id'])
    filled_df = filled_df.reindex(new_index).reset_index()
    filled_df = filled_df.fillna(0)
    return filled_df


def tauify(train_df):
    for c in train_df.columns:
        if 'sum' in c: train_df[c] = np.sqrt(1/(train_df[c]+1))
    return train_df

def post_process(train_df, time_windows,  do_tau):
    train_df = fill_missing(train_df)
    
    if do_tau: train_df = tauify(train_df)
    return train_df

In [11]:
book_feature_dict = {
    wap1: [np.mean, np.std, 'nunique'],
    wap2: [np.mean, np.std],
    log_return1: [np.std],
    log_return2: [np.std],
    ask_spread: [np.mean, np.std],
    price_spread:[np.mean, np.std],
    total_volume:[np.mean, np.std],
}
trade_feature_dict = {
        log_return_price: [np.std, np.mean],
        'seconds_in_bucket':[np.size],
        'size':[np.sum],
        'order_count':[np.sum],
}

time_windows = [(0,600), (0,100), (100,200), (200,300), (300,400), (400, 500), (500,600)]
cache_name = 'train_141cols.feather'
if Path(cache_name).exists():
    train_df = pd.read_feather(cache_name)
else:
    ofg = OptiverFeatureGenerator(book_feature_dict, trade_feature_dict, time_windows)
    train_df = ofg.generate_train_df()
    train_df.to_feather(cache_name)






In [12]:
train_df = post_process(train_df, time_windows, True)

## Train

In [7]:
def rmspe(preds, targs):
    mask = targs != 0
    targs, preds = torch.masked_select(targs, mask), torch.masked_select(preds, mask)
    x = (targs-preds)/targs
    res= (x**2).mean().sqrt()
    return res

In [8]:
def train( train_df, trn_idx=None, val_idx=None, save_as=None):
    
    jit_std=config['jit_std']
    mask_perc=config['mask_perc']
    
    
    if trn_idx is None:
        trn_idx, val_idx = first(GroupKFold().split(train_df, groups = train_df.time_id))
    dls = get_dls(train_df, 100, trn_idx, val_idx, jit_std=jit_std, mask_perc = mask_perc)
    inp_size = len(dls.cont_names)
    
    do_skip = True
    emb_size = config['emb_size']
   
    emb_p = config[f'emb_p']
    block_size = config['block_size']
    ps = [config[f'p{i}'] for i in range(4)]
    bottleneck = config['bottleneck']
    time_ps = [config[f'time_p{i}'] for i in range(4)]
   
    
    lr = config['lr']
    wd = config['wd']
    
    
    model = ParallelModel(inp_size, emb_size, block_size, ps, bottleneck, time_ps, emb_p, do_skip)
    learn = Learner(dls,model = model, loss_func=rmspe, metrics=AccumMetric(rmspe), opt_func=ranger,wd=wd).to_fp16()
    
    learn.fit_flat_cos(50, lr)
    if save_as:
        learn.save(save_as)
    return  L(learn.recorder.values).itemgot(2)[-1]
    

def train_cross_valid( train_df, save_as=None):
    res = 0
    splits = GroupKFold().split(train_df, groups = train_df.time_id)
    for idx, (trn_idx, val_idx) in enumerate(splits):
        v = train(train_df, trn_idx, val_idx, save_as + str(idx) if save_as else None)
        print(f'fold {idx}: {v}')
        res +=v;
    return res/5

In [13]:
train_cross_valid(train_df)

epoch,train_loss,valid_loss,rmspe,time
0,4.534531,1.013496,1.013539,00:02
1,2.247297,0.997252,0.997252,00:01
2,1.569966,0.997906,0.997907,00:01
3,1.284184,0.973296,0.973299,00:01
4,1.139491,0.965903,0.965944,00:01
5,1.021317,0.918074,0.918117,00:01
6,0.801788,0.321331,0.32164,00:01
7,0.581703,0.247587,0.247607,00:01
8,0.454365,0.269891,0.269928,00:01
9,0.375487,0.257723,0.257852,00:01


fold 0: 0.21345092356204987


epoch,train_loss,valid_loss,rmspe,time
0,3.848418,0.998772,0.998771,00:01
1,2.004242,0.991601,0.991602,00:01
2,1.41447,0.802018,0.803283,00:01
3,0.993275,0.580724,0.59584,00:01
4,0.770235,0.529625,0.530277,00:01
5,1.122985,0.730691,0.734347,00:01
6,0.933124,0.732115,0.732832,00:01
7,0.783011,0.638464,0.639638,00:01
8,0.629196,0.418896,0.419288,00:01
9,0.559906,0.569542,0.570762,00:01


fold 1: 0.21631331741809845


epoch,train_loss,valid_loss,rmspe,time
0,4.190659,1.017263,1.017282,00:01
1,2.106608,0.938852,0.93889,00:01
2,1.420635,0.75064,0.750881,00:01
3,1.02951,0.520579,0.522331,00:01
4,0.787875,0.47164,0.474175,00:01
5,0.638473,0.492951,0.494961,00:01
6,0.543182,0.359556,0.360928,00:01
7,0.563737,0.699376,0.70533,00:01
8,0.669612,0.782597,0.783001,00:01
9,0.669406,0.679258,0.679858,00:01


fold 2: 0.21415720880031586


epoch,train_loss,valid_loss,rmspe,time
0,4.73622,0.999327,0.999327,00:01
1,2.318926,0.993969,0.993969,00:01
2,1.603034,0.998505,0.998505,00:01
3,1.30186,0.999325,0.999326,00:01
4,1.156708,0.993509,0.993509,00:01
5,1.081537,0.986581,0.986587,00:01
6,1.019661,0.999481,0.999481,00:01
7,0.934426,0.804431,0.804746,00:01
8,0.828435,0.563984,0.564906,00:01
9,0.694778,0.509335,0.5094,00:01


fold 3: 0.2167339026927948


epoch,train_loss,valid_loss,rmspe,time
0,4.007535,0.999969,0.999969,00:01
1,2.063947,0.99955,0.99955,00:01
2,1.486212,0.998683,0.998683,00:01
3,1.242823,0.995714,0.995715,00:01
4,1.12571,0.994275,0.994276,00:01
5,1.06445,0.988616,0.988617,00:01
6,1.006266,0.858625,0.858728,00:01
7,0.859245,0.674827,0.675752,00:01
8,0.645569,0.314393,0.314909,00:01
9,0.506109,0.336202,0.336374,00:01


fold 4: 0.2164805382490158


0.21542717814445494