In [28]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
from fastai.tabular.all import *
from sklearn.model_selection import KFold, GroupKFold
from optuna.integration import FastAIPruningCallback
import optuna

In [30]:
STOCK_COUNT = 112



In [51]:
def fill_missing(train_df):
    all_times = train_df.time_id.unique()
    all_stocks = train_df.stock_id.unique()
    filled_df = train_df.copy()
    filled_df=filled_df.set_index(['time_id', 'stock_id'])
    new_index = pd.MultiIndex.from_product([all_times, all_stocks], names = ['time_id', 'stock_id'])
    filled_df = filled_df.reindex(new_index).reset_index()
    filled_df = filled_df.fillna(0)
    return filled_df

def subtract_windows(df, time_windows):
    for s,e in time_windows[1:]:
        for c in df.columns:
            wind = f'{s}_{e}'
            if c.endswith(wind): 
                pref = c[:-len(wind)]
                main_col = pref+'0_600'
                df[c] = df[main_col]-df[c]
    return df

def append_trade_count(train_df, time_windows):
    for s,e in time_windows:
        train_df[f'number_trades_{s}_{e}'] = 'more'
        for val in range(3): train_df.loc[train_df[f'seconds_in_bucket_size_{s}_{e}']==val, f'number_trades_{s}_{e}'] = val
    return train_df

def tauify(train_df):
    for c in train_df.columns:
        if 'sum' in c: train_df[c] = np.sqrt(1/(train_df[c]+1))
    return train_df

def post_process(train_df, time_windows, do_subtract, do_append, do_tau):
    train_df = fill_missing(train_df)
    if do_subtract: train_df = subtract_windows(train_df, time_windows)
    if do_append: train_df = append_trade_count(train_df, time_windows)
    if do_tau: train_df = tauify(train_df)
    return train_df

In [32]:
class Jitter(ItemTransform):
    def __init__(self, jit_std):
            super().__init__()
            self.split_idx = 0
            self.jit_std = jit_std
            
    def encodes(self, b):
        #print('doing jitter ', self.jit_std)
        jitter = torch.empty_like(b[1]).normal_(0, self.jit_std)
        b[1] += jitter
        return b

class MaskTfm(ItemTransform):
    
    def __init__(self, mask_perc):
        super().__init__()
        self.split_idx = 0
        self.mask_perc = mask_perc
    
    def mask(self, x, indices):
        x[torch.tensor(indices, device=x.device)] = 0
        return x
    
    def encodes(self, x):
        #print('doing mask', self.mask_perc)
        n = len(x[0])
        to_mask = (n * self.mask_perc) // 100
        indices = np.random.choice(np.array(range(n)), to_mask, replace=False)
        x = [self.mask(y, indices) for y in x]
        
        return x

class MyDataLoader(TabDataLoader):
    def __init__(self, dataset, jit_std, mask_perc, bs=16, shuffle=False, after_batch=None, num_workers=0,  **kwargs):
        if after_batch is None: after_batch = L(TransformBlock().batch_tfms)+ReadTabBatch(dataset) + [Jitter(jit_std), MaskTfm(mask_perc)]
        super().__init__(dataset, bs=bs, shuffle=shuffle, after_batch=after_batch, num_workers=num_workers, **kwargs)

    def shuffle_fn(self, idxs):
        idxs = np.array(idxs).reshape(-1,112)
        np.random.shuffle(idxs)
        return idxs.reshape(-1).tolist()

def get_dls(train_df, bs, trn_idx, val_idx, jit_std=.13, mask_perc=8):
    cont_nn,cat_nn = cont_cat_split(train_df, max_card=9000, dep_var='target')
    cat_nn=[x for x in cat_nn if not x in ['row_id', 'time_id']]
    
    procs_nn = [Categorify, Normalize]
    to_nn = TabularPandas(train_df, procs_nn, cat_nn, cont_nn, splits=[list(trn_idx), list(val_idx)], y_names='target')
    dls = to_nn.dataloaders(bs=112*100, shuffle=True, dl_type = MyDataLoader, jit_std=jit_std, mask_perc=mask_perc)
    dls.train_ds.split_idx=0
    dls.valid_ds.split_idx=1
    return dls

In [33]:
#train_df = pd.read_feather('train_24cols.feather')
# train_df = pd.read_feather('train_126ftrs.feater')
# train_df = fill_missing(train_df)
# train_df = append_trade_count(train_df)


In [53]:
class TimeEncoding(nn.Module):
    def __init__(self, inp_size, bottleneck, p, multiplier):
        super().__init__()
        self.multiplier  = multiplier#nn.Parameter(torch.tensor(multiplier)) 
        self.initial_layers = LinBnDrop(inp_size, bottleneck, act=nn.ReLU(True), p=p, bn=False)
        
        self.concat_layers = nn.Sequential(
            nn.BatchNorm1d(bottleneck * STOCK_COUNT),
            nn.Linear(bottleneck * STOCK_COUNT, inp_size),
            nn.Tanh()
        )
        
    def forward(self, x):
        y = self.initial_layers(x)
        times = y.shape[0] // STOCK_COUNT
        y = y.view(times, -1)
        y = self.concat_layers(y)
   
        y = y.view(times,1,-1).expand(times,STOCK_COUNT,-1).contiguous().view(times*STOCK_COUNT, -1)
        
        return x + y * self.multiplier

class BN(nn.Module):
    def __init__(self, features):
        super().__init__()
        self.num_features = features
        self.bn = nn.BatchNorm1d(STOCK_COUNT * self.num_features)
    def forward(self, x):
        sh = x.shape
        x = x.view(-1, STOCK_COUNT * self.num_features)
        x = self.bn(x)
        return x.view(*sh)

class ParallelBlock(nn.Module):
    def __init__(self, block_size, p, time_p, bottleneck, multiplier, do_skip):
        super().__init__()
        self.do_skip = do_skip
        self.layers = nn.Sequential(
            nn.Linear(block_size, block_size),
            BN(block_size ),
            nn.Dropout(p),
            nn.ReLU(True),
            TimeEncoding(block_size, bottleneck, time_p, multiplier)
        )
    def forward(self, x):
        y = self.layers(x)
        if self.do_skip: return (y + x) /2
        else: return y
    
class ParallelModel(nn.Module):
    def __init__(self, inp_size, emb_szs, block_size, ps, bottleneck, time_ps, multipliers, embed_p, do_skip ):
        super().__init__()
        
        self.embeds = nn.ModuleList([Embedding(ni, nf) for ni,nf in emb_szs])
        self.embed_drop = nn.Dropout(embed_p)
        n_emb = sum(e.embedding_dim for e in self.embeds)
        
        layers = [nn.Linear(inp_size+n_emb, block_size),
                 BN(block_size),
                 nn.ReLU(True)]
        for p, time_p, multiplier in zip( ps, time_ps, multipliers):            
            layers.append(ParallelBlock(block_size, p, time_p, bottleneck, multiplier, do_skip))
            
        layers.append(nn.Linear(block_size, 1))
        layers.append(SigmoidRange(0, .1))
        self.layers = nn.Sequential(*layers)
    
    
    def forward(self, x_cat, x_cont):
        x = [e(x_cat[:,i]) for i,e in enumerate(self.embeds)]
        x = torch.cat(x, 1)
        x = self.embed_drop(x)
        x = torch.cat([x_cont, x], dim=1)
        return self.layers(x)

In [35]:
def rmspe(preds, targs):
    mask = targs != 0
    targs, preds = torch.masked_select(targs, mask), torch.masked_select(preds, mask)
    x = (targs-preds)/targs
    res= (x**2).mean().sqrt()
    if torch.isnan(res): 
        print(targs)
        print(preds)
        raise Exception('fck loss is nan')
    return res

In [71]:
def train(trial, train_df, trn_idx, val_idx, save_as=None):
#     do_subtract = trial.suggest_categorical('do_subtract', [True, False])
#     do_append = trial.suggest_categorical('do_append', [True, False])
#     do_tau = trial.suggest_categorical('do_tau', [True, False])
#     train_df = post_process(train_df, time_windows, do_subtract, do_append, do_tau)
    
    jit_std=trial.suggest_float('jit_std', 0, .1)
    mask_perc=trial.suggest_int('mask_perc', 5, 20)
    
    
    if trn_idx is None:
        trn_idx, val_idx = first(GroupKFold().split(train_df, groups = train_df.time_id))
    dls = get_dls(train_df, 100, trn_idx, val_idx, jit_std=jit_std, mask_perc = mask_perc)
    inp_size = len(dls.cont_names)
    
    do_skip = True#trial.suggest_categorical('do_skip', [True, False])
    emb_size = trial.suggest_int('emb_size', 3, 30)
    emb_sizes = [(len(c_vals), emb_size if c_name == 'stock_id' else 3) for c_name, c_vals in dls.train.classes.items()]
    emb_p = trial.suggest_float(f'emb_p', 0, .5)
    block_size = trial.suggest_int(f'block_size', 50, 1000) 
    ps = [trial.suggest_float(f'p{i}', 0, .8) for i in range(4)]
    bottleneck = trial.suggest_int('bottleneck', 5, 100)
    time_ps = [trial.suggest_float(f'time_p{i}', 0, .5) for i in range(4)]
    multipliers = [trial.suggest_float(f'multiplier{i}', .01, .5) for i in range(4)]
    
    lr = float(trial.suggest_float('lr', 1e-3, 1e-2))
    wd = float(trial.suggest_float('wd', 0, .2))
    
    
    model = ParallelModel(inp_size, emb_sizes, block_size, ps, bottleneck, time_ps, multipliers, emb_p, do_skip)
    learn = Learner(dls,model = model, loss_func=rmspe, metrics=AccumMetric(rmspe), opt_func=ranger,
        cbs = FastAIPruningCallback(trial, 'rmspe'), wd=wd).to_fp16()
    # with learn.no_bar():
    #     with learn.no_logging():    
    learn.fit_flat_cos(70, lr)
    if save_as:
        learn.save(save_as)
    last5 = L(learn.recorder.values).itemgot(2)[-5:]
    return np.mean(last5)

def train_cross_valid(trial, train_df, save_as=None):
    res = 0
    do_subtract = False#trial.suggest_categorical('do_subtract', [True, False])
    do_append = False#trial.suggest_categorical('do_append', [True, False])
    do_tau = True#trial.suggest_categorical('do_tau', [True, False])
    train_df = post_process(train_df, time_windows, do_subtract, do_append, do_tau)
    splits = GroupKFold().split(train_df, groups = train_df.time_id)
    for idx, (trn_idx, val_idx) in enumerate(splits):
        v = train(trial, train_df, trn_idx, val_idx, save_as + str(idx) if save_as else None)
        print(f'fold {idx}: {v}')
        res +=v;
    return res/5

## Generate data

In [37]:
from optiver_features import *

In [38]:
%%time
# book_feature_dict = {
#     wap1: [np.mean, np.std, 'nunique'],
#     wap2: [np.mean, np.std],
#     log_return1: [np.std],
#     log_return2: [np.std],
#     ask_spread: [np.mean, np.std],
#     price_spread:[np.mean, np.std],
#     total_volume:[np.mean, np.std],
# }
# trade_feature_dict = {
#         log_return_price: [np.std, np.mean],
#         'seconds_in_bucket':[np.size],
#         'size':[np.sum],
#         'order_count':[np.sum],
# }

time_windows = [(0,600), (0,100), (100,200), (200,300), (300,400), (400, 500), (500,600)]
# ofg = OptiverFeatureGenerator(book_feature_dict, trade_feature_dict, time_windows)
# train_df = ofg.generate_train_df()






CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.44 µs


In [39]:
#train_df.to_feather('train_141cols.feather')

In [55]:
train_df = pd.read_feather('train_141cols.feather')

## Train

In [66]:
study = optuna.create_study(direction="minimize", study_name = 'four_blocks', storage='sqlite:///optuna.db', load_if_exists=True, pruner=optuna.pruners.NopPruner(), sampler=None)
#study.optimize(functools.partial(train, train_df=train_gen))

[32m[I 2021-09-23 13:47:50,597][0m Using an existing study with name 'four_blocks' instead of creating a new one.[0m


In [67]:
best = study.best_trial

In [68]:
best.params

{'block_size': 566,
 'bottleneck': 89,
 'emb_p': 0.22892755376131763,
 'emb_size': 29,
 'jit_std': 0.044428121554388224,
 'lr': 0.0090050544233066,
 'mask_perc': 15,
 'multiplier0': 0.36610514833346075,
 'multiplier1': 0.4699461659624539,
 'multiplier2': 0.19267216205111673,
 'multiplier3': 0.1103899309385393,
 'p0': 0.7779388520646903,
 'p1': 0.4268044282083482,
 'p2': 0.017189932979278854,
 'p3': 0.6956976227194849,
 'time_p0': 0.040130990403726904,
 'time_p1': 0.2795015767348503,
 'time_p2': 0.2534561744168111,
 'time_p3': 0.35356055022302363,
 'wd': 0.1755624424764211}

In [64]:
my_params = {
 'block_size': 566,
 'bottleneck': 89,
 'emb_p': 0.22892755376131763,
 'emb_size': 29,
 'jit_std': 0.044428121554388224,
 'lr': 0.0090050544233066,
 'mask_perc': 15,
 'multiplier0': 0.36610514833346075,
 'multiplier1': 0.4699461659624539,
 'multiplier2': 0.19267216205111673,
 'multiplier3': 0.1103899309385393,
 'p0': 0.7779388520646903,
 'p1': 0.4268044282083482,
 'p2': 0.017189932979278854,
 'p3': 0.6956976227194849,
 'time_p0': 0.040130990403726904,
 'time_p1': 0.2795015767348503,
 'time_p2': 0.2534561744168111,
 'time_p3': 0.35356055022302363,
 'wd': 0.1755624424764211
}

In [72]:
my_trial = optuna.create_trial(value=42, params=best.params, distributions=best.distributions)

train_cross_valid(my_trial, train_df, 'parallel_mypar_')

  my_trial = optuna.create_trial(value=42, params=best.params, distributions=best.distributions)


epoch,train_loss,valid_loss,rmspe,time
0,3.765806,1.215484,1.216401,00:01
1,1.689618,0.486766,0.487729,00:01
2,1.047471,0.460145,0.461112,00:02
3,0.764163,0.3832,0.384322,00:02
4,0.605121,0.349346,0.350106,00:02
5,0.517316,0.343048,0.343691,00:02
6,0.462092,0.337556,0.338395,00:02
7,0.417447,0.299483,0.299945,00:02
8,0.386012,0.293163,0.293611,00:02
9,0.3604,0.272703,0.273178,00:02


fold 0: 0.2150051474571228


epoch,train_loss,valid_loss,rmspe,time
0,3.574295,1.131712,1.132868,00:01
1,1.634078,0.474408,0.47608,00:02
2,1.021445,0.379375,0.380938,00:02
3,0.745722,0.401821,0.402668,00:02
4,0.603308,0.371671,0.372479,00:02
5,0.518826,0.33214,0.333131,00:02
6,0.454876,0.294509,0.295657,00:02
7,0.41182,0.300437,0.300919,00:02
8,0.378499,0.277239,0.277628,00:02
9,0.348708,0.274463,0.27484,00:02


fold 1: 0.21646390855312347


epoch,train_loss,valid_loss,rmspe,time
0,3.844987,0.712572,0.712979,00:02
1,1.727251,0.518726,0.519522,00:02
2,1.053324,0.401045,0.402766,00:02
3,0.758735,0.407556,0.408809,00:02
4,0.600527,0.392913,0.394037,00:02
5,0.506841,0.336759,0.33804,00:02
6,0.44792,0.298682,0.299479,00:02
7,0.408142,0.304506,0.305463,00:02
8,0.368353,0.260975,0.26134,00:02
9,0.329153,0.244961,0.245491,00:02


fold 2: 0.2166481077671051


epoch,train_loss,valid_loss,rmspe,time
0,3.406065,0.650405,0.651112,00:02
1,1.57475,0.497356,0.498488,00:02
2,0.990449,0.434349,0.435955,00:02
3,0.732098,0.408905,0.41031,00:02
4,0.596735,0.406874,0.408146,00:02
5,0.515563,0.34814,0.349941,00:02
6,0.457358,0.335421,0.336656,00:02
7,0.416824,0.317091,0.317828,00:02
8,0.37702,0.300334,0.300671,00:02
9,0.338351,0.250488,0.2506,00:02


fold 3: 0.21314649879932404


epoch,train_loss,valid_loss,rmspe,time
0,3.372767,0.720783,0.72118,00:01
1,1.594522,0.551973,0.553439,00:01
2,1.015064,0.434547,0.435789,00:02
3,0.765822,0.480315,0.481179,00:01
4,0.634175,0.448386,0.449601,00:02
5,0.557496,0.388683,0.390308,00:02
6,0.506921,0.354522,0.35597,00:02
7,0.457043,0.339091,0.340724,00:02
8,0.418873,0.314443,0.315398,00:02
9,0.370813,0.253625,0.254094,00:02


fold 4: 0.21366323232650758


0.21498537898063658