## TODO

- tau features

- subtract windows

-skip connection

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from fastai.tabular.all import *
from sklearn.model_selection import KFold, GroupKFold
from optuna.integration import FastAIPruningCallback
import optuna

In [3]:
STOCK_COUNT = 112
FEATURE_COUNT = 144#20


In [4]:
def fill_missing(train_df):
    all_times = train_df.time_id.unique()
    all_stocks = train_df.stock_id.unique()
    filled_df = train_df.copy()
    filled_df=filled_df.set_index(['time_id', 'stock_id'])
    new_index = pd.MultiIndex.from_product([all_times, all_stocks], names = ['time_id', 'stock_id'])
    filled_df = filled_df.reindex(new_index).reset_index()
    filled_df = filled_df.fillna(0)
    return filled_df

In [48]:
def append_trade_count(train_df):
    for s,e in time_windows:
        train_df[f'number_trades_{s}_{e}'] = 'more'
        for val in range(3): train_df.loc[train_df[f'seconds_in_bucket_size_{s}_{e}']==val, f'number_trades_{s}_{e}'] = val
    return train_df

In [5]:
class Jitter(ItemTransform):
    def __init__(self, jit_std):
            super().__init__()
            self.split_idx = 0
            self.jit_std = jit_std
            
    def encodes(self, b):
        #print('doing jitter ', self.jit_std)
        jitter = torch.empty_like(b[1]).normal_(0, self.jit_std)
        b[1] += jitter
        return b

class MaskTfm(ItemTransform):
    
    def __init__(self, mask_perc):
        super().__init__()
        self.split_idx = 0
        self.mask_perc = mask_perc
    
    def mask(self, x, indices):
        x[torch.tensor(indices, device=x.device)] = 0
        return x
    
    def encodes(self, x):
        #print('doing mask', self.mask_perc)
        n = len(x[0])
        to_mask = (n * self.mask_perc) // 100
        indices = np.random.choice(np.array(range(n)), to_mask, replace=False)
        x = [self.mask(y, indices) for y in x]
        
        return x

class MyDataLoader(TabDataLoader):
    def __init__(self, dataset, jit_std, mask_perc, bs=16, shuffle=False, after_batch=None, num_workers=0,  **kwargs):
        if after_batch is None: after_batch = L(TransformBlock().batch_tfms)+ReadTabBatch(dataset) + [Jitter(jit_std), MaskTfm(mask_perc)]
        super().__init__(dataset, bs=bs, shuffle=shuffle, after_batch=after_batch, num_workers=num_workers, **kwargs)

    def shuffle_fn(self, idxs):
        idxs = np.array(idxs).reshape(-1,112)
        np.random.shuffle(idxs)
        return idxs.reshape(-1).tolist()

def get_dls(train_df, bs, trn_idx, val_idx, jit_std=.13, mask_perc=8):
    cont_nn,cat_nn = cont_cat_split(train_df, max_card=9000, dep_var='target')
    cat_nn=[x for x in cat_nn if not x in ['row_id', 'time_id']]
    
    procs_nn = [Categorify, Normalize]
    to_nn = TabularPandas(train_df, procs_nn, cat_nn, cont_nn, splits=[list(trn_idx), list(val_idx)], y_names='target')
    dls = to_nn.dataloaders(bs=112*100, shuffle=True, dl_type = MyDataLoader, jit_std=jit_std, mask_perc=mask_perc)
    dls.train_ds.split_idx=0
    dls.valid_ds.split_idx=1
    return dls

In [6]:
#train_df = pd.read_feather('train_24cols.feather')
train_df = pd.read_feather('train_126ftrs.feater')
train_df = fill_missing(train_df)
train_df = append_trade_count(train_df)


In [74]:
class TimeEncoding(nn.Module):
    def __init__(self, inp_size, bottleneck, p, multiplier):
        super().__init__()
        self.multiplier  = multiplier#nn.Parameter(torch.tensor(multiplier)) 
        self.initial_layers = LinBnDrop(inp_size, bottleneck, act=nn.ReLU(True), p=p, bn=False)
        
        self.concat_layers = nn.Sequential(
            nn.BatchNorm1d(bottleneck * STOCK_COUNT),
            nn.Linear(bottleneck * STOCK_COUNT, inp_size),
            nn.Tanh()
        )
        
    def forward(self, x):
        y = self.initial_layers(x)
        times = y.shape[0] // STOCK_COUNT
        y = y.view(times, -1)
        y = self.concat_layers(y)
   
        y = y.view(times,1,-1).expand(times,STOCK_COUNT,-1).contiguous().view(times*STOCK_COUNT, -1)
        
        return x + y * self.multiplier

class BN(nn.Module):
    def __init__(self, features):
        super().__init__()
        self.num_features = features
        self.bn = nn.BatchNorm1d(STOCK_COUNT * self.num_features)
    def forward(self, x):
        sh = x.shape
        x = x.view(-1, STOCK_COUNT * self.num_features)
        x = self.bn(x)
        return x.view(*sh)
    
class ParallelModel(nn.Module):
    def __init__(self, inp_size, emb_szs, lin_sizes, ps, bottleneck, time_ps, multipliers, embed_p ):
        super().__init__()
        
        self.embeds = nn.ModuleList([Embedding(ni, nf) for ni,nf in emb_szs])
        self.embed_drop = nn.Dropout(embed_p)
        n_emb = sum(e.embedding_dim for e in self.embeds)
        
        lin_sizes = [inp_size+n_emb] + lin_sizes
        layers = []
        for n_in, n_out, p, time_p, multiplier in zip(lin_sizes, lin_sizes[1:], ps, time_ps, multipliers):
            layers.append(nn.Linear(n_in, n_out))
            layers.append(BN(n_out ))
            if p: layers.append(nn.Dropout(p))
            
            layers.append(nn.ReLU(True))
            
            layers.append(TimeEncoding(n_out, bottleneck, time_p, multiplier))
        layers.append(LinBnDrop(lin_sizes[-1], 1, bn=False))
        layers.append(SigmoidRange(0, .1))
        self.layers = nn.Sequential(*layers)
    
    
    def forward(self, x_cat, x_cont):
        x = [e(x_cat[:,i]) for i,e in enumerate(self.embeds)]
        x = torch.cat(x, 1)
        x = self.embed_drop(x)
        x = torch.cat([x_cont, x], dim=1)
        for l in self.layers.children():
            #print(x.shape, x.mean(), x.std())
            x = l(x)
        return x#self.layers(x)

In [8]:
def rmspe(preds, targs):
    mask = targs != 0
    targs, preds = torch.masked_select(targs, mask), torch.masked_select(preds, mask)
    x = (targs-preds)/targs
    res= (x**2).mean().sqrt()
    if torch.isnan(res): 
        print(targs)
        print(preds)
        raise Exception('fck loss is nan')
    return res

In [79]:
def train(trial, train_df, trn_idx, val_idx, save_as=None):
    inp_size = FEATURE_COUNT
    jit_std=trial.suggest_float('jit_std', 0, .5)
    mask_perc=trial.suggest_int('mask_perc', 0, 20)
    
    dls = get_dls(train_df, 100, trn_idx, val_idx, jit_std=jit_std, mask_perc = mask_perc)
    emb_size = trial.suggest_int('emb_size', 3, 30)
    emb_sizes = [(len(c_vals), emb_size if c_name == 'stock_id' else 3) for c_name, c_vals in dls0.train.classes.items()]
    emb_p = trial.suggest_float(f'emb_p', 0, .5)
    max_sizes = [2000, 1000, 500]
    lin_sizes = [500,500,500]#[trial.suggest_int(f'lin_size{i}', 10, ms) for i, ms in enumerate(max_sizes)]
    ps = [0]+[trial.suggest_float(f'p{i}', 0, .8) for i in range(1,3)]
    
    bottleneck = trial.suggest_int('bottleneck', 5, 100)
    time_ps = [trial.suggest_float(f'time_p{i}', 0, .5) for i in range(3)]
    multipliers = [trial.suggest_float(f'multiplier{i}', .01, .5) for i in range(3)]
    lr = float(trial.suggest_float('lr', 1e-3, 1e-2))

    
    
    model = ParallelModel(inp_size, emb_sizes, lin_sizes, ps, bottleneck, time_ps, multipliers, emb_p)
    #bx1, bx2, by = dls.one_batch()
    print(model)
    learn = Learner(dls,model = model, loss_func=rmspe, metrics=AccumMetric(rmspe), opt_func=ranger,
        cbs = FastAIPruningCallback(trial, 'rmspe')).to_fp16()
    # with learn.no_bar():
    #     with learn.no_logging():    
    learn.fit_flat_cos(70, lr)
    if save_as:
        learn.save(save_as)
    last5 = L(learn.recorder.values).itemgot(2)[-5:]
    return np.mean(last5)

def train_cross_valid(trial, train_df, save_as=None):
    res = 0
    splits = GroupKFold().split(train_df, groups = train_df.time_id)
    for idx, (trn_idx, val_idx) in enumerate(splits):
        v = train(trial, train_df, trn_idx, val_idx, save_as + str(idx) if save_as else None)
        print(f'fold {idx}: {v}')
        res +=v;
    return res/5

## Generate data

In [10]:
from optiver_features import *

In [32]:
# book_feature_dict = {
#     wap1: [np.mean, np.std, 'nunique'],
#     wap2: [np.mean, np.std],
#     log_return1: [np.std],
#     log_return2: [np.std],
#     ask_spread: [np.mean, np.std],
#     price_spread:[np.mean, np.std],
#     total_volume:[np.mean, np.std],
# }
# trade_feature_dict = {
#         log_return_price: [np.std, np.mean],
#         'seconds_in_bucket':[np.size],
#         'size':[np.sum],
#         'order_count':[np.sum],
# }

time_windows = [(0,600), (0,100), (100,200), (200,300), (300,400), (400, 500), (500,600)]
# ofg = OptiverFeatureGenerator(book_feature_dict, trade_feature_dict, time_windows)
# #train_gen = ofg.generate_train_df()


# train_gen.to_feather('train_126ftrs.feater')

# train_gen = fill_missing(train_gen)

# FEATURE_COUNT = len(train_gen.columns) - 4

# FEATURE_COUNT

In [81]:
[c in train_df.columns if 'sum' in c]

SyntaxError: invalid syntax (2435866549.py, line 1)

In [49]:
train_df = pd.read_feather('train_126ftrs.feater')

In [50]:
train_df = fill_missing(train_df)

In [51]:
train_df = append_trade_count(train_df)

In [56]:
trn_idx0, val_idx0 = first(GroupKFold().split(train_df, groups = train_df.time_id))

dls0 = get_dls(train_df, 100, trn_idx0, val_idx0)

In [57]:
bx1, bx2, by = dls0.one_batch()

In [61]:
bx1[:,1].max()

tensor(4)

## Train

In [12]:
FEATURE_COUNT = len(train_df.columns) - 4

In [13]:
study = optuna.create_study(direction="minimize", study_name = 'train_126ftrs', storage='sqlite:///optuna.db', load_if_exists=True, pruner=optuna.pruners.NopPruner(), sampler=None)
#study.optimize(functools.partial(train, train_df=train_gen))

[32m[I 2021-09-22 13:04:28,641][0m Using an existing study with name 'train_126ftrs' instead of creating a new one.[0m


In [14]:
trials = [t for t in study.trials if t.value is not None]

In [15]:
trials =sorted(trials,key = lambda x: x.value)

In [16]:
best = study.best_trial

In [17]:
best.params

{'bottleneck': 46,
 'emb_p': 0.3454992150001027,
 'emb_size': 4,
 'jit_std': 0.03325574203203305,
 'lr': 0.008414558376176162,
 'mask_perc': 11,
 'multiplier0': 0.18052589333779923,
 'multiplier1': 0.29552707702438435,
 'multiplier2': 0.040720708464957234,
 'p1': 0.5240166648872739,
 'p2': 0.2709588165106582,
 'time_p0': 0.26394511771559187,
 'time_p1': 0.2605308389420442,
 'time_p2': 0.11181420784350192}

In [77]:
my_params = {
 'bottleneck': 50,
 'emb_p': 0.3,
 'emb_size': 10,
 'jit_std': 0.03,
 'lr': 0.008,
 'mask_perc': 10,
 'multiplier0': 0.2,
 'multiplier1': 0.3,
 'multiplier2': 0.05,
 'p1': 0.5,
 'p2': 0.25,
 'time_p0': 0.25,
 'time_p1': 0.25,
 'time_p2': 0.1
}

In [80]:
my_trial = optuna.create_trial(value=42, params=my_params, distributions=best.distributions)

train_cross_valid(my_trial, train_df, '126feats')

  my_trial = optuna.create_trial(value=42, params=my_params, distributions=best.distributions)


ParallelModel(
  (embeds): ModuleList(
    (0): Embedding(113, 10)
    (1): Embedding(5, 3)
    (2): Embedding(5, 3)
    (3): Embedding(5, 3)
    (4): Embedding(5, 3)
    (5): Embedding(5, 3)
    (6): Embedding(5, 3)
    (7): Embedding(5, 3)
  )
  (embed_drop): Dropout(p=0.3, inplace=False)
  (layers): Sequential(
    (0): Linear(in_features=157, out_features=500, bias=True)
    (1): BN(
      (bn): BatchNorm1d(56000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (2): ReLU(inplace=True)
    (3): TimeEncoding(
      (initial_layers): LinBnDrop(
        (0): Dropout(p=0.25, inplace=False)
        (1): Linear(in_features=500, out_features=50, bias=True)
        (2): ReLU(inplace=True)
      )
      (concat_layers): Sequential(
        (0): BatchNorm1d(5600, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (1): Linear(in_features=5600, out_features=500, bias=True)
        (2): Tanh()
      )
    )
    (4): Linear(in_features=500, out_featu

epoch,train_loss,valid_loss,rmspe,time
0,3.363687,0.999562,1.000676,00:01
1,1.587448,0.505076,0.506032,00:01
2,1.019678,0.465959,0.466538,00:01
3,0.766272,0.46972,0.470417,00:01
4,0.619756,0.411492,0.412269,00:01
5,0.529581,0.397671,0.39844,00:01
6,0.490935,0.336047,0.336247,00:01
7,0.451509,0.291243,0.291358,00:01
8,0.407944,0.302069,0.302219,00:01
9,0.386106,0.259619,0.259777,00:01


fold 0: 0.22144209444522858
ParallelModel(
  (embeds): ModuleList(
    (0): Embedding(113, 10)
    (1): Embedding(5, 3)
    (2): Embedding(5, 3)
    (3): Embedding(5, 3)
    (4): Embedding(5, 3)
    (5): Embedding(5, 3)
    (6): Embedding(5, 3)
    (7): Embedding(5, 3)
  )
  (embed_drop): Dropout(p=0.3, inplace=False)
  (layers): Sequential(
    (0): Linear(in_features=157, out_features=500, bias=True)
    (1): BN(
      (bn): BatchNorm1d(56000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (2): ReLU(inplace=True)
    (3): TimeEncoding(
      (initial_layers): LinBnDrop(
        (0): Dropout(p=0.25, inplace=False)
        (1): Linear(in_features=500, out_features=50, bias=True)
        (2): ReLU(inplace=True)
      )
      (concat_layers): Sequential(
        (0): BatchNorm1d(5600, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (1): Linear(in_features=5600, out_features=500, bias=True)
        (2): Tanh()
      )
    )
    (4): Linea

epoch,train_loss,valid_loss,rmspe,time
0,3.620982,1.41617,1.417476,00:01
1,1.663177,0.580185,0.580799,00:01
2,1.062463,0.431524,0.432444,00:01
3,0.793129,0.469967,0.470658,00:01
4,0.645757,0.350138,0.350945,00:01
5,0.551153,0.334546,0.335354,00:01
6,0.488843,0.288077,0.28847,00:01
7,0.438486,0.335114,0.335478,00:01
8,0.395996,0.354491,0.355824,00:01
9,0.370553,0.247459,0.247909,00:01


fold 1: 0.21472090780735015
ParallelModel(
  (embeds): ModuleList(
    (0): Embedding(113, 10)
    (1): Embedding(5, 3)
    (2): Embedding(5, 3)
    (3): Embedding(5, 3)
    (4): Embedding(5, 3)
    (5): Embedding(5, 3)
    (6): Embedding(5, 3)
    (7): Embedding(5, 3)
  )
  (embed_drop): Dropout(p=0.3, inplace=False)
  (layers): Sequential(
    (0): Linear(in_features=157, out_features=500, bias=True)
    (1): BN(
      (bn): BatchNorm1d(56000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (2): ReLU(inplace=True)
    (3): TimeEncoding(
      (initial_layers): LinBnDrop(
        (0): Dropout(p=0.25, inplace=False)
        (1): Linear(in_features=500, out_features=50, bias=True)
        (2): ReLU(inplace=True)
      )
      (concat_layers): Sequential(
        (0): BatchNorm1d(5600, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (1): Linear(in_features=5600, out_features=500, bias=True)
        (2): Tanh()
      )
    )
    (4): Linea

epoch,train_loss,valid_loss,rmspe,time
0,3.59712,1.53119,1.533695,00:01
1,1.647258,0.792155,0.79492,00:01
2,1.021318,0.592433,0.594447,00:01
3,0.713762,0.319666,0.322031,00:01
4,0.555797,0.294352,0.298053,00:01
5,0.465369,0.4347,0.440871,00:01
6,0.403544,0.531955,0.536381,00:01
7,0.3825,0.272276,0.272775,00:01
8,0.363087,0.244213,0.244935,00:01
9,0.346126,0.374601,0.375556,00:01


fold 2: 0.22308090925216675
ParallelModel(
  (embeds): ModuleList(
    (0): Embedding(113, 10)
    (1): Embedding(5, 3)
    (2): Embedding(5, 3)
    (3): Embedding(5, 3)
    (4): Embedding(5, 3)
    (5): Embedding(5, 3)
    (6): Embedding(5, 3)
    (7): Embedding(5, 3)
  )
  (embed_drop): Dropout(p=0.3, inplace=False)
  (layers): Sequential(
    (0): Linear(in_features=157, out_features=500, bias=True)
    (1): BN(
      (bn): BatchNorm1d(56000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (2): ReLU(inplace=True)
    (3): TimeEncoding(
      (initial_layers): LinBnDrop(
        (0): Dropout(p=0.25, inplace=False)
        (1): Linear(in_features=500, out_features=50, bias=True)
        (2): ReLU(inplace=True)
      )
      (concat_layers): Sequential(
        (0): BatchNorm1d(5600, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (1): Linear(in_features=5600, out_features=500, bias=True)
        (2): Tanh()
      )
    )
    (4): Linea

epoch,train_loss,valid_loss,rmspe,time
0,3.682256,1.491436,1.492689,00:01
1,1.686116,0.51387,0.51461,00:01
2,1.054144,0.630226,0.630883,00:01
3,0.784485,0.501129,0.5017,00:01
4,0.644033,0.574008,0.574914,00:01
5,0.55141,0.352085,0.352952,00:01
6,0.49514,0.374499,0.375452,00:01
7,0.451926,0.286448,0.286547,00:01
8,0.412504,0.299397,0.299517,00:01
9,0.381869,0.256179,0.256227,00:01


fold 3: 0.21472125351428986
ParallelModel(
  (embeds): ModuleList(
    (0): Embedding(113, 10)
    (1): Embedding(5, 3)
    (2): Embedding(5, 3)
    (3): Embedding(5, 3)
    (4): Embedding(5, 3)
    (5): Embedding(5, 3)
    (6): Embedding(5, 3)
    (7): Embedding(5, 3)
  )
  (embed_drop): Dropout(p=0.3, inplace=False)
  (layers): Sequential(
    (0): Linear(in_features=157, out_features=500, bias=True)
    (1): BN(
      (bn): BatchNorm1d(56000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (2): ReLU(inplace=True)
    (3): TimeEncoding(
      (initial_layers): LinBnDrop(
        (0): Dropout(p=0.25, inplace=False)
        (1): Linear(in_features=500, out_features=50, bias=True)
        (2): ReLU(inplace=True)
      )
      (concat_layers): Sequential(
        (0): BatchNorm1d(5600, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (1): Linear(in_features=5600, out_features=500, bias=True)
        (2): Tanh()
      )
    )
    (4): Linea

epoch,train_loss,valid_loss,rmspe,time
0,3.406003,1.022375,1.024101,00:01
1,1.624529,0.899889,0.900974,00:01
2,1.078691,0.460917,0.462378,00:01
3,0.796457,0.594251,0.594706,00:01
4,0.654416,0.451649,0.452413,00:01
5,0.556703,0.362784,0.363866,00:01
6,0.476852,0.303941,0.304519,00:01
7,0.444407,0.302711,0.303012,00:01
8,0.428674,0.269548,0.270072,00:01
9,0.407448,0.270461,0.270975,00:01


fold 4: 0.217070135474205


0.21820706009864804