In [None]:
## TODO
- nans
- Gaussian Noise

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from fastai.tabular.all import *
from sklearn.model_selection import KFold, GroupKFold
from optuna.integration import FastAIPruningCallback
import optuna

In [3]:
STOCK_COUNT = 112
FEATURE_COUNT = 144#20


In [33]:
def fill_missing(train_df):
    all_times = train_df.time_id.unique()
    all_stocks = train_df.stock_id.unique()
    filled_df = train_df.copy()
    filled_df=filled_df.set_index(['time_id', 'stock_id'])
    new_index = pd.MultiIndex.from_product([all_times, all_stocks], names = ['time_id', 'stock_id'])
    filled_df = filled_df.reindex(new_index).reset_index()
    filled_df = filled_df.fillna(0)
    return filled_df

In [5]:
class MaskTfm(ItemTransform):
    do_tranform=0
    def mask(self, x, indices):
        x[torch.tensor(indices, device=x.device)] = 0
        return x
    
    def __call__(self, b, split_idx=None, **kwargs):
        self.do_transform = (split_idx == 0)
        return super().__call__(b, split_idx=split_idx, **kwargs)
    

    def encodes(self, x):
        if not self.do_transform: return x
        n = len(x[0])
        indices = np.random.choice(np.array(range(n)), n//10, replace=False)
        x = [self.mask(y, indices) for y in x]
        
        return x

class MyDataLoader(TabDataLoader):
    def __init__(self, dataset, bs=16, shuffle=False, after_batch=None, num_workers=0, **kwargs):
        if after_batch is None: after_batch = L(TransformBlock().batch_tfms)+ReadTabBatch(dataset) + [MaskTfm()]
        super().__init__(dataset, bs=bs, shuffle=shuffle, after_batch=after_batch, num_workers=num_workers, **kwargs)

    def shuffle_fn(self, idxs):
        idxs = np.array(idxs).reshape(-1,112)
        np.random.shuffle(idxs)
        return idxs.reshape(-1).tolist()

def get_dls(train_df, bs, trn_idx, val_idx):
    cont_nn,cat_nn = cont_cat_split(train_df, max_card=9000, dep_var='target')
    cat_nn=[x for x in cat_nn if not x in ['row_id', 'time_id']]
    
    procs_nn = [Categorify, Normalize]
    to_nn = TabularPandas(train_df, procs_nn, cat_nn, cont_nn, splits=[list(trn_idx), list(val_idx)], y_names='target')
    dls = to_nn.dataloaders(bs=112*100, shuffle=True, dl_type = MyDataLoader)
    dls.train_ds.split_idx=0
    dls.valid_ds.split_idx=1
    return dls

In [6]:
train_df = pd.read_csv('train_with_features_NO_ST.csv')

In [7]:
#train_df = pd.read_feather('train_24cols.feather')
train_df = fill_missing(train_df)
trn_idx0, val_idx0 = first(GroupKFold().split(train_df, groups = train_df.time_id))

dls0 = get_dls(train_df, 100, trn_idx0, val_idx0)

In [8]:
class TimeEncoding(nn.Module):
    def __init__(self, inp_size, bottleneck, p, multiplier):
        super().__init__()
        self.multiplier  = multiplier#nn.Parameter(torch.tensor(multiplier)) 
        self.initial_layers = LinBnDrop(inp_size, bottleneck, act=nn.ReLU(True), p=p, bn=False)
        
        self.concat_layers = nn.Sequential(
            nn.BatchNorm1d(bottleneck * STOCK_COUNT),
            nn.Linear(bottleneck * STOCK_COUNT, inp_size),
            nn.Tanh()
        )
        
    def forward(self, x):
        y = self.initial_layers(x)
        times = y.shape[0] // STOCK_COUNT
        y = y.view(times, -1)
        y = self.concat_layers(y)
   
        y = y.view(times,1,-1).expand(times,STOCK_COUNT,-1).contiguous().view(times*STOCK_COUNT, -1)
        
        return x + y * self.multiplier

class BN(nn.Module):
    def __init__(self, features):
        super().__init__()
        self.num_features = features
        self.bn = nn.BatchNorm1d(STOCK_COUNT * self.num_features)
    def forward(self, x):
        sh = x.shape
        x = x.view(-1, STOCK_COUNT * self.num_features)
        x = self.bn(x)
        return x.view(*sh)
    
class ParallelModel(nn.Module):
    def __init__(self, inp_size, emb_size, lin_sizes, ps, bottleneck, time_ps, multiplier ):
        super().__init__()
        
        self.stock_emb = nn.Parameter(torch.empty(STOCK_COUNT, emb_size))
        nn.init.normal_(self.stock_emb)
        
        lin_sizes = [inp_size+emb_size] + lin_sizes
        layers = []
        for n_in, n_out, p, time_p in zip(lin_sizes, lin_sizes[1:], ps, time_ps):
            layers.append(nn.Linear(n_in, n_out))
            layers.append(BN(n_out ))
            if p: layers.append(nn.Dropout(p))
            
            layers.append(nn.ReLU(True))
            
            layers.append(TimeEncoding(n_out, bottleneck, time_p, multiplier))
        layers.append(LinBnDrop(lin_sizes[-1], 1, bn=False))
        layers.append(SigmoidRange(0, .1))
        self.layers = nn.Sequential(*layers)
    
    
    def forward(self, x_cat, x_cont):
        times = x_cat.shape[0] // STOCK_COUNT
        s_e = self.stock_emb.repeat(times, 1)
        x = torch.cat([x_cont, s_e], dim=1)
        for l in self.layers.children():
            #print(x.shape, x.mean(), x.std())
            x = l(x)
        return x#self.layers(x)

In [9]:
def rmspe(preds, targs):
    mask = targs != 0
    targs, preds = torch.masked_select(targs, mask), torch.masked_select(preds, mask)
    x = (targs-preds)/targs
    res= (x**2).mean().sqrt()
    if torch.isnan(res): 
        print(targs)
        print(preds)
        raise Exception('fck loss is nan')
    return res

In [10]:
def train(trial, dls, save_as=None):
    inp_size = FEATURE_COUNT
    emb_size = trial.suggest_int('emb_size', 3, 30)
    max_sizes = [2000, 1000, 500]
    lin_sizes = [trial.suggest_int(f'lin_size{i}', 10, ms) for i, ms in enumerate(max_sizes)]
    ps = [0]+[trial.suggest_float(f'p{i}', 0, .8) for i in range(1,3)]
    
    bottleneck = trial.suggest_int('bottleneck', 5, 100)
    time_ps = [trial.suggest_float(f'time_p{i}', 0, .5) for i in range(3)]
    multiplier = trial.suggest_float('multiplier', .01, .5)
    lr = float(trial.suggest_float('lr', 1e-3, 1e-2))
    
    model = ParallelModel(inp_size, emb_size, lin_sizes, ps, bottleneck, time_ps, multiplier)
    #bx1, bx2, by = dls.one_batch()
    
    learn = Learner(dls,model = model, loss_func=rmspe, metrics=AccumMetric(rmspe), opt_func=ranger,
        cbs = FastAIPruningCallback(trial, 'rmspe')).to_fp16()
    # with learn.no_bar():
    #     with learn.no_logging():    
    learn.fit_flat_cos(50, lr)
    if save_as:
        learn.save(save_as)
    last5 = L(learn.recorder.values).itemgot(2)[-5:]
    return np.mean(last5)

def train_cross_valid(trial, dlss, save_as=None):
    res = 0
    for idx, dls in enumerate(dlss):
        v = train(trial, dls, save_as + str(idx) if save_as else None)
        print(f'fold {idx}: {v}')
        res +=v;
    return res/5

## Generate data

In [11]:
from optiver_features import *

aks_spread
c_sum
size_sum
realized_vol_trade_sum

In [35]:
book_feature_dict = {
    wap1: [np.mean, np.std, 'nunique'],
    wap2: [np.mean, np.std],
    log_return1: [np.std],
    log_return2: [np.std],
    ask_spread: [np.mean, np.std],
    price_spread:[np.mean, np.std],
    total_volume:[np.mean, np.std],
}
trade_feature_dict = {
        log_return_price: [np.std, np.mean],
        'seconds_in_bucket':[np.size],
        'size':[np.sum],
        'order_count':[np.sum],
}

time_windows = [(0,600), (0,100), (100,200), (200,300), (300,400), (400, 500), (500,600)]
ofg = OptiverFeatureGenerator(book_feature_dict, trade_feature_dict, time_windows)
train_gen = ofg.generate_train_df()


In [38]:
train_gen = fill_missing(train_gen)

In [39]:
FEATURE_COUNT = len(train_gen.columns) - 4

In [40]:
FEATURE_COUNT

126

## Train

In [41]:
study = optuna.load_study('parallel_no_st2','sqlite:///optuna.db' )

In [17]:
# study = optuna.create_study(direction="minimize", study_name = 'parallel_no_st', storage='sqlite:///optuna.db', load_if_exists=True, pruner=optuna.pruners.NopPruner(), sampler=None)
# study.optimize(functools.partial(train, dls=dls))

In [42]:
trials = [t for t in study.trials if t.value is not None]

In [43]:
trials =sorted(trials,key = lambda x: x.value)

In [44]:
dlss = [get_dls(train_gen,100, trn_idx, val_idx) for trn_idx, val_idx in GroupKFold().split(train_gen, groups = train_gen.time_id)]

In [45]:
best = study.best_trial

In [46]:
best.params

{'bottleneck': 10,
 'emb_size': 5,
 'lin_size0': 446,
 'lin_size1': 477,
 'lin_size2': 434,
 'lr': 0.006451237192100813,
 'multiplier': 0.12849162558259716,
 'p1': 0.6252926917957898,
 'p2': 0.02656541478851094,
 'time_p0': 0.45265688727038544,
 'time_p1': 0.02835514720594759,
 'time_p2': 0.21139460201890878}

In [47]:
my_params = {
 'bottleneck': 10,
 'emb_size': 5,
 'lin_size0': 450,
 'lin_size1': 450,
 'lin_size2': 450,
 'lr': 0.006,
 'multiplier': 0.15,
 'p1': 0.5,
 'p2': 0.05,
 'time_p0': .45,
 'time_p1': 0.05,
 'time_p2': 0.15 
}

In [48]:
my_trial = optuna.create_trial(value=42, params=my_params, distributions=best.distributions)

train_cross_valid(my_trial, dlss, '91feats')

  my_trial = optuna.create_trial(value=42, params=my_params, distributions=best.distributions)


epoch,train_loss,valid_loss,rmspe,time
0,3.462488,3.530585,3.535204,00:01
1,1.570039,2.174784,2.184813,00:00
2,0.964937,1.107624,1.111014,00:00
3,0.69675,0.510773,0.511114,00:00
4,0.544971,0.714438,0.715582,00:00
5,0.447234,0.370449,0.370977,00:00
6,0.381126,0.41672,0.420987,00:00
7,0.341221,0.284039,0.285192,00:00
8,0.31161,0.282303,0.282867,00:00
9,0.301482,0.266256,0.26636,00:00


fold 0: 0.2225315749645233


epoch,train_loss,valid_loss,rmspe,time
0,4.029916,3.241901,3.243562,00:00
1,1.773963,1.37543,1.379536,00:00
2,1.054403,1.005057,1.01202,00:00
3,0.721675,0.591306,0.591508,00:00
4,0.542163,0.397156,0.397275,00:00
5,0.442506,0.540431,0.543427,00:00
6,0.379063,0.331828,0.332273,00:00
7,0.343202,0.252919,0.253004,00:00
8,0.324129,0.333349,0.333907,00:00
9,0.310607,0.317135,0.318559,00:00


fold 1: 0.2135338544845581


epoch,train_loss,valid_loss,rmspe,time
0,3.173623,3.265737,3.273388,00:00
1,1.481855,0.669103,0.676924,00:00
2,0.93804,0.540822,0.541096,00:00
3,0.699085,0.65757,0.660124,00:00
4,0.570408,0.550355,0.553868,00:00
5,0.484216,0.345047,0.348414,00:00
6,0.406545,0.351204,0.352394,00:00
7,0.359324,0.418696,0.420397,00:00
8,0.324563,0.260789,0.2625,00:00
9,0.309751,0.345806,0.347457,00:00


fold 2: 0.221008563041687


epoch,train_loss,valid_loss,rmspe,time
0,3.251262,2.05236,2.054511,00:00
1,1.485987,1.767344,1.778854,00:00
2,0.936065,0.868114,0.870238,00:00
3,0.654609,0.503943,0.504526,00:00
4,0.509487,0.298666,0.299184,00:00
5,0.431228,0.489038,0.492311,00:00
6,0.380605,0.307274,0.307417,00:00
7,0.35879,0.361034,0.361133,00:00
8,0.350217,0.282474,0.282629,00:00
9,0.318559,0.33156,0.331651,00:00


fold 3: 0.2145466536283493


epoch,train_loss,valid_loss,rmspe,time
0,3.33348,1.950485,1.951169,00:00
1,1.559221,1.557906,1.56411,00:00
2,0.98084,0.740064,0.742442,00:00
3,0.687864,0.401948,0.402405,00:00
4,0.525713,0.517713,0.5197,00:00
5,0.440976,0.311241,0.311617,00:00
6,0.399216,0.400121,0.40177,00:00
7,0.358478,0.280835,0.281129,00:00
8,0.324671,0.299614,0.300149,00:00
9,0.304429,0.258933,0.259272,00:00


fold 4: 0.2169355869293213


0.21771124660968783

In [None]:
for i in range(10):
    trial = trials[i]
    r = train_cross_valid(trial, dlss)
    print('trial', i,':',r)