In [1]:
from fastai.vision.all import *
from fastai.tabular.all import *
from sklearn.model_selection import KFold, GroupKFold
import tqdm

In [2]:
PATH = Path('../input/optiver-realized-volatility-prediction')



train_df = pd.read_csv(PATH/'train.csv')


In [3]:
def fix_offsets(data_df):
    offsets = data_df.groupby(['time_id']).agg({'seconds_in_bucket':'min'})
    offsets.columns = ['offset']
    data_df = data_df.join(offsets, on='time_id')
    data_df.seconds_in_bucket = data_df.seconds_in_bucket - data_df.offset
    return data_df
def ffill(data_df):
    data_df=data_df.set_index(['time_id', 'seconds_in_bucket'])
    data_df = data_df.reindex(pd.MultiIndex.from_product([data_df.index.levels[0], np.arange(0,600)], names = ['time_id', 'seconds_in_bucket']), method='ffill')
    return data_df.reset_index()

def load_data(fname):
    data = pd.read_parquet(fname)
    stock_id = str(fname).split('=')[1]
    time_ids = data.time_id.unique()
    row_ids = list(map(lambda x:f'{stock_id}-{x}', time_ids))
    data = fix_offsets(data)
    data = ffill(data)
    data = data[['bid_price1', 'ask_price1', 'bid_size1', 'ask_size1','bid_price2', 'ask_price2', 'bid_size2', 'ask_size2']].to_numpy()
    data = torch.tensor(data.astype('float32'))
    #data = (data - means) / stds
    return data, row_ids

In [4]:
train_ftrs = pd.read_feather('train_24cols.feather')

In [5]:
# all_data =[]
# all_id=[]
# for stock_id in tqdm.tqdm_notebook(train_df.stock_id.unique()):
#     data, ids = load_data(PATH/f'book_train.parquet/stock_id={stock_id}')
#     all_data.append(data)
#     all_id.append(ids)

# all_data=torch.cat(all_data, dim=0)

# torch.save(all_data, PATH/'torch_data2.pth')

# chained_ids = list(itertools.chain(*all_id))

# (train_ftrs.row_id ==chained_ids).all()

In [6]:
def add_target_bin(train_df):
    w = train_df.target.to_numpy()

    w =np.sort(w)

    bins = []
    bin_med=[]
    step = (len(w)+9)//10
    for i in range(0, len(w), step):
        j = min(i+step, len(w))
        bins.append(w[j] if j< len(w) else 1)
        bin_med.append(np.median(w[i:j]))
        #print(w[i], w[j-1],np.median(w[i:j]) )

    target_bin = np.digitize(train_df.target, bins)

    train_df['target_bin']=target_bin
    return train_df

In [7]:
%%time
torch_data = torch.load(PATH/'torch_data2.pth')

CPU times: user 0 ns, sys: 3.2 s, total: 3.2 s
Wall time: 2min 3s


In [8]:
for c in [2,3,6,7]:
    torch_data[:,c] = (1 /  (1+torch_data[:,c])).sqrt()

In [9]:
means, stds = torch_data.mean(dim=0), torch_data.std(dim=0)
print(means, stds)
torch_data = (torch_data - means) / stds

tensor([0.9997, 1.0003, 0.1468, 0.1440, 0.9995, 1.0005, 0.1495, 0.1459]) tensor([0.0037, 0.0037, 0.1732, 0.1694, 0.0037, 0.0037, 0.1745, 0.1697])


In [10]:
#torch.save(stds, 'models/conv_stds.pth')

In [11]:
offset = list(range(0, len(torch_data), 600))

In [12]:
train_ftrs['offset']=offset

In [13]:
train_ftrs.columns

Index(['log_return2_std_0_600', 'stock_id', 'row_id', 'time_id', 'target',
       'log_return_price_std_0_600', 'order_count_sum_0_600',
       'seconds_in_bucket_size_0_600', 'size_sum_0_600',
       'log_return1_std_0_600_min_time', 'log_return1_std_0_600_mean_time',
       'log_return1_std_0_600_min_stock', 'log_return1_std_0_600_mean_stock',
       'log_return1_std_0_600', 'log_return1_std_200_600',
       'log_return1_std_400_600', 'price_spread_mean_0_600',
       'log_return_price_std_0_600_mean_time',
       'log_return_price_std_200_600_mean_time',
       'log_return_price_std_400_600_mean_time',
       'log_return_price_std_0_600_min_time',
       'log_return_price_std_200_600_min_time',
       'log_return_price_std_400_600_min_time', 'total_volume_mean_0_600',
       'offset'],
      dtype='object')

In [14]:
cols_to_keep = ['stock_id', 'row_id', 'time_id', 'target',
       'log_return_price_std_0_600', 'order_count_sum_0_600',
       'seconds_in_bucket_size_0_600', 'size_sum_0_600',
       'log_return1_std_0_600_min_time', 'log_return1_std_0_600_mean_time',
       'log_return1_std_0_600_min_stock', 'log_return1_std_0_600_mean_stock',
       'log_return_price_std_0_600_mean_time',
       'log_return_price_std_200_600_mean_time',
       'log_return_price_std_400_600_mean_time',
       'log_return_price_std_0_600_min_time',
       'log_return_price_std_200_600_min_time',
       'log_return_price_std_400_600_min_time', 'total_volume_mean_0_600',
       'offset']

train_ftrs = train_ftrs[cols_to_keep]

In [15]:
train_ftrs = add_target_bin(train_ftrs)

In [16]:
train_ftrs = train_ftrs.fillna(0)

In [17]:
#TODO: tau?

In [18]:
class ReadBatch(ItemTransform):
    def encodes(self, to):
        book_offsets = torch.tensor(to['offset'].to_numpy()).long()
        book_data = torch_data.view(-1,600,8)[book_offsets//600,:,:]
        book_data = book_data.permute(0,2,1)
        res = (tensor(to.cats).long(),tensor(to.conts).float(), book_data)        
        res = res + (tensor(to.targ),)
        if to.device is not None: res = to_device(res, to.device)
        return res

In [19]:
trn_idx, val_idx = first(GroupKFold().split(train_ftrs, groups = train_ftrs.time_id))

## Pretraining with CrossEntropy

In [20]:
def get_dls(train_ftrs, trn_idx, val_idx, target_category):
    if target_category:
        target = 'target_bin'
        train_ftrs = train_ftrs.drop('target', axis=1)
    else:
        target = 'target'
        train_ftrs = train_ftrs.drop('target_bin', axis=1)
    cont_nn,cat_nn = cont_cat_split(train_ftrs, max_card=9000, dep_var=target)
    cont_nn.remove('offset')
    cat_nn=[x for x in cat_nn if not x in ['row_id', 'time_id']]
    procs_nn = [Categorify, Normalize]
    to_nn = TabularPandas(train_ftrs, procs_nn, cat_nn, cont_nn,
                        splits=[list(trn_idx), list(val_idx)], y_names=target)
    return to_nn.dataloaders(1024, after_batch = ReadBatch)

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /opt/conda/conda-bld/pytorch_1623448278899/work/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


In [22]:
class ResBlock(nn.Module):
    def __init__(self, ch):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Conv1d(ch, ch, kernel_size = 5, padding = 2, padding_mode='replicate'),
            nn.BatchNorm1d(ch),
            nn.ReLU(),
            nn.Conv1d(ch, ch, kernel_size = 5, padding = 2, padding_mode='replicate'),
            nn.BatchNorm1d(ch),
        )
        
    def forward(self, x):
        res = self.layers(x) + x
        res = F.relu(res)
        return res

class ResnetModel(nn.Module):
    def __init__(self, num_outputs, chan=20, conv_depth=6, res_width=1,p=.1, do_sigmoid = False):
        super().__init__()
        self.do_sigmoid = do_sigmoid
        layers = [nn.Conv1d(8, chan, kernel_size=1), nn.BatchNorm1d(chan) ,nn.ReLU()]
        
        for _ in range(conv_depth):
            layers += [ResBlock(chan) for _ in range(res_width)]
            layers += [nn.AvgPool1d(3, padding=1)]
        layers += [Flatten(), nn.Dropout(p)]   
        self.conv_layers = nn.Sequential(*layers)
        test_x = torch.ones(32,8,600)
        conv_out = self.conv_layers(test_x).shape[1]
        self.classifier = nn.Linear(conv_out, num_outputs)
        
    def forward(self, x_cat, x_cont, x_raw):
        feat = self.conv_layers(x_raw)
        res = self.classifier(feat)
        if self.do_sigmoid:
            res = sigmoid_range(res, 0, .1).view(-1)
        return res

class ConvFeatModel(nn.Module):
    def __init__(self, emb_szs, n_cont, layer_sizes, conv_layers, embed_p,ps):
        super().__init__()
        self.embeds = nn.ModuleList([Embedding(ni, nf) for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(embed_p)
        self.conv_layers = conv_layers
        self.bn_cont = nn.BatchNorm1d(n_cont)
        test_x = torch.ones(32,8,600)
        conv_out = self.conv_layers.cuda()(test_x.cuda()).shape[1]
        n_emb = sum(e.embedding_dim for e in self.embeds)
        sizes = [n_emb + n_cont + conv_out] + layer_sizes + [1]
        actns = [nn.ReLU() for _ in range(len(sizes)-2)] + [None]
        layers = [LinBnDrop(sizes[i], sizes[i+1], bn = (i!=len(actns)-1), p=p, act=a, lin_first=False)
                       for i,(p,a) in enumerate(zip(ps+[0.],actns))]
        layers.append(SigmoidRange(0, 0.1))
        self.layers = nn.Sequential(*layers)
    def forward(self, x_cat, x_cont, x_raw):
        x = [e(x_cat[:,i]) for i,e in enumerate(self.embeds)]
        x = torch.cat(x, 1)
        x = self.emb_drop(x)
        x_cont = self.bn_cont(x_cont)
        x_conv = self.conv_layers(x_raw)
        x = torch.cat([x, x_cont, x_conv], 1)
        return self.layers(x)

In [26]:
def CE_loss(inp, tar):
    return F.cross_entropy(inp, tar.view(-1).long())

In [27]:
def pretrain(dls):
    chan=64
    conv_depth=6
    res_width=2
    p=.4
    epochs = 12
    model = ResnetModel(10, chan=chan,conv_depth=conv_depth, res_width=res_width, p=p)
    #print(model)
    learn = Learner(dls, model, metrics = [accuracy], loss_func = CE_loss)
    learn.fit_one_cycle(epochs, 1e-3)
    return learn.model.conv_layers

epoch,train_loss,valid_loss,accuracy,time
0,1.64953,1.649736,0.343409,01:10
1,1.513815,1.533154,0.38718,01:04
2,1.464689,1.55924,0.376491,01:04
3,1.444468,1.448902,0.407101,01:05
4,1.430218,1.432181,0.416205,01:05
5,1.420035,1.433318,0.415261,01:05
6,1.406995,1.423563,0.417033,01:04
7,1.396461,1.415531,0.418467,01:06
8,1.389022,1.397896,0.42785,01:07
9,1.383045,1.389751,0.43024,01:05


In [33]:
def split_2way(model):
    #return L(params(model.initial_conv)+params(model.conv_layers), params(model.classifier))
    return L(params(model.conv_layers), params(model.layers)+params(model.embeds))
def rmspe(preds, targs):
    x = (targs-preds)/targs
    return (x**2).mean().sqrt()

In [62]:
def train(trn_idx, val_idx, fname):
    dls_reg = get_dls(train_ftrs, trn_idx, val_idx, False)

    dls_cat = get_dls(train_ftrs, trn_idx, val_idx, True)

    conv_layers = pretrain(dls_cat)
    emb_sizes = [(len(dls_reg.train.classes['stock_id']), 14)]
    n_cont = len(dls_reg.cont_names)
    layer_sizes = [1024,512,256]
    embed_p = .05
    ps = [0,.4,0]
    model = ConvFeatModel(emb_sizes, n_cont, layer_sizes, conv_layers, embed_p,ps)

    learn = Learner(dls_reg,model, loss_func=rmspe, splitter = split_2way, metrics=AccumMetric(rmspe))
    learn.fine_tune(70,3e-3, freeze_epochs=3, lr_mult=50, wd = .25)
    torch.save(learn.model, fname)
    return L(learn.recorder.values).itemgot(2)[-1]

In [64]:
res = 0.216926
for i, (trn_idx, val_idx) in enumerate(GroupKFold().split(train_ftrs, groups = train_ftrs.time_id)):
    if i==0: continue
    print('starting fold', i)
    res += train(trn_idx, val_idx, f'models/conv_feat_{i}.pth')
res / 5

starting fold 1


epoch,train_loss,valid_loss,accuracy,time
0,1.676432,1.626696,0.349925,00:33
1,1.532022,1.495707,0.393405,00:32
2,1.466167,1.4399,0.411543,00:33
3,1.444816,1.428299,0.417616,00:33
4,1.427851,1.469001,0.404152,00:32
5,1.41845,1.42845,0.416858,00:33
6,1.409114,1.406151,0.423747,00:33
7,1.396661,1.40775,0.423258,00:33
8,1.387068,1.392347,0.431347,00:33
9,1.381194,1.391143,0.429261,00:33


epoch,train_loss,valid_loss,rmspe,time
0,0.272148,0.254115,0.256782,00:27
1,0.272667,0.255982,0.261184,00:27
2,0.26026,0.234602,0.236823,00:27


epoch,train_loss,valid_loss,rmspe,time
0,0.225661,0.235189,0.237647,00:34
1,0.223664,0.232178,0.235607,00:33
2,0.223192,0.227173,0.230555,00:33
3,0.223619,0.230714,0.235362,00:33
4,0.221735,0.227579,0.23203,00:33
5,0.223988,0.229161,0.235207,00:33
6,0.222476,0.230457,0.236263,00:33
7,0.224233,0.23602,0.241067,00:33
8,0.222982,0.230641,0.23789,00:33
9,0.222155,0.225615,0.227839,00:33


starting fold 2


epoch,train_loss,valid_loss,accuracy,time
0,1.652648,1.565214,0.368627,00:32
1,1.501983,1.468224,0.394318,00:31
2,1.460203,1.454754,0.395659,00:31
3,1.442204,1.444055,0.399378,00:31
4,1.427605,1.42574,0.408831,00:31
5,1.420143,1.408239,0.416537,00:31
6,1.407269,1.402045,0.417049,00:31
7,1.405617,1.397282,0.421269,00:31
8,1.390002,1.381348,0.431422,00:31
9,1.385357,1.372697,0.432227,00:31


epoch,train_loss,valid_loss,rmspe,time
0,0.278421,0.259034,0.262084,00:25
1,0.268858,0.303047,0.305053,00:25
2,0.263334,0.229228,0.232473,00:25


epoch,train_loss,valid_loss,rmspe,time
0,0.231125,0.222674,0.224491,00:32
1,0.229132,0.218456,0.220344,00:32
2,0.227123,0.219812,0.221853,00:32
3,0.231652,0.222219,0.224297,00:32
4,0.229233,0.220485,0.222535,00:32
5,0.228239,0.222611,0.224431,00:32
6,0.232379,0.222939,0.226233,00:32
7,0.224774,0.217249,0.219115,00:32
8,0.226625,0.216602,0.218541,00:32
9,0.226062,0.219949,0.222175,00:32


starting fold 3


epoch,train_loss,valid_loss,accuracy,time
0,1.683511,1.632064,0.350815,00:31
1,1.511672,1.457437,0.400508,00:31
2,1.464432,1.652645,0.374537,00:31
3,1.445507,1.463521,0.398107,00:31
4,1.429974,1.417726,0.414345,00:31
5,1.42328,1.402986,0.422435,00:31
6,1.411054,1.392468,0.421549,00:31
7,1.39751,1.376879,0.430443,00:31
8,1.391726,1.370427,0.433299,00:31
9,1.386607,1.372704,0.434908,00:31


epoch,train_loss,valid_loss,rmspe,time
0,0.27533,0.243802,0.246171,00:25
1,0.270051,0.245127,0.249294,00:25
2,0.260354,0.22342,0.226036,00:25


epoch,train_loss,valid_loss,rmspe,time
0,0.231215,0.219736,0.221663,00:32
1,0.229873,0.220046,0.222195,00:32
2,0.228228,0.221321,0.223193,00:32
3,0.226023,0.21703,0.21909,00:32
4,0.229112,0.228921,0.230685,00:32
5,0.22898,0.216983,0.219189,00:32
6,0.225297,0.214285,0.216422,00:32
7,0.227437,0.223356,0.22511,00:32
8,0.229086,0.217176,0.219141,00:32
9,0.235043,0.216971,0.219614,00:32


starting fold 4


epoch,train_loss,valid_loss,accuracy,time
0,1.659894,1.56192,0.362635,00:33
1,1.507569,1.439054,0.404145,00:34
2,1.471946,1.433936,0.403714,00:33
3,1.454679,1.412001,0.41234,00:33
4,1.436234,1.430741,0.41002,00:33
5,1.428078,1.38062,0.425104,00:33
6,1.413073,1.379793,0.425606,00:33
7,1.410527,1.365341,0.429802,00:33
8,1.395147,1.363253,0.43211,00:33
9,1.389768,1.349844,0.435164,00:33


epoch,train_loss,valid_loss,rmspe,time
0,0.284618,0.275193,0.276918,00:26
1,0.264477,0.248613,0.249991,00:26
2,0.256493,0.224934,0.229951,00:27


epoch,train_loss,valid_loss,rmspe,time
0,0.227865,0.216071,0.217898,00:35
1,0.231643,0.21665,0.218911,00:33
2,0.227819,0.216811,0.21884,00:34
3,0.2389,0.222813,0.224283,00:34
4,0.228149,0.216241,0.218073,00:33
5,0.225785,0.217735,0.219459,00:33
6,0.22904,0.218335,0.220867,00:34
7,0.226718,0.214826,0.216584,00:35
8,0.227215,0.217459,0.219207,00:33
9,0.22977,0.229846,0.231756,00:33


0.21658809313793181

In [37]:
learn.fine_tune(70,3e-3, freeze_epochs=3, lr_mult=50, wd = .25)

epoch,train_loss,valid_loss,rmspe,time
0,0.275173,0.262362,0.265055,00:28
1,0.265152,0.25296,0.255931,00:28
2,0.258208,0.24692,0.249237,00:28


epoch,train_loss,valid_loss,rmspe,time
0,0.22698,0.22715,0.22965,00:35
1,0.231793,0.229917,0.232149,00:35
2,0.229059,0.239129,0.241302,00:35
3,0.223637,0.229196,0.231194,00:35
4,0.226565,0.22665,0.22898,00:35
5,0.227131,0.222259,0.224526,00:34
6,0.223892,0.222197,0.224238,00:34
7,0.225444,0.223182,0.225492,00:34
8,0.224455,0.221668,0.223891,00:34
9,0.232144,0.225249,0.227932,00:34


In [39]:
torch.save(learn.model, 'models/conv_feat_0.pth')

In [47]:
for stock_id in train_ftrs.stock_id.unique():
    train_ftrs_stock = train_ftrs[train_ftrs.stock_id==stock_id].copy()
    test_dl = dls_reg.test_dl(train_ftrs_stock)
    preds,targs=[],[]
    for batch in test_dl:
        bx1, bx2, bx3, by = [x.cuda() for x in batch]
        with torch.no_grad():
            pred = model(bx1,bx2,bx3)
        preds.append(pred)
        targs.append(by)
    res = rmspe(torch.cat(preds, dim=0), torch.cat(targs))
    print(stock_id, res.item())

0 0.23483806848526
1 0.18679066002368927
2 0.17933864891529083
3 0.20766259729862213
4 0.22986364364624023
5 0.22697193920612335
6 0.19317428767681122
7 0.22093327343463898
8 0.2011977732181549
9 0.23397405445575714
10 0.16617223620414734
11 0.21619702875614166
13 0.18527881801128387
14 0.16212579607963562
15 0.18375582993030548
16 0.24070972204208374
17 0.18411044776439667
18 0.271755576133728
19 0.21554110944271088
20 0.1698087602853775
21 0.21279124915599823
22 0.19733291864395142
23 0.19263514876365662
26 0.18502844870090485
27 0.24918240308761597
28 0.20077313482761383
29 0.16698721051216125
30 0.23943471908569336
31 0.3952319025993347
32 0.186578631401062
33 0.2664206922054291
34 0.17956489324569702
35 0.16802597045898438
36 0.17504224181175232
37 0.2769724726676941
38 0.2212161272764206
39 0.19005519151687622
40 0.22624565660953522
41 0.20023591816425323
42 0.20268729329109192
43 0.18926960229873657
44 0.16871188580989838
46 0.2005273401737213
47 0.20172810554504395
48 0.1921295