In [None]:
import numpy as np
import pandas as pd
import glob
from tqdm import tqdm
import sys, os


def load_data(mode, path="/kaggle/input/optiver-realized-volatility-prediction"):
    # mode = "train"/"test"
    file_name = f'{path}/{mode}.csv'
    return pd.read_csv(file_name)

df = load_data("test")
print(df.shape, df["stock_id"].max())
df.head()

In [None]:
SCALE = 100
PATH = "/kaggle/input/optiver-realized-volatility-prediction"

order_book_paths = glob.glob(f'{PATH}/book_test.parquet/*/*')
len(order_book_paths)

In [None]:
trade_paths = glob.glob(f'{PATH}/trade_test.parquet/*/*')
len(trade_paths)

In [None]:
order_books = dict()


for path in tqdm(order_book_paths):
    stock_id = int(path.split("=")[1].split("/")[0])
    book_df = pd.read_parquet(path)
    books_by_time = dict()
    
    for time_id in book_df.time_id.unique():
        books_by_time[time_id] = book_df[book_df["time_id"] == time_id].reset_index(drop=True)
    
    order_books[stock_id] = books_by_time

In [None]:
trades = dict()


for path in tqdm(trade_paths):
    stock_id = int(path.split("=")[1].split("/")[0])
    trade_df = pd.read_parquet(path)
    trade_by_time = dict()
    
    for time_id in trade_df.time_id.unique():
        trade_by_time[time_id] = trade_df[trade_df["time_id"] == time_id].reset_index(drop=True)
    
    trades[stock_id] = trade_by_time

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset


means_order = torch.FloatTensor([  0.9997,   1.0003, 769.9902, 766.7346,   0.9995,   1.0005, 959.3417,
        928.2203, 300])
stds_order = torch.FloatTensor([3.6881e-03, 3.6871e-03, 5.3541e+03, 4.9549e+03, 3.7009e-03, 3.6991e-03,
        6.6838e+03, 5.7353e+03, 300])

means_trade = torch.FloatTensor([300, 1.0, 100, 3.0])
stds_trade = torch.FloatTensor([300, 0.004, 153, 3.5])



class OptiverDataset(Dataset):
    
    def __init__(self, df, aug=False):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.aug = aug
        self.seq_len = 600
        self.order_features = ['bid_price1', 'ask_price1', 'bid_size1', 'ask_size1','bid_price2', 
                         'ask_price2', 'bid_size2', 'ask_size2', "seconds_in_bucket"]
        self.trade_features = ["seconds_in_bucket", "price", "size", "order_count"]
        
    
    def extract_features(self, data_dict, stock_id, time_id, features, means, stds):
        X = -torch.ones((self.seq_len, len(features)))
        try:
            df = data_dict[stock_id][time_id]
            feature_array = df[features].values
            X[-feature_array.shape[0]:] = (torch.FloatTensor(feature_array) - means)/stds
        except:
            pass
        return X


    def __getitem__(self, index):
        row = self.df.iloc[index]
        
        X1 = self.extract_features(order_books, row.stock_id, row.time_id, self.order_features,
                                  means_order, stds_order)
        try:
            X2 = self.extract_features(trades, row.stock_id, row.time_id, self.trade_features,
                                      means_trade, stds_trade) 
        except:
            X2 = -torch.ones((self.seq_len, len(self.trade_features)))
        target = torch.FloatTensor([0.0])
        stock = torch.LongTensor([row.stock_id])
        return X1, X2, stock, target

    def __len__(self):
        return self.df.shape[0]
    
ds = OptiverDataset(df)
ds[1]

In [None]:
class ConvBlock(nn.Module):
    def __init__(self, in_dim, out_dim, kernel_size, stride=1):
        super().__init__()
        self.lin = nn.Conv1d(in_dim, out_dim, kernel_size, stride=stride)
        self.bn = nn.BatchNorm1d(out_dim)
        self.activation = nn.ReLU()
        
    def forward(self, x):
        x = self.lin(x)
        x = self.bn(x)
        return self.activation(x)
        

class SubModel(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.convs1 = nn.Sequential(ConvBlock(in_dim, 16, 3),
                                   ConvBlock(16, 32, 3))
        self.stock_conv = ConvBlock(36, 64, 4, stride=4)
        self.avg_pool = nn.AdaptiveAvgPool1d(8)
        self.max_pool = nn.AdaptiveMaxPool1d(8)
        self.convs2 = nn.Sequential(ConvBlock(128, 128, 2, stride=2),
                                    ConvBlock(128, 32, 2, stride=2),
                                    ConvBlock(32, 8, 2, stride=2))
        
    def forward(self, x, s):
        x = self.convs1(x.transpose(2, 1))
        x = self.stock_conv(torch.cat([x, s.repeat(1, 1, x.shape[2])], axis=1))
        x = torch.cat([self.avg_pool(x), self.max_pool(x)], axis=1)
        x = self.convs2(x).squeeze(-1)
        return x
    
    
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.order_model = SubModel(in_dim=9)
        self.trade_model = SubModel(in_dim=4)
        self.top = nn.Linear(16, 1)
        self.stock_emb = nn.Embedding(127, 4)
        
    def forward(self, inputs):
        x1, x2, s = inputs
        s = self.stock_emb(s).transpose(2, 1)
        
        x1 = self.order_model(x1, s)
        x2 = self.trade_model(x2, s)
        x = self.top(torch.cat([x1, x2], axis=1))
        return x
    
    


In [None]:
def read_data(data):
    return tuple(d.cuda() for d in data[:-1]), data[-1].cuda()

def inference(model, loader, num_folds=5):
    model.eval()
    
    tbar = tqdm(loader, file=sys.stdout)
    
    preds = []
    
    model_weights = {i: torch.load(f"/kaggle/input/optiver-nn/optiver_nn_v01_{i}.pth") for i in range(num_folds)}

    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            model.load_state_dict(model_weights[0])
            pred = model(inputs)/num_folds
            for i in range(1, num_folds):
                model.load_state_dict(model_weights[i])
                pred += model(inputs)/num_folds


            preds.append(pred.detach().cpu().numpy().ravel())
    
    return np.concatenate(preds)

NW = 4
BS = 256
loader = DataLoader(ds, batch_size=BS, shuffle=False, num_workers=NW, pin_memory=False, drop_last=False)


model = Model()
model = model.cuda()

y = inference(model, loader)

In [None]:
df["target"] = np.clip(y, 0.0, None)/SCALE

df.to_csv("submission.csv", index=False, columns=["row_id", "target"])

In [None]:
df.head()

## Required functions for training

1 fold takes 1 hour on a laptop with Nvidia Quadro RTX 5000.

In [None]:
def adjust_lr(optimizer, epoch):
    if epoch < 1:
        lr = 5e-5
    elif epoch < 10:
        lr = 1e-3
    elif epoch < 27:
        lr = 1e-4
    else:
        lr = 1e-5

    for p in optimizer.param_groups:
        p['lr'] = lr
    return lr
    
def get_optimizer(net):
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=3e-4, betas=(0.9, 0.999),
                                 eps=1e-08)
    return optimizer

def rmspe(y_true, y_pred):
    y_pred = np.clip(y_pred, 0, None)
    return (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))


def loss_func(y_pred, y_true):
    return torch.mean(torch.square((y_true - y_pred) / y_true))


def validate(model, val_loader):
    model.eval()
    
    tbar = tqdm(val_loader, file=sys.stdout)
    
    preds = []
    labels = []

    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            pred = model(inputs)

            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())
    
    return np.concatenate(labels), np.concatenate(preds)



def train(model, train_loader, val_loader, epochs):
    
    optimizer = get_optimizer(model)
    
    for e in range(epochs):
        model.train()
        tbar = tqdm(train_loader, file=sys.stdout)
        
        lr = adjust_lr(optimizer, e)
        
        loss_list = []
        preds = []
        labels = []

        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            optimizer.zero_grad()
            pred = model(inputs)

            loss = loss_func(pred, target)
            loss.backward()
            optimizer.step()
            
            loss_list.append(loss.detach().cpu().item())
            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())
            
            avg_loss = np.round(np.mean(loss_list), 4)

            tbar.set_description(f"Epoch {e+1} Loss: {avg_loss} lr: {lr}")
            
        val_labels, val_preds = validate(model, val_loader)
        val_metric = np.round(rmspe(val_labels, val_preds), 4)

        train_metric = np.round(rmspe(np.concatenate(labels), np.concatenate(preds)), 4)
        log_text = f"Epoch {e+1}\n Train metric: {train_metric}\nValidation metric: {val_metric}\n"
            
        print(log_text)
    return model, val_preds



def kfold_train(BS=512, NW=8, NUM_FOLDS=5):
    oof_preds = np.zeros(df.shape[0])

    for fold in range(NUM_FOLDS):
        print(f"Fold {fold + 1}")
        train_ind = np.where(df["time_id"].values % NUM_FOLDS != fold)[0]
        val_ind = np.where(df["time_id"].values % NUM_FOLDS == fold)[0]

        train_df, val_df = df.iloc[train_ind], df.iloc[val_ind]


        train_ds = OptiverDataset(train_df, aug=False)
        val_ds = OptiverDataset(val_df, aug=False)

        train_loader = DataLoader(train_ds, batch_size=BS, shuffle=True, num_workers=NW,
                                  pin_memory=False, drop_last=True)
        val_loader = DataLoader(val_ds, batch_size=BS, shuffle=False, num_workers=NW,
                                  pin_memory=False, drop_last=False)

        model = Model()
        model.cuda()
        model, val_preds = train(model, train_loader, val_loader, epochs=30)

        oof_preds[val_ind] = val_preds

        torch.save(model.state_dict(), f"models/optiver_nn_v01_{fold}.pth")
        
    df["nn_pred"] = oof_preds/SCALE
    df.to_csv("cache/optiver_nn_v01_oof.csv", index=False, columns=["stock_id", "time_id", "nn_pred"])