In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import glob
from tqdm import tqdm
import sys, os


def load_data(mode, path="/kaggle/input/optiver-realized-volatility-prediction"):
    # mode = "train"/"test"
    file_name = f'{path}/{mode}.csv'
    return pd.read_csv(file_name)

df = load_data("test")
print(df.shape, df["stock_id"].max())
df.head()

In [None]:
SCALE = 100
PATH = "/kaggle/input/optiver-realized-volatility-prediction"

order_book_paths = glob.glob(f'{PATH}/book_test.parquet/*/*')
len(order_book_paths)

In [None]:
trade_paths = glob.glob(f'{PATH}/trade_test.parquet/*/*')
len(trade_paths)

In [None]:
order_books = dict()


for path in tqdm(order_book_paths):
    stock_id = int(path.split("=")[1].split("/")[0])
    book_df = pd.read_parquet(path)
    books_by_time = dict()
    
    for time_id in book_df.time_id.unique():
        books_by_time[time_id] = book_df[book_df["time_id"] == time_id].reset_index(drop=True)
    
    order_books[stock_id] = books_by_time

In [None]:
trades = dict()


for path in tqdm(trade_paths):
    stock_id = int(path.split("=")[1].split("/")[0])
    trade_df = pd.read_parquet(path)
    trade_by_time = dict()
    
    for time_id in trade_df.time_id.unique():
        trade_by_time[time_id] = trade_df[trade_df["time_id"] == time_id].reset_index(drop=True)
    
    trades[stock_id] = trade_by_time

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset


means_order = torch.FloatTensor([  0.9997,   1.0003, 769.9902, 766.7346,   0.9995,   1.0005, 959.3417,
        928.2203, 300])
stds_order = torch.FloatTensor([3.6881e-03, 3.6871e-03, 5.3541e+03, 4.9549e+03, 3.7009e-03, 3.6991e-03,
        6.6838e+03, 5.7353e+03, 300])

means_trade = torch.FloatTensor([300, 1.0, 100, 3.0])
stds_trade = torch.FloatTensor([300, 0.004, 153, 3.5])



class OptiverDataset(Dataset):
    
    def __init__(self, df, aug=False):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.aug = aug
        self.seq_len = 600
        self.order_features = ['bid_price1', 'ask_price1', 'bid_size1', 'ask_size1','bid_price2', 
                         'ask_price2', 'bid_size2', 'ask_size2', "seconds_in_bucket"]
        self.trade_features = ["seconds_in_bucket", "price", "size", "order_count"]
        
    
    def extract_features(self, data_dict, stock_id, time_id, features, means, stds):
        X = -torch.ones((self.seq_len, len(features)))
        try:
            df = data_dict[stock_id][time_id]
            feature_array = df[features].values
            X[-feature_array.shape[0]:] = (torch.FloatTensor(feature_array) - means)/stds
        except:
            pass
        return X


    def __getitem__(self, index):
        row = self.df.iloc[index]
        
        X1 = self.extract_features(order_books, row.stock_id, row.time_id, self.order_features,
                                  means_order, stds_order)
        try:
            X2 = self.extract_features(trades, row.stock_id, row.time_id, self.trade_features,
                                      means_trade, stds_trade) 
        except:
            X2 = -torch.ones((self.seq_len, len(self.trade_features)))
        target = torch.FloatTensor([0.0])
        stock = torch.LongTensor([row.stock_id])
        return X1, X2, stock, target

    def __len__(self):
        return self.df.shape[0]
    
ds = OptiverDataset(df)
ds[1]

In [None]:
class ConvBlock(nn.Module):
    def __init__(self, in_dim, out_dim, kernel_size, stride=1):
        super().__init__()
        self.lin = nn.Conv1d(in_dim, out_dim, kernel_size, stride=stride)
        self.bn = nn.BatchNorm1d(out_dim)
        self.activation = nn.ReLU()
        
    def forward(self, x):
        x = self.lin(x)
        x = self.bn(x)
        return self.activation(x)
        

class SubModel(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.convs1 = nn.Sequential(ConvBlock(in_dim, 16, 3),
                                   ConvBlock(16, 32, 3))
        self.stock_conv = ConvBlock(36, 64, 4, stride=4)
        self.avg_pool = nn.AdaptiveAvgPool1d(8)
        self.max_pool = nn.AdaptiveMaxPool1d(8)
        self.convs2 = nn.Sequential(ConvBlock(128, 128, 2, stride=2),
                                    ConvBlock(128, 32, 2, stride=2),
                                    ConvBlock(32, 8, 2, stride=2))
        
    def forward(self, x, s):
        x = self.convs1(x.transpose(2, 1))
        x = self.stock_conv(torch.cat([x, s.repeat(1, 1, x.shape[2])], axis=1))
        x = torch.cat([self.avg_pool(x), self.max_pool(x)], axis=1)
        x = self.convs2(x).squeeze(-1)
        return x
    
    
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.order_model = SubModel(in_dim=9)
        self.trade_model = SubModel(in_dim=4)
        self.top = nn.Linear(16, 1)
        self.stock_emb = nn.Embedding(127, 4)
        
    def forward(self, inputs):
        x1, x2, s = inputs
        s = self.stock_emb(s).transpose(2, 1)
        
        x1 = self.order_model(x1, s)
        x2 = self.trade_model(x2, s)
        x = self.top(torch.cat([x1, x2], axis=1))
        return x

In [None]:
def read_data(data):
    return tuple(d.cuda() for d in data[:-1]), data[-1].cuda()

def inference(model, loader):
    model.eval()
    
    tbar = tqdm(loader, file=sys.stdout)
    
    preds = []

    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            pred = model(inputs)

            preds.append(pred.detach().cpu().numpy().ravel())
    
    return np.concatenate(preds)

NW = 4
BS = 256
NUM_FOLDS = 5
loader = DataLoader(ds, batch_size=BS, shuffle=False, num_workers=NW, pin_memory=False, drop_last=False)


model = Model()
model = model.cuda()
model.eval()

model.load_state_dict(torch.load(f"/kaggle/input/models/optiver_nn_v01_0.pth"))
y = inference(model, loader)/NUM_FOLDS

for i in range(1, NUM_FOLDS):
    model.load_state_dict(torch.load(f"/kaggle/input/models/optiver_nn_v01_{i}.pth"))
    y += inference(model, loader)/NUM_FOLDS

In [None]:
df["target"] = np.clip(y, 0.0, None)/SCALE

df.to_csv("submission.csv", index=False, columns=["row_id", "target"])


In [None]:
df.head()
