In [None]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import pickle

import torch
import torch.nn as nn

import matplotlib.pyplot as plt

Training : https://www.kaggle.com/narendra/optiver-baseline-gru/output?scriptVersionId=71057694

In [None]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)


In [None]:
BUCKET_INTERVAL=30
class config:
    num_buckets= 20
    num_features= 26
    epochs=50

In [None]:
mean_columns=['bid_price1', 'ask_price1', 'bid_price2','ask_price2', 
                   'bid_size1', 'ask_size1', 'bid_size2','ask_size2', 
                   'wap', 'wap2', 'price_spread1', 'total_volume1', 'volume_spread1',
                   'price_spread2', 'total_volume2','volume_spread2', 'total_volume']
std_columns=['bid_price1', 'ask_price1', 'bid_price2','ask_price2', 'wap', 'wap2', 'price_spread1']
min_columns=['bid_price1', 'ask_price1', 'bid_price2','ask_price2', 
                   'bid_size1', 'ask_size1', 'bid_size2','ask_size2', 
                   'wap', 'wap2', 'price_spread1', 'total_volume1', 'volume_spread1',
                   'price_spread2', 'total_volume2','volume_spread2', 'total_volume']
max_columns=['bid_price1', 'ask_price1', 'bid_price2','ask_price2', 
                   'bid_size1', 'ask_size1', 'bid_size2','ask_size2', 
                   'wap', 'wap2', 'price_spread1', 'total_volume1', 'volume_spread1',
                   'price_spread2', 'total_volume2','volume_spread2', 'total_volume']




feature_columns=[
    'mean_bid_price1', 'mean_ask_price1', 'mean_bid_price2', 'mean_ask_price2',
    'mean_bid_size1', 'mean_ask_size1', 'mean_bid_size2', 'mean_ask_size2', 'mean_wap',
    'mean_wap2', 'mean_price_spread1', 'mean_total_volume1',
    'mean_volume_spread1', 'mean_price_spread2', 'mean_total_volume2',
    'mean_volume_spread2', 'mean_total_volume', 'std_bid_price1',
    'std_ask_price1', 'std_bid_price2', 'std_ask_price2', 'std_wap',
    'std_wap2', 'std_price_spread1', 'rv1', 'rv2'
]

In [None]:
def calculate_wap(df):
    wap=(df['ask_price1'] * df['bid_size1']) + (df['bid_price1'] * df['ask_size1'])
    wap/=(df['ask_size1'] + df['bid_size1'])
    return wap

def calculate_wap2(df):
    wap=(df['ask_price2'] * df['bid_size2']) + (df['bid_price2'] * df['ask_size2'])
    wap/=(df['ask_size2'] + df['bid_size2'])
    return wap


def calculate_realized_volatility(s):
    s=np.log(s)
    s=np.diff(s)
    s=(s**2).sum()
    s=np.sqrt(s)
    return s
    

def get_features(df):
    df['wap']=calculate_wap(df)
    df['wap2']=calculate_wap2(df)
    
    df['price_spread1']=df['ask_price1']/df['bid_price1']
    df['volume_spread1']=df['ask_size1']/df['bid_size1']
    df['total_volume1']=df['ask_size1'] + df['bid_size1']
    
    df['price_spread2']=df['ask_price2']/df['bid_price2']
    df['volume_spread2']=df['ask_size2']/df['bid_size2']
    df['total_volume2']=df['ask_size2'] + df['bid_size2']
    
    df['total_volume']=df['total_volume1']+df['total_volume2']
    
    return df


def get_realized_volatility(s):
    s=np.diff(s)
    s=s**2
    return np.sqrt( np.sum(s) )


def get_aggregated_features(df):
    global mean_columns
    global std_columns
    
    mean_df=df.groupby(['stock_id','time_id', 'bucket_num'])[mean_columns].mean()
    std_df=df.groupby(['stock_id','time_id', 'bucket_num'])[std_columns].std()
    rv1_df=df.groupby(['stock_id', 'time_id', 'bucket_num'])[['rv1']].agg(get_realized_volatility).reset_index()
    rv2_df=df.groupby(['stock_id', 'time_id', 'bucket_num'])[['rv2']].agg(get_realized_volatility).reset_index()
    
    mean_df.columns=["mean_"+colname for colname in mean_df.columns]
    std_df.columns=["std_"+colname for colname in std_df.columns]
    
    mean_df=mean_df.reset_index()
    std_df=std_df.reset_index()
    
    agg_df=mean_df.merge(std_df)
    
    if rv1_df.shape[0]==0:
        agg_df['rv1']=1e-10
    else:
        agg_df=agg_df.merge(rv1_df)
    
    if rv2_df.shape[0]==0:
        agg_df['rv2']=1e-10
    else:
        agg_df=agg_df.merge(rv2_df)
        
    agg_df.fillna(1e-10, inplace=True)
    return agg_df
    
def get_default_values(df):
    min_df=df.head(1)
    min_df[['bid_price1', 'ask_price1','bid_price2', 'ask_price2']]=1e-9
    min_df[['bid_size1', 'ask_size1', 'bid_size2','ask_size2']]=1
    return min_df


def preprocess_book_data(book_folder):
    all_df=[]
    for i, filepath in enumerate(os.listdir(book_folder)):
        path=os.path.join(book_folder, filepath)
        stock_id=int(filepath.split('=')[-1])
        
        df=pd.read_parquet(path)
        df['stock_id']=stock_id
        df['bucket_num'] = df.seconds_in_bucket//BUCKET_INTERVAL
        df=get_features(df)
        
        df['rv1']=np.log(df['wap'] + 1e-9)
        df['rv2']=np.log(df['wap2'] + 1e-9)
        
        stock_df=[]
        for bucket_num in range(600//BUCKET_INTERVAL):
            bucket_df=df[df.bucket_num==bucket_num].copy()
            if bucket_df.shape[0] == 0:
                continue
            bucket_agg_df=get_aggregated_features(bucket_df)            
            stock_df.append(bucket_agg_df)
        
        stock_df=pd.concat(stock_df)
        
        stock_df.fillna(0.0, inplace=True)
        stock_df['bucket_features']=stock_df[feature_columns].values.tolist()
        stock_df['bucket_features']=stock_df['bucket_features'].apply(np.array)
        stock_df.drop(columns=feature_columns, inplace=True)

        stock_df=stock_df.groupby(['stock_id', 'time_id'])[['bucket_num', 'bucket_features']].agg(list).reset_index()
        stock_df['bucket_num']=stock_df['bucket_num'].apply(np.array)
        stock_df['bucket_features']=stock_df['bucket_features'].apply(np.array)
        
        all_df.append(stock_df)
        
    all_df=pd.concat(all_df)
    all_df.reset_index(drop=True, inplace=True)
    return all_df

In [None]:
features_df=preprocess_book_data('../input/optiver-realized-volatility-prediction/book_test.parquet')
features_df.fillna(0.0, inplace=True)
features_df.head()

# Dataset

In [None]:
class OptiverDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df=df
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row=self.df.iloc[idx]
        bucket_num=row.bucket_num
        bucket_features=row.bucket_features
        
        X=torch.zeros((config.num_buckets, config.num_features), dtype=torch.float32)
        for i, bucket_id in enumerate(bucket_num):
            X[bucket_id] = torch.tensor(bucket_features[i], dtype=torch.float32)
        
        Xmax, _=torch.max(X, dim=0)
        Xmax=Xmax.view(1, -1)
        Xmax[Xmax==0]=1
        X_norm=X*(1/Xmax)
        return X_norm

# Model

In [None]:
class FFN(nn.Module):
    def __init__(self, sz):
        super().__init__()
        self.linear=nn.Linear(sz, sz)
        self.bn=nn.BatchNorm1d(sz)
        self.silu=nn.SiLU()
        self.dropout=nn.Dropout(0.1)
        
    def forward(self, x):
        x=self.bn(x)
        x=self.silu(x)
        x=self.dropout(x)
        x=self.linear(x)
        
        return x
    
class OptiverModel(nn.Module):
    def __init__(self):
        super().__init__()
        hsize=150
        self.pre_bn=nn.BatchNorm1d(config.num_buckets , config.num_features)
        self.gru=nn.GRU(config.num_features, hsize, 2, batch_first=True, dropout=0.1)
        
        self.ffn1=FFN(hsize)
        self.ffn2=FFN(hsize)
        self.out=nn.Linear(hsize, 1)
        
    def forward(self, x):
        x=self.pre_bn(x)
        _, h=self.gru(x)
        h=h[1]
        
        y=self.ffn1(h)
        y=self.ffn2(h)
        
        yout=self.out(y)
        yout=yout.view(-1)
        return y, yout

# Inference

In [None]:
def inference(model, test_dataloader):
    model.eval()
    ypred=[]
    for X in test_dataloader:
        X=X.to(device)
        with torch.no_grad():
            _, y=model(X)
            y=y.view(-1).detach()
            y=torch.clamp(y, 0, 1.0)
            y=y.tolist()
            ypred+=y
    return ypred

In [None]:
test_dataset=OptiverDataset(features_df)
test_dataloader=torch.utils.data.DataLoader(test_dataset,batch_size=1024,
                                            shuffle=False,
                                            drop_last=False)

model=OptiverModel()
model=torch.load('../input/baseline-gru/best_rmspe.pt', map_location=device)
model=model.to(device)

In [None]:
test_df=pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')

features_df['target']=inference(model, test_dataloader)
test_df=test_df.merge(features_df, how='left')
test_df.fillna(0, inplace=True)
test_df.head()

In [None]:
test_df[['row_id', 'target']].to_csv('submission.csv', index=False)