In [None]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import pickle

import torch
import torch.nn as nn

import matplotlib.pyplot as plt

In [None]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
def get_volume_features(df):
    df['ask_vol']=df['ask_size1']+df['ask_size2']
    df['bid_vol']=df['bid_size1']+df['bid_size2']
    
    df['total_volume1']=df['ask_size1']+df['bid_size1']
    df['total_volume2']=df['ask_size2']+df['bid_size2']
    df['total_volume']=df['ask_vol']+df['bid_vol']
    
    df['volume_spread1']=(df['ask_size1']+1e-12)/(df['bid_size1']+1e-12)
    df['volume_spread2']=(df['ask_size2']+1e-12)/(df['bid_size2']+1e-12)
    
    return df

def get_price_features(df):
    df['ask_price_spread']=df['ask_price2'] - df['ask_price1']
    df['bid_price_spread']=df['bid_price1'] - df['bid_price2']
    df['price_spread1']=df['ask_price1']-df['bid_price1']
    df['price_spread2']=df['ask_price2']-df['bid_price2']
    return df

def calculate_wap1(df):
    wap=(df['ask_price1'] * df['bid_size1']) + (df['bid_price1'] * df['ask_size1'])
    wap/=(df['ask_size1'] + df['bid_size1'])
    return wap

def calculate_wap2(df):
    wap=(df['ask_price2'] * df['bid_size2']) + (df['bid_price2'] * df['ask_size2'])
    wap/=(df['ask_size2'] + df['bid_size2'])
    return wap


def calculate_wap3(wap1, wap2):
    wap3=(wap1+wap2)/2
    return wap3

def calculate_log_return(s):
    s=np.log(s)
    s=np.diff(s)
    s=np.append(0, s)
    return s

def calculate_realized_volatitlity(s):
    s=s**2
    s=s.cumsum()
    s=np.sqrt(s)
    return s

def get_log_return(df, wap_colname):
    log_return_df=df.groupby('time_id')[[wap_colname]].agg(list).reset_index()
    log_return_df[wap_colname]=log_return_df[wap_colname].apply(np.array)
    log_return_df[wap_colname]=log_return_df[wap_colname].apply(calculate_log_return)
    log_return_values=np.concatenate(log_return_df[wap_colname].values).ravel()
    return log_return_values

def get_realized_volatility(df, colname):
    rv_df=df.groupby('time_id')[[colname]].agg(list).reset_index()
    rv_df[colname]=rv_df[colname].apply(np.array)
    rv_df[colname]=rv_df[colname].apply(calculate_realized_volatitlity)
    rv_values=np.concatenate(rv_df[colname].values).ravel()
    return rv_values

def get_features(df):
    df=df.sort_values(['time_id', 'seconds_in_bucket'])
    
    df=get_price_features(df)
    df=get_volume_features(df)
    
    df['wap1']=calculate_wap1(df)
    df['wap2']=calculate_wap2(df)
    df['wap3']=calculate_wap3(df['wap1'], df['wap2'])
    
    
    df['log_return1']=get_log_return(df, 'wap1')
    df['log_return2']=get_log_return(df, 'wap2')
    df['log_return3']=get_log_return(df, 'wap3')
    
    df['rv1']=get_realized_volatility(df, 'log_return1')
    df['rv2']=get_realized_volatility(df, 'log_return2')
    df['rv3']=get_realized_volatility(df, 'log_return3')
    
    return df


def get_dummy_df(df):
    time_id=df.time_id.unique()
    seconds_in_bucket=np.arange(600)
    
    
    dummy_df=pd.DataFrame.from_dict({'time_id': np.repeat(time_id, seconds_in_bucket.shape[0]), 
                                     'seconds_in_bucket': np.tile(seconds_in_bucket, time_id.shape[0])})
    return dummy_df

In [None]:
mean_columns=[
    'bid_price1', 'ask_price1', 'bid_price2', 'ask_price2', 
    'bid_size1', 'ask_size1', 'bid_size2', 'ask_size2',
    'ask_price_spread', 'bid_price_spread', 'price_spread1', 'price_spread2', 
    'ask_vol', 'bid_vol', 'total_volume1', 'total_volume2', 'total_volume', 
    'volume_spread1', 'volume_spread2',
    'wap1', 'wap2', 'wap3', 'log_return1', 'log_return2', 'log_return3', 
]

max_columns=['rv1', 'rv2', 'rv3']
log_transform_features=['bid_size1', 'ask_size1', 'bid_size2', 'ask_size2', 'ask_vol', 'bid_vol',
                        'total_volume1','total_volume2', 'total_volume', 'volume_spread1', 'volume_spread2']


def aggregate_buckets(df):
    mean_df=df.groupby(['stock_id', 'time_id', 'bucket'])[mean_columns].mean().reset_index()
    max_df=df.groupby(['stock_id', 'time_id', 'bucket'])[max_columns].max().reset_index()
    return mean_df.merge(max_df)

In [None]:
def preprocess_book_data(book_folder):
    all_df=[]
    for i, filepath in enumerate(os.listdir(book_folder)):
        path=os.path.join(book_folder, filepath)
        stock_id=int(filepath.split('=')[-1])
        
        df=pd.read_parquet(path)
        df['stock_id']=stock_id
        
        dummy_df=get_dummy_df(df)
        df=dummy_df.merge(df, how='left')
        df['bucket']=df['seconds_in_bucket']//10
        
        df=get_features(df)
        df.fillna(method='ffill', inplace=True)
        
        stock_df=aggregate_buckets(df)
        for colname in log_transform_features:
            stock_df[colname]=np.log( 1+stock_df[colname] )
        
        
        stock_df['features']=stock_df[mean_columns+max_columns].values.tolist()
        stock_df['features']=stock_df['features'].apply(np.array)
        stock_df=stock_df.groupby(['stock_id', 'time_id'])[['bucket', 'features']].agg(list).reset_index()

        stock_df['bucket']=stock_df['bucket'].apply(np.array)
        stock_df['features']=stock_df['features'].apply(np.array)
    
        all_df.append(stock_df)
        
    all_df=pd.concat(all_df)
    all_df.reset_index(drop=True, inplace=True)
    return all_df

In [None]:
features_df=preprocess_book_data('../input/optiver-realized-volatility-prediction/book_test.parquet')
features_df.head()

In [None]:
class config:
    num_buckets= 60
    num_features= 28
    batch_size=256
    epochs=20

# Dataset

In [None]:
class OptiverDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df=df
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row=self.df.iloc[idx]
        bucket_num=row.bucket
        bucket_features=row.features
        
        X=torch.zeros((config.num_buckets, config.num_features), dtype=torch.float32)
        for i, bucket_id in enumerate(bucket_num):
            X[bucket_id] = torch.tensor(bucket_features[i], dtype=torch.float32)
        return X

# Model

In [None]:
def get_activation_fn(activation):
    if activation=='gelu':
        return nn.GELU()
    elif activation=='relu':
        return nn.ReLU()
    
def attention(query, key, value, dropout=None):
    d_k=query.size(-1)
    scores=torch.matmul( query, key.transpose(-1, -2) )/np.sqrt(d_k)
    scores=torch.tril(scores)
    scores=scores.masked_fill(scores == 0, -1e9)
    p_attn=torch.softmax(scores, dim=-1)
    
    x_attn=torch.matmul(p_attn, value)
    
    if dropout:
        x_attn=dropout(x_attn)
    return p_attn, x_attn

class MultiHeadAttention(nn.Module):
    def __init__(self, dmodel, nhead,activation,norm,dropout):
        super().__init__()
        self.dmodel=dmodel
        self.nhead=nhead
        self.d_k=dmodel//nhead #Size
        
        self.activation=activation
        self.norm=norm
        self.dropout=dropout
        
        self.Q=nn.Linear(dmodel, dmodel)
        self.K=nn.Linear(dmodel, dmodel)
        self.V=nn.Linear(dmodel, dmodel)
        self.W=nn.Linear(dmodel, dmodel)
        
    def forward(self, x):
        bsize=x.size(0)
        query=self.Q(x).view(bsize, -1, self.nhead, self.d_k)
        key=self.K(x).view(bsize, -1, self.nhead, self.d_k)
        value=self.V(x).view(bsize, -1, self.nhead, self.d_k)
        
        p_attn, x_attn=attention(query, key, value, self.dropout)
        x_attn=x_attn.view(bsize, -1, self.nhead*self.d_k)
        x_attn=self.W(x_attn)
        x=self.norm(x+x_attn)
        return x

class TimeSeriesAttentionLayer(nn.Module):
    def __init__(self,
                 dmodel=128,
                 nhead=4,
                 dim_feed_forward=512,
                 activation='gelu', 
                 dropout=0.1):
        
        super().__init__()
        self.dmodel=dmodel
        self.nhead=nhead
        self.dim_feed_forward=dim_feed_forward
        self.activation=get_activation_fn(activation)
        self.norm=nn.LayerNorm(dmodel)
        self.dropout=nn.Dropout(dropout)
        
        self.multihead_attn=MultiHeadAttention(dmodel,
                                               nhead,
                                               self.activation,
                                               self.norm,
                                               self.dropout)
        
        self.linear1=nn.Linear(dmodel, dim_feed_forward)
        self.linear2=nn.Linear(dim_feed_forward, dmodel)
        
    def forward(self, x):
        x=self.multihead_attn(x)
        x_ffn=self.linear2(self.dropout(self.activation(self.linear1(x))))
        x=self.norm(x+x_ffn)
        return x

class FeatureExtractor(nn.Module):
    def __init__(self, sz):
        super().__init__()
        self.pre_bn=nn.BatchNorm1d(config.num_buckets , config.num_features)
        
        self.linear1=nn.Linear(config.num_features, 2*sz)
        self.bn1=nn.BatchNorm1d(config.num_buckets, 2*sz)
        
        self.linear2=nn.Linear(2*sz, sz)
        self.bn2=nn.BatchNorm1d(config.num_buckets, sz)
        
        self.dropout=nn.Dropout(0.2)
        self.activation=nn.GELU()
    def forward(self, x):
        x=self.pre_bn(x)
        x=self.dropout( self.bn1( self.activation(self.linear1(x)) ) )
        x=self.dropout( self.bn2( self.activation(self.linear2(x)) ) )
        return x
    
    
class FeatureExtractorWith1DConv(nn.Module):
    def __init__(self, sz):
        super().__init__()
        self.pre_bn=nn.BatchNorm1d(config.num_buckets , config.num_features)
        
        self.conv1=nn.Conv1d(config.num_features, sz, 3, padding=1)
        self.bn1=nn.BatchNorm1d(config.num_buckets, sz)
        
        
        self.conv2=nn.Conv1d(sz, sz, 3, padding=1)
        self.bn2=nn.BatchNorm1d(config.num_buckets, sz)
        
        self.activation=nn.ReLU()
        
    def forward(self, x):
        x=self.pre_bn(x)
        x=self.activation( self.bn1( self.conv1(x.transpose(1, 2)).transpose(1, 2) ))
        #x=self.activation( self.bn2( self.conv2(x.transpose(1, 2)).transpose(1, 2) ))
        
        return x

In [None]:
class Auxilary_FFN(nn.Module):
    def __init__(self, sz):
        super().__init__()
        self.linear=nn.Linear(sz, sz)
        self.bn=nn.BatchNorm1d(config.num_buckets , sz)
        self.activation=nn.GELU()
        self.dropout=nn.Dropout(0.2)
        
    def forward(self, x):
        x=self.bn(x)
        x=self.activation(x)
        x=self.dropout(x)
        x=self.linear(x)
        return x


class FFN(nn.Module):
    def __init__(self, sz):
        super().__init__()
        self.linear=nn.Linear(sz, sz)
        self.bn=nn.BatchNorm1d(sz)
        self.activation=nn.GELU()
        self.dropout=nn.Dropout(0.2)
        
    def forward(self, x):
        x=self.bn(x)
        x=self.activation(x)
        x=self.dropout(x)
        x=self.linear(x)    
        return x


class AttentionHead(nn.Module):
    def __init__(self, dmodel, dropout):
        super().__init__()
        self.dropout=dropout
        self.W=nn.Linear(dmodel, 1)
    def forward(self, x):
        scores=self.W(x).squeeze(-1)
        p_attn=torch.softmax(scores, dim=-1)
        
        if self.dropout:
            p_attn=self.dropout(p_attn)
        x_attn=torch.matmul(p_attn, x)
        x_attn=x_attn.sum(dim=1)
        return p_attn, x_attn

In [None]:
class PrimaryHead(nn.Module):
    def __init__(self, hsize):
        super().__init__()
        self.hsize=hsize
        self.attn_dropout=nn.Dropout(0.1)
        self.attn_head=AttentionHead(hsize, self.attn_dropout)
        self.ffn=nn.ModuleList(
            [FFN(hsize) for _ in range(3)]
        )
        self.primary_out=nn.Linear(hsize, 1)
        
    def forward(self, x):
        p_attn, x=self.attn_head(x)
        for i, _ in enumerate(self.ffn):
            if i==0:
                continue
            x=self.ffn[i](x + self.ffn[i-1](x))
        y=self.primary_out(x)
        return p_attn, y

class AuxilaryHead(nn.Module):
    def __init__(self, hsize):
        super().__init__()
        self.hsize=hsize
        self.ffn=nn.ModuleList(
            [Auxilary_FFN(hsize) for _ in range(3)]
        )
        self.aux_out=nn.Linear(hsize, 1)
    def forward(self, x):
        for i, _ in enumerate(self.ffn):
            if i==0:
                continue
            x=self.ffn[i](x + self.ffn[i-1](x))
        y=self.aux_out(x)
        return y

In [None]:
class OptiverModel(nn.Module):
    def __init__(self, model_size=128):
        super().__init__()
        self.model_size=model_size
        self.feature_extractor=FeatureExtractorWith1DConv(model_size)
        self.pos_embeddings=nn.Embedding(config.num_buckets, model_size, max_norm=1)
        self.positions=torch.arange(config.num_buckets, dtype=torch.long).to(device)
        self.attn_layers=nn.ModuleList([TimeSeriesAttentionLayer() for _ in range(5)])
        
        self.primary_model=PrimaryHead(model_size)
        self.auxilary_ask1=AuxilaryHead(model_size)
        self.auxilary_bid1=AuxilaryHead(model_size)
        self.auxilary_wap1=AuxilaryHead(model_size)
        
    def forward(self, x):
        x=self.feature_extractor(x)
        x=x+(self.pos_embeddings(self.positions).unsqueeze(0)/np.sqrt(self.model_size))
        for attn_layer in self.attn_layers:
            x=attn_layer(x)
            
        p_attn, yprimary=self.primary_model(x)
        yaux_ask1=self.auxilary_ask1(x)
        yaux_bid1=self.auxilary_bid1(x)
        yaux_wap1=self.auxilary_wap1(x)
        
        return {
            'p_attn': p_attn,
            'yprimary': yprimary,
            'yask1': yaux_ask1,
            'ybid1': yaux_bid1,
            'ywap1': yaux_wap1
        }

# Inference

In [None]:
def inference(model, test_dataloader):
    model.eval()
    ypred=[]
    for X in test_dataloader:
        X=X.to(device)
        with torch.no_grad():
            output=model(X)
            y=output['yprimary']
            y=y.view(-1).detach()
            y=torch.clamp(y, 0, 1.0)
            y=y.tolist()
            ypred+=y
    return ypred

In [None]:
test_dataset=OptiverDataset(features_df)
test_dataloader=torch.utils.data.DataLoader(test_dataset,
                                            batch_size=1024,
                                            shuffle=False,
                                            drop_last=False)

model=OptiverModel()
model=torch.load('../input/optiver-attention-multitask-models/best_rmspe.pt', map_location=device)
model=model.to(device)

In [None]:
test_df=pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')

features_df['target']=inference(model, test_dataloader)
test_df=test_df.merge(features_df, how='left')
test_df.fillna(0, inplace=True)
test_df.head()

In [None]:
test_df[['row_id', 'target']].to_csv('submission.csv', index=False)