In [None]:
import os
import gc
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import pickle

import torch
import torch.nn as nn

import matplotlib.pyplot as plt

In [None]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
def calculate_wap1(df):
    wap=(df['ask_price1'] * df['bid_size1']) + (df['bid_price1'] * df['ask_size1'])
    wap/=(df['ask_size1'] + df['bid_size1'])
    return wap

def calculate_wap2(df):
    wap=(df['ask_price2'] * df['bid_size2']) + (df['bid_price2'] * df['ask_size2'])
    wap/=(df['ask_size2'] + df['bid_size2'])
    return wap


def calculate_wap3(wap1, wap2):
    wap3=(wap1+wap2)/2
    return wap3

def get_features(df):
    df['ask_size1']=df['ask_size1'].apply(lambda x: max(1, x))
    df['ask_size2']=df['ask_size2'].apply(lambda x: max(1, x))
    df['bid_size1']=df['bid_size1'].apply(lambda x: max(1, x))
    df['bid_size2']=df['bid_size2'].apply(lambda x: max(1, x))
    
    df['wap1']=calculate_wap1(df)
    df['wap2']=calculate_wap2(df)
    df['wap3']=calculate_wap3(df['wap1'], df['wap2'])
    
    return df

def get_dummy_df(df, stock_id):
    time_id=df.time_id.unique()
    seconds_in_bucket=np.arange(600)
    
    
    dummy_df=pd.DataFrame.from_dict({'time_id': np.repeat(time_id, seconds_in_bucket.shape[0]), 
                                     'seconds_in_bucket': np.tile(seconds_in_bucket, time_id.shape[0])})
    dummy_df['stock_id']=stock_id
    return dummy_df

In [None]:
mean_columns=[
    'bid_price1', 'ask_price1', 'bid_price2', 'ask_price2', 
    'bid_size1', 'ask_size1', 'bid_size2', 'ask_size2',
    'wap1', 'wap2', 'wap3'
]

log_transform_features=['bid_size1', 'ask_size1', 'bid_size2', 'ask_size2']
def aggregate_buckets(df):
    mean_df=df.groupby(['stock_id', 'time_id', 'bucket'])[mean_columns].mean().reset_index()
    return mean_df

In [None]:
class config:
    num_buckets= 200
    num_features= 11
    batch_size=256
    epochs=20

# Dataset

In [None]:
class OptiverDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df=df
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row=self.df.iloc[idx]
        features=row.features
        
        X=torch.tensor(features, dtype=torch.float32)
        
        s=np.log(features[:, 8])
        s[s==-np.inf] = 0.0
        
        s=torch.tensor(np.append(0, np.diff(s)), dtype=torch.float32)
        rvs=torch.sqrt(torch.cumsum( s**2 , dim=0))
        return (X, s.unsqueeze(dim=-1), rvs.unsqueeze(dim=-1))

# Model

In [None]:
#https://github.com/KrisKorrel/sparsemax-pytorch

class Sparsemax(nn.Module):
    """Sparsemax function."""

    def __init__(self, dim=None):
        """Initialize sparsemax activation
        
        Args:
            dim (int, optional): The dimension over which to apply the sparsemax function.
        """
        super(Sparsemax, self).__init__()

        self.dim = -1 if dim is None else dim

    def forward(self, input):
        """Forward function.
        Args:
            input (torch.Tensor): Input tensor. First dimension should be the batch size
        Returns:
            torch.Tensor: [batch_size x number_of_logits] Output tensor
        """
        # Sparsemax currently only handles 2-dim tensors,
        # so we reshape to a convenient shape and reshape back after sparsemax
        input = input.transpose(0, self.dim)
        original_size = input.size()
        input = input.reshape(input.size(0), -1)
        input = input.transpose(0, 1)
        dim = 1

        number_of_logits = input.size(dim)

        # Translate input by max for numerical stability
        input = input - torch.max(input, dim=dim, keepdim=True)[0].expand_as(input)

        # Sort input in descending order.
        # (NOTE: Can be replaced with linear time selection method described here:
        # http://stanford.edu/~jduchi/projects/DuchiShSiCh08.html)
        zs = torch.sort(input=input, dim=dim, descending=True)[0]
        range = torch.arange(start=1, end=number_of_logits + 1, step=1, device=device, dtype=input.dtype).view(1, -1)
        range = range.expand_as(zs)

        # Determine sparsity of projection
        bound = 1 + range * zs
        cumulative_sum_zs = torch.cumsum(zs, dim)
        is_gt = torch.gt(bound, cumulative_sum_zs).type(input.type())
        k = torch.max(is_gt * range, dim, keepdim=True)[0]

        # Compute threshold function
        zs_sparse = is_gt * zs

        # Compute taus
        taus = (torch.sum(zs_sparse, dim, keepdim=True) - 1) / k
        taus = taus.expand_as(input)

        # Sparsemax
        self.output = torch.max(torch.zeros_like(input), input - taus)

        # Reshape back to original shape
        output = self.output
        output = output.transpose(0, 1)
        output = output.reshape(original_size)
        output = output.transpose(0, self.dim)

        return output

    def backward(self, grad_output):
        """Backward function."""
        dim = 1

        nonzeros = torch.ne(self.output, 0)
        sum = torch.sum(grad_output * nonzeros, dim=dim) / torch.sum(nonzeros, dim=dim)
        self.grad_input = nonzeros * (grad_output - sum.expand_as(grad_output))

        return self.grad_input

In [None]:
sparsemax = Sparsemax(dim=-1)

def get_activation_fn(activation):
    if activation=='gelu':
        return nn.GELU()
    elif activation=='relu':
        return nn.ReLU()
    
def attention(query, key, value, dropout=None):
    d_k=query.size(-1)
    scores=torch.matmul( query, key.transpose(-1, -2) )/np.sqrt(d_k)
    #scores=torch.tril(scores)
    scores=scores.masked_fill(scores == 0, -1e9)
    
    p_attn=torch.softmax(scores, dim=-1)
    #p_attn=sparsemax(scores)
    x_attn=torch.matmul(p_attn, value)
    if dropout:
        x_attn=dropout(x_attn)        
    return p_attn, x_attn


class MultiHeadAttention(nn.Module):
    def __init__(self, num_features, dmodel, nhead,activation,norm,dropout):
        super().__init__()
        self.dmodel=dmodel
        self.nhead=nhead
        self.d_k=dmodel//nhead #Size
        
        self.activation=activation
        self.norm=norm
        self.dropout=dropout
        
        self.Q=nn.Linear(num_features, dmodel)
        self.K=nn.Linear(num_features, dmodel)
        self.V=nn.Linear(num_features, dmodel)
        
        #self.Q=nn.Conv1d(num_features, dmodel, 3, padding=1)
        #self.K=nn.Conv1d(num_features, dmodel, 3, padding=1)
        #self.V=nn.Conv1d(num_features, dmodel, 3, padding=1)
        
        
        self.W=nn.Linear(dmodel, num_features)
        
        nn.init.uniform_(self.Q.weight, -1/np.sqrt(2*num_features), 1/np.sqrt(2*num_features))
        nn.init.uniform_(self.K.weight, -1/np.sqrt(2*num_features), 1/np.sqrt(2*num_features))
        nn.init.uniform_(self.V.weight, -1/np.sqrt(2*num_features), 1/np.sqrt(2*num_features))
        nn.init.uniform_(self.W.weight, -1/np.sqrt(2*num_features), 1/np.sqrt(2*num_features))
        
        
    def forward(self, x):
        bsize=x.size(0)
        x=self.norm(x)
        #x=x.transpose(2, 1)
        #query=self.Q(x).transpose(2, 1).view(bsize, -1, self.nhead, self.d_k)
        #key=self.K(x).transpose(2, 1).view(bsize, -1, self.nhead, self.d_k)
        #value=self.V(x).transpose(2, 1).view(bsize, -1, self.nhead, self.d_k)
        
        query=self.Q(x).view(bsize, -1, self.nhead, self.d_k)
        key=self.K(x).view(bsize, -1, self.nhead, self.d_k)
        value=self.V(x).view(bsize, -1, self.nhead, self.d_k)
        
        
        p_attn, x_attn=attention(query, key, value, self.dropout)
        x_attn=x_attn.view(bsize, -1, self.nhead*self.d_k)
        
        x_attn=self.W(x_attn)
        #x=x.transpose(2, 1)
        x=x+x_attn
        return x

class TimeSeriesAttentionLayer(nn.Module):
    def __init__(self,
                 num_features=32,
                 dmodel=128,
                 nhead=4,
                 dim_feed_forward=512,
                 activation='relu', 
                 dropout=0.1):
        
        super().__init__()
        self.num_features=num_features
        self.dmodel=dmodel
        self.nhead=nhead
        self.dim_feed_forward=dim_feed_forward
        self.activation=get_activation_fn(activation)
        self.norm=nn.LayerNorm(num_features)
        self.dropout=nn.Dropout(dropout)
        
        self.multihead_attn=MultiHeadAttention(num_features,
                                               dmodel,
                                               nhead,
                                               self.activation,
                                               self.norm,
                                               self.dropout)
        
        self.linear1=nn.Linear(num_features, dim_feed_forward)
        self.linear2=nn.Linear(dim_feed_forward, num_features)
        
    def forward(self, x):
        x=self.multihead_attn(x)
        x=self.norm(x)
        x_ffn=self.linear2(self.dropout(self.activation(self.linear1(x))))
        x=x+x_ffn
        return x


class FeatureExtractorWith1DConv(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.pre_bn=nn.BatchNorm1d(input_size)
        
        self.linear1=nn.Linear(input_size, 2*output_size)
        self.bn1=nn.BatchNorm1d(2*output_size)
        
        self.linear2=nn.Linear(2*output_size, output_size)
        self.bn2=nn.BatchNorm1d(output_size)
        #self.conv1=nn.Conv1d(input_size, output_size, 3, padding=1)
        #self.bn1=nn.BatchNorm1d(output_size)
        
        self.activation=nn.GELU()
        self.dropout=nn.Dropout(0.2)
        
    def forward(self, x):
        x=self.pre_bn(x.transpose(2, 1)).transpose(2, 1)
        x=self.dropout(self.activation(self.bn1( self.linear1(x).transpose(2, 1) ).transpose(2, 1)))
        x=self.activation(self.bn2( self.linear2(x).transpose(2, 1) ).transpose(2, 1))
        return x
    
class Auxilary_FFN(nn.Module):
    def __init__(self, sz):
        super().__init__()
        self.bn1=nn.BatchNorm1d(sz)
        self.linear1=nn.Linear(sz, 2*sz)
        
        self.bn2=nn.BatchNorm1d(2*sz)
        self.linear2=nn.Linear(2*sz, 2*sz)
        
        self.activation=nn.GELU()
        self.dropout=nn.Dropout(0.2)
        
    def forward(self, x):
        x=self.bn1(x.transpose(2, 1)).transpose(2, 1)
        x=self.linear1( self.dropout(self.activation(x)) )
        
        x=self.bn2(x.transpose(2, 1)).transpose(2, 1)
        x=self.linear2( self.dropout(self.activation(x)) )
        return x


class FFN(nn.Module):
    def __init__(self, sz):
        super().__init__()
        self.bn1=nn.BatchNorm1d(sz)
        self.linear1=nn.Linear(sz, 2*sz)
        
        self.bn2=nn.BatchNorm1d(2*sz)
        self.linear2=nn.Linear(2*sz, 2*sz)
        
        
        self.activation=nn.GELU()
        self.dropout=nn.Dropout(0.2)
        
    def forward(self, x):
        x=self.linear1(self.dropout(self.activation(self.bn1(x))))
        x=self.linear2(self.dropout(self.activation(self.bn2(x))))
        return x


class AttentionHead(nn.Module):
    def __init__(self, dmodel, dropout):
        super().__init__()
        self.dropout=dropout
        self.W=nn.Linear(dmodel, 1)
    def forward(self, x):
        scores=self.W(x).squeeze(-1)
        p_attn=torch.softmax(scores, dim=-1)
        #p_attn=sparsemax(scores)
        if self.dropout:
            p_attn=self.dropout(p_attn)
        x_attn=torch.matmul(p_attn, x)
        x_attn=x_attn.sum(dim=1)
        return p_attn, x_attn
    
class PrimaryHead(nn.Module):
    def __init__(self, hsize, num_layers):
        super().__init__()
        self.hsize=hsize
        self.attn_dropout=nn.Dropout(0.1)
        self.attn_head=AttentionHead(hsize, self.attn_dropout)
        self.ffn=nn.ModuleList(
            [FFN(hsize) for _ in range(num_layers)]
        )
        self.primary_out=nn.Linear(2*hsize, 1)
        
    def forward(self, x):
        #p_attn, x=self.attn_head(x)
        p_attn=None
        x=x[:, -1, :]
        for i, _ in enumerate(self.ffn):
            x=self.ffn[i](x)
        y=self.primary_out(x)
        
        return p_attn, y

class AuxilaryHead(nn.Module):
    def __init__(self, hsize, num_layers):
        super().__init__()
        self.hsize=hsize
        self.ffn=nn.ModuleList(
            [Auxilary_FFN(hsize) for _ in range(num_layers)]
        )
        self.aux_out=nn.Linear(2*hsize, 1)
    def forward(self, x):
        for i, _ in enumerate(self.ffn):
            x=self.ffn[i](x)
        y=self.aux_out(x)
        return y
    
from torch.autograd import Variable
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, dropout, max_len=config.num_buckets):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], 
                         requires_grad=False)
        return x
    
    
class OptiverModel(nn.Module):
    def __init__(self, params):
        super().__init__()
        self.dmodel=params['dmodel']
        self.in_features=params['in_features']
        self.out_features=params['out_features']
        self.num_ffn_layers=params['num_ffn_layers']
        
        
        self.dropout=nn.Dropout(0.1)
        self.feature_extractor=FeatureExtractorWith1DConv(self.in_features, self.out_features)
        self.positions = PositionalEncoding(self.out_features, 0.1)
        self.attn_layers=nn.ModuleList([TimeSeriesAttentionLayer(num_features=self.out_features,
                                                                 dmodel=self.dmodel,
                                                                 nhead=params['nhead'],
                                                                 dim_feed_forward=params['dim_feed_forward'],
                                                                ) for _ in range(params['num_attention_layers'])])
        
        self.primary_model=PrimaryHead(self.out_features, self.num_ffn_layers)
        self.aux_model=AuxilaryHead(self.out_features, self.num_ffn_layers)
        
    def forward(self, x):
        batch_size=x.size(0)
        seq_len=x.size(1)
        
        x=self.positions(self.feature_extractor(x))
        for attn_layer in self.attn_layers:
            x=attn_layer(x)    
        p_attn, yalpha=self.primary_model(x)
        #yalpha_aux=self.aux_model(x)
        
        return {
            'p_attn': p_attn,
            'yalpha': yalpha.view(-1),
            'yalpha_aux': None#yalpha_aux.squeeze(dim=-1)
        }

# Inference

In [None]:
def inference(model, test_dataloader):
    model.eval()
    ypred=[]
    for i, (X, s, rvs) in enumerate(test_dataloader):
        X=torch.cat([X, s], dim=-1)
        X=X.to(device)
        rvs=rvs.to(device)
        
        with torch.no_grad():
            outputs=model(X)
            yhat_alpha=outputs['yalpha']
            y = yhat_alpha * rvs[:, -1, 0]
            y=y.view(-1).detach().cpu()
            y=torch.clamp(y, 0, 1.0)
            
            y=y.tolist()
            ypred+=y
    return ypred

In [None]:
params={
    'dmodel': 128,
    'nhead':8,
    'in_features': 12,
    'out_features': 32,
    'dim_feed_forward': 256,
    'num_attention_layers': 6,
    'num_ffn_layers': 1
}

model=OptiverModel(params)
model=torch.load('../input/optiver-smooting-model/best_rmspe.pt', map_location=device)
model=model.to(device)

In [None]:
def preprocess_book_data(book_folder):
    all_df=[]
    for i, filepath in enumerate(os.listdir(book_folder)):
        path=os.path.join(book_folder, filepath)
        stock_id=int(filepath.split('=')[-1])
        
        df=pd.read_parquet(path)
        
        dummy_df=get_dummy_df(df, stock_id)
        df=dummy_df.merge(df, how='left')
        df=df.sort_values(['time_id', 'seconds_in_bucket'])
        df.fillna(method='ffill', inplace=True)
        df.fillna(0.0, inplace=True)
        df['bucket']=df['seconds_in_bucket']//3
        df=get_features(df)
        df.fillna(0.0, inplace=True)
        
        stock_df=aggregate_buckets(df)
        for colname in log_transform_features:
            stock_df[colname]=np.log(1+stock_df[colname] )
            
        stock_df['features']=stock_df[mean_columns].values.tolist()
        stock_df['features']=stock_df['features'].apply(np.array)
        stock_df=stock_df.groupby(['stock_id', 'time_id'])[['features']].agg(list).reset_index()
        stock_df['features']=stock_df['features'].apply(np.array)
        
        test_dataset=OptiverDataset(stock_df)
        test_dataloader=torch.utils.data.DataLoader(test_dataset,
                                                    batch_size=256,
                                                    shuffle=False,
                                                    num_workers=0,
                                                    pin_memory=True,
                                                    drop_last=False)
        
        stock_df['target']=inference(model, test_dataloader)
        stock_df.drop(columns='features', inplace=True)
    
        all_df.append(stock_df)
        gc.collect()
        
    all_df=pd.concat(all_df)
    all_df.reset_index(drop=True, inplace=True)
    return all_df

In [None]:
test_df=pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')

features_df=preprocess_book_data('../input/optiver-realized-volatility-prediction/book_test.parquet')
test_df=test_df.merge(features_df, how='left')
test_df.fillna(0.0, inplace=True)
test_df.head()

In [None]:
test_df[['row_id', 'target']].to_csv('submission.csv', index=False)