In [None]:
import torch
import torch.nn as nn

import os
import gc
import numpy as np
import pandas as pd

In [None]:
test_book_folder='../input/optiver-realized-volatility-prediction/book_test.parquet'

log_transform_features=['bid_size1', 'ask_size1', 'bid_size2', 'ask_size2']
feature_columns=[
    'bid_price1', 'ask_price1', 'bid_price2', 'ask_price2', 
    'bid_size1', 'ask_size1', 'bid_size2', 'ask_size2'
]

In [None]:
class config:
    num_buckets= 600
    num_features= 11
    batch_size=128
    epochs=10

In [None]:
window=4
def smoothing(x):
    if x.shape[0] < window:
        return x
    x=x.copy()
    try:
        cum_x=np.cumsum(x, axis=0)
        x_rolling=(cum_x[window:, :] - cum_x[:-window, :])/window
        x[window:, :]=x_rolling
    except:
        pass
    return x

def smoothing_1d(x):
    if x.shape[0] < window:
        return x
    x=x.copy()
    try:
        cum_x=np.cumsum(x)
        x_rolling=(cum_x[window:] - cum_x[:-window])/window
        x[window:]=x_rolling
    except:
        pass
    return x

# dataset

In [None]:
class OptiverDataset(torch.utils.data.Dataset):
    def __init__(self, features):
        self.features=features
    def __len__(self):
        return len(self.features)
    
    def calculate_wap(self, ask_price, bid_price, ask_size, bid_size):
        ask_size=np.exp(ask_size) - 1
        bid_size=np.exp(bid_size) - 1
        wap = (ask_price * bid_size) + (bid_price * ask_size)
        wap = wap / (ask_size + bid_size)
        return wap
    
    def get_log_returns(self, wap):
        s=np.append(0, np.diff(np.log(wap)))
        s[s==-np.inf]=0.0
        return s
    
    def get_realized_volatility(self, s):
        rv=np.sqrt(np.sum( (s**2) ))
        return rv
    
    def get_price_differences(self, feat, seq_len):
        bid_price1=feat[:, 0]
        ask_price1=feat[:, 1]
        bid_price2= feat[:, 2]
        ask_price2= feat[:, 3]
        
        price_diff1 = bid_price1 - ask_price1
        price_diff2 = bid_price2 - ask_price2
        
        ask_diff = ask_price1 - ask_price2
        bid_diff = bid_price1 - bid_price2
        
        price_diff=np.zeros((4, 600) )
        price_diff[0, -seq_len:]=price_diff1
        price_diff[1, -seq_len:]=price_diff2
        price_diff[2, -seq_len:]=ask_diff
        price_diff[3, -seq_len:]=bid_diff
        
        price_diff=torch.tensor(price_diff, dtype=torch.float32).transpose(1, 0)
        return price_diff
    
    def get_features(self, feat):
        bid_price1=feat[:, 0]
        ask_price1=feat[:, 1]
        bid_price2= feat[:, 2]
        ask_price2= feat[:, 3]
        bid_size1=feat[:, 4]
        ask_size1=feat[:, 5]
        bid_size2=feat[:, 6]
        ask_size2=feat[:, 7]
        
        wap1=self.calculate_wap(ask_price1, bid_price1, ask_size1, bid_size1)
        wap2=self.calculate_wap(ask_price2, bid_price2, ask_size2, bid_size2)
        
        s1=self.get_log_returns(wap1)
        s2=self.get_log_returns(wap2)
        
        rv1=self.get_realized_volatility(s1)
        rv2=self.get_realized_volatility(s2)
        
        if rv1==0:
            rv1=rv2
        elif rv2==0:
            rv2=rv1
        
        return (wap1,wap2, s1, s2, rv1, rv2)
    
    def get_binary_target_features(self, y_target, rv1, rv2):
        y_binary1=(y_target>rv1)
        y_binary2=(y_target>rv2)
        
        weights1=np.abs(y_target - rv1) + 1e-10
        weights2=np.abs(y_target - rv2) + 1e-10
        
        return (y_binary1, y_binary2, weights1, weights2)
        
    
    def __getitem__(self, i):
        feat=np.array(self.features[i])
        (seq_len, num_features) = (feat.shape[0], feat.shape[1])
        
        (wap1_arr,wap2_arr, s1_arr, s2_arr, rv1, rv2)=self.get_features(feat)
        price_diff = self.get_price_differences(feat, seq_len)
        
        feat=smoothing(feat)
        wap1_arr=smoothing_1d(wap1_arr)
        wap2_arr=smoothing_1d(wap2_arr)
        
        X=torch.zeros(600, num_features)
        
        mask=np.zeros(600)
        mask[-seq_len:]=1
        
        wap1=torch.zeros(600)
        wap2=torch.zeros(600)
        s1=torch.zeros(600)
        s2=torch.zeros(600)
        
        mask=torch.tensor(mask, dtype=torch.long)
        X[-seq_len:]=torch.tensor(feat, dtype=torch.float32)
        wap1[-seq_len:]=torch.tensor(wap1_arr, dtype=torch.float32)
        wap2[-seq_len:]=torch.tensor(wap2_arr, dtype=torch.float32)
        
        s1[-seq_len:]=torch.tensor(s1_arr, dtype=torch.float32)
        s2[-seq_len:]=torch.tensor(s2_arr, dtype=torch.float32)
        
        
        rv1=torch.tensor(rv1, dtype=torch.float32)
        rv2=torch.tensor(rv2, dtype=torch.float32)
        
        return (X, price_diff, mask, wap1, wap2, s1, s2, rv1, rv2)

# model

In [None]:
#https://github.com/KrisKorrel/sparsemax-pytorch

class Sparsemax(nn.Module):
    """Sparsemax function."""

    def __init__(self, dim=None):
        """Initialize sparsemax activation
        
        Args:
            dim (int, optional): The dimension over which to apply the sparsemax function.
        """
        super(Sparsemax, self).__init__()

        self.dim = -1 if dim is None else dim

    def forward(self, input):
        """Forward function.
        Args:
            input (torch.Tensor): Input tensor. First dimension should be the batch size
        Returns:
            torch.Tensor: [batch_size x number_of_logits] Output tensor
        """
        # Sparsemax currently only handles 2-dim tensors,
        # so we reshape to a convenient shape and reshape back after sparsemax
        input = input.transpose(0, self.dim)
        original_size = input.size()
        input = input.reshape(input.size(0), -1)
        input = input.transpose(0, 1)
        dim = 1

        number_of_logits = input.size(dim)

        # Translate input by max for numerical stability
        input = input - torch.max(input, dim=dim, keepdim=True)[0].expand_as(input)

        # Sort input in descending order.
        # (NOTE: Can be replaced with linear time selection method described here:
        # http://stanford.edu/~jduchi/projects/DuchiShSiCh08.html)
        zs = torch.sort(input=input, dim=dim, descending=True)[0]
        range = torch.arange(start=1, end=number_of_logits + 1, step=1, device=device, dtype=input.dtype).view(1, -1)
        range = range.expand_as(zs)

        # Determine sparsity of projection
        bound = 1 + range * zs
        cumulative_sum_zs = torch.cumsum(zs, dim)
        is_gt = torch.gt(bound, cumulative_sum_zs).type(input.type())
        k = torch.max(is_gt * range, dim, keepdim=True)[0]

        # Compute threshold function
        zs_sparse = is_gt * zs

        # Compute taus
        taus = (torch.sum(zs_sparse, dim, keepdim=True) - 1) / k
        taus = taus.expand_as(input)

        # Sparsemax
        self.output = torch.max(torch.zeros_like(input), input - taus)

        # Reshape back to original shape
        output = self.output
        output = output.transpose(0, 1)
        output = output.reshape(original_size)
        output = output.transpose(0, self.dim)

        return output

    def backward(self, grad_output):
        """Backward function."""
        dim = 1

        nonzeros = torch.ne(self.output, 0)
        sum = torch.sum(grad_output * nonzeros, dim=dim) / torch.sum(nonzeros, dim=dim)
        self.grad_input = nonzeros * (grad_output - sum.expand_as(grad_output))

        return self.grad_input

In [None]:
sparsemax = Sparsemax(dim=-1)

def get_activation_fn(activation):
    if activation=='gelu':
        return nn.GELU()
    elif activation=='relu':
        return nn.ReLU()
    
def attention(query, key, value, mask=None, dropout=None):
    d_k=query.size(-1)
    scores=torch.matmul( query, key.transpose(-1, -2) )/np.sqrt(d_k)
    #scores=torch.tril(scores)
    if mask is not None:
        scores=scores.masked_fill(mask == 0, -1e9)
    
    #p_attn=torch.softmax(scores, dim=-1)
    p_attn=sparsemax(scores)
    x_attn=torch.matmul(p_attn, value)
    if dropout:
        x_attn=dropout(x_attn)
        
    return p_attn, x_attn

class MultiHeadAttention(nn.Module):
    def __init__(self, num_features, dmodel, nhead,activation,norm,dropout):
        super().__init__()
        self.dmodel=dmodel
        self.nhead=nhead
        self.d_k=dmodel//nhead #Size
        
        self.activation=activation
        self.norm=norm
        self.dropout=dropout
        
        #self.Q=nn.Linear(num_features, dmodel)
        #self.K=nn.Linear(num_features, dmodel)
        #self.V=nn.Linear(num_features, dmodel)
        
        self.Q=nn.Conv1d(num_features, dmodel, 5, padding=2)
        self.K=nn.Conv1d(num_features, dmodel, 5, padding=2)
        self.V=nn.Conv1d(num_features, dmodel, 5, padding=2)
        
        
        self.W=nn.Linear(dmodel, num_features)
        
        #nn.init.uniform_(self.Q.weight, -1/np.sqrt(2*num_features), 1/np.sqrt(2*num_features))
        #nn.init.uniform_(self.K.weight, -1/np.sqrt(2*num_features), 1/np.sqrt(2*num_features))
        #nn.init.uniform_(self.V.weight, -1/np.sqrt(2*num_features), 1/np.sqrt(2*num_features))
        #nn.init.uniform_(self.W.weight, -1/np.sqrt(2*num_features), 1/np.sqrt(2*num_features))
        
        
    def forward(self, x, mask=None):
        bsize=x.size(0)
        x=self.norm(x)
        x=x.transpose(2, 1)
        query=self.Q(x).transpose(2, 1).view(bsize, -1, self.nhead, self.d_k)
        key=self.K(x).transpose(2, 1).view(bsize, -1, self.nhead, self.d_k)
        value=self.V(x).transpose(2, 1).view(bsize, -1, self.nhead, self.d_k)
        mask=mask.unsqueeze(-1).unsqueeze(-1)
        
        #query=self.Q(x).view(bsize, -1, self.nhead, self.d_k)
        #key=self.K(x).view(bsize, -1, self.nhead, self.d_k)
        #value=self.V(x).view(bsize, -1, self.nhead, self.d_k)
        
        
        p_attn, x_attn=attention(query, key, value, mask, self.dropout)
        x_attn=x_attn.view(bsize, -1, self.nhead*self.d_k)
        
        x_attn=self.W(x_attn)
        x=x.transpose(2, 1)
        x=x+x_attn
        return x

class TimeSeriesAttentionLayer(nn.Module):
    def __init__(self,
                 num_features=32,
                 dmodel=128,
                 nhead=4,
                 dim_feed_forward=512,
                 activation='relu', 
                 dropout=0.1
                ):
        
        super().__init__()
        self.num_features=num_features
        self.dmodel=dmodel
        self.nhead=nhead
        self.dim_feed_forward=dim_feed_forward
        self.activation=get_activation_fn('gelu')
        self.norm=nn.LayerNorm(num_features)
        self.dropout=nn.Dropout(dropout)
        
        self.multihead_attn=MultiHeadAttention(num_features,
                                               dmodel,
                                               nhead,
                                               self.activation,
                                               self.norm,
                                               self.dropout
                                              )
        
        self.conv1=nn.Conv1d(num_features, dim_feed_forward, 5, padding=2)
        self.conv2=nn.Conv1d(dim_feed_forward, num_features, 5, padding=2)
        #self.linear1=nn.Linear(num_features, dim_feed_forward)
        #self.linear2=nn.Linear(dim_feed_forward, num_features)
        
    def forward(self, x, mask=None):
        x=self.multihead_attn(x, mask)
        x=self.norm(x)
        #x_ffn=self.linear2(self.dropout(self.activation(self.linear1(x))))
        x_ffn=self.conv2(self.dropout(self.activation(self.conv1(x.transpose(1, 2))))).transpose(2, 1)
        x=x+x_ffn
        return x



class FeatureExtractorWith1DConv(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        #self.pre_layernorm=nn.LayerNorm(input_size)
        self.pre_bn=nn.BatchNorm1d(input_size)
        
        #self.linear1=nn.Linear(input_size, 2*output_size)
        #self.bn1=nn.BatchNorm1d(2*output_size)
        
        #self.linear2=nn.Linear(2*output_size, output_size)
        #self.bn2=nn.BatchNorm1d(output_size)
        
        self.conv1=nn.Conv1d(input_size, 2*output_size, 5, padding=2)
        self.bn1=nn.BatchNorm1d(2*output_size)
        
        self.conv2=nn.Conv1d(2*output_size, output_size, 5, padding=2)
        self.bn2=nn.BatchNorm1d(output_size)
        
        self.activation=nn.GELU()
        self.dropout=nn.Dropout(0.2)
        
    def forward(self, x):
        #x=self.pre_layernorm(x)
        x=x.transpose(1, 2)
        x=self.pre_bn(x)
        x=self.dropout(self.activation(self.bn1(self.conv1(x))))
        x=self.activation(self.bn2(self.conv2(x)))
        x=x.transpose(1, 2)
        
        return x

In [None]:
class FFN(nn.Module):
    def __init__(self, sz):
        super().__init__()
        self.bn1=nn.BatchNorm1d(sz)
        self.linear1=nn.Linear(sz, 2*sz)
        
        self.bn2=nn.BatchNorm1d(2*sz)
        self.linear2=nn.Linear(2*sz, sz)
        
        
        self.activation=nn.GELU()
        self.dropout=nn.Dropout(0.2)
        
    def forward(self, x):
        x=self.linear1(self.dropout(self.activation(self.bn1(x))))
        x=self.linear2(self.dropout(self.activation(self.bn2(x))))
        return x


class ConvHead(nn.Module):
    def __init__(self, dmodel, pool_size):
        super().__init__()
        self.convs=nn.Sequential(
            nn.Conv1d(dmodel, 2*dmodel, 7, padding=3, stride=4),
            nn.BatchNorm1d( 2*dmodel),
            nn.GELU(),
            
            nn.Conv1d(2*dmodel, dmodel, 5, padding=2, stride=4),
            nn.BatchNorm1d(dmodel),
            nn.GELU(),
            
            nn.AdaptiveAvgPool1d(pool_size)
        )
    def forward(self, x):
        bsize=x.size(0)
        x=self.convs(x.transpose(1, 2))
        return x.view(bsize, -1)
    
    
class MLPHead(nn.Module):
    def __init__(self, sz, num_layers):
        super().__init__()
        self.ffn=nn.ModuleList(
            [FFN( sz ) for _ in range(num_layers)]
        )
        self.dropout=nn.Dropout(0.2)
        self.out=nn.Linear(sz, 2)
        
    def forward(self, x):
        x=self.dropout(x)
        for i, _ in enumerate(self.ffn):
            x=self.ffn[i](x)
        y=self.out(x)
        return y
    
from torch.autograd import Variable
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, dropout, max_len=config.num_buckets):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], 
                         requires_grad=False)
        return x
    
class OptiverEncoder(nn.Module):
    def __init__(self, params):
        super().__init__()
        self.dmodel=params['dmodel']
        self.in_features=params['in_features']
        self.out_features=params['out_features']
        self.num_ffn_layers=params['num_ffn_layers']
        self.pool_size=params['pool_size']
        
        
        self.dropout=nn.Dropout(0.1)
        self.feature_extractor=FeatureExtractorWith1DConv(self.in_features, self.out_features)
        self.positions = PositionalEncoding(self.out_features, 0.1)        
        self.attn_layers=nn.ModuleList([TimeSeriesAttentionLayer(num_features=self.out_features,
                                                                 dmodel=self.dmodel,
                                                                 nhead=params['nhead'],
                                                                 dim_feed_forward=params['dim_feed_forward'],
                                                                ) for _ in range(params['num_attention_layers'])])
        
    def forward(self, x, mask):
        batch_size=x.size(0)
        seq_len=x.size(1)
        x=self.positions(self.feature_extractor(x))
        for attn_layer in self.attn_layers:
            x=attn_layer(x, mask)
        return x
    
class OptiverModel(nn.Module):
    def __init__(self, params):
        super().__init__()
        self.dmodel=params['dmodel']
        self.in_features=params['in_features']
        self.out_features=params['out_features']
        self.num_ffn_layers=params['num_ffn_layers']
        self.pool_size=params['pool_size']
        
        self.encoder=OptiverEncoder(params)
        
        
        self.alpha_model=MLPHead(2 * self.out_features , self.num_ffn_layers)
    def pooling(self, x, mask):
        mask=mask.unsqueeze(dim=-1)
        mean_pool=(x * mask).sum(dim=1)/mask.sum(dim=1)
        
        max_pool=x.masked_fill(mask == 0, -1e9)
        max_pool=torch.max(max_pool, dim=1)[0]
        return torch.cat([mean_pool, max_pool], dim=1)
    
    def forward(self, x, mask):
        batch_size=x.size(0)
        seq_len=x.size(1)
        
        x=self.encoder(x, mask)
        x=self.pooling(x, mask)
        
        yhat_alpha=self.alpha_model(x)
        return yhat_alpha

# Loading models

In [None]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
models=[
    torch.load('../input/optiverpool-head-check2/best_rmspe1.pt', map_location=device)
]

# inference

In [None]:
def inference(models, features):
    test_dataset=OptiverDataset(features)
    test_dataloader=torch.utils.data.DataLoader(test_dataset,
                                                batch_size=512,
                                                shuffle=False,
                                                num_workers=2,
                                                pin_memory=True,
                                                drop_last=False)
    
    ypred=[]
    for (X, price_diff, mask, wap1, wap2, s1, s2, rv1, rv2) in test_dataloader:
        X=torch.cat([X, price_diff, wap1.unsqueeze(-1), wap2.unsqueeze(-1),
                             s1.unsqueeze(-1),s2.unsqueeze(-1)], dim=-1)
        X=X.to(device)
        mask=mask.to(device)
        rv1=rv1.to(device)
        rv2=rv2.to(device)
        
        
        for model in models:
            model.eval()
            with torch.no_grad():
                yhat_alpha=model(X, mask)
                yhat1 = torch.abs(yhat_alpha[:, 0].view(-1) * rv1)
                yhat2 = torch.abs(yhat_alpha[:, 1].view(-1) * rv2)

                yhat1=yhat1.view(-1).detach().cpu()
                yhat1=torch.clamp(yhat1, 0, 1.5)
                
                yhat2=yhat2.view(-1).detach().cpu()
                yhat2=torch.clamp(yhat1, 0, 1.5)

                
                yhat1=yhat1.numpy()
                yhat2=yhat2.numpy()
                
                ypred+=yhat1.tolist()
                #ypred+=((yhat1 + yhat2)/2).tolist()
    return ypred

In [None]:
all_df=[]
for i, stock_file in enumerate(os.listdir(test_book_folder)):
    file_path=os.path.join(test_book_folder, stock_file)
    stock_id=stock_file.split('=')[-1]
    
    stock_df=pd.read_parquet(file_path)
    stock_df['stock_id']=int(stock_id)
    
    stock_df=stock_df.sort_values(['time_id', 'seconds_in_bucket'])
    for colname in log_transform_features:
        stock_df[colname]=np.log( 1+stock_df[colname] )
    
    
    stock_df['features']=stock_df[feature_columns].values.tolist()
    stock_df['features']=stock_df['features'].apply(np.array)
    
    
    stock_df=stock_df.groupby(['stock_id', 'time_id'])[['features']].agg(list).reset_index()
    stock_df['features']=stock_df['features'].apply(np.array)
    
    features=stock_df.features.values
    
    
    stock_df['target']=inference(models, features)
    stock_df.drop(columns='features', inplace=True)
    all_df.append(stock_df)
    gc.collect()
all_df=pd.concat(all_df)
all_df.head()

In [None]:
test_df=pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
test_df=test_df.merge(all_df, how='left')
test_df.fillna(0.0, inplace=True)
test_df.head()

In [None]:
test_df[['row_id', 'target']].to_csv('submission.csv', index=False)