In [None]:

from torch.utils.data import Dataset

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler



class StockDataset(Dataset):
    def __init__(self, df, valid_features, predict_len=300, row_id=None):
        self.df = df
        self.valid_features = valid_features
        self.predict_len = predict_len
        self.row_id = row_id
        
    def __len__(self):
        return self.df.shape[0]//600
    
    def __getitem__(self, idx):
        seq = self.df[idx*600: (idx+1)*600]
        
        if self.row_id is None:
            x = seq[:, :-1]#.reshape(1, -1, len(self.valid_features))
            y = seq[0 , -1:]#.reshape(1, -1, len(self.valid_features))
        else:
            x = seq # (600, n_feature)
            y = self.row_id[idx] #(2, )

        return x, y



def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 


def ffill(data_df):
    data_df=data_df.set_index(['time_id', 'seconds_in_bucket'])
    data_df = data_df.reindex(pd.MultiIndex.from_product([data_df.index.levels[0], np.arange(0,600)], names = ['time_id', 'seconds_in_bucket']), method='ffill')
    return data_df.reset_index()


def fix_offsets(data_df):
    offsets = data_df.groupby(['time_id']).agg({'seconds_in_bucket':'min'})
    offsets.columns = ['offset']
    data_df = data_df.join(offsets, on='time_id')
    data_df.seconds_in_bucket = data_df.seconds_in_bucket - data_df.offset
    return data_df


def engineering(path):
    book = pd.read_parquet(path)
    
    book = ffill(
        fix_offsets(book)
    )
    assert book.shape[0] % 600 == 0
    
    for n in range(1, 3):
        p1 = book[f"bid_price{n}"]
        p2 = book[f"ask_price{n}"]
        s1 = book[f"bid_size{n}"]
        s2 = book[f"ask_size{n}"]
        book[f"wap{n}"] = (p1*s2 + p2*s1) / (s1 + s2)
        
        book[f'log_return{n}'] = book.groupby(['time_id'])[f'wap{n}'].apply(log_return)
        book[f'log_return{n}'] = book[f'log_return{n}'].fillna(0.)
    
    assert book.shape[0] % 600 == 0
    
    return book
    

def load_data(parquet_path, stock_id, valid_features, normalize=True):
    df = engineering(parquet_path)
    
    df["stock_id"] = stock_id

    if normalize:
        scaler = StandardScaler()
        df[valid_features] = scaler.fit_transform(df[valid_features])

    return df


def get_dataset_test(data_path, stock_id, valid_features):
    keep_columns = ['stock_id', 'time_id', 'seconds_in_bucket'] + valid_features
    df = load_data(data_path, stock_id, valid_features, normalize=True)
    df = df[keep_columns]
    
    features = df[valid_features].values
    row_id = df[['stock_id', 'time_id']].values
    row_id = row_id[::600]
    
    test_data = StockDataset(
        features,
        valid_features,
        row_id = row_id
    )
    
 
    return test_data



In [None]:


import torch
from torch import nn


class RNNEncoder(nn.Module):
    def __init__(self, rnn_num_layers=1, input_feature_len=1, hidden_size=100, device='cpu', rnn_dropout=0.2):
        super().__init__()
        self.hidden_size = hidden_size
        self.input_feature_len = input_feature_len
        self.num_layers = rnn_num_layers
        self.lstm = nn.LSTM(
            num_layers=rnn_num_layers,
            input_size=input_feature_len,
            hidden_size=hidden_size,
            batch_first=True,
            bidirectional=False,
            dropout=rnn_dropout
        )
        self.device = device

    def forward(self, input_seq):
        ht = torch.zeros(
            self.num_layers,
            input_seq.size(0), 
            self.hidden_size, 
            device=self.device
        )
        ct = torch.zeros(
            self.num_layers,
            input_seq.size(0), 
            self.hidden_size, 
            device=self.device
        )
        
        
            
        gru_out, hidden = self.lstm(input_seq, (ht, ct))
        # hidden = hidden.permute(1, 0, 2).reshape(input_seq.size(0), -1)
        # gru_out (B, len, fea), hidden (B, fea*layer)
            
        return gru_out, hidden


class Many2One(nn.Module):
    def __init__(self, encoder, hidden_size):
        super().__init__()
        self.encoder = encoder
        self.out = nn.Linear(hidden_size, 1)
        # self.relu = nn.ELU(inplace=True)

    def forward(self, x):
        encoder_out, hidden = self.encoder(x)
        
        output = self.out(encoder_out[:, -1])
        # output = self.relu(output)
        output = torch.exp(output)
        return output

In [None]:

import numpy as np
import pandas as pd
import torch
import torch.optim as optim
from torch import nn
from torch.utils.data import DataLoader
from tqdm import tqdm


import os


def rmspe(y_true, y_pred):
        return (torch.sqrt(torch.mean(torch.square((y_true - y_pred) / y_true))))

def load_model(model, model_path, optimizer=None, resume=False, 
               lr=None, lr_step=None):
    start_epoch = 0
    checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage)
    print('loaded {}, epoch {}'.format(model_path, checkpoint['epoch']))
    state_dict_ = checkpoint['state_dict']
    state_dict = {}
  
    # convert data_parallal to model
    for k in state_dict_:
        if k.startswith('module') and not k.startswith('module_list'):
            state_dict[k[7:]] = state_dict_[k]
        else:
            state_dict[k] = state_dict_[k]

    model_state_dict = model.state_dict()

    # check loaded parameters and created model parameters
    msg = 'If you see this, your model does not fully load the '+\
            'pre-trained weight. Please make sure '+\
            'you have correctly specified --arch xxx '+\
            'or set the correct --num_classes for your own dataset.'

    for k in state_dict:
        if k in model_state_dict:
            if state_dict[k].shape != model_state_dict[k].shape:
                print('Skip loading parameter {}, required shape{}, '\
                    'loaded shape{}. {}'.format(
                k, model_state_dict[k].shape, state_dict[k].shape, msg))
                state_dict[k] = model_state_dict[k]
        else:
            print('Drop parameter {}.'.format(k) + msg)
    
    for k in model_state_dict:
        if not (k in state_dict):
            print('No param {}.'.format(k) + msg)
            state_dict[k] = model_state_dict[k]
    model.load_state_dict(state_dict, strict=False)

    # resume optimizer parameters
    if optimizer is not None and resume:
        if 'optimizer' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            start_epoch = checkpoint['epoch']
            start_lr = lr
            for step in lr_step:
                if start_epoch >= step:
                    start_lr *= 0.1
            for param_group in optimizer.param_groups:
                param_group['lr'] = start_lr
            print('Resumed optimizer with start lr', start_lr)
        else:
            print('No optimizer parameters in checkpoint.')

    if optimizer is not None:
        return model, optimizer, checkpoint['epoch'], checkpoint['loss']
    else:
        return model


def get_stocks(train):
    return np.sort(np.unique(train['stock_id']))


def get_stock_file(root_data, stock):
    dir = os.path.join(root_data, "book_test.parquet", "stock_id=" + str(stock))
    file_path = os.listdir(dir)[0]
    return os.path.join(dir, file_path)


def main(stock_book_file, stock_id, label_df, rv, row_ids):

    print(stock_book_file, stock_id)
    valid_columns = [
        'bid_price1', 'ask_price1', 'bid_size1', 'ask_size1', 'wap1', 'log_return1',
        'bid_price2', 'ask_price2', 'bid_size2', 'ask_size2', 'wap2', 'log_return2'
    ]
    test_data = get_dataset_test(
        stock_book_file,
        stock_id,
        valid_columns
    )
    input_feature_len = len(valid_columns)

    test_dataloader = DataLoader(test_data, batch_size=16, shuffle=False, num_workers=8)

    device = torch.device('cuda:0')

    rnn_num_layers=3
    hidden_size=100
    encoder = RNNEncoder(
        rnn_num_layers=rnn_num_layers, 
        input_feature_len=input_feature_len, 
        hidden_size=hidden_size,
        device=device, 
        rnn_dropout=0.2
    )

    model = Many2One(encoder, hidden_size)
    model = model.to(device)

    
    model = load_model(model, f'../input/myweights/best_{stock_id}.pth')
    model.eval()

    bar = tqdm(test_dataloader, position=0, leave=True, bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}')
    for x, y in bar:
        x_cuda = x.float().to(device)
        y = y.numpy()
        # row_id = '{}-{}'.format(row_id[0], row_id[1])
        
        
        pred = model(
            x_cuda
        )
        
        pred = pred.cpu().detach().numpy()
        
        rv.append(pred)
        row_ids.append(y)
    
    return rv, row_ids
    


In [None]:
root_dir = '../input/optiver-realized-volatility-prediction'


label_df = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
stocks = get_stocks(label_df)

rv, row_ids = [], []
for stock_id in stocks:
    stock_book_file = get_stock_file(root_dir, stock_id)
    rv, row_ids = main(stock_book_file, stock_id, label_df, rv, row_ids)


rv = np.concatenate(rv)
row_ids = np.concatenate(row_ids)

submit_data = pd.DataFrame(
    np.concatenate((row_ids, rv), axis=-1),
    columns=['stock_id', 'time_id', 'target']
)

submit_data['row_id'] = submit_data['stock_id'].astype(int).astype(str) + '-' + submit_data['time_id'].astype(int).astype(str)
submit_data = submit_data[['row_id','target']]
submit_data.to_csv('submission.csv',index = False)

In [None]:
# submit_data