In [None]:
!pip install -q pytorch-tabnet

In [None]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from glob import glob
from joblib import Parallel, delayed
from sklearn.model_selection import KFold, train_test_split
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor

In [None]:
ROOT_DIR = '../input/optiver-realized-volatility-prediction'
SEED = 42
MAX_EPOCH=1

In [None]:
book_testparquet = pd.read_parquet("../input/optiver-realized-volatility-prediction/book_test.parquet/stock_id=0")

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

In [None]:
def get_stock_stat(stock_id : int, dataType = 'train'):
    
    book_train_subset = pd.read_parquet(os.path.join(ROOT_DIR, f'book_{dataType}.parquet/stock_id={stock_id}/'))
    book_train_subset.sort_values(by=['time_id', 'seconds_in_bucket'])

    book_train_subset['bas'] = (book_train_subset[['ask_price1', 'ask_price2']].min(axis = 1)
                                / book_train_subset[['bid_price1', 'bid_price2']].max(axis = 1)
                                - 1)                               

    
    book_train_subset['wap'] = (book_train_subset['bid_price1'] * book_train_subset['ask_size1'] +
                            book_train_subset['ask_price1'] * book_train_subset['bid_size1']) / (
                            book_train_subset['bid_size1']+ book_train_subset['ask_size1'])

    book_train_subset['log_return'] = (book_train_subset.groupby(by = ['time_id'])['wap'].
                                       apply(log_return).
                                       reset_index(drop = True).
                                       fillna(0)
                                      )
    
    stock_stat = pd.merge(
        book_train_subset.groupby(by = ['time_id'])['log_return'].agg(realized_volatility).reset_index(),
        book_train_subset.groupby(by = ['time_id'], as_index = False)['bas'].mean(),
        on = ['time_id'],
        how = 'left'
    )
    
    stock_stat['stock_id'] = stock_id
    
    return stock_stat

def get_dataset(stock_ids : list, dataType = 'train'):

    stock_stat = Parallel(n_jobs=-1)(
        delayed(get_stock_stat)(stock_id, dataType) 
        for stock_id in stock_ids
    )
    
    stock_stat_df = pd.concat(stock_stat, ignore_index = True)

    return stock_stat_df

In [None]:
train = pd.read_csv(os.path.join(ROOT_DIR, 'train.csv'))
test = pd.read_csv(os.path.join(ROOT_DIR, 'test.csv'))

train_stock_stat_df = get_dataset(stock_ids = train['stock_id'].unique(), dataType = 'train')
train_dataset = pd.merge(train, train_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left')

In [None]:
y = train_dataset['target'].values.reshape(-1,1)
X = train_dataset.drop(['stock_id', 'time_id', 'target'], axis = 1)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=42)
X_train = X_train.values#reset_index(drop=True)
X_valid = X_valid.values#reset_index(drop=True)

In [None]:
def rmspe(y_true, y_pred):
    '''
    Compute Root Mean Square Percentage Error between two arrays.
    '''
    
    if (y_true == 0).any():
        raise ValueError("Root Mean Square Percentage Error cannot be used when "
                         "targets contain zero values.")
        
    loss = np.sqrt(np.mean(np.square(((y_true - y_pred) / y_true)), axis=0)).item()

    return loss

class RMSPE(Metric):
    def __init__(self):
        self._name = "rmspe"
        self._maximize = False

    def __call__(self, y_true, y_score):
        return rmspe(y_true, y_score)

In [None]:
tabnet_params = dict(
    n_d = 32,
    n_a = 32,
    n_steps = 1,
    gamma = 1.3,
    lambda_sparse = 0,
    optimizer_fn = optim.Adam,
    optimizer_params = dict(lr = 2e-2, weight_decay = 1e-5),
    mask_type = "entmax",
    scheduler_params = dict(
        mode = "min", patience = 5, min_lr = 1e-5, factor = 0.9),
    scheduler_fn = ReduceLROnPlateau,
    seed = SEED,
    verbose = 10
)

clf = TabNetRegressor(**tabnet_params)
clf.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_name = ["val"],
    eval_metric=[RMSPE],
    max_epochs = MAX_EPOCH,
    patience = 20,
    batch_size = 1024, 
    virtual_batch_size = 32,
    num_workers = 4,
    drop_last = False,
)

clf.save_model('optiver_tabnet_model')

In [None]:
!zip -r optiver_tabnet_model.zip /kaggle/working/ --exclude *.ipynb