In [None]:
!pip install -q '../input/pytorchtabnet/pytorch_tabnet-3.1.1-py3-none-any.whl'

In [None]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from glob import glob
from joblib import Parallel, delayed
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor

In [None]:
ROOT_DIR = '../input/optiver-realized-volatility-prediction'
SEED = 42
MAX_EPOCH=100

In [None]:
book_testparquet = pd.read_parquet("../input/optiver-realized-volatility-prediction/book_test.parquet/stock_id=0")

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

In [None]:
def get_stock_stat(stock_id : int, dataType = 'train'):
    
    book_train_subset = pd.read_parquet(os.path.join(ROOT_DIR, f'book_{dataType}.parquet/stock_id={stock_id}/'))
    book_train_subset.sort_values(by=['time_id', 'seconds_in_bucket'])

    book_train_subset['bas'] = (book_train_subset[['ask_price1', 'ask_price2']].min(axis = 1)
                                / book_train_subset[['bid_price1', 'bid_price2']].max(axis = 1)
                                - 1)                               

    
    book_train_subset['wap'] = (book_train_subset['bid_price1'] * book_train_subset['ask_size1'] +
                            book_train_subset['ask_price1'] * book_train_subset['bid_size1']) / (
                            book_train_subset['bid_size1']+ book_train_subset['ask_size1'])

    book_train_subset['log_return'] = (book_train_subset.groupby(by = ['time_id'])['wap'].
                                       apply(log_return).
                                       reset_index(drop = True).
                                       fillna(0)
                                      )
    
    stock_stat = pd.merge(
        book_train_subset.groupby(by = ['time_id'])['log_return'].agg(realized_volatility).reset_index(),
        book_train_subset.groupby(by = ['time_id'], as_index = False)['bas'].mean(),
        on = ['time_id'],
        how = 'left'
    )
    
    stock_stat['stock_id'] = stock_id
    
    return stock_stat

def get_dataset(stock_ids : list, dataType = 'train'):

    stock_stat = Parallel(n_jobs=-1)(
        delayed(get_stock_stat)(stock_id, dataType) 
        for stock_id in stock_ids
    )
    
    stock_stat_df = pd.concat(stock_stat, ignore_index = True)

    return stock_stat_df

In [None]:
test = pd.read_csv(os.path.join(ROOT_DIR, 'test.csv'))

test_stock_stat_df = get_dataset(stock_ids = test['stock_id'].unique(), dataType = 'test')
test_dataset = pd.merge(test, test_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left')
test_dataset = test_dataset.drop(['stock_id', 'time_id'], axis = 1)

In [None]:
y_pred = test_dataset[['row_id']]
X_test = test_dataset.drop(['row_id'], axis = 1).fillna(0)

In [None]:
!cp ../input/optiver-tabnet-model/{model_params.json,network.pt} ./
!zip optiver-tabnet-model.zip ./model_params.json ./network.pt
!rm ./model_params.json ./network.pt

In [None]:
clf = TabNetRegressor()
clf.load_model('/kaggle/working/optiver-tabnet-model.zip')

In [None]:
y_pred = y_pred.assign(target = clf.predict(X_test.values))
y_pred.to_csv('submission.csv',index = False)
y_pred

In [None]:
!rm ./optiver-tabnet-model.zip