In [None]:
!pip install --no-index --find-links ../input/pytorchtabnet/pytorch_tabnet-3.1.1-py3-none-any.whl pytorch-tabnet

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.preprocessing import MinMaxScaler
import math
import os
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from scipy import stats
%matplotlib inline
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import LabelEncoder 
import glob
from sklearn.model_selection import train_test_split, KFold
from tqdm import tqdm
import gc
from shutil import copyfile,make_archive
from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.pretraining import TabNetPretrainer

In [None]:
config = {'input_trade_path': "../input/optiver-realized-volatility-prediction/trade_",
              'input_book_path': "../input/optiver-realized-volatility-prediction/book_",
              'train_path': '../input/optiver-realized-volatility-prediction/train.csv',
              'test_path' : '../input/optiver-realized-volatility-prediction/test.csv'}

#train_df = pd.read_csv(config['train_path'])
test_df = pd.read_csv(config['test_path'])

def read_trade_and_book_data(stock_id, inp_type, data_type):
    
    trade_file = glob.glob(config[inp_type]+f'{data_type}.parquet/stock_id={stock_id}/*')[0]
    trade = pd.read_parquet(trade_file)
    return trade

def get_consolidated_final_trade_book_df(df, data_type):
    unique_id = df['stock_id'].unique().tolist()
    
    trade_final_df = pd.DataFrame()
    book_final_df = pd.DataFrame()
    for stock_id in tqdm(unique_id):
        # Get book data
        temp_book_stock_df = read_trade_and_book_data(stock_id=stock_id, 
                                                  inp_type='input_book_path', 
                                                  data_type=data_type)
        temp_book_stock_df['stock_id'] = stock_id
        book_final_df = pd.concat([book_final_df, temp_book_stock_df])
        
        # Get trade data
        temp_trade_stock_df = read_trade_and_book_data(stock_id=stock_id, 
                                                   inp_type='input_trade_path', 
                                                   data_type=data_type)
        temp_trade_stock_df['stock_id'] = stock_id
        trade_final_df = pd.concat([trade_final_df, temp_trade_stock_df])
        
        gc.collect()
        
    book_final_df = book_final_df.reset_index(drop=True)
    trade_final_df = trade_final_df.reset_index(drop=True)

    return book_final_df, trade_final_df

gc.collect()
#train_book_final_df, train_trade_final_df = get_consolidated_final_trade_book_df(df=train_df.sample(10), data_type='train')
test_book_final_df, test_trade_final_df = get_consolidated_final_trade_book_df(df=test_df, data_type='test')

#train_book_final_df.shape, train_trade_final_df.shape,
test_book_final_df.shape, test_trade_final_df.shape

def get_trade_agg_info(df):
    agg_df = df.groupby(['stock_id', 'time_id']).agg(mean_sec_in_bucket_trade = ('seconds_in_bucket', 'mean'), 
                                                     mean_price = ('price', 'mean'),
                                                     mean_size = ('size', 'mean'),
                                                     mean_order = ('order_count', 'mean'),
                                                     max_sec_in_bucket_trade = ('seconds_in_bucket', 'max'), 
                                                     max_price = ('price', 'max'),
                                                     max_size = ('size', 'max'),
                                                     max_order = ('order_count', 'max'),
                                                     min_sec_in_bucket_trade = ('seconds_in_bucket', 'min'), 
                                                     min_price = ('price', 'min'),
                                                     min_size = ('size', 'min'),
                                                     min_order = ('order_count', 'min'),
                                                     median_sec_in_bucket_trade = ('seconds_in_bucket', 'median'), 
                                                     median_price = ('price', 'median'),
                                                     median_size = ('size', 'median'),
                                                     median_order = ('order_count', 'median')
                                                    ).reset_index()
    
    return agg_df

def get_book_agg_info(df):
    agg_df = df.groupby(['stock_id', 'time_id']).agg(mean_sec_in_bucket_book = ('seconds_in_bucket', 'mean'),
                                                     mean_bid_price1 = ('bid_price1', 'mean'),
                                                     mean_ask_price1 = ('ask_price1', 'mean'),
                                                     mean_bid_price2 = ('bid_price2',  'mean'),
                                                     mean_ask_price2 = ('ask_price2',  'mean'),
                                                     mean_bid_size1 = ('bid_size1',  'mean'),
                                                     mean_ask_size1 = ('ask_size1',  'mean'),
                                                     mean_bid_size2 = ('bid_size2', 'mean'),
                                                     mean_ask_size2 = ('ask_size2', 'mean'),
                                                     max_sec_in_bucket_book = ('seconds_in_bucket', 'max'),
                                                     max_bid_price1 = ('bid_price1', 'max'),
                                                     max_ask_price1 = ('ask_price1', 'max'),
                                                     max_bid_price2 = ('bid_price2',  'max'),
                                                     max_ask_price2 = ('ask_price2',  'max'),
                                                     max_bid_size1 = ('bid_size1',  'max'),
                                                     max_ask_size1 = ('ask_size1',  'max'),
                                                     max_bid_size2 = ('bid_size2', 'max'),
                                                     max_ask_size2 = ('ask_size2', 'max'),
                                                     min_sec_in_bucket_book = ('seconds_in_bucket', 'min'),
                                                     min_bid_price1 = ('bid_price1', 'min'),
                                                     min_ask_price1 = ('ask_price1', 'min'),
                                                     min_bid_price2 = ('bid_price2',  'min'),
                                                     min_ask_price2 = ('ask_price2',  'min'),
                                                     min_bid_size1 = ('bid_size1',  'min'),
                                                     min_ask_size1 = ('ask_size1',  'min'),
                                                     min_bid_size2 = ('bid_size2', 'min'),
                                                     min_ask_size2 = ('ask_size2', 'min'),
                                                     median_sec_in_bucket_book = ('seconds_in_bucket', 'median'),
                                                     median_bid_price1 = ('bid_price1', 'median'),
                                                     median_ask_price1 = ('ask_price1', 'median'),
                                                     median_bid_price2 = ('bid_price2',  'median'),
                                                     median_ask_price2 = ('ask_price2',  'median'),
                                                     median_bid_size1 = ('bid_size1',  'median'),
                                                     median_ask_size1 = ('ask_size1',  'median'),
                                                     median_bid_size2 = ('bid_size2', 'median'),
                                                     median_ask_size2 = ('ask_size2', 'median')
                                                    ).reset_index()
    
    return agg_df

#train_trade_agg = get_trade_agg_info(df=train_trade_final_df)
test_trade_agg = get_trade_agg_info(df=test_trade_final_df)

#train_book_agg = get_book_agg_info(df=train_book_final_df)
test_book_agg = get_book_agg_info(df=test_book_final_df)

#train_agg = pd.merge(train_book_agg, train_trade_agg, on=['stock_id', 'time_id'], how='left')
test_agg = pd.merge(test_book_agg, test_trade_agg, on=['stock_id', 'time_id'], how='left')

# Merge to get the labels
#train_final_df = pd.merge(train_df, train_agg, on=['stock_id', 'time_id'], how='left')

# Merge to get the row-id for submission
test_final_df = pd.merge(test_df, test_agg, on=['stock_id', 'time_id'], how='left')

#train_final_df.fillna(0,inplace=True)
test_final_df.fillna(0,inplace=True)

In [None]:
!mkdir ./model
copyfile('../input/modelfile/model/model_params.json', "./model/model_params.json")
copyfile('../input/modelfile/model/network.pt', "./model/network.pt")
make_archive('./model', 'zip', './model')
regressor = TabNetRegressor()
regressor.load_model("./model.zip")

In [None]:
le = LabelEncoder()
le.classes_ = np.load('../input/classex/classes.npy')
test = test_final_df
col = test['row_id']
test.drop(columns=['row_id'],inplace=True)
test = test.to_numpy()
ypred = regressor.predict(test).astype(int)
sub = pd.DataFrame({'row_id':col,'target':list(le.inverse_transform(ypred.flatten()))})

In [None]:
sub.to_csv('submission.csv',index=False)

In [None]:
sub.head()