TabNet training:
https://www.kaggle.com/hghghghgh1234/tabnet-regressor-training

LGB training:
https://www.kaggle.com/hghghghgh1234/optiver-realized-volatility-lgbm-baseline

LGB models made from "LGB training":
https://www.kaggle.com/hghghghgh1234/lgb-models

TabNet models made from "Tabnet models":
https://www.kaggle.com/hghghghgh1234/tabnet-models

some ideas come from https://www.kaggle.com/chumajin/optiver-realized-ensemble-tabnet-and-lgbm

In [None]:
!pip install ../input/pytorchtabnet/pytorch_tabnet-3.1.1-py3-none-any.whl

In [None]:
!lscpu |grep 'Model name'

In [None]:
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.metrics import Metric
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder 
import pickle
import scipy as sc
import random

In [None]:
import os
import glob
from joblib import Parallel, delayed
import pandas as pd
import numpy as np
import scipy as sc
from sklearn.model_selection import KFold
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 300)

In [None]:
# import module we'll need to import our custom module
from shutil import copyfile

# copy our file into the working directory (make sure it has .py suffix)
copyfile(src = "../input/gauss-rank-scaler/gauss_rank_scaler.py", dst = "../working/gauss_rank_scaler.py")

# import all our functions
from gauss_rank_scaler import GaussRankScaler

In [None]:
for fold in range(5):
    !cp -r ../input/tabnet-models/supervised_fold_{str(fold)}/* . 
    !zip supervised_fold_{str(fold)}.zip model_params.json network.pt

In [None]:
import zipfile

In [None]:
train = pd.read_csv("../input/optiver-dfs/train.csv")
train = train.drop("Unnamed: 0", axis=1)
train

In [None]:
## data directory
data_dir = '../input/optiver-realized-volatility-prediction/'

# Function to calculate first WAP
def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

# Function to calculate second WAP
def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

# Function to calculate the log of the return
# Remember that logb(x / y) = logb(x) - logb(y)
def log_return(series):
    return np.log(series).diff()

# Calculate the realized volatility
def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

# Function to count unique elements of a series
def count_unique(series):
    return len(np.unique(series))

# Function to read our base train and test set
def read_train_test():
    test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
    # Create a key to merge with book and trade data
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    #test['target'].values[:] = 0
    print(f'Our testing set has {test.shape[0]} rows')
    return test

# Function to preprocess book data (for each stock id)
def book_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    # Calculate Wap
    df['wap1'] = calc_wap1(df)
    df['wap2'] = calc_wap2(df)
    # Calculate log returns
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return)
    # Calculate wap balance
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    # Calculate spread
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    # Dict for aggregations
    create_feature_dict = {
        'wap1': [np.sum, np.mean, np.std],
        'wap2': [np.sum, np.mean, np.std],
        'log_return1': [np.sum, realized_volatility, np.mean, np.std],
        'log_return2': [np.sum, realized_volatility, np.mean, np.std],
        'wap_balance': [np.sum, np.mean, np.std],
        'price_spread':[np.sum, np.mean, np.std],
        'bid_spread':[np.sum, np.mean, np.std],
        'ask_spread':[np.sum, np.mean, np.std],
        'total_volume':[np.sum, np.mean, np.std],
        'volume_imbalance':[np.sum, np.mean, np.std]
    }
    
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(create_feature_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # Get the stats for different windows
    df_feature = get_stats_window(seconds_in_bucket = 0, add_suffix = False)
    df_feature_450 = get_stats_window(seconds_in_bucket = 450, add_suffix = True)
    df_feature_300 = get_stats_window(seconds_in_bucket = 300, add_suffix = True)
    df_feature_150 = get_stats_window(seconds_in_bucket = 150, add_suffix = True)
    
    # Merge all
    df_feature = df_feature.merge(df_feature_450, how = 'left', left_on = 'time_id_', right_on = 'time_id__450')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_150, how = 'left', left_on = 'time_id_', right_on = 'time_id__150')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__450', 'time_id__300', 'time_id__150'], axis = 1, inplace = True)
    
    # Create row_id so we can merge
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x: f'{stock_id}-{x}')
    df_feature.drop(['time_id_'], axis = 1, inplace = True)
    return df_feature

# Function to preprocess trade data (for each stock id)
def trade_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    
    # Dict for aggregations
    create_feature_dict = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum],
        'order_count':[np.mean],
    }
    
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(create_feature_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # Get the stats for different windows
    df_feature = get_stats_window(seconds_in_bucket = 0, add_suffix = False)
    df_feature_450 = get_stats_window(seconds_in_bucket = 450, add_suffix = True)
    df_feature_300 = get_stats_window(seconds_in_bucket = 300, add_suffix = True)
    df_feature_150 = get_stats_window(seconds_in_bucket = 150, add_suffix = True)

    # Merge all
    df_feature = df_feature.merge(df_feature_450, how = 'left', left_on = 'time_id_', right_on = 'time_id__450')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_150, how = 'left', left_on = 'time_id_', right_on = 'time_id__150')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__450', 'time_id__300', 'time_id__150'], axis = 1, inplace = True)
    
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature.drop(['trade_time_id_'], axis = 1, inplace = True)
    return df_feature

# Function to get group stats for the stock_id and time_id
def get_time_stock(df):
    # Get realized volatility columns
    vol_cols = ['log_return1_realized_volatility', 'log_return2_realized_volatility', 'log_return1_realized_volatility_450', 'log_return2_realized_volatility_450', 
                'log_return1_realized_volatility_300', 'log_return2_realized_volatility_300', 'log_return1_realized_volatility_150', 'log_return2_realized_volatility_150', 
                'trade_log_return_realized_volatility', 'trade_log_return_realized_volatility_450', 'trade_log_return_realized_volatility_300', 'trade_log_return_realized_volatility_150']

    # Group by the stock id
    df_stock_id = df.groupby(['stock_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
    df_stock_id = df_stock_id.add_suffix('_' + 'stock')

    # Group by the stock id
    df_time_id = df.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix('_' + 'time')
    
    # Merge with original dataframe
    df = df.merge(df_stock_id, how = 'left', left_on = ['stock_id'], right_on = ['stock_id__stock'])
    df = df.merge(df_time_id, how = 'left', left_on = ['time_id'], right_on = ['time_id__time'])
    df.drop(['stock_id__stock', 'time_id__time'], axis = 1, inplace = True)
    return df
    
# Funtion to make preprocessing function in parallel (for each stock id)
def preprocessor(list_stock_ids, is_train = True):
    
    # Parrallel for loop
    def for_joblib(stock_id):
        # Train
        if is_train:
            file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        # Test
        else:
            file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_test.parquet/stock_id=" + str(stock_id)
    
        # Preprocess book and trade data and merge them
        df_tmp = pd.merge(book_preprocessor(file_path_book), trade_preprocessor(file_path_trade), on = 'row_id', how = 'left')
        
        # Return the merge dataframe
        return df_tmp
    
    # Use parallel api to call paralle for loop
    df = Parallel(n_jobs = -1, verbose = 1)(delayed(for_joblib)(stock_id) for stock_id in list_stock_ids)
    # Concatenate all the dataframes that return from Parallel
    df = pd.concat(df, ignore_index = True)
    return df

# Function to calculate the root mean squared percentage error
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

# Function to early stop with root mean squared percentage error
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

def test_output(train, train_l, test, test_l):
    
    # Split features and target
    x = train[feature_cols] #.drop(['row_id', 'target', 'time_id'], axis = 1)
    x_l = train_l.drop(['row_id', 'target', 'time_id'], axis = 1)
    
    y = train['target']
    
    x_test = test[feature_cols]
    x_test_l = test_l.drop(['row_id', 'time_id'], axis = 1)
    
    # Transform stock id to a numeric value
    x['stock_id'] = x['stock_id'].astype(int)
    x_test['stock_id'] = x_test['stock_id'].astype(int)
    
    x_l['stock_id'] = x_l['stock_id'].astype(int)
    x_test_l['stock_id'] = x_test_l['stock_id'].astype(int)
    
    # Create out of folds array
    lgb_oof_predictions = np.zeros(y.shape)
    tabnet_oof_predictions = np.zeros(y.shape)
    print(tabnet_oof_predictions.shape)
    # Create test array to store predictions
    lgb_test_predictions = np.zeros(x_test.shape[0])
    tabnet_test_predictions = np.zeros(x_test.shape[0])
    
    # Iterate through each booster model
    fold_use = 5
    for fold in range(fold_use):
        print(f'lgb fold {fold}')
        
        model = lgb.Booster(model_file='../input/lgb-models/model_'+str(fold)+'.txt')
        
        print(x.shape)
        # Add predictions to the out of folds array
        lgb_oof_predictions += model.predict(x_l) / fold_use
        
        # Predict the test set
        lgb_test_predictions += model.predict(x_test_l) / fold_use
        
    # Iterate through each tabnet model
    print(rmspe(y.values, lgb_oof_predictions))
    
    print("lgb_oof_predictions: ", lgb_oof_predictions)
    
    fold_use = 5
    for fold in range(fold_use):
        print(f'tabnet fold {fold}')
        
        #unsupervised_model = TabNetPretrainer()
        #unsupervised_model.load_model('../input/fork-of-optiver-realized-volatility-lgbm-baseline/unsupervised_fold_'+str(fold)+'.zip')
        
        loaded_clf = TabNetRegressor(**tabnet_params)
        loaded_clf.load_model('./supervised_fold_'+str(fold)+'.zip')
        
        #print(loaded_clf.predict(x.values).shape)
        
        # Add predictions to the out of folds array
        tabnet_oof_predictions += loaded_clf.predict(np.hstack((x[scales].values, x[["stock_id"]].values))).reshape(-1) / fold_use
        
        print('tabnet_oof_predictions: ', tabnet_oof_predictions)
        
        # Predict the test set
        tabnet_test_predictions += loaded_clf.predict(np.hstack((x_test[scales].values, x_test[["stock_id"]].values))).reshape(-1) / fold_use
        
        print('tabnet_test_predictions: ', tabnet_test_predictions)
        
    
    print(rmspe(y.values, tabnet_oof_predictions))
    
    oof_predictions = lgb_oof_predictions*0.5
    oof_predictions += tabnet_oof_predictions*0.5
    
    rmspe_score = rmspe(y, oof_predictions)
    print(f'Our out of folds RMSPE is {rmspe_score}')
    
    test_predictions = lgb_test_predictions * 0.5 
    test_predictions += tabnet_test_predictions * 0.5
    # Return test predictions
    return test_predictions

# Read train and test
test = read_train_test()

# Get unique stock ids 
test_stock_ids = test['stock_id'].unique()
# Preprocess them using Parallel and our single stock id functions
test_ = preprocessor(test_stock_ids, is_train = False)
test = test.merge(test_, on = ['row_id'], how = 'left')

# Get group stats of time_id and stock_id
test = get_time_stock(test)


In [None]:
train_l = train.copy()
test_l = test.copy()

In [None]:
test

In [None]:
train

In [None]:
feature_cols = train.columns.tolist()
feature_cols = [ii for ii in feature_cols if not ii in ['row_id', 'target', 'time_id', 'kfold']]
target_cols = ['target']

In [None]:
train.shape[0]

In [None]:
train["tt"] = ["train"]*train.shape[0]
test["tt"] = ["test"]*test.shape[0]

In [None]:
total = train.copy()
total = train.append(test).copy()

In [None]:
total[total["tt"] == "test"]

In [None]:
for col in feature_cols:
    total[col] = total[col].fillna(total[col].mean())

In [None]:
train = total[total["tt"]=="train"].drop("tt", axis = 1)
test = total[total["tt"]=="test"].drop("tt", axis = 1)

In [None]:
test["wap_balance_sum"]

In [None]:
scales = train.drop(['row_id', 'target', 'time_id',"stock_id"], axis = 1).columns.to_list()

In [None]:
scaler = GaussRankScaler()
scaler.fit(total[scales])

In [None]:
total[scales] = scaler.transform(total[scales])

In [None]:
train[scales] = total[scales][total["tt"]=="train"]
test[scales] = total[scales][total["tt"]=="test"]

In [None]:
total["wap1_sum"].hist(bins=100)

In [None]:
test

In [None]:
for col in scales:
    if total[col].isnull().values.any()==True:
        print(col)

In [None]:
total["wap_balance_sum"]

In [None]:
from sklearn.preprocessing import LabelEncoder 
import pickle

le=LabelEncoder()
le.fit(train["stock_id"])
train["stock_id"] = le.transform(train["stock_id"])
test["stock_id"] = le.transform(test["stock_id"])
with open( 'stock_id_encoder.txt', 'wb') as f:
    pickle.dump(le, f)

In [None]:
train

In [None]:
test

In [None]:
def rmspe(y_true, y_pred):
    '''
    Compute Root Mean Square Percentage Error between two arrays.
    '''
    
    if (y_true == 0).any():
        raise ValueError("Root Mean Square Percentage Error cannot be used when "
                         "targets contain zero values.")
        
    loss = np.sqrt(np.mean(np.square(((y_true - y_pred) / y_true)), axis=0)).item()

    return loss
class RMSPE(Metric):
    def __init__(self):
        self._name = "rmspe"
        self._maximize = False

    def __call__(self, y_true, y_score):
        return rmspe(y_true, y_score)

In [None]:
tabnet_params = dict(
    n_d = 32,
    n_a = 32,
    n_steps = 3,
    gamma = 1.3,
    lambda_sparse = 0,
    optimizer_fn = optim.Adam,
    optimizer_params = dict(lr = 1e-2, weight_decay = 1e-5),
    mask_type = "entmax",
    scheduler_params = dict(
        mode = "min", patience = 5, min_lr = 1e-5, factor = 0.9),
    scheduler_fn = ReduceLROnPlateau,
    seed = 42,
    #verbose = 5,
    cat_dims=[len(le.classes_)], cat_emb_dim=[10], cat_idxs=[-1] # define categorical features
)

In [None]:
loaded_clf = TabNetRegressor(**tabnet_params)
loaded_clf.load_model('./supervised_fold_'+str(1)+'.zip')

In [None]:
loaded_clf.predict(np.hstack((test[scales].values, test[["stock_id"]].values)))

In [None]:
# Traing and evaluate
test_predictions = test_output(train, train_l, test, test_l)
# Save test predictions
test['target'] = test_predictions
test[['row_id', 'target']].to_csv('submission.csv',index = False)

[0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]

In [None]:
#pick ratio