# Setup

## Google Drive


In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive')

In [None]:
# cd "/content/gdrive/My Drive/Optiver Kaggle/Data"

In [None]:
# output_file = r'/content/gdrive/My Drive/Optiver Kaggle/\working/submission.csv'
# train_file = r'/content/gdrive/My Drive/Optiver Kaggle/working/train.csv'

## Kaggle

In [None]:
cd "/kaggle/input/optiver-realized-volatility-prediction/"

In [None]:
output_file = r'/kaggle/working/submission.csv'
train_file = r'/kaggle/input/pre-processed-data/train.csv'
save_model_path = r'/kaggle/working'

## PC

In [None]:
# cd D:\kaggle\input\optiver-realized-volatility-prediction

In [None]:
# output_file = r'D:\kaggle\working\submission.csv'
# train_file = r'D:\kaggle\working\train.csv'
# # train_file = r'D:\kaggle\working\train_little.csv'

## Hardware info

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

# Libraries

In [None]:
import pandas as pd
import numpy as np
import glob
import os
import gc

from joblib import Parallel, delayed

from sklearn import preprocessing, model_selection
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import r2_score

import matplotlib.pyplot as plt 
import seaborn as sns
import numpy.matlib

seed0 = 2021
target_name = 'target'
scores_folds = {}

pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

# Data Preprocess

## Read Target Data 

In [None]:
def read_train_test():
    train = pd.read_csv('train.csv')
    test = pd.read_csv('test.csv')
    # Create a key to merge with book and trade data
    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    print(f'Our training set has {train.shape[0]} rows')
    return train, test

In [None]:
# train,test = read_train_test()

In [None]:
# train.head(5)

## Book and Trade Data Preprocess



### Pre functions

In [None]:
# calculate weighted average price
def wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

def wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

def wap3(df):
    wap = (df['bid_price1'] * df['bid_size1'] + df['ask_price1'] * df['ask_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

def wap4(df):
    wap = (df['bid_price2'] * df['bid_size2'] + df['ask_price2'] * df['ask_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

# calculate log return
def log_return(series):
    return np.log(series).diff()

# calculate the realized volatility
def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

# count unique elements of a series
def count_unique(series):
    return len(np.unique(series))

# calculate order flow using tick method
def order_flow_tick(df):
    df['order_flow_tick'] = df['size']
    df['price_difference'] = df['price'].diff()
    df.loc[df['price_difference']>0,'order_flow_tick'] = df['size']
    df.loc[df['price_difference']<0,'order_flow_tick'] = -df['size']
    # for those with price not change, assign a same sign with the previous value
    constant_index = df.index[df['price_difference']==0].values
    for index in constant_index:
        df.loc[index,'order_flow_tick'] = np.sign(df.loc[index-1,'order_flow_tick']) * df.loc[index,'size']

    df.drop('price_difference',inplace=True,axis=1)
    return df

def order_flow_quote(df,df_ob):
    # select the time id of orderbook to be consistent with trade
    time_id = df['time_id'].iloc[0]
    df_ob = df_ob.loc[df_ob['time_id']==time_id,:]

    df_ob['mid_price'] = (df_ob['ask_price1'] + df_ob['bid_price1']) / 2
    df = df.merge(df_ob[['seconds_in_bucket','mid_price']],on='seconds_in_bucket',how='left')
    # if there are none values, let the quote equals to tick
    df['order_flow_quote'] = df['order_flow_tick']
    df.loc[df['price']<df['mid_price'],'order_flow_quote'] = -df['size']
    df.loc[df['price']>df['mid_price'],'order_flow_quote'] = df['size']
    df.loc[df['price']==df['mid_price'],'order_flow_quote'] = 0

    df.drop('mid_price',inplace=True,axis=1)

    return df

def tendency_factors(df):
    def tendency(price, vol):    
        df_diff = np.diff(price)
        val = (df_diff/price[1:])*100
        power = np.sum(val*vol[1:])
        return power

    lis = []
    for n_time_id in df['time_id'].unique():
        df_id = df[df['time_id'] == n_time_id]        
        tendencyV = tendency(df_id['price'].values, df_id['size'].values)      
        f_max = np.sum(df_id['price'].values > np.mean(df_id['price'].values))
        f_min = np.sum(df_id['price'].values < np.mean(df_id['price'].values))
        df_max =  np.sum(np.diff(df_id['price'].values) > 0)
        df_min =  np.sum(np.diff(df_id['price'].values) < 0)
        # new
        abs_diff = np.median(np.abs( df_id['price'].values - np.mean(df_id['price'].values)))        
        energy = np.mean(df_id['price'].values**2)
        iqr_p = np.percentile(df_id['price'].values,75) - np.percentile(df_id['price'].values,25)

        # vol vars

        abs_diff_v = np.median(np.abs( df_id['size'].values - np.mean(df_id['size'].values)))        
        energy_v = np.sum(df_id['size'].values**2)
        iqr_p_v = np.percentile(df_id['size'].values,75) - np.percentile(df_id['size'].values,25)

        lis.append({'time_id':n_time_id,'tendency':tendencyV,'f_max':f_max,'f_min':f_min,'df_max':df_max,'df_min':df_min,
                    'abs_diff':abs_diff,'energy':energy,'iqr_p':iqr_p,'abs_diff_v':abs_diff_v,'energy_v':energy_v,'iqr_p_v':iqr_p_v})

    df_lr = pd.DataFrame(lis)

    return df_lr

### Main functions

#### Book

In [None]:
# preprocess book data (for each stock id)
def book_preprocessor(file_path):
    df = pd.read_parquet(file_path)

    # calculate weighted average price
    df['wap1'] = wap1(df)
    df['wap2'] = wap2(df)
    df['wap3'] = wap3(df)
    df['wap4'] = wap4(df)
    # calculate log returns
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return)
    df['log_return3'] = df.groupby(['time_id'])['wap3'].apply(log_return)
    df['log_return4'] = df.groupby(['time_id'])['wap4'].apply(log_return)
    # calculate wap balance
    df['wap_gap1'] = abs(df['wap1'] - df['wap2'])
    df['wap_gap2'] = abs(df['wap3'] - df['wap4'])

    # calculate spread
    df['price_spread1'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']))
    df['price_spread2'] = (df['ask_price2'] - df['bid_price2']) / ((df['ask_price2'] + df['bid_price2']))
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price2'] - df['ask_price1']
    df['bid_ask_spread1'] = df['ask_price1'] - df['bid_price1']
    df['bid_ask_spread2'] = df['ask_price2'] - df['bid_price2']
    df["bid_ask_spread_spread"] = abs(df['bid_spread'] - df['ask_spread'])

    # calculate imbalance
    df['volume1'] = (df['ask_size1']) + (df['bid_size1'])
    df['volume_imbalance1'] = (df['ask_size1']) - (df['bid_size1'])
    df['imbalance_percentage1'] = df['volume_imbalance1'] / df['volume1']
    df['volume2'] = (df['ask_size2']) + (df['bid_size2'])
    df['volume_imbalance2'] = (df['ask_size2']) - (df['bid_size2'])
    df['imbalance_percentage2'] = df['volume_imbalance2'] / df['volume2']
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['total_volume_imbalance'] = (df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2'])
    df['total_imbalance_percentage'] = df['total_volume_imbalance'] / df['total_volume']

    # create dict of functions for each feature for aggregations
    create_feature_dict = {
      'wap1': [np.sum, np.std],
      'wap2': [np.sum, np.std],
      'wap3': [np.sum, np.std],
      'wap4': [np.sum, np.std],
      'log_return1': [realized_volatility],
      'log_return2': [realized_volatility],
      'log_return3': [realized_volatility],
      'log_return4': [realized_volatility],
      'wap_gap1': [np.sum, np.max],
      'wap_gap2': [np.sum, np.max],
      'price_spread1':[np.sum, np.max],
      'price_spread2':[np.sum, np.max],
      'bid_spread':[np.sum, np.max],
      'ask_spread':[np.sum, np.max],
      'bid_ask_spread1':[np.sum, np.max],
      'bid_ask_spread2':[np.sum, np.max],
      'bid_ask_spread_spread':[np.sum, np.max],
      'volume1':[np.sum, np.max],
      'volume_imbalance1':[np.sum, np.max],
      "imbalance_percentage1":[np.sum,  np.max],
      "volume2":[np.sum,  np.max],
      "volume_imbalance2":[np.sum,  np.max],
      "imbalance_percentage2":[np.sum,  np.max],
      "total_volume":[np.sum,  np.max],
      "total_volume_imbalance":[np.sum,  np.max],
      "total_imbalance_percentage":[np.sum,  np.max],
    }
    create_feature_dict_time = {
      'log_return1': [realized_volatility],
      'log_return2': [realized_volatility],
      'log_return3': [realized_volatility],
      'log_return4': [realized_volatility],
    }

    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(fe_dict,seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(fe_dict).reset_index()
        # Rename columns joining suffix (get wap_sum from the multi index wap and sum)
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature

    # Get the stats for different windows
    df_feature = get_stats_window(create_feature_dict,seconds_in_bucket = 0, add_suffix = False)
    df_feature_500 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 500, add_suffix = True)
    df_feature_400 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 400, add_suffix = True)
    df_feature_300 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 300, add_suffix = True)
    df_feature_200 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 200, add_suffix = True)
    df_feature_100 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 100, add_suffix = True)

    # Merge all
    df_feature = df_feature.merge(df_feature_500, how = 'left', left_on = 'time_id_', right_on = 'time_id__500')
    df_feature = df_feature.merge(df_feature_400, how = 'left', left_on = 'time_id_', right_on = 'time_id__400')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_200, how = 'left', left_on = 'time_id_', right_on = 'time_id__200')
    df_feature = df_feature.merge(df_feature_100, how = 'left', left_on = 'time_id_', right_on = 'time_id__100')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__500','time_id__400', 'time_id__300', 'time_id__200','time_id__100'], axis = 1, inplace = True)

    # Create row_id so we can merge later
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x: f'{stock_id}-{x}')
    df_feature.drop(['time_id_'], axis = 1, inplace = True)

    return df_feature

In [None]:
# df = book_preprocessor('Data/book_train.parquet/stock_id=0')
# df.describe()

In [None]:
# df[df['row_id']=='0-5']

#### Trade

In [None]:
# df = pd.read_parquet('Data/trade_train.parquet/stock_id=1')
# df_ob = pd.read_parquet('Data/book_train.parquet/stock_id=1')
# df = df.groupby('time_id').apply(order_flow_tick)
# df = df.groupby('time_id').apply(order_flow_quote,df_ob=df_ob)
# df = df.droplevel(0).reset_index()
# df = df[df['time_id']==5]
# df_ob = df_ob[df_ob['time_id']==5]

# df_ob['mid_price'] = (df_ob['ask_price1'] + df_ob['bid_price1']) / 2
# overlap = df_ob['seconds_in_bucket'].isin(df['seconds_in_bucket'])
# df['mid_price'] = df_ob.loc[overlap, ['mid_price']].values
# df['order_flow'] = df['size']
# df.loc[df['price']>df['mid_price'],'order_flow'] = -df['order_flow']
# df.loc[df['price']==df['mid_price'],'order_flow'] = 0

In [None]:
# df_ob

In [None]:
# volume_per_bar = 50

In [None]:
# preprocess trade data (for each stock id)
def trade_preprocessor(file_path, file_path_book):
    df = pd.read_parquet(file_path)
    df_ob = pd.read_parquet(file_path_book)

    # calculate log return
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    
    # calculate trade amount
    df['amount']=df['price']*df['size']

    # calculate average volume of each order
    df['average_volume_per_trade'] = df['size'] / df['order_count']

    # calculate trading order flow
    df = df.groupby('time_id').apply(order_flow_tick)
    df = df.groupby('time_id').apply(order_flow_quote,df_ob=df_ob)
    df = df.droplevel(0).reset_index() # drop multi-index created when doing groupby

    # create dict of functions for each feature for aggregations
    create_feature_dict = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum, np.max, np.min],
        'order_count':[np.sum,np.max],
        'amount':[np.sum,np.max,np.min],
        'average_volume_per_trade':[np.sum,np.max,np.min],
        'order_flow_tick':[np.sum,np.max,np.min],
        'order_flow_quote':[np.sum,np.max,np.min]
    }
    create_feature_dict_time = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum],
        'order_count':[np.sum],
        'amount':[np.sum,np.max,np.min],
        'average_volume_per_trade':[np.sum,np.max,np.min],
        'order_flow_tick':[np.sum,np.max,np.min],
        'order_flow_quote':[np.sum,np.max,np.min]
    }

    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(fe_dict,seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(fe_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    

    # Get the stats for different windows
    df_feature = get_stats_window(create_feature_dict,seconds_in_bucket = 0, add_suffix = False)
    df_feature_500 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 500, add_suffix = True)
    df_feature_400 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 400, add_suffix = True)
    df_feature_300 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 300, add_suffix = True)
    df_feature_200 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 200, add_suffix = True)
    df_feature_100 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 100, add_suffix = True)
    

    # calculate tendency factors
    df_lr = tendency_factors(df) 
    df_feature = df_feature.merge(df_lr, how = 'left', left_on = 'time_id_', right_on = 'time_id')
        
    # Merge all
    df_feature = df_feature.merge(df_feature_500, how = 'left', left_on = 'time_id_', right_on = 'time_id__500')
    df_feature = df_feature.merge(df_feature_400, how = 'left', left_on = 'time_id_', right_on = 'time_id__400')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_200, how = 'left', left_on = 'time_id_', right_on = 'time_id__200')
    df_feature = df_feature.merge(df_feature_100, how = 'left', left_on = 'time_id_', right_on = 'time_id__100')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__500','time_id__400', 'time_id__300', 'time_id__200','time_id','time_id__100'], axis = 1, inplace = True)
    
    
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature.drop(['trade_time_id_'], axis = 1, inplace = True)
    return df_feature

In [None]:
# origin_trade = pd.read_parquet('trade_train.parquet/stock_id=0')

In [None]:
# df = trade_preprocessor('trade_train.parquet/stock_id=0','book_train.parquet/stock_id=0')

In [None]:
# df[df['row_id']=='0-11']

#### Merge

In [None]:
# make preprocessing function in parallel (for each stock id)
def preprocessor(list_stock_ids, is_train = True):
    
    # Parrallel for loop
    def for_joblib(stock_id):
        # Train
        if is_train:
            file_path_book = "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = "trade_train.parquet/stock_id=" + str(stock_id)
        # Test
        else:
            file_path_book = "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = "trade_test.parquet/stock_id=" + str(stock_id)
    
        # Preprocess book and trade data and merge them
        df_tmp = pd.merge(book_preprocessor(file_path_book), trade_preprocessor(file_path_trade,file_path_book), on = 'row_id', how = 'left')
        
        # Return the merge dataframe
        return df_tmp
    
    # Use parallel api to call paralle for loop
    df = Parallel(n_jobs = -1, verbose = 1)(delayed(for_joblib)(stock_id) for stock_id in list_stock_ids)
    # Concatenate all the dataframes that return from Parallel
    df = pd.concat(df, ignore_index = True)
    return df

# get group stats for the stock_id and time_id
def get_time_stock(df):
    vol_cols = ['log_return1_realized_volatility', 'log_return2_realized_volatility', 'log_return1_realized_volatility_400', 'log_return2_realized_volatility_400', 
                'log_return1_realized_volatility_300', 'log_return2_realized_volatility_300', 'log_return1_realized_volatility_200', 'log_return2_realized_volatility_200', 
                'trade_log_return_realized_volatility', 'trade_log_return_realized_volatility_400', 'trade_log_return_realized_volatility_300', 'trade_log_return_realized_volatility_200']


    # Group by the stock id
    df_stock_id = df.groupby(['stock_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
    df_stock_id = df_stock_id.add_suffix('_' + 'stock')

    # Group by the stock id
    df_time_id = df.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix('_' + 'time')
    
    # Merge with original dataframe
    df = df.merge(df_stock_id, how = 'left', left_on = ['stock_id'], right_on = ['stock_id__stock'])
    df = df.merge(df_time_id, how = 'left', left_on = ['time_id'], right_on = ['time_id__time'])
    df.drop(['stock_id__stock', 'time_id__time'], axis = 1, inplace = True)
    return df

### Main

In [None]:
# Read train and test
train, test = read_train_test()

# # Get unique stock ids 
# train_stock_ids = train['stock_id'].unique()
# # Preprocess them using Parallel and our single stock id functions
# train_ = preprocessor(train_stock_ids, is_train = True)
# train = train.merge(train_, on = ['row_id'], how = 'left')

# Get unique stock ids 
test_stock_ids = test['stock_id'].unique()
# Preprocess them using Parallel and our single stock id functions
test_ = preprocessor(test_stock_ids, is_train = False)
test = test.merge(test_, on = ['row_id'], how = 'left')

# Get group stats of time_id and stock_id
# train = get_time_stock(train)
test = get_time_stock(test)

train = pd.read_csv(train_file)
train.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
[x for x in test.columns if x not in train.columns]

In [None]:
# import matplotlib.pyplot as plt
# from sklearn.preprocessing import QuantileTransformer
# scaler = QuantileTransformer(output_distribution='normal', random_state=seed0)
# target = train['target'].values
# target = scaler.fit_transform(target.reshape(-1,1))
# plt.hist(target,bins=1000)

In [None]:
# analysis the features
# train.isna().sum()

In [None]:
# save the train
# train.to_csv(train_file)

In [None]:
# train2 = pd.read_csv(train_file)

In [None]:
# train2.loc[train2['trade_order_flow_tick_amax'].isna(),['trade_order_flow_tick_sum','trade_order_flow_tick_amax']]

In [None]:
# train2.groupby('time_id')['seconds_in_bucket'].apply(lambda col: (col.values>=500).any())

In [None]:
# replace by order sum (tau)
train['size_tau'] = np.sqrt( 1/ train['trade_seconds_in_bucket_count_unique'] )
test['size_tau'] = np.sqrt( 1/ test['trade_seconds_in_bucket_count_unique'] )
#train['size_tau_450'] = np.sqrt( 1/ train['trade_seconds_in_bucket_count_unique_450'] )
#test['size_tau_450'] = np.sqrt( 1/ test['trade_seconds_in_bucket_count_unique_450'] )
train['size_tau_400'] = np.sqrt( 1/ train['trade_seconds_in_bucket_count_unique_400'] )
test['size_tau_400'] = np.sqrt( 1/ test['trade_seconds_in_bucket_count_unique_400'] )
train['size_tau_300'] = np.sqrt( 1/ train['trade_seconds_in_bucket_count_unique_300'] )
test['size_tau_300'] = np.sqrt( 1/ test['trade_seconds_in_bucket_count_unique_300'] )
#train['size_tau_150'] = np.sqrt( 1/ train['trade_seconds_in_bucket_count_unique_150'] )
#test['size_tau_150'] = np.sqrt( 1/ test['trade_seconds_in_bucket_count_unique_150'] )
train['size_tau_200'] = np.sqrt( 1/ train['trade_seconds_in_bucket_count_unique_200'] )
test['size_tau_200'] = np.sqrt( 1/ test['trade_seconds_in_bucket_count_unique_200'] )

train['size_tau2'] = np.sqrt( 1/ train['trade_order_count_sum'] )
test['size_tau2'] = np.sqrt( 1/ test['trade_order_count_sum'] )
#train['size_tau2_450'] = np.sqrt( 0.25/ train['trade_order_count_sum'] )
#test['size_tau2_450'] = np.sqrt( 0.25/ test['trade_order_count_sum'] )
train['size_tau2_400'] = np.sqrt( 0.33/ train['trade_order_count_sum'] )
test['size_tau2_400'] = np.sqrt( 0.33/ test['trade_order_count_sum'] )
train['size_tau2_300'] = np.sqrt( 0.5/ train['trade_order_count_sum'] )
test['size_tau2_300'] = np.sqrt( 0.5/ test['trade_order_count_sum'] )
#train['size_tau2_150'] = np.sqrt( 0.75/ train['trade_order_count_sum'] )
#test['size_tau2_150'] = np.sqrt( 0.75/ test['trade_order_count_sum'] )
train['size_tau2_200'] = np.sqrt( 0.66/ train['trade_order_count_sum'] )
test['size_tau2_200'] = np.sqrt( 0.66/ test['trade_order_count_sum'] )

# delta tau
train['size_tau2_d'] = train['size_tau2_400'] - train['size_tau2']
test['size_tau2_d'] = test['size_tau2_400'] - test['size_tau2']

In [None]:
# pd.set_option('display.max_rows',None)
# train.isna().sum()

In [None]:
# get column features
colNames = [col for col in list(train.columns)
            if col not in {"stock_id", "time_id", "target", "row_id"}]
len(colNames)

## KMeans Stock Aggregation

In [None]:
from sklearn.cluster import KMeans

# making agg features for each stock cluster
train_p = pd.read_csv('train.csv')
train_p = train_p.pivot(index='time_id', columns='stock_id', values='target')

corr = train_p.corr()

ids = corr.index

kmeans = KMeans(n_clusters=7, random_state=0).fit(corr.values)
print(kmeans.labels_)

l = []
for n in range(7):
    l.append ( [ (x-1) for x in ( (ids+1)*(kmeans.labels_ == n)) if x > 0] )
    

mat = []
matTest = []

# for each cluster within each time id, get agg features for all stocks in the cluster
n = 0
for ind in l:
    print(ind)
    newDf = train.loc[train['stock_id'].isin(ind) ]
    newDf = newDf.groupby(['time_id']).agg(np.nanmean) 
    newDf.loc[:,'stock_id'] = str(n)+'c1'
    mat.append ( newDf )
    
    newDf = test.loc[test['stock_id'].isin(ind) ]    
    newDf = newDf.groupby(['time_id']).agg(np.nanmean)
    newDf.loc[:,'stock_id'] = str(n)+'c1'
    matTest.append ( newDf )
    
    n+=1
    
mat1 = pd.concat(mat).reset_index()
mat1.drop(columns=['target'],inplace=True)

mat2 = pd.concat(matTest).reset_index()

mat2 = pd.concat([mat2,mat1.loc[mat1.time_id==5]])
mat1 = mat1.pivot(index='time_id', columns='stock_id')
mat1.columns = ["_".join(x) for x in mat1.columns.ravel()]
mat1.reset_index(inplace=True)

mat2 = mat2.pivot(index='time_id', columns='stock_id')
mat2.columns = ["_".join(x) for x in mat2.columns.ravel()]
mat2.reset_index(inplace=True)

In [None]:
nnn = ['time_id',
     'log_return1_realized_volatility_0c1',
     'log_return1_realized_volatility_1c1',     
     'log_return1_realized_volatility_3c1',
     'log_return1_realized_volatility_4c1',     
     'log_return1_realized_volatility_6c1',
     'total_volume_sum_0c1',
     'total_volume_sum_1c1', 
     'total_volume_sum_3c1',
     'total_volume_sum_4c1', 
     'total_volume_sum_6c1',
     'volume_imbalance1_sum_0c1',
     'volume_imbalance1_sum_1c1', 
     'volume_imbalance1_sum_3c1',
     'volume_imbalance1_sum_4c1', 
     'volume_imbalance1_sum_6c1',
     'price_spread1_sum_0c1',
     'price_spread1_sum_1c1',
     'price_spread1_sum_3c1',
     'price_spread1_sum_4c1',
     'price_spread1_sum_6c1',   
     'bid_spread_sum_0c1',
     'bid_spread_sum_1c1',
     'bid_spread_sum_3c1',
     'bid_spread_sum_4c1',
     'bid_spread_sum_6c1',       
     'ask_spread_sum_0c1',
     'ask_spread_sum_1c1',
     'ask_spread_sum_3c1',
     'ask_spread_sum_4c1',
     'ask_spread_sum_6c1',      
     'bid_ask_spread1_sum_0c1',
     'bid_ask_spread1_sum_1c1',
     'bid_ask_spread1_sum_3c1',
     'bid_ask_spread1_sum_4c1',
     'bid_ask_spread1_sum_6c1',
     'bid_ask_spread_spread_sum_0c1',
     'bid_ask_spread_spread_sum_1c1',
     'bid_ask_spread_spread_sum_3c1',
     'bid_ask_spread_spread_sum_4c1',
     'bid_ask_spread_spread_sum_6c1',
     'size_tau2_0c1',
     'size_tau2_1c1',
     'size_tau2_3c1',
     'size_tau2_4c1',
     'size_tau2_6c1',
     'trade_size_sum_0c1',
     'trade_size_sum_1c1', 
     'trade_size_sum_3c1',
     'trade_size_sum_4c1', 
     'trade_size_sum_6c1',
     'trade_order_count_sum_0c1',
     'trade_order_count_sum_1c1',
     'trade_order_count_sum_3c1',
     'trade_order_count_sum_4c1',
     'trade_order_count_sum_6c1',
     'trade_average_volume_per_trade_sum_0c1',
     'trade_average_volume_per_trade_sum_1c1',
     'trade_average_volume_per_trade_sum_3c1',
     'trade_average_volume_per_trade_sum_4c1',
     'trade_average_volume_per_trade_sum_6c1',
     'trade_order_flow_quote_sum_0c1',
     'trade_order_flow_quote_sum_1c1',
     'trade_order_flow_quote_sum_3c1',
     'trade_order_flow_quote_sum_4c1',
     'trade_order_flow_quote_sum_6c1'] 
train = pd.merge(train,mat1[nnn],how='left',on='time_id')
test = pd.merge(test,mat2[nnn],how='left',on='time_id')

# Feature Selection

In [None]:
# train['total_volume_sum_0c1/trade_size_sum_0c1'] = train['total_volume_sum_0c1'] / train['trade_size_sum_0c1']
# test['total_volume_sum_0c1/trade_size_sum_0c1'] = test['total_volume_sum_0c1'] / test['trade_size_sum_0c1']

# train['total_volume_sum_3c1/size_tau2_3c1'] = train['total_volume_sum_3c1'] / train['size_tau2_3c1']
# test['total_volume_sum_3c1/size_tau2_3c1'] = test['total_volume_sum_3c1'] / test['size_tau2_3c1']

# train['total_volume_sum_1c1/size_tau2_1c1'] = train['total_volume_sum_1c1'] / train['size_tau2_1c1']
# test['total_volume_sum_1c1/size_tau2_1c1'] = test['total_volume_sum_1c1'] / test['size_tau2_1c1']

# train['stock_id/price_spread_sum'] = train['stock_id'] / train['price_spread1_sum']
# test['stock_id/price_spread_sum'] = test['stock_id'] / test['price_spread1_sum']

In [None]:
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

# Model Fitting

## Functions

In [None]:
# pip install pytorch_tabnet
!pip install /kaggle/input/pytorchtabnet/pytorch_tabnet-3.1.1-py3-none-any.whl

### Tabnet

In [None]:
import torch
from torch.optim import Adam, SGD
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts

from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor

In [None]:
class RMSPE(Metric):

    def __init__(self) -> None:
        self._name = 'rmspe'
        self._maximize = False
    
    def __call__(self, y_true, y_pred):
        rmspe = np.sqrt(np.mean(np.square( (y_true-y_pred)/y_true )))
        return rmspe

def rmspeLoss(y_pred, y_true):
    l = torch.sqrt(torch.mean( ((y_true-y_pred)/y_true)**2 )).clone()
    return l

## Main

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler, QuantileTransformer
from tqdm import tqdm

scaler_opt = "standard" # alternative is standard

train_ = train.replace([np.inf, -np.inf], np.nan)
train_['stock31'] = 0
train_['stock31'].loc[train_['stock_id'] == 31] = 1
test_ = test.replace([np.inf, -np.inf], np.nan)
test_['stock31'] = 0
test_['stock31'].loc[test_['stock31']==31] = 1
# train_.dropna(inplace=True)
X = train_.drop(['row_id', 'target', 'time_id'], axis=1)
y = train_['target']
X_test = test_.copy()
X_test.drop(['time_id', 'row_id'], axis=1, inplace=True)


N_unique = X.nunique()
types = X.dtypes

c_col = [] # record the name of categorical columns
c_dim = {} # record the dimension of categorical columns

# encoding categorical data
for col in tqdm(X.columns):
    if col in ['stock_id', 'stock31']:
        encoder = LabelEncoder()
        X[col] = encoder.fit_transform(X[col].values)
        X_test[col] = encoder.transform(X_test[col].values)
        c_col.append(col)
        c_dim[col] = len(encoder.classes_)
    else:
        if scaler_opt == 'quantile':
            scaler = QuantileTransformer(random_state=21,n_quantiles=2000, output_distribution='normal')
        else:
            scaler = StandardScaler()
        X[col] = scaler.fit_transform(X[[col]])
        X_test[col] = scaler.transform(X_test[[col]])

X['time_id'] = train_['time_id']

cat_idx = [i for i,col in enumerate(X.columns.to_list()) if col in c_col]
cat_dims = [c_dim[col] for col in X.columns.to_list() if col in c_col]

In [None]:
import gc
del train
del train_
del test_
gc.collect()

In [None]:
# according to tabnet baseline

tabnet_params = dict(
    cat_idxs=cat_idx,
    cat_dims=cat_dims,
    cat_emb_dim=1,
    n_d = 24,
    n_a = 24,
    n_steps = 3,
    gamma = 1.2,
    n_independent = 2,
    n_shared = 2,
    lambda_sparse = 0,
    optimizer_fn = Adam,
    optimizer_params = dict(lr = (2e-2), weight_decay=1e-5),
    mask_type = "entmax",
    scheduler_params = dict(T_0=200, T_mult=1, eta_min=1e-4, last_epoch=-1, verbose=False),
    scheduler_fn = CosineAnnealingWarmRestarts,
    seed = 42,
    verbose = 10,
    device_name='cuda'
)

In [None]:
# Function to calculate the root mean squared percentage error
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

# Function to early stop with root mean squared percentage error
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

In [None]:
from sklearn.model_selection import GroupKFold

groups = X['time_id']
oof_predictions = np.zeros((X.shape[0],1))
test_predictions = np.zeros(X_test.shape[0])
feature_importance = pd.DataFrame()
feature_importance['features'] = X.columns.to_list()
stats = pd.DataFrame()
explain_matrix = []
masks = []

num_splits = 5

kfold = GroupKFold(n_splits=num_splits)
cv_scores = []

X = X.fillna(X.mean())
y = y.fillna(y.mean())
colNames = list(X)
colNames.remove('time_id')
X_test = X_test[colNames].fillna(X[colNames].mean())


for fold, (trn_ind, val_ind) in enumerate(kfold.split(X,y,groups)):
    print(f'training fold {fold + 1}')
    X_ = X.drop('time_id', axis=1)
    X_train, X_val = X_.iloc[trn_ind].values, X_.iloc[val_ind].values
    y_train, y_val = y.iloc[trn_ind].values.reshape((-1,1)), y.iloc[val_ind].values.reshape((-1,1))
    

    clf = TabNetRegressor(**tabnet_params)
    clf.fit(
        X_train,
        y_train,
        eval_set=[(X_val, y_val)],
        max_epochs=500,
        patience=50,
        batch_size=1024*24,
        virtual_batch_size=128*24,
        num_workers=0,
        drop_last=False,
        eval_metric=[RMSPE],
        loss_fn=rmspeLoss
    )


    oof_predictions[val_ind] = clf.predict(X_val)
    cv_scores.append(rmspe(y_val, oof_predictions[val_ind]))

    test_predictions += clf.predict(X_test.values).flatten()/num_splits
    
    clf.save_model(r'/kaggle/working/tabnet_f%d'%fold)

rmspe_score = round(np.mean(cv_scores),6)
print(f'Our out of folds RMSPE is {rmspe_score}')

# Submission

In [None]:
target_name = "target"
test[target_name] = test_predictions
test[['row_id', target_name]].to_csv(output_file,index = False)