### References
* https://www.kaggle.com/realtimshady/2lgbm-2nn
* https://www.kaggle.com/munumbutt/feature-engineering-tuned-xgboost-lgbm

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob
from IPython.core.display import display, HTML
import gc
import plotly.graph_objects as go
from joblib import Parallel, delayed
from sklearn import preprocessing, model_selection
import seaborn as sns
from tqdm import tqdm
from scipy.stats import probplot

pd.set_option('max_rows', 400)
pd.set_option('max_columns', 400)

In [None]:
data_dir = '../input/optiver-realized-volatility-prediction/'

## EDA

In [None]:
sample = pd.read_csv("../input/optiver-realized-volatility-prediction/sample_submission.csv")
sample

In [None]:
test = pd.read_csv("../input/optiver-realized-volatility-prediction/test.csv")
test

In [None]:
train = pd.read_csv("../input/optiver-realized-volatility-prediction/train.csv")
train

In [None]:
train['stock_id'].value_counts()

In [None]:
train['stock_id'].unique()

In [None]:
book_train = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=10')
book_train

In [None]:
trade_example = pd.read_parquet("../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=1")
trade_example

## FUNC

In [None]:
def convert_to_32bit(df):
    for f in df.columns:
        if df[f].dtype == 'int64':
            df[f] = df[f].astype('int32')
        if df[f].dtype == 'float64':
            df[f] = df[f].astype('float32')
    return df

In [None]:
def wap_1(df):
    wap = (df['bid_price1'] * df['bid_size1'] + df['ask_price1'] * df['ask_size1'])/(df['bid_size1'] + df['ask_size1'])
    return wap
def wap_2(df):
    wap = (df['bid_price2'] * df['bid_size2'] + df['ask_price2'] * df['ask_size2'])/(df['bid_size2'] + df['ask_size2'])
    return wap
def wap_bid(df):
    wap = (df['bid_price1'] * df['bid_size1'] + df['bid_price2'] * df['bid_size2'])/(df['bid_size1'] + df['bid_size2'])
    return wap
def wap_ask(df):
    wap = (df['ask_price1'] * df['ask_size1'] + df['ask_price2'] * df['ask_size2'])/(df['ask_size1'] + df['ask_size2'])
    return wap

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 
def realized_volatility(series):
    return np.sqrt(np.sum(series**2))
def count_unique(series):
    return len(np.unique(series))

In [None]:
# Function to read our base train and test set
def read_train_test():
    train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
    test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
    # Create a key to merge with book and trade data
    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    print(f'Our training set has {train.shape[0]} rows')
    return train, test

In [None]:
def book_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    df = convert_to_32bit(df)
    
    #calculate return etc
    df['wap1'] = wap_1(df)
    df['log_return1'] = df.groupby('time_id')['wap1'].apply(log_return)
    
    df['wap2'] = wap_2(df)
    df['log_return2'] = df.groupby('time_id')['wap2'].apply(log_return)
    
    df['wap_bid'] = wap_bid(df)
    df['wap_ask'] = wap_ask(df)
    
    df['log_return_bid'] = df.groupby('time_id')['wap_bid'].apply(log_return)
    df['log_return_ask'] = df.groupby('time_id')['wap_ask'].apply(log_return)
    
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1'])/2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df["bid_ask_spread"] = abs(df['bid_spread'] - df['ask_spread'])
    df['bid_volume'] = df['bid_size1'] + df['bid_size2']
    df['ask_volume'] = df['ask_size1'] + df['ask_size2']
    df['bid_ask_volume'] = abs(df['bid_volume'] - df['ask_volume'])
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))

    #dict for aggregate
    create_feature_dict = {
        'wap1': [np.sum, np.mean, np.min, np.max, np.std],
        'wap2': [np.sum, np.mean, np.min, np.max, np.std],
        'log_return1': [np.sum, realized_volatility, np.mean, np.min, np.max, np.std],
        'log_return2': [np.sum, realized_volatility, np.mean, np.min, np.max, np.std],
        'wap_bid': [np.sum, np.mean, np.min, np.max, np.std],
        'wap_ask': [np.sum, np.mean, np.min, np.max, np.std],
        'log_return_bid': [np.sum, realized_volatility, np.mean, np.min, np.max, np.std],
        'log_return_ask': [np.sum, realized_volatility, np.mean, np.min, np.max, np.std],
        'wap_balance': [np.sum, np.mean, np.min, np.max, np.std],
        'price_spread':[np.sum, np.mean, np.min, np.max, np.std],
        'bid_spread':[np.sum, np.mean, np.min, np.max, np.std],
        'ask_spread':[np.sum, np.mean, np.min, np.max, np.std],
        'bid_ask_spread':[np.sum, np.mean, np.min, np.max, np.std],
        'bid_volume':[np.sum, np.mean, np.min, np.max, np.std],
        'ask_volume':[np.sum, np.mean, np.min, np.max, np.std],
        'bid_ask_volume':[np.sum, np.mean, np.min, np.max, np.std],
        'total_volume':[np.sum, np.mean, np.min, np.max, np.std],
        'volume_imbalance':[np.sum, np.mean, np.min, np.max, np.std]
        }

    create_feature_dict_time = {
        'log_return1': [realized_volatility],
        'log_return2': [realized_volatility],
        'log_return_bid': [realized_volatility],
        'log_return_bid': [realized_volatility],
    }
    
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(fe_dict,seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(fe_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # Get the stats for different windows
    df_feature = get_stats_window(create_feature_dict,seconds_in_bucket = 0, add_suffix = False)
    df_feature_500 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 500, add_suffix = True)
    df_feature_400 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 400, add_suffix = True)
    df_feature_300 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 300, add_suffix = True)
    df_feature_200 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 200, add_suffix = True)
    df_feature_100 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 100, add_suffix = True)

    # Merge all
    df_feature = df_feature.merge(df_feature_500, how = 'left', left_on = 'time_id_', right_on = 'time_id__500')
    df_feature = df_feature.merge(df_feature_400, how = 'left', left_on = 'time_id_', right_on = 'time_id__400')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_200, how = 'left', left_on = 'time_id_', right_on = 'time_id__200')
    df_feature = df_feature.merge(df_feature_100, how = 'left', left_on = 'time_id_', right_on = 'time_id__100')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__500','time_id__400', 'time_id__300', 'time_id__200','time_id__100'], axis = 1, inplace = True)
    
    
    # Create row_id so we can merge
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x: f'{stock_id}-{x}')
    df_feature.drop(['time_id_'], axis = 1, inplace = True)
    return df_feature

In [None]:
%%time

# Function to preprocess trade data (for each stock id)
def trade_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    df = convert_to_32bit(df)
    
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    df['amount']=df['price']*df['size']
    # Dict for aggregations
    create_feature_dict = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum, np.max, np.min],
        'order_count':[np.sum,np.max],
        'amount':[np.sum,np.max,np.min],
    }
    create_feature_dict_time = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum],
        'order_count':[np.sum],
    }
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(fe_dict,seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(fe_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    

    # Get the stats for different windows
    df_feature = get_stats_window(create_feature_dict,seconds_in_bucket = 0, add_suffix = False)
    df_feature_500 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 500, add_suffix = True)
    df_feature_400 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 400, add_suffix = True)
    df_feature_300 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 300, add_suffix = True)
    df_feature_200 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 200, add_suffix = True)
    df_feature_100 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 100, add_suffix = True)
    
    def tendency(price, vol):    
        df_diff = np.diff(price)
        val = (df_diff/price[1:])*100
        power = np.sum(val*vol[1:])
        return(power)
    
    lis = []
    for n_time_id in df['time_id'].unique():
        df_id = df[df['time_id'] == n_time_id]        
        tendencyV = tendency(df_id['price'].values, df_id['size'].values)      
        f_max = np.sum(df_id['price'].values > np.mean(df_id['price'].values))
        f_min = np.sum(df_id['price'].values < np.mean(df_id['price'].values))
        df_max =  np.sum(np.diff(df_id['price'].values) > 0)
        df_min =  np.sum(np.diff(df_id['price'].values) < 0)
        # new
        abs_diff = np.median(np.abs( df_id['price'].values - np.mean(df_id['price'].values)))        
        energy = np.mean(df_id['price'].values**2)
        iqr_p = np.percentile(df_id['price'].values,75) - np.percentile(df_id['price'].values,25)
        
        # vol vars
        
        abs_diff_v = np.median(np.abs( df_id['size'].values - np.mean(df_id['size'].values)))        
        energy_v = np.sum(df_id['size'].values**2)
        iqr_p_v = np.percentile(df_id['size'].values,75) - np.percentile(df_id['size'].values,25)
        
        lis.append({'time_id':n_time_id,'tendency':tendencyV,'f_max':f_max,'f_min':f_min,'df_max':df_max,'df_min':df_min,
                   'abs_diff':abs_diff,'energy':energy,'iqr_p':iqr_p,'abs_diff_v':abs_diff_v,'energy_v':energy_v,'iqr_p_v':iqr_p_v})
    
    df_lr = pd.DataFrame(lis)
        
   
    df_feature = df_feature.merge(df_lr, how = 'left', left_on = 'time_id_', right_on = 'time_id')
    
    # Merge all
    df_feature = df_feature.merge(df_feature_500, how = 'left', left_on = 'time_id_', right_on = 'time_id__500')
    df_feature = df_feature.merge(df_feature_400, how = 'left', left_on = 'time_id_', right_on = 'time_id__400')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_200, how = 'left', left_on = 'time_id_', right_on = 'time_id__200')
    df_feature = df_feature.merge(df_feature_100, how = 'left', left_on = 'time_id_', right_on = 'time_id__100')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__500','time_id__400', 'time_id__300', 'time_id__200','time_id','time_id__100'], axis = 1, inplace = True)
    
    
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature.drop(['trade_time_id_'], axis = 1, inplace = True)
    return df_feature

In [None]:
%%time

def get_time_stock(df):
    vol_cols = ['log_return1_realized_volatility', 'log_return2_realized_volatility', 'log_return1_realized_volatility_400', 'log_return2_realized_volatility_400', 
                'log_return1_realized_volatility_300', 'log_return2_realized_volatility_300', 'log_return1_realized_volatility_200', 'log_return2_realized_volatility_200', 
                'trade_log_return_realized_volatility', 'trade_log_return_realized_volatility_400', 'trade_log_return_realized_volatility_300', 'trade_log_return_realized_volatility_200']


    # Group by the stock id
    df_stock_id = df.groupby(['stock_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
    df_stock_id = df_stock_id.add_suffix('_' + 'stock')

    # Group by the stock id
    df_time_id = df.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix('_' + 'time')
    
    # Merge with original dataframe
    df = df.merge(df_stock_id, how = 'left', left_on = ['stock_id'], right_on = ['stock_id__stock'])
    df = df.merge(df_time_id, how = 'left', left_on = ['time_id'], right_on = ['time_id__time'])
    df.drop(['stock_id__stock', 'time_id__time'], axis = 1, inplace = True)
    return df

In [None]:
# Funtion to make preprocessing function in parallel (for each stock id)
def preprocessor(list_stock_ids, is_train = True):
    
    # Parrallel for loop
    def for_joblib(stock_id):
        # Train
        if is_train:
            file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        # Test
        else:
            file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_test.parquet/stock_id=" + str(stock_id)
    
        # Preprocess book and trade data and merge them
        df_tmp = pd.merge(book_preprocessor(file_path_book), trade_preprocessor(file_path_trade), on = 'row_id', how = 'left')
        
        # Return the merge dataframe
        return df_tmp
    
    # Use parallel api to call paralle for loop
    df = Parallel(n_jobs = -1, verbose = 1)(delayed(for_joblib)(stock_id) for stock_id in list_stock_ids)
    # Concatenate all the dataframes that return from Parallel
    df = pd.concat(df, ignore_index = True)
    return df

In [None]:
# Function to calculate the root mean squared percentage error
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

In [None]:
# Function to early stop with root mean squared percentage error
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False


In [None]:
train, test = read_train_test()

In [None]:
train_stock_ids = train['stock_id'].unique()
# Preprocess them using Parallel and our single stock id functions
train_ = preprocessor(train_stock_ids, is_train = True)
train = train.merge(train_, on = ['row_id'], how = 'left')

# Get unique stock ids 
test_stock_ids = test['stock_id'].unique()
# Preprocess them using Parallel and our single stock id functions
test_ = preprocessor(test_stock_ids, is_train = False)
test = test.merge(test_, on = ['row_id'], how = 'left')

# Get group stats of time_id and stock_id
train = get_time_stock(train)
test = get_time_stock(test)

In [None]:
# replace by order sum (tau)
train['size_tau'] = np.sqrt( 1/ train['trade_seconds_in_bucket_count_unique'] )
test['size_tau'] = np.sqrt( 1/ test['trade_seconds_in_bucket_count_unique'] )
#train['size_tau_450'] = np.sqrt( 1/ train['trade_seconds_in_bucket_count_unique_450'] )
#test['size_tau_450'] = np.sqrt( 1/ test['trade_seconds_in_bucket_count_unique_450'] )
train['size_tau_400'] = np.sqrt( 1/ train['trade_seconds_in_bucket_count_unique_400'] )
test['size_tau_400'] = np.sqrt( 1/ test['trade_seconds_in_bucket_count_unique_400'] )
train['size_tau_300'] = np.sqrt( 1/ train['trade_seconds_in_bucket_count_unique_300'] )
test['size_tau_300'] = np.sqrt( 1/ test['trade_seconds_in_bucket_count_unique_300'] )
#train['size_tau_150'] = np.sqrt( 1/ train['trade_seconds_in_bucket_count_unique_150'] )
#test['size_tau_150'] = np.sqrt( 1/ test['trade_seconds_in_bucket_count_unique_150'] )
train['size_tau_200'] = np.sqrt( 1/ train['trade_seconds_in_bucket_count_unique_200'] )
test['size_tau_200'] = np.sqrt( 1/ test['trade_seconds_in_bucket_count_unique_200'] )

In [None]:
train['size_tau2'] = np.sqrt( 1/ train['trade_order_count_sum'] )
test['size_tau2'] = np.sqrt( 1/ test['trade_order_count_sum'] )
#train['size_tau2_450'] = np.sqrt( 0.25/ train['trade_order_count_sum'] )
#test['size_tau2_450'] = np.sqrt( 0.25/ test['trade_order_count_sum'] )
train['size_tau2_400'] = np.sqrt( 0.33/ train['trade_order_count_sum'] )
test['size_tau2_400'] = np.sqrt( 0.33/ test['trade_order_count_sum'] )
train['size_tau2_300'] = np.sqrt( 0.5/ train['trade_order_count_sum'] )
test['size_tau2_300'] = np.sqrt( 0.5/ test['trade_order_count_sum'] )
#train['size_tau2_150'] = np.sqrt( 0.75/ train['trade_order_count_sum'] )
#test['size_tau2_150'] = np.sqrt( 0.75/ test['trade_order_count_sum'] )
train['size_tau2_200'] = np.sqrt( 0.66/ train['trade_order_count_sum'] )
test['size_tau2_200'] = np.sqrt( 0.66/ test['trade_order_count_sum'] )

# delta tau
train['size_tau2_d'] = train['size_tau2_400'] - train['size_tau2']
test['size_tau2_d'] = test['size_tau2_400'] - test['size_tau2']

In [None]:
df = train
test_data_set = test

In [None]:
test_data_set['stock_id'] = test_data_set['stock_id'].astype(int)
test_data_set.head()

In [None]:
X = df.drop(['row_id', 'target', 'time_id'], axis = 1)
y = df['target']
X.shape, y.shape

In [None]:
thresh = int(len(df) * 0.9 / df['stock_id'].nunique())
print (thresh)

In [None]:
mask  = df.groupby('stock_id')['stock_id'].cumcount() < thresh

In [None]:
train = df[mask]
test = df[~mask]

In [None]:
X_train = train.drop(['row_id', 'target', 'time_id'], axis = 1)
y_train = train['target']
X_train.shape, y_train.shape

In [None]:
X_valid = test.drop(['row_id', 'target', 'time_id'], axis = 1)
y_valid = test['target']
X_valid.shape, y_valid.shape

In [None]:
X_train['stock_id'] = X_train['stock_id'].astype(int)
X_valid['stock_id'] = X_valid['stock_id'].astype(int)

## XGBOOST

In [None]:
import optuna
import xgboost as xgb
from optuna.samplers import TPESampler
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [None]:
params_xgb = {
        'lambda': 0.0014832052084105417, 
        'alpha': 2.6885464964958112, 
        'max_depth': 17, 
        'learning_rate': 0.02, 
        'random_state': 24, 
        'n_estimators': 1540, 
        'eta': 0.12558915915760901, 
        'subsample': 0.6000000000000001, 
        'colsample_bytree': 0.3, 
        'min_child_weight': 77, 
        'reg_lambda': 0.001217091110648466, 
        'reg_alpha': 0.0019723477880301235
        }

xgb_model = xgb.XGBRegressor(**params_xgb, tree_method='gpu_hist')

In [None]:
%%time
xgb_model.fit(X_train ,y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=150, verbose=False)

preds = xgb_model.predict(X_valid)
RMSPE = round(rmspe(y_true = y_valid, y_pred = preds), 5)
print(f'Performance of the Tuned XGB prediction: RMSPE: {RMSPE}')

## LGBM

In [None]:
from lightgbm import LGBMRegressor

In [None]:
params_lgbm = {
        "metric": "rmse",
        "verbosity": -1,
        'learning_rate': 0.04412162462604988, 
        'max_depth': 300, 
        'lambda_l1': 0.12309589568066824, 
        'lambda_l2': 3.1044658548129586e-06, 
        'num_leaves': 246, 
        'n_estimators': 2350, 
        'feature_fraction': 0.531654883966269, 
        'bagging_fraction': 0.8553165643797457, 
        'bagging_freq': 8, 
        'min_child_samples': 42
        }

In [None]:
lgbm_model = LGBMRegressor(**params_lgbm, device='gpu')

In [None]:
%%time
lgbm_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False, early_stopping_rounds=150)

preds = lgbm_model.predict(X_valid)
RMSPE = round(rmspe(y_true = y_valid, y_pred = preds), 5)
print(f'Performance of the Tuned LIGHTGBM prediction: RMSPE: {RMSPE}')

## CatBoostRegressor

In [None]:
import catboost as cat
from catboost import CatBoostRegressor

In [None]:
params_cb = {
        'colsample_bylevel': 0.029576065862676762,
        'depth': 91,
        'learning_rate': 0.022293479743970765,
        'iterations': 7000,
        'max_bin': 120,
        'min_data_in_leaf': 66,
        'l2_leaf_reg': 0.0009704826955054485,
        'bagging_temperature': 0.7432417203968587,
        'subsample': 0.7022796507235656,
        'grow_policy': 'Lossguide', 
        'leaf_estimation_method': 'Newton',
        'loss_function': 'RMSE',
        'eval_metric': 'RMSE',
        'cat_features': ['stock_id']
        }

In [None]:
cb_model = CatBoostRegressor(**params_cb)

In [None]:
%%time
cb_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False, early_stopping_rounds=150)

preds = cb_model.predict(X_valid)
RMSPE = round(rmspe(y_true = y_valid, y_pred = preds), 5)
print(f'Performance of the Tuned CATBOOST prediction: RMSPE: {RMSPE}')

## Stacking

In [None]:
from sklearn.ensemble import StackingRegressor

In [None]:
mod_xgb = xgb.XGBRegressor(tree_method='gpu_hist', n_jobs= - 1)
mod_lgbm = LGBMRegressor(device='gpu')
mod_cb = CatBoostRegressor()

In [None]:
estimators = [('mod_xgb', mod_xgb),
              ('mod_lgbm', mod_lgbm),
              ('mod_cb', mod_cb)]

clf = StackingRegressor(estimators=estimators, verbose=1)

In [None]:
%%time
clf.fit(X_train, y_train)

In [None]:
preds = clf.predict(X_valid)
RMSPE = round(rmspe(y_true = y_valid, y_pred = preds), 5)
print(f'Performance of the STACK prediction: RMSPE: {RMSPE}')

## Submission

In [None]:
y_pred = test_data_set[['row_id']]
X_test = test_data_set.drop(['time_id', 'row_id'], axis = 1)

In [None]:
target = np.zeros(len(X_test))

pred = clf.predict(X_test[X_train.columns])
target = pred

In [None]:
y_pred = y_pred.assign(target = target)
y_pred

In [None]:
y_pred.to_csv('submission.csv',index = False)