This work is build on the basis of https://www.kaggle.com/austinzhao/reproduction-explanation-lgbm-baseline. I used catboost model instead of  LGBM and tuned its hyperparameters with hyperopt.

# Import


In [None]:
from catboost import Pool, CatBoostRegressor
# Import order: data manipulating -> machine/deep learning -> utilities/helpers/improvement -> configuration
import pandas as pd
import numpy as np
import scipy as sc

from sklearn.model_selection import KFold
import lightgbm as lgb

from joblib import Parallel, delayed

import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 300)

from hpsklearn import HyperoptEstimator
from hpsklearn import any_regressor
from hpsklearn import any_preprocessing
from hyperopt import tpe
from  hyperopt import hp

# Funcs

In [None]:
# Define data_directory and data_read func
data_dir = '../input/optiver-realized-volatility-prediction/'
def read_train_test():
    train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
    test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
    # Create a key to merge with book and trade data
    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    print(f'Our training set has {train.shape[0]} rows')
    return train, test

In [None]:
# Calculate 1st WAP
def calc_wap1(df):
    wap = (df['ask_price1'] * df['bid_size1'] + df['bid_price1'] * df['ask_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap
# Calculate 2nd WAP
def calc_wap2(df):
    wap = (df['ask_price2'] * df['bid_size2'] + df['bid_price2'] * df['ask_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

# Calculate Log Return
def log_return(series):
    return np.log(series).diff() # log(x / y) = log(x) - log(y), ref[2]

# Realized Volatility
def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

# Count Unique Elements of Series
def count_unique(series):
    return len(np.unique(series))

# Calculate features as specifized feature_dict
def calc_features(df, feature_dict):
    # Calculate STATs (sum, mean, std) for different time-window (seconds in bucket)
    def calc_certain_window(window, add_suffix=False):
        # Filter by time-window, Groupy by time_id, then Apply feature_dict
        df_feature = df[df['seconds_in_bucket'] >= window].groupby(['time_id']).agg(feature_dict).reset_index()
        # Rename features/columns by joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix for different time-window
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(window))
        return df_feature
    
    windows = [0, 150, 300, 450]
    df_feature = pd.DataFrame()
    
    for window in windows:
        if window == 0:
            df_feature = calc_certain_window(window=window, add_suffix=False)
        else:
            df_feature_tmp = calc_certain_window(window=window, add_suffix=True)
            df_feature = df_feature.merge(df_feature_tmp, how='left', left_on='time_id_', right_on='time_id__'+str(window))
        
    df_feature.drop(['time_id__450', 'time_id__300', 'time_id__150'], axis = 1, inplace = True)
        
    return df_feature


# Preprocess book-data (applied for each stock_id)
def preprocess_book(file_path):
    df = pd.read_parquet(file_path)
    
    # Calculate Wap
    df['wap1'] = calc_wap1(df)
    df['wap2'] = calc_wap2(df)
    # Calculate Log-Return
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return)
    # Calculate Wap-Balance
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    # Calculate Various-Spread
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    # Feature (Generating) Dict for aggregated operations
    feature_dict = {
        'wap1': [np.sum, np.mean, np.std],
        'wap2': [np.sum, np.mean, np.std],
        'log_return1': [np.sum, realized_volatility, np.mean, np.std],
        'log_return2': [np.sum, realized_volatility, np.mean, np.std],
        'wap_balance': [np.sum, np.mean, np.std],
        'price_spread':[np.sum, np.mean, np.std],
        'bid_spread':[np.sum, np.mean, np.std],
        'ask_spread':[np.sum, np.mean, np.std],
        'total_volume':[np.sum, np.mean, np.std],
        'volume_imbalance':[np.sum, np.mean, np.std]
    }
    
    df_feature = calc_features(df, feature_dict=feature_dict)
    
    # Generate row_id (for later merge)
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x: f'{stock_id}-{x}')
    # Drop the left time_id_ (after using for generating row_id)
    df_feature.drop(['time_id_'], axis = 1, inplace = True)
    
    return df_feature

# Preprocess trade-Data (applied for each stock_id)
def preprocess_trade(file_path):
    df = pd.read_parquet(file_path)
    
    # Calculate Log-Return
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    
    # Feature (Generating) Dict for aggregated operations
    feature_dict = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum],
        'order_count':[np.mean],
    }
    
    df_feature = calc_features(df, feature_dict=feature_dict)
    
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature.drop(['trade_time_id_'], axis = 1, inplace = True)
    
    return df_feature


# Preprocess/Feature-Engineering in parallel (applied for each stock_id)
def preprocess(list_stock_ids, is_train=True):
    
    def preprocess_for_stock_id(stock_id):
        # Generate file_path for train-dataset
        if is_train:
            file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        # ... for test-dataset
        else:
            file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_test.parquet/stock_id=" + str(stock_id)
    
        # Preprocess book- and trade- data, then merge both
        df_tmp = pd.merge(preprocess_book(file_path_book), preprocess_trade(file_path_trade), on='row_id', how='left')

        return df_tmp
    
    # Parallelize Preprocessing for Every stock_id
    df = Parallel(n_jobs=-1, verbose=1)(delayed(preprocess_for_stock_id)(stock_id) for stock_id in list_stock_ids)
    
    # Concatenate All Dataframes from Parallelized Preprocessing
    df = pd.concat(df, ignore_index=True)
    
    return df

# Calculate STATs (mean, std, max, min) for realized volatility while groupped by stock_id and time_id
def get_time_stock(df_feature):
    # Enumerate realized volatility features/columns
    vol_cols = ['log_return1_realized_volatility', 'log_return2_realized_volatility', 'log_return1_realized_volatility_450', 'log_return2_realized_volatility_450', 
                'log_return1_realized_volatility_300', 'log_return2_realized_volatility_300', 'log_return1_realized_volatility_150', 'log_return2_realized_volatility_150', 
                'trade_log_return_realized_volatility', 'trade_log_return_realized_volatility_450', 'trade_log_return_realized_volatility_300', 'trade_log_return_realized_volatility_150']

    # Group by the stock id
    df_stock_id = df_feature.groupby(['stock_id'])[vol_cols].agg(['mean', 'std', 'max', 'min']).reset_index()
    # Rename columns joining suffix
    df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
    df_stock_id = df_stock_id.add_suffix('_' + 'stock')

    # Group by the stock id
    df_time_id = df_feature.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min']).reset_index()
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix('_' + 'time')
    
    # Merge with original dataframe
    df_feature = df_feature.merge(df_stock_id, how='left', left_on=['stock_id'], right_on=['stock_id__stock'])
    df_feature = df_feature.merge(df_time_id, how='left', left_on=['time_id'], right_on=['time_id__time'])
    df_feature.drop(['stock_id__stock', 'time_id__time'], axis=1, inplace=True)
    
    return df_feature

# Calculate the root mean squared percentage error
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

# Early stop with root mean squared percentage error
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

# Finding best CatBoost model using hyperopt

In [None]:
# CatBoost parameters
ctb_reg_params = {
    'learning_rate':     hp.choice('learning_rate',     np.arange(0.05, 1, 0.05)),
    'max_depth':         hp.choice('max_depth',         np.arange(5, 16, 1, dtype=int)),
    'colsample_bylevel': hp.choice('colsample_bylevel', np.arange(0.3, 0.8, 0.1)),
    'n_estimators':      hp.choice('n_estimators', np.arange(100, 2000, 100)),
    'eval_metric':       hp.choice('eval_metric', ['RMSE', 'MAPE', 'MAE']),
    'od_wait': 5 # overfitting detector
}
ctb_fit_params = {
    'early_stopping_rounds': 10,
    
    'verbose': True
}
ctb_para = dict()
ctb_para['reg_params'] = ctb_reg_params
ctb_para['fit_params'] = ctb_fit_params
ctb_para['loss_func' ] = lambda y, pred: np.sqrt(mean_squared_error(y, pred))

In [None]:
from hyperopt import fmin, tpe, STATUS_OK, STATUS_FAIL, Trials
import catboost as ctb
from sklearn.metrics import mean_squared_error
class HPOpt(object):

    def __init__(self, x_train, x_test, y_train, y_test):
        self.x_train = x_train
        self.x_test  = x_test
        self.y_train = y_train
        self.y_test  = y_test

    def process(self, fn_name, space, trials, algo, max_evals):
        fn = getattr(self, fn_name)
        try:
            result = fmin(fn=fn, space=space, algo=algo, max_evals=max_evals, trials=trials)
        except Exception as e:
            return {'status': STATUS_FAIL,
                    'exception': str(e)}
        return result, trials


    def ctb_reg(self, para):
        reg = ctb.CatBoostRegressor(**para['reg_params'])
        return self.train_reg(reg, para)

    def train_reg(self, reg, para):
        reg.fit(self.x_train, self.y_train,
                eval_set=[(self.x_train, self.y_train), (self.x_test, self.y_test)], 
                **para['fit_params'])
        pred = reg.predict(self.x_test)
        loss = para['loss_func'](self.y_test, pred)
        return {'loss': loss, 'status': STATUS_OK, 'trained_model': reg}

In [None]:
def get_best_model(trials):
    valid_trial_list = [trial for trial in trials
                            if STATUS_OK == trial['result']['status']]
    losses = [ float(trial['result']['loss']) for trial in valid_trial_list]
    index_having_minumum_loss = np.argmin(losses)
    best_trial_obj = valid_trial_list[index_having_minumum_loss]
    best_model = best_trial_obj['result']['trained_model']
    return best_model

In [None]:
def train_and_evaluate(train, test, best_model):

    # Split features and target
    x = train.drop(['row_id', 'target', 'time_id'], axis = 1)
    y = train['target']
    x_test = test.drop(['row_id', 'time_id'], axis = 1)
    
    # Transform stock id to a numeric value
    x['stock_id'] = x['stock_id'].astype(int)
    x_test['stock_id'] = x_test['stock_id'].astype(int)
    
    # Create out of folds array
    oof_predictions = np.zeros(x.shape[0])
    # Create test array to store predictions
    test_predictions = np.zeros(x_test.shape[0])
    # Create a KFold object
    kfold = KFold(n_splits = 5, random_state = 66, shuffle = True)
    # Iterate through each fold
    flag = 1
    model = best_model
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(x)):
        print(f'Training fold {fold + 1}')
        x_train, x_val = x.iloc[trn_ind], x.iloc[val_ind]
        y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
        # Root mean squared percentage error weights
        train_weights = 1 / np.square(y_train)
        val_weights = 1 / np.square(y_val)
        train_dataset = Pool(x_train,
                     y_train,
                     weight=train_weights)
        val_dataset = Pool(x_val,
                     y_val,
                     weight=val_weights)

        if flag == 1:
            model.fit(train_dataset, eval_set = val_dataset)
            flag=2
        else:
            model.fit(train_dataset,
                      eval_set = val_dataset,
                      init_model='model.cbm')
        # Incremental learning is not needed here, but I leave it for educational purposes
        model.save_model('model.cbm') 

        oof_predictions[val_ind] = model.predict(Pool(x_val))
        # Predict the test set
        test_predictions += model.predict(Pool(x_test)) / 5
        
    rmspe_score = rmspe(y, oof_predictions)
    print(f'Our out of folds RMSPE is {rmspe_score}')
    # Return test predictions
    return test_predictions

# Main

In [None]:
# Read train and test
train, test = read_train_test()

# Get unique stock_id (as prediction by stock_id)
train_stock_ids = train['stock_id'].unique()

# Generate features
train_feature = preprocess(train_stock_ids, is_train=True)
# Merge with intiail train data
train = train.merge(train_feature, on=['row_id'], how='left')

# Same for test datas
test_stock_ids = test['stock_id'].unique()
test_feature = preprocess(test_stock_ids, is_train=False)
test = test.merge(test_feature, on=['row_id'], how='left')

# Further generate features with realized-volatility
train = get_time_stock(train)
test = get_time_stock(test)

In [None]:
from sklearn.model_selection import train_test_split

x = train.drop(['row_id', 'target', 'time_id'], axis = 1)
y = train['target']

x['stock_id'] = x['stock_id'].astype(int)

x_train, x_val, y_train, y_val =train_test_split( x, y, test_size=0.8, random_state=42)

In [None]:
# Find best model
obj = HPOpt(x_train, x_val, y_train, y_val)
n_epochs_to_evaluate = 50
ctb_obj = obj.process(fn_name='ctb_reg', space=ctb_para, trials=Trials(), 
                      algo=tpe.suggest, max_evals=n_epochs_to_evaluate)
best_model = get_best_model(ctb_obj[1])
best_model.save_model('best_cv_model.cbm')

In [None]:
# Traing and evaluate
test_predictions = train_and_evaluate(train, test, best_model)

# Save test predictions
test['target'] = test_predictions
test[['row_id', 'target']].to_csv('submission.csv', index=False)