In [None]:
import os
import glob
from joblib import Parallel, delayed
import joblib
import pandas as pd
import numpy as np
import scipy as sc
from sklearn.model_selection import KFold
import lightgbm as lgb
import warnings
from scipy.stats.mstats import winsorize
import matplotlib.pyplot as plt 
import pickle

warnings.filterwarnings('ignore')
pd.set_option('max_columns', 300)

In [None]:
# data directory
data_dir = '../input/optiver-realized-volatility-prediction/'

In [None]:
def calculate_wap(df):
    a1 = df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']
    a2 = df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']
    b = df['bid_size1'] + df['ask_size1'] + df['bid_size2']+ df['ask_size2']
    return (a1 + a2)/ b

def calculate_wap1(df):
    a1 = df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']
    b1 = df['bid_size1'] + df['ask_size1']
    a2 = df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']
    b2 = df['bid_size2'] + df['ask_size2']
    
    x = (a1/b1 + a2/b2)/ 2
    
    return x

# Function to calculate the log of the return
# Remember that logb(x / y) = logb(x) - logb(y)
def log_return(series):
    return np.log(series).diff()

# Calculate the realized volatility
def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

# Function to count unique elements of a series
def count_unique(series):
    return len(np.unique(series))

In [None]:
# Function to read our base train and test set
def read_train_test():
    train = pd.read_csv(data_dir + 'train.csv')
    test = pd.read_csv(data_dir + 'test.csv')
    # Create a key to merge with book and trade data
    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    print(f'Our training set has {train.shape[0]} rows')
    return train, test

# Function to get group stats for different windows (seconds in bucket)
def get_stats_window(df, create_feature_dict, seconds_in_bucket, add_suffix = False):
    # Group by the window
    df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(create_feature_dict).reset_index()
    # Rename columns joining suffix
    df_feature.columns = ['_'.join(col) for col in df_feature.columns]
    # Add a suffix to differentiate windows
    if add_suffix:
        df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
    return df_feature
    
# Function to preprocess book data (for each stock id)
def book_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    # Calculate Wap
    df['wap1'] = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    df['wap2'] = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
#     df['wap3'] = calculate_wap(df)

    # Calculate log returns
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return)
#     df['log_return3'] = df.groupby(['time_id'])['wap3'].apply(log_return)

    # Calculate wap balance
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    # Calculate spread
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['bid_spread'] = (df['bid_price1'] - df['bid_price2']) / (df['bid_price1'] + df['bid_price2'])
    df['ask_spread'] = (df['ask_price2'] - df['ask_price1']) / (df['ask_price1'] + df['ask_price2'])
    
    df['total_volume_1'] = df['ask_size1'] + df['bid_size1']
    df['total_volume_2'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    
    df['volume_imbalance_1'] = abs((df['ask_size1'] - df['bid_size1']))
    df['volume_imbalance_2'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
        
    # Dict for aggregations
    create_feature_dict = {
        'wap1': [np.sum, np.mean, np.std],
        'wap2': [np.sum, np.mean, np.std],
        'log_return1': [np.sum, realized_volatility, np.mean, np.std],
        'log_return2': [np.sum, realized_volatility, np.mean, np.std],
#         'log_return3': [np.sum, realized_volatility, np.mean, np.std],
        'wap_balance': [np.sum, realized_volatility, np.mean, np.std],
        'price_spread':[np.sum, realized_volatility, np.mean, np.std],
        'bid_spread':[np.sum, realized_volatility, np.mean, np.std],
        'ask_spread':[np.sum, realized_volatility, np.mean, np.std],
        'total_volume_1':[np.sum, realized_volatility, np.mean, np.std],
        'total_volume_2':[np.sum, realized_volatility, np.mean, np.std],
        'volume_imbalance_1':[np.sum, realized_volatility, np.mean, np.std],
        'volume_imbalance_2':[np.sum, realized_volatility, np.mean, np.std]
    }
        
    # Get the stats for different windows
    df_feature = get_stats_window(df, create_feature_dict, seconds_in_bucket = 0, add_suffix = False)    
    df_feature_550 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 550, add_suffix = True)
    df_feature_500 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 500, add_suffix = True)
    df_feature_450 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 450, add_suffix = True)
    df_feature_400 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 400, add_suffix = True)
#     df_feature_350 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 350, add_suffix = True)
    df_feature_300 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 300, add_suffix = True)
#     df_feature_250 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 250, add_suffix = True)
    df_feature_200 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 200, add_suffix = True)
    df_feature_150 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 150, add_suffix = True)
    df_feature_100 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 100, add_suffix = True)
    df_feature_50 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 50, add_suffix = True)
    df_feature_25 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 25, add_suffix = True)
    df_feature_10 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 10, add_suffix = True)

    # Merge all
    df_feature = df_feature.merge(df_feature_550, how = 'left', left_on = 'time_id_', right_on = 'time_id__550')
    df_feature = df_feature.merge(df_feature_500, how = 'left', left_on = 'time_id_', right_on = 'time_id__500')
    df_feature = df_feature.merge(df_feature_450, how = 'left', left_on = 'time_id_', right_on = 'time_id__450')
    df_feature = df_feature.merge(df_feature_400, how = 'left', left_on = 'time_id_', right_on = 'time_id__400')
#     df_feature = df_feature.merge(df_feature_350, how = 'left', left_on = 'time_id_', right_on = 'time_id__350')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
#     df_feature = df_feature.merge(df_feature_250, how = 'left', left_on = 'time_id_', right_on = 'time_id__250')
    df_feature = df_feature.merge(df_feature_200, how = 'left', left_on = 'time_id_', right_on = 'time_id__200')
    df_feature = df_feature.merge(df_feature_150, how = 'left', left_on = 'time_id_', right_on = 'time_id__150')
    df_feature = df_feature.merge(df_feature_100, how = 'left', left_on = 'time_id_', right_on = 'time_id__100')
    df_feature = df_feature.merge(df_feature_50, how = 'left', left_on = 'time_id_', right_on = 'time_id__50')
    df_feature = df_feature.merge(df_feature_25, how = 'left', left_on = 'time_id_', right_on = 'time_id__25')
    df_feature = df_feature.merge(df_feature_10, how = 'left', left_on = 'time_id_', right_on = 'time_id__10')

#     df_feature = df_feature.merge(peaks_df, how='left', left_on = 'time_id_', right_index=True)
    
    # Drop unnecesary time_ids
    unused_time_id = df_feature.filter(like='time_id__').columns.tolist()
    df_feature.drop(unused_time_id, axis = 1, inplace = True)
    
    # Create row_id so we can merge
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x: f'{stock_id}-{x}')
    df_feature.drop(['time_id_'], axis = 1, inplace = True)

    return df_feature

# Function to preprocess trade data (for each stock id)
def trade_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    
    # Dict for aggregations
    create_feature_dict = {
        'log_return':[realized_volatility, np.mean, np.std],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum, np.mean, np.std , realized_volatility],
        'order_count':[np.sum, np.mean, np.std, realized_volatility],
    }
        
    # Get the stats for different windows
    df_feature = get_stats_window(df, create_feature_dict, seconds_in_bucket = 0, add_suffix = False)
    df_feature_550 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 550, add_suffix = True)
    df_feature_500 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 500, add_suffix = True)
#     df_feature_450 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 450, add_suffix = True)
#     df_feature_400 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 400, add_suffix = True)
#     df_feature_350 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 350, add_suffix = True)
    df_feature_300 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 300, add_suffix = True)
#     df_feature_250 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 250, add_suffix = True)
#     df_feature_200 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 200, add_suffix = True)
#     df_feature_150 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 150, add_suffix = True)
#     df_feature_100 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 100, add_suffix = True)
    df_feature_50 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 50, add_suffix = True)
    df_feature_25 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 25, add_suffix = True)
    df_feature_10 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 10, add_suffix = True)

    # Merge all
    df_feature = df_feature.merge(df_feature_550, how = 'left', left_on = 'time_id_', right_on = 'time_id__550')
    df_feature = df_feature.merge(df_feature_500, how = 'left', left_on = 'time_id_', right_on = 'time_id__500')
#     df_feature = df_feature.merge(df_feature_450, how = 'left', left_on = 'time_id_', right_on = 'time_id__450')
#     df_feature = df_feature.merge(df_feature_400, how = 'left', left_on = 'time_id_', right_on = 'time_id__400')
#     df_feature = df_feature.merge(df_feature_350, how = 'left', left_on = 'time_id_', right_on = 'time_id__350')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
#     df_feature = df_feature.merge(df_feature_250, how = 'left', left_on = 'time_id_', right_on = 'time_id__250')
#     df_feature = df_feature.merge(df_feature_200, how = 'left', left_on = 'time_id_', right_on = 'time_id__200')
#     df_feature = df_feature.merge(df_feature_150, how = 'left', left_on = 'time_id_', right_on = 'time_id__150')
#     df_feature = df_feature.merge(df_feature_100, how = 'left', left_on = 'time_id_', right_on = 'time_id__100')
    df_feature = df_feature.merge(df_feature_50, how = 'left', left_on = 'time_id_', right_on = 'time_id__50')
    df_feature = df_feature.merge(df_feature_25, how = 'left', left_on = 'time_id_', right_on = 'time_id__25')
    df_feature = df_feature.merge(df_feature_10, how = 'left', left_on = 'time_id_', right_on = 'time_id__10')

    # Drop unnecesary time_ids
    unused_time_id = df_feature.filter(like='time_id__').columns.tolist()
    df_feature.drop(unused_time_id, axis = 1, inplace = True)
    
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature.drop(['trade_time_id_'], axis = 1, inplace = True)

    return df_feature

# Function to get group stats for the stock_id and time_id
def get_time_stock(df):

    vol_cols = [f for f in df.columns if ('realized_volatility' in f)]

    # Group by the stock id
    df_time_id = df.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix('_' + 'time')
    
    # Merge with original dataframe
    df = df.merge(df_time_id, how = 'left', left_on = ['time_id'], right_on = ['time_id__time'])
    df.drop(['time_id__time'], axis = 1, inplace = True)
    return df
    
# Funtion to make preprocessing function in parallel (for each stock id)
def preprocessor(list_stock_ids, is_train = True):
    
    from tqdm import tqdm
    
    # Parrallel for loop
    def for_joblib(stock_id):
        # Train
        if is_train:
            file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        # Test
        else:
            file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_test.parquet/stock_id=" + str(stock_id)
    
        # Preprocess book and trade data and merge them
        df_tmp = pd.merge(book_preprocessor(file_path_book), trade_preprocessor(file_path_trade), on = 'row_id', how = 'left')
        
        # Return the merge dataframe
        return df_tmp
    
    # Use parallel api to call paralle for loop
    df = Parallel(n_jobs = -1, verbose = 1)(delayed(for_joblib)(stock_id) for stock_id in tqdm(list_stock_ids))
    # Concatenate all the dataframes that return from Parallel
    df = pd.concat(df, ignore_index = True)
    return df

# Function to calculate the root mean squared percentage error
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

# Function to early stop with root mean squared percentage error
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

In [None]:
MODEL_DIR = '../input/lgbmv111'

# features = test.drop(columns=['time_id', 'row_id']).columns

with open(f"{MODEL_DIR}/lgbm_features.txt", "rb") as fp:   # Unpickling
    features = pickle.load(fp)

In [None]:
# Read train and test
train, test = read_train_test()

In [None]:
# Get unique stock ids 
train_stock_ids = train['stock_id'].unique()

# Preprocess them using Parallel and our single stock id functions
train_ = preprocessor(train_stock_ids, is_train = True)
train = train.merge(train_, on = ['row_id'], how = 'left')

# Get unique stock ids 
test_stock_ids = test['stock_id'].unique()

# Preprocess them using Parallel and our single stock id functions
test_ = preprocessor(test_stock_ids, is_train = False)
test = test.merge(test_, on = ['row_id'], how = 'left')

In [None]:
train['trade_size_mean_10'].plot.hist(bins=100)

In [None]:
for col in test.drop(columns=['time_id', 'row_id', 'stock_id']).columns:
    
    train[col] = winsorize(train[col], limits=[0, 0.01])
    test[col] = winsorize(test[col], limits=[0, 0.01])

In [None]:
train['trade_size_mean_10'].plot.hist(bins=100)

In [None]:
# Get group stats of time_id and stock_id
train = get_time_stock(train)
test = get_time_stock(test)

In [None]:
target_encode = train.groupby('stock_id')['target'].agg(['mean'])
# target_encode.columns = ['_'.join(col) for col in target_encode.columns]
target_encode = target_encode.add_suffix('_' + 'stock')
target_encode = target_encode.reset_index()

In [None]:
train = train.merge(target_encode, how='left', on='stock_id')
test = test.merge(target_encode, how='left', on='stock_id')

In [None]:
features += ['mean_stock']

In [None]:
list(features)

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import random
from collections import Counter, defaultdict
from sklearn import model_selection

# ---- GroupKFold ----
class GroupKFold(object):
    """
    GroupKFold with random shuffle with a sklearn-like structure
    """

    def __init__(self, n_splits=4, shuffle=True, random_state=42):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def get_n_splits(self, X=None, y=None, group=None):
        return self.n_splits

    def split(self, X, y, group):
        kf = model_selection.KFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state)
        unique_ids = X[group].unique()
        for fold, (tr_group_idx, va_group_idx) in enumerate(kf.split(unique_ids)):
            # split group
            tr_group, va_group = unique_ids[tr_group_idx], unique_ids[va_group_idx]
            train_idx = np.where(X[group].isin(tr_group))[0]
            val_idx = np.where(X[group].isin(va_group))[0]
            yield train_idx, val_idx

In [None]:
seed = 42
params = {
    'learning_rate': 0.13572437900113307,        
    'lambda_l1': 2.154360665259325,
    'lambda_l2': 6.711089761523827,
    'num_leaves': 769,
    'min_sum_hessian_in_leaf': 20.44437160769411,
    'bagging_fraction': 0.9726755660563261,
    'bagging_freq': 42,
    'min_data_in_leaf': 690,
    'max_depth': 3,
    'seed': seed,
    'feature_fraction_seed': seed,
    'bagging_seed': seed,
    'drop_seed': seed,
    'data_random_seed': seed,
    'objective': 'rmse',
    'boosting': 'dart',
    'verbosity': -1,
    'n_jobs': -1,
}   

boosting_type = params['boosting']

In [None]:
train['vol_ratio'] = (train['target'] / train['log_return1_realized_volatility'])

In [None]:
train['vol_ratio'].plot.hist(bins=100, figsize=(20, 10))

In [None]:
train[train['vol_ratio'] < 3]['vol_ratio'].plot.hist(bins=100, figsize=(20, 10))

In [None]:
train = train[train['vol_ratio'] < 3].reset_index(drop=True)

In [None]:
n_fold = 10

cv = GroupKFold(n_splits=n_fold)
kf = cv.split(train, train['target'], 'time_id')

# Split features and target
x = train[features].fillna(0)
y = train['target']
x_test = test[features]
# Transform stock id to a numeric value
x['stock_id'] = x['stock_id'].astype(int)
x_test['stock_id'] = x_test['stock_id'].astype(int)

# Create out of folds array
oof_df = train[['row_id', 'time_id', 'stock_id', 'log_return1_realized_volatility', 'target']].copy()
oof_df['pred'] = np.nan

# Create test array to store predictions
test_predictions = np.zeros(x_test.shape[0])

# Iterate through each fold
for fold, (trn_ind, val_ind) in enumerate(kf):
    print(f'Training fold {fold + 1}')
    x_train, x_val = x.iloc[trn_ind], x.iloc[val_ind]
    y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
    
    y_train = winsorize(y_train, limits=[0, 0.02])
    
    if fold == 0:
        features_importance = np.zeros(len(x_train.columns))
    
    train_weights = 1 / np.square(y_train)
    val_weights = 1 / np.square(y_val)
    
    train_dataset = lgb.Dataset(x_train, y_train, weight = train_weights, categorical_feature = ['stock_id'])
    val_dataset = lgb.Dataset(x_val, y_val, weight = val_weights, categorical_feature = ['stock_id'])
    
    model = lgb.train(params = params, 
                      train_set = train_dataset, 
                      valid_sets = [train_dataset, val_dataset], 
                      num_boost_round = 2000,
                      early_stopping_rounds = 67, 
                      verbose_eval = 200,
                      feval = feval_rmspe)
    
    lgb.plot_importance(model, importance_type='gain', max_num_features=50, figsize=(10, 50))
    plt.title("Feature importance")
    plt.show()
    
    joblib.dump(model, f'./lgbm_fold_{fold}_{boosting_type}_kfold.pkl') # save model
    
    y_pred = model.predict(x_val)# + cat.predict(x_val)) / 2
    
    score = round(rmspe(y_true = y_val, y_pred = y_pred),5)
    print('Fold {} : {}'.format((fold+1), score))
    
    # Add predictions to the out of folds array
    oof_df['pred'].iloc[val_ind] = y_pred
    # Predict the test set
    test_predictions += model.predict(x_test) / n_fold

In [None]:
rmspe_score = rmspe(y, oof_df['pred'])
print(f'Our out of folds RMSPE is {rmspe_score}')

In [None]:
from sklearn.metrics import r2_score
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

oof_df.dropna(inplace=True)
y_true = oof_df['target'].values
y_pred = oof_df['pred'].values

oof_df['target'].hist(bins=100)
oof_df['pred'].hist(bins=100)

R2 = round(r2_score(y_true, y_pred), 5)
RMSPE = round(rmspe(y_true, y_pred), 5)
RMSPE

In [None]:
# Postprocessing

test['target'] = test_predictions

# Submission
test[['row_id', 'target']].to_csv('submission.csv',index = False)

In [None]:
x = train[features].fillna(0)
y = train['vol_ratio']
w = train['target']

x['stock_id'] = x['stock_id'].astype(int)

# Create out of folds array
oof_df = train[['row_id', 'time_id', 'stock_id', 'log_return1_realized_volatility', 'target', 'vol_ratio']].copy()
oof_df['pred'] = np.nan

cv = GroupKFold(n_splits=n_fold)
kf = cv.split(train, train['target'], 'time_id')

# Iterate through each fold
for fold, (trn_ind, val_ind) in enumerate(kf):
    print(f'Training fold {fold + 1}')

    x_train, x_val = x.iloc[trn_ind], x.iloc[val_ind]
    y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
    w_train, w_val = w.iloc[trn_ind], w.iloc[val_ind]
    
    if fold == 0:
        features_importance = np.zeros(len(x_train.columns))
                        
    # Root mean squared percentage error weights
        
    train_weights = 1 / np.square(y_train)
    val_weights = 1 / np.square(y_val)
    
    train_dataset = lgb.Dataset(x_train, y_train, weight = train_weights, categorical_feature = ['stock_id'])
    val_dataset = lgb.Dataset(x_val, y_val, weight = val_weights, categorical_feature = ['stock_id'])
    
    model = lgb.train(params = params, 
                      train_set = train_dataset, 
                      valid_sets = [train_dataset, val_dataset], 
                      num_boost_round = 2000,
                      early_stopping_rounds = 67, 
                      verbose_eval = 200,
                      feval = feval_rmspe)
    
    lgb.plot_importance(model, importance_type='gain', max_num_features=50, figsize=(10, 50))
    plt.title("Feature importance")
    plt.show()
        
    y_pred = model.predict(x_val)# + cat.predict(x_val)) / 2
    
    print(rmspe(y_val, y_pred))
    
    features_importance += model.feature_importance(importance_type='gain') / n_fold
    
    oof_df['pred'].iloc[val_ind] = y_pred

In [None]:
oof_df['pred_trans'] = oof_df['pred'] * oof_df['log_return1_realized_volatility']

In [None]:
rmspe_score = rmspe(oof_df['target'], oof_df['pred_trans'])
print(f'Our out of folds RMSPE is {rmspe_score}')

In [None]:
from sklearn.metrics import r2_score
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

oof_df.dropna(inplace=True)
y_true = oof_df['target'].values
y_pred = oof_df['pred_trans'].values

oof_df['target'].hist(bins=100)
oof_df['pred_trans'].hist(bins=100)

R2 = round(r2_score(y_true, y_pred), 5)
RMSPE = round(rmspe(y_true, y_pred), 5)
RMSPE