In [None]:
import os
import copy
import glob
import numpy as np
import pandas as pd
from sklearn import model_selection
import xgboost as xgb
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import shap
from tsfresh.feature_extraction import feature_calculators
import gc

The idea of this notebook is to present the way of constructing features using tsfresh package.

**All features will be calculated in respect of the `seconds_in_bucket` column from 0 to 600, where all missed values are first forward filled and after backward filled, as it is the practice with time-series.**

Straight forward using`tsfresh` in order to automatically extract features is hardly feasible here due to the big data, so I've defined some functions manually. Analogously, other features can be extracted using methods from here https://tsfresh.readthedocs.io/en/latest/api/tsfresh.feature_extraction.html#tsfresh.feature_extraction.feature_calculators.abs_energy

Here will be used only a few raw columns for features extraction, just to present the example.

In [None]:
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

def abs_sum_of_change(x):
    x = x.reindex(range(600))
    x = x.ffill().bfill()
    return feature_calculators.absolute_sum_of_changes(x)

def agg_autocorr_mean_120(x):
    x = x.reindex(range(600))
    x = x.ffill().bfill()
    return feature_calculators.agg_autocorrelation(x, [{'f_agg': 'mean', 'maxlag': 100}])[0][1]

def app_entropy_30(x):
    x = x.reindex(range(600))
    x = x.ffill().bfill()
    return feature_calculators.approximate_entropy([9,200,31,1,5,1,-3,9,2], 30, 0.1)

def autocorrelation_60(x):
    x = x.reindex(range(600))
    x = x.ffill().bfill()
    return feature_calculators.autocorrelation(x, 60)

def autocorrelation_120(x):
    x = x.reindex(range(600))
    x = x.ffill().bfill()
    return feature_calculators.autocorrelation(x, 120)

def autocorrelation_300(x):
    x = x.reindex(range(600))
    x = x.ffill().bfill()
    return feature_calculators.autocorrelation(x, 300)

def c3_60(x):
    x = x.reindex(range(600))
    x = x.ffill().bfill()
    return feature_calculators.c3(x, 60)

def cid_ce(x):
    x = x.reindex(range(600))
    x = x.ffill().bfill()
    return feature_calculators.cid_ce(x, True)

def count_above_mean(x):
    x = x.reindex(range(600))
    x = x.ffill().bfill()
    return feature_calculators.count_above_mean(x)

def cwt_coeff20_w30_widths_30_60_120_240(x):
    x = x.reindex(range(600))
    x = x.ffill().bfill()
    return list(feature_calculators.cwt_coefficients(x, [{'widths':[30, 60, 120, 240],
                                                     'coeff':20, 'w':30}]))[0][1]

def fft_coef1_angle(x):
    x = x.reindex(range(600))
    x = x.ffill().bfill()
    return list(feature_calculators.fft_coefficient(x, [{'coeff': 1, 'attr': "angle"}]))[0][1]

def fft_coef1_abs(x):
    x = x.reindex(range(600))
    x = x.ffill().bfill()
    return list(feature_calculators.fft_coefficient(x, [{'coeff': 1, 'attr': "abs"}]))[0][1]

def first_location_of_maximum(x):
    x = x.reindex(range(600))
    x = x.ffill().bfill()
    return feature_calculators.first_location_of_maximum(x)

def first_location_of_minimum(x):
    x = x.reindex(range(600))
    x = x.ffill().bfill()
    return feature_calculators.first_location_of_minimum(x)

def linear_trend_slope(x):
    x = x.reindex(range(600))
    x = x.ffill().bfill()
    return feature_calculators.linear_trend(x, [{'attr': 'slope'}])[0][1]

def longest_strike_above_mean(x):
    x = x.reindex(range(600))
    x = x.ffill().bfill()
    return feature_calculators.longest_strike_above_mean(x)

def longest_strike_below_mean(x):
    x = x.reindex(range(600))
    x = x.ffill().bfill()
    return feature_calculators.longest_strike_below_mean(x)

def mean_abs_change(x):
    x = x.reindex(range(600))
    x = x.ffill().bfill()
    return feature_calculators.mean_abs_change(x)

def mean_change(x):
    x = x.reindex(range(600))
    x = x.ffill().bfill()
    return feature_calculators.mean_change(x)

def mean_second_derivative_central(x):
    x = x.reindex(range(600))
    x = x.ffill().bfill()
    return feature_calculators.mean_second_derivative_central(x)

def number_cwt_peaks_w60(x):
    x = x.reindex(range(600))
    x = x.ffill().bfill()
    return feature_calculators.number_cwt_peaks(x, 60)    

def partial_autocorrelation_l60(x):
    x = x.reindex(range(600))
    x = x.ffill().bfill()
    return feature_calculators.partial_autocorrelation(x, [{'lag': 60}])[0][1]     

def ratio_beyond_r_sigma2(x):
    x = x.reindex(range(600))
    x = x.ffill().bfill()
    return feature_calculators.ratio_beyond_r_sigma(x, 2)

def time_reversal_asymmetry_statistic60(x):
    x = x.reindex(range(600))
    x = x.ffill().bfill()
    return feature_calculators.time_reversal_asymmetry_statistic(x, 60) 

def prepare_book_features(file_path, raw_features, agg_f):
    
    df_book_data = pd.read_parquet(file_path)
    df_book_data['wap'] =(df_book_data['bid_price1'] * df_book_data['ask_size1']+
                          df_book_data['ask_price1'] * df_book_data['bid_size1'])/(
        df_book_data['bid_size1']+ df_book_data['ask_size1'])
        
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    
    df_stat = df_book_data.set_index('seconds_in_bucket').groupby('time_id').agg(
        {ckey:agg_f for ckey in raw_features})
    df_stat.columns = df_stat.columns.map('_'.join)
    df_stat = df_stat.reset_index()
    
    df_realized_vol_per_stock =  pd.DataFrame(df_book_data.groupby(['time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns = {'log_return':'realized_volatility'})
    
    df_realized_vol_per_stock = df_realized_vol_per_stock.merge(df_stat, how='left')
    stock_id = file_path.split('=')[1]
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')
    
    return df_realized_vol_per_stock

def prepare_trade_features(file_path, raw_features, agg_f):
    
    df_trade_data = pd.read_parquet(file_path)
    df_trade_data = df_trade_data.set_index('seconds_in_bucket').groupby('time_id').agg(
        {ckey:agg_f for ckey in raw_features})
    df_trade_data.columns = df_trade_data.columns.map('_'.join)
    df_trade_data = df_trade_data.reset_index()
    
    stock_id = file_path.split('=')[1]
    df_trade_data['row_id'] = df_trade_data['time_id'].apply(lambda x:f'{stock_id}-{x}')
    
    return df_trade_data

def process_book_files(files_dir, book_raw_features, agg_f):
    
    df_features = pd.DataFrame()
    list_file = glob.glob(files_dir)
    
    for file in tqdm(list_file):
        df_features = df_features.append(prepare_book_features(file, book_raw_features, agg_f))
            
    return df_features

def process_trade_files(files_dir, trade_raw_features, agg_f):
    
    df_features = pd.DataFrame()
    list_file = glob.glob(files_dir)
    
    for file in tqdm(list_file):
        df_features = df_features.append(prepare_trade_features(file, trade_raw_features, agg_f))
            
    return df_features   


In [None]:
# # all tsfresh functions defined above
# agg_f = [abs_sum_of_change, agg_autocorr_mean_120, app_entropy_30, autocorrelation_60,
#          autocorrelation_120, autocorrelation_300, c3_60, cid_ce,
#          count_above_mean, cwt_coeff20_w30_widths_30_60_120_240, fft_coef1_angle, fft_coef1_abs,
#          first_location_of_maximum, first_location_of_minimum, linear_trend_slope,
#          longest_strike_above_mean, longest_strike_below_mean, mean_abs_change, mean_change,
#          mean_second_derivative_central, number_cwt_peaks_w60, partial_autocorrelation_l60,
#          ratio_beyond_r_sigma2, time_reversal_asymmetry_statistic60]

agg_f = [abs_sum_of_change, first_location_of_minimum, mean_change, ratio_beyond_r_sigma2]

# # all book raw features
# book_raw_features = ['seconds_in_bucket', 'bid_price1', 'ask_price1', 'bid_price2', 'ask_price2',
#               'bid_size1', 'ask_size1', 'bid_size2', 'ask_size2', 'wap', 'log_return']

book_raw_features = ['wap', 'log_return']

# # all trade raw features
# trade_raw_features = ['seconds_in_bucket_trade', 'price', 'size', 'order_count']
trade_raw_features = [ 'price', 'order_count']

data_dir = '/kaggle/input/optiver-realized-volatility-prediction'
book_features = process_book_files(f'{data_dir}/book_train.parquet/*', book_raw_features, agg_f)
trade_features = process_trade_files(f'{data_dir}/trade_train.parquet/*', trade_raw_features, agg_f)

In [None]:
book_features.head()

In [None]:
trade_features.head()

In [None]:
book_features = book_features.drop('time_id', axis=1)
trade_features = trade_features.drop('time_id', axis=1)
df_features = pd.merge(book_features, trade_features, left_on=['row_id'], right_on=['row_id'], how='left')
df_features = df_features.fillna(0)

df_train = pd.read_csv(f'{data_dir}/train.csv')
df_train['row_id'] = df_train[['stock_id', 'time_id']].apply(lambda row: f'{row[0]}-{row[1]}',axis=1)
df_train = df_train.drop('stock_id', axis=1)
df_train = df_train.drop('time_id', axis=1)

df_train = pd.merge(df_train, df_features, left_on=['row_id'], right_on=['row_id'], how='left')
df_train.head()

In [None]:
df_train.loc[:, 'kfold'] = -1
df_train.sample(frac=1).reset_index(drop=True)
y = df_train['target'].values
skf = model_selection.KFold(n_splits=5, shuffle=True)

for f, (t_, v_) in enumerate(skf.split(X=df_train, y=y)):
    df_train.loc[v_, 'kfold'] = f

df_train.head()

In [None]:
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

#all_features = list(df_train.columns)
#for f in ['target', 'row_id', 'kfold']:
#    all_features.remove(f)
all_features = ['realized_volatility',
 'wap_abs_sum_of_change', 'order_count_abs_sum_of_change', 'wap_first_location_of_minimum', 'wap_mean_change',
               'log_return_abs_sum_of_change', 'log_return_first_location_of_minimum','log_return_ratio_beyond_r_sigma2']

for fold in range(5):
    
    df_tr = df_train[df_train.kfold != fold].reset_index(drop=True)
    df_val = df_train[df_train.kfold == fold].reset_index(drop=True)
    
    x_tr = df_tr[all_features].values
    x_val = df_val[all_features].values

    y_tr = df_tr['target'].values
    y_val = df_val['target'].values
    
    model = xgb.XGBRegressor(n_estimators=50)
    model.fit(x_tr, y_tr)
    pred = model.predict(x_val)
    r = rmspe(y_val, pred)
    
    print(f'Fold {fold}, RMSPE:{r}')

In [None]:
explainer = shap.Explainer(model, df_val[all_features])
shap_values = explainer.shap_values(df_val[all_features])
shap.summary_plot(shap_values, df_val[all_features], title='SHAP XGB summary plot', show=False)

In [None]:
book_features_test = process_book_files(f'{data_dir}/book_test.parquet/*', book_raw_features, agg_f)
trade_features_test = process_trade_files(f'{data_dir}/trade_test.parquet/*', trade_raw_features, agg_f)
df_features_test = pd.merge(book_features_test, trade_features_test, left_on=['row_id'], right_on=['row_id'], how='left')
df_features_test = df_features_test.fillna(0)

model = xgb.XGBRegressor(n_estimators=50)
model.fit(df_train[all_features].values, df_train['target'].values)

pred = model.predict(df_features_test[all_features].values)

df_features_test['target']=pred

df_test = pd.read_csv(f'{data_dir}/test.csv')
df_test = pd.merge(df_test, df_features_test, left_on=['row_id'], right_on=['row_id'], how='left')
df_test = df_test.fillna(0)

df_test[['row_id', 'target']].to_csv('submission.csv',index = False)