# Base Feature engineering notebook

Notebook that regroup most of features engineering on book / trade data and test them against target and percentage error for the stock 0. 

The features that do not appears in currently available notebooks are mostly from publicly available code from a previous orderbook challenge (XTX challenge). (See for example https://github.com/alexbotsula/XTX_Challenge/blob/master/Research/Order_book_vars.py)

# Other Feature Engineering Notebooks: 

This notebook is part of a serie on basic Feature Engineering / visual variable selection notebooks:

1) Base Features: https://www.kaggle.com/lucasmorin/feature-engineering-1-base-features

2) Aggregation Functions: https://www.kaggle.com/lucasmorin/feature-engineering-2-aggregation-functions

3) RV aggregation: https://www.kaggle.com/lucasmorin/feature-engineering-3-rv-aggregation/

In [None]:
import random
import seaborn as sns
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt

import glob
import os
import gc
from joblib import Parallel, delayed

In [None]:
path_submissions = '/'
target_name = 'target'
scores_folds = {}

# Tools

In [None]:
def calc_wap(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1'])/(df['bid_size1'] + df['ask_size1'])
    return wap

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

In [None]:
book_example = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0')
trade_example =  pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0')

stock_id = '0'
time_id = book_example.time_id.unique()

book_example = book_example[book_example['time_id'].isin(time_id)]
book_example.loc[:,'stock_id'] = stock_id
trade_example = trade_example[trade_example['time_id'].isin(time_id)]
trade_example.loc[:,'stock_id'] = stock_id

book_example['wap'] = calc_wap(book_example)

book_example.loc[:,'log_return'] = log_return(book_example['wap'])
book_example = book_example[~book_example['log_return'].isnull()]

book_example = book_example.merge(trade_example, on=['seconds_in_bucket','time_id'],how='left', suffixes=('', '_y'))
book_example = book_example.loc[:, ~book_example.columns.str.endswith('_y')]

book_example = book_example.fillna(0)

rv = pd.DataFrame(book_example[['log_return','time_id']].groupby(['time_id']).agg(realized_volatility)).reset_index()
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv', dtype = {'stock_id': np.int32, 'time_id': np.int32, 'target': np.float64})
train.head()

train_0 = train[train['stock_id']==0]
df_rv_train = train_0.merge(rv, on = ['time_id'], how = 'right')
df_rv_train['error'] = (df_rv_train['target'] - df_rv_train['log_return'])
df_rv_train['percentage_error'] = (df_rv_train['target'] - df_rv_train['log_return'])/df_rv_train['target']

# Base Features

In [None]:
# bid ask price difference aka spread
# statistical feature engineering

df = book_example


default_indices = [1,2,3]
default_indices_diff = [[1,2],[1,3],[2,3]]
default_col = ['size','price','volume']

def calc3(df):
    df['ask_price3'] = (df['ask_price1']*df['ask_size2'] + df['ask_price2']*df['ask_size1'])/(df['ask_size1'] + df['ask_size2'])
    df['ask_size3'] = df['ask_size1'] + df['ask_size2']
    df['bid_price3'] = (df['bid_price1']*df['bid_size2'] + df['bid_price2']*df['bid_size1'])/(df['bid_size1'] + df['bid_size2'])
    df['bid_size3'] = df['bid_size1'] + df['bid_size2']
    return df

def calc_mid_prices(df, indices = default_indices):
    for i in indices:
        df['mid_price'+str(i)] = (df['ask_price'+str(i)] + df['bid_price'+str(i)])/2
    return df

def calc_market_depth(df, indices = default_indices):
    for i in indices:
        df['market_depth'+str(i)] = (df['bid_size'+str(i)] + df['ask_size'+str(i)])
    return df

def calc_price_impact(df, indices = default_indices):
    for i in indices:
        df['price_impact'+str(i)] = 1/(df['bid_size'+str(i)] + df['ask_size'+str(i)])
    return df

def calc_waps(df, indices = default_indices):
    for i in indices:
        df['wap'+str(i)] = (df['bid_price'+str(i)] * df['ask_size'+str(i)] + df['ask_price'+str(i)] * df['bid_size'+str(i)])/(df['bid_size'+str(i)] + df['ask_size'+str(i)])
    return df

def calc_wrong_waps(df, indices = default_indices):
    for i in indices:
        df['wap'+str(i+3)] = (df['bid_price'+str(i)] * df['bid_size'+str(i)] + df['ask_price'+str(i)] * df['ask_size'+str(i)])/(df['bid_size'+str(i)] + df['ask_size'+str(i)])
    return df

def calc_volumes(df, sides =['bid','ask'] ,indices = default_indices):
    for s in sides:
        for i in indices:
            df[s+'_volume'+str(i)] = df[s+'_price'+str(i)]*df[s+'_size'+str(i)]
    return df

# bid ask difference - price, size, volume
def calc_imbalance(df, col=default_col, indices = default_indices):
    for c in col:
        for i in indices:
            df[c+'_imbalance'+str(i)] = df['ask_'+c+str(i)] - df['bid_'+c+str(i)] 
    return df

# sides differences - price, size, volume
def calc_difference(df, sides=['bid','ask'], col=default_col, indices = default_indices_diff):
    for s in sides:
        for c in col:
            for i in indices:
                df[s+'_'+c+'_difference'+str(i[0])+str(i[1])] = df[s+'_'+c+str(i[0])] - df[s+'_'+c+str(i[1])]
    return df


# Accumulated Features
def accumulate_base(df, sides = ['bid','ask'], col = default_col, ind = default_indices):
    df_group = df.groupby('time_id').cumsum()
    for s in sides:
        for c in col:
            for i in ind:
                df['acc_'+s+'_'+c+str(i)] = df_group[s+'_'+c+str(i)]
    return df

def accumulate_imbalance(df,col = default_col, ind = default_indices):
    df_group = df.groupby('time_id').cumsum()
    for c in col:
        for i in ind:
            df['acc_'+c+'_imbalance'+str(i)] = df_group[c+'_imbalance'+str(i)]
            
    return df           
        
def accumulate_difference(df, sides = ['bid','ask'], col = default_col, ind = default_indices_diff):
    df_group = df.groupby('time_id').cumsum()
    for s in sides:
        for c in col:
            for i in ind:
                df['acc_'+s+'_'+c+'_difference'+str(i[0])+str(i[1])] = df_group[s+'_'+c+'_difference'+str(i[0])+str(i[1])]
    return df

def calc_log_returns(df, indices = default_indices):
    for i  in default_indices:
        df['log_return'+str(i)] = df.groupby(['time_id'])['wap'+str(i)].apply(log_return)
    return df

In [None]:
def Build_OB_Features(df):
    
    indices = [1,2,3]
    indices_diff = [[1,2],[1,3],[2,3]]
    col = ['size','price','volume']
    
    df = calc3(df)
    df = calc_mid_prices(df)
    df = calc_market_depth(df)
    df = calc_price_impact(df)
    df = calc_waps(df)
    df = calc_wrong_waps(df, indices = [1])
    df = calc_volumes(df,indices = indices)
    df = calc_imbalance(df)
    df = calc_difference(df, col = col)
    df = accumulate_base(df, col = col)
    df = accumulate_imbalance(df,col = col, ind = indices)
    df = accumulate_difference(df, col = col, ind = indices_diff)
                
    return df

In [None]:
book_example = Build_OB_Features(book_example)

# Features versus Target

In [None]:
df_train_stock_0 = book_example.groupby('time_id').agg([np.mean,np.sum,np.std])
df_train_stock_0.columns = ['_'.join(col) for col in df_train_stock_0.columns]

sns.set(rc={'figure.figsize':(24,8)})
sns.set_style(style='white')

columns = [columns for columns in book_example.columns if columns not in ['time_id','stock_id']]

for col in columns:
    color = (random.random(), random.random(), random.random())
    
    fig, axs = plt.subplots(ncols=3)
    sns.regplot(x=df_train_stock_0[col+'_mean'], y=df_rv_train['target'], color=color, order = 2, line_kws={"color": 'black'}, ax=axs[0]).set(ylim=(0, None),title= 'Mean')
    sns.regplot(x=df_train_stock_0[col+'_sum'], y=df_rv_train['target'], color=color, order = 2, line_kws={"color": 'black'}, ax=axs[1]).set(ylim=(0, None),title= 'Sum')
    sns.regplot(x=df_train_stock_0[col+'_std'], y=df_rv_train['target'], color=color, order = 2, line_kws={"color": 'black'}, ax=axs[2]).set(ylim=(0, None),title= 'Std')
    fig.suptitle(col+' v.s. target',size=30) 
    
    plt.show()

# Features v.s. baseline error

In [None]:
for col in columns:
    
    color = (random.random(), random.random(), random.random())
    
    fig, axs = plt.subplots(ncols=3)
    
    sns.regplot(x=df_train_stock_0[col+'_mean'], y=df_rv_train['percentage_error'], color=color, order = 2, line_kws={"color": 'black'}, ax=axs[0]).set(ylim=(-20, 2),title= 'Mean')
    sns.regplot(x=df_train_stock_0[col+'_sum'], y=df_rv_train['percentage_error'], color=color, order = 2, line_kws={"color": 'black'}, ax=axs[1]).set(ylim=(-20, 2),title= 'Sum')
    sns.regplot(x=df_train_stock_0[col+'_std'], y=df_rv_train['percentage_error'], color=color, order = 2, line_kws={"color": 'black'}, ax=axs[2]).set(ylim=(-20, 2),title= 'Std')
    fig.suptitle(col+' v.s. Baseline Relative Error',size=30) 
    
    plt.show()