# LSTM Baseline

This is my first LSTM model. I tried to describe what I have done and hope for some helpful feedback and an upvote if you like.
Please feel free to fork this notebook.

In [None]:
import glob
import numpy as np
import os
import pandas as pd
import tensorflow as tf

from sklearn.compose import make_column_transformer
from sklearn.metrics import r2_score
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from tensorflow import keras


In [None]:
list_order_book_file_train = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')

In [None]:
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')

# Calculate additional features

Calculate the features from the Optiver examples

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def realized_volatility_rolling(series_log_return):
    return np.sqrt((series_log_return**2).expanding().mean())

def add_features_and_aggregate_data(df):
    # spread between ask and bis price on first level in order book
    df['price_spread_l1'] = df['ask_price1'] - df['bid_price1']
    # added price spread as log difference to make it independent
    df['price_spread_l1_log_diff'] = df.groupby('time_id')['price_spread_l1'].transform(log_return)
    # I tried to aggregate the data in buckets of 50 seconds and called the bucket index timeslice
    # I would like to reduce memory consumption and train time with this approach
    # There is a maximum of 600(0-599) Seconds in every training bucket(stock_id, time_id)
    # So there should be a maximum of 12 Buckets
    df['timeslice'] = df['seconds_in_bucket'] // 50 
    # calculated the weighted average price
    df['wap'] = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1']+ df['ask_size1'])
    # calculated log return
    df['log_return'] = df.groupby(['time_id'])['wap'].transform(log_return)
    # drop rows with na. The na gets created by the diff function by calculating the log return
    # use inplace to save memory
    df.dropna(subset=['log_return', 'price_spread_l1_log_diff'], inplace=True)
    #calculate realized voltality for every bucket
    df['realized_vol'] = df.groupby(['time_id', 'timeslice'])['log_return'].transform(realized_volatility_rolling)
    
    return df.groupby(['time_id', 'timeslice']).agg(
                stock_id=('stock_id', 'max'),
                min_price_spread_l1_log_diff=('price_spread_l1_log_diff', 'min'),
                max_price_spread_l1_log_diff=('price_spread_l1_log_diff', 'max'),
                mean_price_spread_l1_log_diff=('price_spread_l1_log_diff', 'mean'),
                min_realized_vol=('realized_vol', 'min'),
                max_realized_vol=('realized_vol', 'max'),
                mean_realized_vol=('realized_vol', 'mean'),
    ).reset_index()

# LSTM

## Preprocess Input Data

In [None]:
feature_columns = ['stock_id', 'min_price_spread_l1_log_diff', 'max_price_spread_l1_log_diff', 
           'mean_price_spread_l1_log_diff', 'min_realized_vol', 'max_realized_vol', 'mean_realized_vol']

In [None]:
def get_input_data(list_file):
    df_input = pd.DataFrame()
    for file in list_file:
        # read only needed columns to save memory
        df_input_file = pd.read_parquet(file, 
                                        columns=['time_id', 'seconds_in_bucket', 
                                                 'bid_size1' ,'bid_price1', 
                                                 'ask_size1', 'ask_price1'])
        # get stock id from filename
        df_input_file['stock_id'] = int(file.split('=')[1])
        # add features and aggregate data
        df_input = pd.concat([df_input,
                              add_features_and_aggregate_data(df_input_file)], 
                              ignore_index=True, copy=False)
    return df_input

In [None]:
df_input = get_input_data(list_file=list_order_book_file_train)

In [None]:
# add row id and targets to data
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
df_input = df_input.merge(train, on=['time_id', 'stock_id'], how = 'left')

In [None]:
# split train and validation set groupwise by row-id
train_inds, val_inds = next(GroupShuffleSplit(test_size=.20, n_splits=2, random_state = 7)
                            .split(df_input, groups=df_input['row_id'])
                           )

train = df_input.iloc[train_inds]
validation = df_input.iloc[val_inds]

In [None]:
# fit transformer on columns in train dataset
column_transformer = make_column_transformer(
    (MinMaxScaler(), ['min_price_spread_l1_log_diff', 'max_price_spread_l1_log_diff', 
                        'mean_price_spread_l1_log_diff', 'min_realized_vol', 
                        'max_realized_vol', 'mean_realized_vol']),
    remainder='passthrough')
column_transformer = column_transformer.fit(train[feature_columns])


In [None]:
# save memory
del(df_input)

In [None]:
# reshape and transform columns groupwise to get the needed shape for LSTM [batch, timesteps, feature]
# Pad to length 12, the max of seconds_in_bucket/50, to get equal sized sequences.
train_np = np.array([keras.preprocessing.sequence.pad_sequences(
    column_transformer.transform(
        x[feature_columns]
    ).transpose(), 
    maxlen=12, 
    dtype='float32', 
    value=0.0).transpose() for _, x in train.groupby('row_id')])
val_np = np.array([keras.preprocessing.sequence.pad_sequences(column_transformer.transform(x[feature_columns]).transpose(), 
                                                               maxlen=12, 
                                                               dtype='float32',
                                                               value=0.0).transpose() for _, x in validation.groupby('row_id')])
# scale targets
target_scaler = StandardScaler()
target_train = target_scaler.fit_transform(
    train.groupby(['stock_id', 'time_id'])['target'].first().values.reshape(-1,1)
).reshape(-1)
target_val = target_scaler.transform(
    validation.groupby(['stock_id', 'time_id'])['target'].first().values.reshape(-1,1)
).reshape(-1)

In [None]:
# save memory
del(train, validation)

In [None]:
# some simple LSTM
# The architecture is mostly random. I don't know how to create a good architecture for this problem
learning_rate = 0.03

inputs_lstm = keras.layers.Input(shape=(train_np.shape[1], train_np.shape[2]))
masking = keras.layers.Masking(mask_value=0.0, input_shape=(train_np.shape[1], train_np.shape[2]))(inputs_lstm)
lstm_1_out = keras.layers.LSTM(128, return_sequences=True)(masking)
lstm_2_out = keras.layers.LSTM(64, return_sequences=True)(lstm_1_out)
lstm_3_out = keras.layers.LSTM(10, activation='relu')(lstm_2_out)
outputs = keras.layers.Dense(1)(lstm_3_out)

model = keras.Model(inputs=inputs_lstm, outputs=outputs)

model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate), loss=keras.metrics.mean_squared_error)
model.summary()

In [None]:
# early stopping and fit function
def run_trainings_batch(dataset_train, target, val, epochs):
    es_callback = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=5)

    history = model.fit(
        dataset_train,
        target,
        epochs=epochs,
        batch_size=1000,
        validation_data=val,
        callbacks=[es_callback],
    )

In [None]:
# train model
run_trainings_batch(train_np, target_train, (val_np, target_val), 100)

In [None]:
# save memory
del(train_np, target_train, val_np, target_val)

# Submission

In [None]:
list_order_book_file_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')
# get prediction for every file
def get_predictions(list_file):
    prediction = pd.DataFrame()
    for file in list_file:
        df_input_test = get_input_data(list_file=[file])
        df_input_test['row_id'] = df_input_test['stock_id'].astype(str) + '-' + df_input_test['time_id'].astype(str)
        df_pred_np = np.array([keras.preprocessing.sequence.pad_sequences(
            column_transformer.transform(x[feature_columns]).transpose(), 
            maxlen=12, 
            dtype='float32', 
            value=0.0).transpose() for _, x in df_input_test.groupby('row_id')])
        prediction_new = pd.DataFrame()
        prediction_new['row_id'] = df_input_test['row_id'].unique()
        prediction_new['target'] = model.predict(df_pred_np).reshape(-1)
        prediction = pd.concat([prediction, prediction_new])
    prediction['target'] = target_scaler.inverse_transform(prediction['target'])
    return prediction


In [None]:
# save submission
get_predictions(list_file=list_order_book_file_test).to_csv('submission.csv',index = False)