In [None]:
import os
import numpy as np 
import pandas as pd 
from scipy.signal import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

In [None]:
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.layers import Dropout, Activation, BatchNormalization
from tensorflow.keras.layers import  Bidirectional, Layer, Concatenate, Flatten
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from tensorflow.keras.regularizers import l2
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras.metrics import RootMeanSquaredError

In [None]:
K.set_floatx('float64')

In [None]:
main_dir = '../input/optiver-realized-volatility-prediction'

info = os.path.join(main_dir, '{}.csv') #.format(train/test)

data = os.path.join(main_dir, '{}_{}.parquet', 'stock_id={}') #.format(book/trade, train/test, stock_id)

In [None]:
train = pd.read_csv(info.format('train'))

In [None]:
stock_ids = train['stock_id'].unique()
time_ids = train['time_id'].unique()

In [None]:
max_feature_series_len = 128

In [None]:
def log_returns(stock_prices):
    return np.log(stock_prices).diff() 

def row_id(stock_id, time_id):
    return f'{int(stock_id)}-{int(time_id)}'

def get_series(groupby_obj):
    wap_series = list(groupby_obj)
    log_series = np.diff(np.log(wap_series))
    log_squared_series = log_series**2
    resampled_series = resample(log_squared_series, max_feature_series_len)
    return resampled_series
    
def get_stock_wise_dict(set_type, stock_id):
    
    df = pd.read_parquet(data.format('book', set_type, stock_id))
    
    df['wap'] = (df['bid_price1']*df['ask_size1'] + df['ask_price1']*df['bid_size1']) / (
                 df['ask_size1'] + df['bid_size1'])
    
    stock_dict = df.groupby('time_id')['wap'].apply(get_series).to_dict()
    
    return stock_dict

def get_feature_dict(set_type, stock_ids):
    
    feature_data_dict = {}
    
    for stock_id in stock_ids:
        feature_data_dict[stock_id] = get_stock_wise_dict(set_type, stock_id)

    return feature_data_dict

In [None]:
train_feature_dict = get_feature_dict('train', stock_ids)

In [None]:
def get_X_array(df, feature_dict):
    
    n_samples = len(df)
    
    X = np.empty((n_samples, max_feature_series_len))

    for i in range(n_samples):
        
        stock_id = df.iloc[i]['stock_id']
        time_id = df.iloc[i]['time_id']
        X[i] = feature_dict[stock_id][time_id]
        
    X = np.expand_dims(X, axis=-1)
    
    return X

In [None]:
X = get_X_array(train, train_feature_dict)

In [None]:
y = train['target'].values.reshape(-1,1)

In [None]:
enc = OrdinalEncoder()
train_stock_ids = enc.fit_transform(train['stock_id'].values.reshape(-1,1))

In [None]:
test_size = 0.2
X_train, X_valid, y_train, y_valid, X_train_stock, X_valid_stock = train_test_split(X, y, train_stock_ids, test_size=test_size, random_state=35465)

In [None]:
x_scaling = 1e9
y_scaling = 1e3

In [None]:
class attention(Layer):
    def __init__(self,**kwargs):
        super(attention,self).__init__(**kwargs)

    def build(self,input_shape):
        self.W=self.add_weight(name="W",shape=(input_shape[-1],attention_size))
        self.b=self.add_weight(name="b",shape=(attention_size),initializer="zeros")  
        self.U=self.add_weight(name="U",shape=(attention_size, 1))      
        super(attention, self).build(input_shape)

    def call(self,x):
        v=K.tanh(K.dot(x,self.W)+self.b)
        z=K.softmax(K.dot(v,self.U))
        output=x*z
        return K.sum(output,axis=1)

    def compute_output_shape(self,input_shape):
        return (input_shape[0],input_shape[-1])

    def get_config(self):
        config = super(attention,self).get_config()
        return config

In [None]:
def rmspe(y_true, y_pred):
     return K.sqrt(K.mean(K.square( (y_true - y_pred)/ y_true )))

In [None]:
LSTM_units = 96
attention_size= 4
hidden_units = 128
stock_embedding_size = 32

hidden_units2 = [128, 64]

initial_learning_rate = 1e-3
batch_size = 4096
epochs = 100

In [None]:
optimizer = Adam(learning_rate=initial_learning_rate)

In [None]:
plateau = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=7, verbose=0, mode='min')

In [None]:
def lstm_model():
    
    stock_inp = Input(shape=(1,))
    
    feature_inp = Input(shape=(max_feature_series_len,1))
    
    x = Bidirectional(
            LSTM(LSTM_units, 
            return_sequences=True))(feature_inp)
    
    x = attention()(x)
    
    x = Dense(
            hidden_units)(x)
    
    x = Activation('swish')(x)
    
    e = Embedding(len(stock_ids), stock_embedding_size)(stock_inp)
    
    e = Flatten()(e)
    
    x = Concatenate()([e, x])
    
    x = BatchNormalization()(x)
    
    for h in hidden_units2:
    
        x = Dense(
                h)(x)

        x = Activation('swish')(x)
    
    x = Dense(
            1)(x)
    
    x = Activation('swish')(x)
    
    model = Model([stock_inp, feature_inp], x)
    
    print(model.summary())
    
    model.compile(optimizer=optimizer, metrics=[rmspe], loss=rmspe)

    return model

In [None]:
model = lstm_model()

In [None]:
checkpoint = ModelCheckpoint('model.hdf5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='min')

In [None]:
r = model.fit(
    [X_train_stock, X_train*x_scaling], y_train*y_scaling, epochs=epochs, callbacks=[checkpoint, plateau], 
    batch_size=batch_size, validation_data=([X_valid_stock, X_valid*x_scaling], y_valid*y_scaling), verbose=1)

In [None]:
model = load_model('model.hdf5', custom_objects={'attention': attention}, compile=False)

In [None]:
del X
del X_train
del X_valid

In [None]:
test = pd.read_csv(info.format('test'))

In [None]:
if len(test) == 3:
    test = pd.DataFrame(test.iloc[0]).T

In [None]:
test_stock_ids = test['stock_id'].unique()

In [None]:
test_wap_dict = get_feature_dict('test', test_stock_ids)

In [None]:
X_test = get_X_array(test, test_wap_dict)

In [None]:
test['row_id'] = test.apply(lambda x: row_id(x.stock_id, x.time_id), axis=1)

In [None]:
X_test_stock = enc.transform(test['stock_id'].values.reshape(-1,1))

In [None]:
test['target'] = model.predict([X_test_stock, X_test*x_scaling])/y_scaling

In [None]:
submission = test[['row_id', 'target']]

In [None]:
submission.to_csv('submission.csv', index=False)