In [None]:

import os
import tensorflow as tf
import numpy as np
import pandas as pd
from joblib import Parallel, delayed

import warnings
warnings.filterwarnings('ignore')

# Files and directories
trainFile        = '../input/optiver-realized-volatility-prediction/train.csv'
bookTrainFolder  = '../input/optiver-realized-volatility-prediction/book_train.parquet/'
bookTestFolder   = '../input/optiver-realized-volatility-prediction/book_test.parquet/'
tradeTrainFolder = '../input/optiver-realized-volatility-prediction/trade_train.parquet/'
tradeTestFolder  = '../input/optiver-realized-volatility-prediction/trade_test.parquet/'

# Load train.csv -- contains the target values
train = pd.read_csv(trainFile)

def buildData(trainFolders = True):

    if trainFolders:
        bookFolder  = bookTrainFolder
        tradeFolder = tradeTrainFolder
    else:
        bookFolder  = bookTestFolder
        tradeFolder = tradeTestFolder

    def buildDataForSingleStockId(stock_id_folder, stock_id_bookFolder, stock_id_tradeFolder):

        # Read in book data for curr stock_id
        bookData = pd.read_parquet(stock_id_bookFolder + stock_id_folder)

        # create dataframe with all the time_id in the current bookData and all the possible seconds_in_bucket 0-599
        time_id = []
        seconds_in_bucket = []

        for x in bookData['time_id'].unique():
            for y in range(600):
                time_id.append(x)
                seconds_in_bucket.append(y)

        allTimes = pd.DataFrame({'time_id' : time_id, 'seconds_in_bucket' : seconds_in_bucket})

        # make sure that all the seconds are accounted for
        bookData = bookData.merge(allTimes, on=['time_id', 'seconds_in_bucket'], how='outer').sort_values(by=['time_id', 'seconds_in_bucket'])
        bookData['time_id_cp'] = bookData['time_id']
        
        # forward fill and backfill - time_id_cp will get removed
        bookData = bookData.groupby(['time_id_cp']).fillna(method='ffill').fillna(method='bfill').reset_index(drop=True)

        # calculate the weighted average price
        bookData['wap1']                = (bookData['bid_price1'] * bookData['ask_size1'] + bookData['ask_price1'] * bookData['bid_size1']) / (bookData['ask_size1'] + bookData['bid_size1'])
        bookData['wap2']                = (bookData['bid_price2'] * bookData['ask_size2'] + bookData['ask_price2'] * bookData['bid_size2']) / (bookData['ask_size2'] + bookData['bid_size2'])

        # Bid-Ask Spreads
        bookData['ask1_bid1_spread']    = bookData['ask_price1'] / bookData['bid_price1'] - 1

        # aggregate the data over windows of interval seconds
        # for each interval calculate
        #      - log(high/low) using wap1 and wap2
        def aggregateBookData(interval):
            df               = bookData.copy()
            df['interval']   = df['seconds_in_bucket'] // interval

            df_agg = df.groupby(['time_id', 'interval']).agg(
                                wap1_log_high_low       = pd.NamedAgg(column='wap1',                aggfunc=lambda x: np.log(np.max(x) / np.min(x))),
                                wap2_log_high_low       = pd.NamedAgg(column='wap2',                aggfunc=lambda x: np.log(np.max(x) / np.min(x))),
                                ask1_bid1_spread_avg    = pd.NamedAgg(column='ask1_bid1_spread',    aggfunc=np.mean)       ).reset_index()

            df_wide    = pd.pivot_table(df_agg, values=['wap1_log_high_low', 'wap2_log_high_low', 'ask1_bid1_spread_avg'], 
                                             index='time_id', columns='interval').reset_index().fillna(0)
            df_wide.columns = ['_'.join(str(e) for e in col) for col in df_wide.columns]
            df_wide = df_wide.add_suffix(f'_{interval}s_wide').rename(columns={f'time_id__{interval}s_wide' : 'time_id'})

            return df_wide
        
        finalBookData = aggregateBookData(10)

        # add row_id
        finalBookData['row_id']   = stock_id_folder.split('=')[1] + '-' + finalBookData['time_id'].astype(str)

        return finalBookData.drop(columns='time_id').fillna(0)

    result = Parallel(n_jobs=-1, verbose=10)(delayed(buildDataForSingleStockId)(curr_stock_id_folder, bookFolder, tradeFolder) for curr_stock_id_folder in os.listdir(bookFolder))

    return pd.concat(result, ignore_index=True).fillna(0)

trainData_aggregations = buildData(trainFolders = True)
testData_aggregations  = buildData(trainFolders = False)

# add stock_id
trainData_aggregations['stock_id'] = trainData_aggregations['row_id'].apply(lambda x: x.split('-')[0]).astype(int)
testData_aggregations['stock_id']  = testData_aggregations['row_id'].apply(lambda x: x.split('-')[0]).astype(int)

# add time_id
trainData_aggregations['time_id'] = trainData_aggregations['row_id'].apply(lambda x: x.split('-')[1]).astype(int)
testData_aggregations['time_id']  = testData_aggregations['row_id'].apply(lambda x: x.split('-')[1]).astype(int)

# Merge in targets to trainDataWith_s_wide_vars
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
trainData       = pd.merge(train[['row_id','target']], trainData_aggregations, on = ['row_id'], how = 'inner').fillna(0)


# train a nnet with the variable 
#    'wap1_log_high_low'
#    'wap2_log_high_low'
#    'ask1_bid1_spread_avg'

s_wide_vars         = [x for x in trainData.columns.tolist() if any(y in x for y in ['s_wide'])]
trainData_np_s_wide = trainData[s_wide_vars].to_numpy()
testData_np_s_wide  = testData_aggregations[s_wide_vars].to_numpy()

trainData_np_stock_id = trainData['stock_id'].to_numpy()
testData_np_stock_id  = testData_aggregations['stock_id'].to_numpy()


trainTarget_np = trainData['target'].to_numpy()

# custom objective for the keras model
def rmspe_tf(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square((y_true - y_pred) / y_true)))

# Network structure
def buildModel(input_shape):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.InputLayer(input_shape))
    model.add(tf.keras.layers.Dense(input_shape[0] // 3, activation = 'relu'))
    model.add(tf.keras.layers.Dense(1, activation = 'relu', name='s_wide_features'))
    
    model.compile(loss      = rmspe_tf,
                  optimizer = tf.keras.optimizers.Adam(learning_rate=0.00001),
                  metrics   = [rmspe_tf])
    
    return model

# embedding from https://www.kaggle.com/tatudoug/stock-embedding-ffnn-features-of-the-best-lgbm/notebook
# but there are several other notebooks that do the same thing
# Network structure

maxStockId = trainData['stock_id'].max()

def buildModelWithStockEmdedding(input_shape):
    
    # stock_id
    stock_id_input = tf.keras.layers.Input(shape=(1,), name='stock_id')
    stock_embedding = tf.keras.layers.Embedding(input_dim = maxStockId + 1, output_dim=2, input_length=1, name='stock_embedding')(stock_id_input)
    flatEmbedding   = tf.keras.layers.Flatten()(stock_embedding)
    
    #s_wide
    s_wide_input   = tf.keras.layers.Input(shape=input_shape, name='s_wide_input')
    layer1          = tf.keras.layers.Dense(input_shape[0] // 3, activation = 'relu')(s_wide_input)
    
    # concat
    concatLayer     = tf.keras.layers.Concatenate()([flatEmbedding, layer1])
    
    # end
    x               = tf.keras.layers.Dense(10, activation = 'relu')(concatLayer)
    lastLayer       = tf.keras.layers.Dense(1, activation = 'relu', name='s_wide_features')(x)
    
    model = tf.keras.Model(inputs = [stock_id_input, s_wide_input],
                           outputs = lastLayer)
        
    model.compile(loss      = rmspe_tf,
                  optimizer = tf.keras.optimizers.Adam(learning_rate=0.00001),
                  metrics   = [rmspe_tf])
    
    return model

# 5 fold model / cv
num_epochs      = 200
batchSize       = 1024
k               = 5
num_val_samples = len(trainData_np_s_wide) // k

np.random.seed(9999)
shuffledIndx      = np.random.permutation(len(trainData_np_s_wide))

testDataRowId       = testData_aggregations['row_id']
finalPrediction     = np.zeros(len(testDataRowId))

np.random.seed(9999)
tf.random.set_seed(9999)

for i in range(k):
    #i = 2
    print(f'Processing fold #{i}')
    val_data              = [trainData_np_stock_id[shuffledIndx[i * num_val_samples : (i + 1) * num_val_samples]], trainData_np_s_wide[shuffledIndx[i * num_val_samples : (i + 1) * num_val_samples]]]
    val_targets           = trainTarget_np[shuffledIndx[i * num_val_samples : (i + 1) * num_val_samples]]
    partial_train_data    = [np.concatenate([trainData_np_stock_id[shuffledIndx[:i * num_val_samples]],
                                            trainData_np_stock_id[shuffledIndx[(i + 1) * num_val_samples:]] ],
                                            axis=0),
                             np.concatenate([trainData_np_s_wide[shuffledIndx[:i * num_val_samples]],
                                            trainData_np_s_wide[shuffledIndx[(i + 1) * num_val_samples:]] ],
                                            axis=0)]
    partial_train_targets = np.concatenate([trainTarget_np[shuffledIndx[:i * num_val_samples]],
                                            trainTarget_np[shuffledIndx[(i + 1) * num_val_samples:]] ],
                                            axis=0)
    
    model = buildModelWithStockEmdedding(trainData_np_s_wide.shape[1:])
    history = model.fit(x=partial_train_data, y=partial_train_targets,
                        validation_data = (val_data, val_targets),
                        epochs          = num_epochs, 
                        batch_size      = batchSize, 
                        verbose         = 1)

    finalPrediction += model.predict([testData_np_stock_id, testData_np_s_wide]).reshape(-1) / k

# write submission.csv
pd.DataFrame({'row_id': testDataRowId, 'target': finalPrediction}).to_csv('submission.csv', index=False)
