In [None]:
import os
import glob

import pickle as pk
from sklearn.decomposition import IncrementalPCA

import numpy as np 
import pandas as pd 

import tensorflow as tf
import keras.backend as K

from multiprocessing import cpu_count, Pool

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

from tqdm.auto import tqdm
tqdm.pandas()

import warnings
warnings.filterwarnings('ignore')

In [None]:
PROCESS_NREC = 600
MAX_SERIES_LEN = PROCESS_NREC
PCA_FEATURES = 50

In [None]:
def get_path_dict(f, v):

    f_dict = {}
    for i in tqdm(v):
        fpath = f'{f}/stock_id={i}'
        flist = glob.glob(os.path.join(fpath, '*.parquet'))
        if len(flist) > 0:
            f_dict[i] = flist[0]
    return f_dict

In [None]:
train_ds = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
test_ds = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')

print(f'Train ds shape: {train_ds.shape}')
print(f'Test ds shape: {test_ds.shape}')

train_ds['row_id'] = train_ds['stock_id'].astype(str) + '-' + train_ds['time_id'].astype(str)

In [None]:
train_ds[:3]

In [None]:
book_test_dict = get_path_dict(f'../input/optiver-realized-volatility-prediction/book_test.parquet', test_ds['stock_id'].unique())
trade_test_dict = get_path_dict(f'../input/optiver-realized-volatility-prediction/trade_test.parquet', test_ds['stock_id'].unique())

In [None]:
# Load StandardScalerVectors and PCA
np_book_mean_v = np.load('../input/optiver-train-pca-v1/np_train_books_mean_v.npy')
np_book_std_v = np.load('../input/optiver-train-pca-v1/np_train_books_std_v.npy')

np_trade_mean_v = np.load('../input/optiver-train-pca-v1/np_train_trades_mean_v.npy')
np_trade_std_v = np.load('../input/optiver-train-pca-v1/np_train_trades_std_v.npy')

pca_book = pk.load(open("../input/optiver-train-pca-v2/pca_book_tranformer.pkl",'rb'))
pca_trade = pk.load(open("../input/optiver-train-pca-v2/pca_trade_tranformer.pkl",'rb'))

In [None]:
# Thank you --> https://www.kaggle.com/alexioslyon/stock-embedding-ffnn-my-features
#
def calc_wap1(df):
    # Function to calculate first WAP
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

def calc_wap2(df):
    # Function to calculate second WAP
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

def log_return(series):
    # Function to calculate the log of the return
    return np.log(series).diff()

def realized_volatility(series):
    # Calculate the realized volatility
    return np.sqrt(np.sum(series**2))

def count_unique(series):
    # Function to count unique elements of a series
    return len(np.unique(series))

def book_ds_fe(df):
    # Calculate Wap
    df['wap1'] = calc_wap1(df)
    df['wap2'] = calc_wap2(df)
    
    # Calculate log returns
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return)
    
    # Calculate wap balance
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    
    # Calculate spread
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['price_spread2'] = (df['ask_price2'] - df['bid_price2']) / ((df['ask_price2'] + df['bid_price2']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df["bid_ask_spread"] = abs(df['bid_spread'] - df['ask_spread'])
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    return df

def trade_ds_fe(df):
    df['log_return1'] = df.groupby(['time_id'])['price'].apply(log_return)
    
    return df

def process_trades_ds(ds, trade_dict, trade_skip_columns, n_last_rec=PROCESS_NREC):
    cache_stock_id = -1
    cache_ds = None
    trade_columns = None

    num_features = 4
    
    j = 0
    rout = np.zeros((len(ds), num_features, PCA_FEATURES), dtype=np.float16)
    
    for index, row in tqdm(ds.iterrows(), total=len(ds)):

        stock_id = int(row['stock_id'])
        time_id = row['time_id']
        #target = row['target']

        # get book data
        trade_ds = None

        if cache_stock_id == stock_id:
            trade_ds = cache_ds
        else:
            fname = trade_dict.get(stock_id)
            if fname is not None:
                trade_ds = pd.read_parquet(fname)    
                
                ### do some feature engineering
                trade_ds = trade_ds_fe(trade_ds)

                cache_stock_id = stock_id
                cache_ds = trade_ds

        if trade_ds is None:
            continue

        if trade_columns is None:
            trade_columns = [c for c in trade_ds.columns if c not in trade_skip_columns]

        sds = trade_ds[trade_ds['time_id'] == time_id]
         
        a = np.zeros((len(trade_columns), n_last_rec), dtype=np.float16)
        
        if len(sds) != 0:
            a = sds[trade_columns][-n_last_rec:].to_numpy(dtype=np.float16).transpose()
            if a.shape[1] < n_last_rec:
                a = np.pad(a, ((0, 0), (n_last_rec - a.shape[1], 0)), mode='constant')
            
        a = np.nan_to_num(a)
        a = ((a - np.expand_dims(np_trade_mean_v, axis=1)) / np.expand_dims(np_trade_std_v, axis=1))    
        a = pca_trade.transform(a)

        rout[j, :] = np.nan_to_num(a)
        j += 1
        
    return rout

def process_books_ds(ds, book_dict, book_skip_columns, n_last_rec=PROCESS_NREC):
    
    cache_stock_id = -1
    cache_ds = None
    book_columns = None
    
    num_features = 20
    
    j = 0
    rout = np.zeros((len(ds), num_features, PCA_FEATURES), dtype=np.float16)
    
    for index, row in tqdm(ds.iterrows(), total=len(ds)):
        
        stock_id = int(row['stock_id'])
        time_id = row['time_id']
        #target = row['target']

        # get book data
        book_ds = None

        if cache_stock_id == stock_id:
            book_ds = cache_ds
        else:
            fname = book_dict.get(stock_id)
            if fname is not None:
                book_ds = pd.read_parquet(fname)    
                
                ### do some feature engineering
                book_ds = book_ds_fe(book_ds)

                cache_stock_id = stock_id
                cache_ds = book_ds

        if book_ds is None:
            continue

        if book_columns is None:
            book_columns = [c for c in book_ds.columns if c not in book_skip_columns]

        sds = book_ds[book_ds['time_id'] == time_id]
        
        a = np.zeros((len(book_columns), n_last_rec), dtype=np.float16)
        if len(sds) != 0:
            a = sds[book_columns][-n_last_rec:].to_numpy(dtype=np.float16).transpose()
            if a.shape[1] < n_last_rec:
                a = np.pad(a, ((0, 0), (n_last_rec - a.shape[1], 0)), mode='constant')
            
        a = np.nan_to_num(a)
        a = ((a - np.expand_dims(np_book_mean_v, axis=1)) / np.expand_dims(np_book_std_v, axis=1))    
        a = pca_book.transform(a)
        
        rout[j, :] = np.nan_to_num(a)
        j += 1
    return rout

In [None]:
# TODO: Make It Parralel
trade_skip_columns = book_skip_columns = ['time_id', 'seconds_in_bucket']
np_test_books = process_books_ds(test_ds, book_test_dict, book_skip_columns, n_last_rec=PROCESS_NREC)
np_test_trades = process_trades_ds(test_ds, trade_test_dict, trade_skip_columns, n_last_rec=PROCESS_NREC)

In [None]:
print(np_test_books.shape)
print(np_test_trades.shape)

In [None]:
# Load train datasets 
np_train_books = np.load('../input/optiver-train-pca-v2/np_train_books_pca.npy')
np_train_trades = np.load('../input/optiver-train-pca-v2/np_train_trades_pca.npy')

In [None]:
print(f'Train books shape: {np_train_books.shape}')
print(f'Train trades shape: {np_train_trades.shape}')
print(f'Test books shape: {np_test_books.shape}')
print(f'Test trades shape: {np_test_trades.shape}')

In [None]:
np_train_books = np.transpose(np_train_books, (0, 2, 1))
np_train_trades = np.transpose(np_train_trades, (0, 2, 1))
np_test_books = np.transpose(np_test_books, (0, 2, 1))
np_test_trades = np.transpose(np_test_trades, (0, 2, 1))

print(np_train_books.shape)
print(np_train_trades.shape)

In [None]:
np_train = np.concatenate((np_train_books, np_train_trades), axis=2)
np_test = np.concatenate((np_test_books, np_test_trades), axis=2)
#np_train = np.concatenate((np_train_books, np_train_trades), axis=1)
#np_test = np.concatenate((np_test_books, np_test_trades), axis=1)
print(np_train.shape)
print(np_test.shape)

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(np_train, train_ds['target'], test_size=0.1, random_state=42)
x_test = np_test

In [None]:
def rmspe(y_true, y_pred):
    return K.sqrt(K.mean(K.square((y_true - y_pred) / y_true)))

In [None]:
def get_model_v1():
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.LSTM(30, input_shape=(1, x_train.shape[2]), return_sequences=True))
    model.add(tf.keras.layers.LSTM(30, input_shape=(1, x_train.shape[2]), return_sequences=False))
    model.add(tf.keras.layers.Dense(64))
    model.add(tf.keras.layers.Dense(1))

    model.compile(loss=rmspe, optimizer='adam')
    model.summary()
    return model    

In [None]:
# TODO: Attention
K.clear_session()

model = get_model_v1()

checkpoint_filepath = './best_weights/checkpoint'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

model_earlystopping_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)

history = model.fit(x_train, y_train.values, validation_data=(x_valid, y_valid.values), callbacks=[model_checkpoint_callback, model_earlystopping_callback], epochs=100, batch_size=256, verbose=1)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
a = np.min(history.history['val_loss'])
print(f'The best val_loss is {a:.3f}')

In [None]:
# sanity check
rmin = 50
rmax = 55

stest = x_valid[rmin:rmax]

model.load_weights(checkpoint_filepath)
res = model.predict(stest, batch_size=32)

a = np.zeros((rmax-rmin, 3), dtype=np.float16)
a[:, 0] = res[0]
a[:, 1] = y_valid[rmin:rmax].values
a[:, 2] = a[:, 0] - a[:, 1]
print(a)

In [None]:
model.load_weights(checkpoint_filepath)
res = model.predict(x_test, batch_size=256)
#res = np.clip(res, 0, 1)

In [None]:
submission_ds = pd.DataFrame()
submission_ds['row_id'] = test_ds['row_id']
submission_ds['target'] = res
submission_ds.to_csv('submission.csv', index=False)

In [None]:
submission_ds[:3]