In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        #print(os.path.join(dirname, filename))
        pass

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from IPython.core.magic import register_cell_magic
@register_cell_magic
def skip(line, cell=None):
    '''Skips execution of the current line/cell if line evaluates to True.'''
    if eval(line):
        return
        
    get_ipython().run_cell(cell)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow_datasets.public_api as tfds
import tensorflow as tf
import glob
import dill
from kaggle_datasets import KaggleDatasets
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import random

In [None]:
tf.random.set_seed(123)
np.random.seed(123)
random.seed(123)

In [None]:
# NEW on TPU in TensorFlow 24: shorter cross-compatible TPU/GPU/multi-GPU/cluster-GPU detection code
tpu = None
try: # detect TPUs
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect() # TPU detection
    strategy = tf.distribute.TPUStrategy(tpu)
except ValueError: # detect GPUs
    #strategy = tf.distribute.MirroredStrategy() # for GPU or multi-GPU machines
    strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
    #strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() # for clusters of multi-GPU machines

    
#strategy,tpu = tf.distribute.MirroredStrategy(devices=["TPU:0", "TPU:1","TPU:2"]),True

print("Number of accelerators: ", strategy.num_replicas_in_sync)


AUTO = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')


* Dataset conversion to TFRecords : https://www.kaggle.com/tchaye59/orvp-dataset
* TFRecords : https://www.kaggle.com/tchaye59/orvptfrecords

In [None]:
data_path = '../input/optiver-realized-volatility-prediction'
GCS_PATH = '../input/orvptfrecords'
if tpu:
    GCS_PATH = KaggleDatasets().get_gcs_path('orvptfrecords')

In [None]:
class TrainDataset(tfds.core.GeneratorBasedBuilder):
    VERSION = tfds.core.Version('0.1.0')
    
    def _split_generators(self, dl_manager):
        return [
            tfds.core.SplitGenerator(
                    name=f'train',
                    gen_kwargs={
                    },
            )
        ]
    
    def _info(self):
        return tfds.core.DatasetInfo(
            builder=self,
            description=(""),
            features=tfds.features.FeaturesDict({
                "stock_id": tfds.features.Tensor(shape=(),dtype=tf.float64),
                "book": tfds.features.Tensor(shape=(None,9,),dtype=tf.float64),
                "trade": tfds.features.Tensor(shape=(None,4,),dtype=tf.float64),
                "target": tfds.features.Tensor(dtype=tf.float64 ,shape=(1,)),
            }),
        )
    
    def _generate_examples(self,**args):
        pass

In [None]:
BATCH_SIZE_PER_REPLICA = 256
if tpu:
    BATCH_SIZE_PER_REPLICA = 256
GLOBAL_BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
BUFFER_SIZE = 50000

prefetch = 30
MAX_SEQ = 600
TRAIN = False

epochs = 60
LR = 1e-3

In [None]:
train_df = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')

In [None]:
def calculate_wap(bid_price1,ask_price1,bid_size1,ask_size1,
                 bid_price2,ask_price2,bid_size2,ask_size2):
    a1 = bid_price1 * ask_size1 + ask_price1 * bid_size1
    b1 = bid_size1 + ask_size1
    a2 = bid_price2 * ask_size2 + ask_price2 * bid_size2
    b2 = bid_size2 + ask_size2
    x = (a1/b1 + a2/b2)/ 2
    return x[:,tf.newaxis]


def calculate_wap2(bid_price1,ask_price1,bid_size1,ask_size1,
                 bid_price2,ask_price2,bid_size2,ask_size2):
        
    a1 = bid_price1 * ask_size1 + ask_price1 * bid_size1
    a2 = bid_price2 * ask_size2 + ask_price2 * bid_size2
    b = bid_size1 + ask_size1 + bid_size2+ ask_size2
    
    x = (a1 + a2)/ b
    return x[:,tf.newaxis]

def calculate_wap3(bid_price1,ask_price1,bid_size1,ask_size1,):
    a1 = bid_price1 * ask_size1 + ask_price1 * bid_size1
    b1 = bid_size1 + ask_size1
    x = a1/b1
    return x[:,tf.newaxis]

def calculate_wap4(bid_price2,ask_price2,bid_size2,ask_size2,):
    a2 = bid_price2 * ask_size2 + ask_price2 * bid_size2
    b2 = bid_size2 + ask_size2
    x = a2/b2
    return x[:,tf.newaxis]

def tf_diff(a):
    return a[1:]-a[:-1]

def calculate_log_return(wap):
    log_return = tf.math.log(wap)
    log_return = tf.concat([log_return,tf.constant([[0.]],dtype=tf.float64)],axis=0)
    log_return = tf_diff(log_return)
    return log_return

def realized_volatility(log_return):
    rv = tf.math.sqrt(tf.reduce_sum(log_return**2))
    return rv

In [None]:
def features_builder(stock_id,book,trade):
    #time_id = book[:,0]
    seconds_in_bucket = book[:,0]
    bid_price1 = book[:,1]
    ask_price1 = book[:,2]
    bid_price2 = book[:,3]
    ask_price2 = book[:,4]
    bid_size1 = book[:,5]
    ask_size1 = book[:,6]
    bid_size2 = book[:,7]
    ask_size2 = book[:,8]
    
    #Book features
    
    # book_size
    book_size = tf.cast(tf.shape(book)[0],tf.float64)
    
    #wap
    wap = calculate_wap(bid_price1,ask_price1,bid_size1,ask_size1,bid_price2,ask_price2,bid_size2,ask_size2)
    wap2 = calculate_wap2(bid_price1,ask_price1,bid_size1,ask_size1,bid_price2,ask_price2,bid_size2,ask_size2)
    wap3 = calculate_wap3(bid_price1,ask_price1,bid_size1,ask_size1,)
    wap4 = calculate_wap4(bid_price2,ask_price2,bid_size2,ask_size2,)
    #log_return
    log_return = calculate_log_return(wap)
    log_return2 = calculate_log_return(wap2)
    log_return3 = calculate_log_return(wap3)
    log_return4 = calculate_log_return(wap4)
    # rv
    rv = realized_volatility(log_return)
    rv2 = realized_volatility(log_return2)
    rv3 = realized_volatility(log_return3)
    rv4 = realized_volatility(log_return4)
    rv = tf.repeat(rv,tf.shape(book)[0])[:,tf.newaxis]
    rv2 = tf.repeat(rv2,tf.shape(book)[0])[:,tf.newaxis]
    rv3 = tf.repeat(rv3,tf.shape(book)[0])[:,tf.newaxis]
    rv4 = tf.repeat(rv4,tf.shape(book)[0])[:,tf.newaxis]
    
    book_data = [wap,wap2,wap3,wap4,
                 log_return,log_return2,log_return3,log_return4,
                 rv,rv2,rv3,rv4,
                 tf.repeat(stock_id,tf.shape(book)[0])[:,tf.newaxis],
                 tf.repeat(book_size,tf.shape(book)[0])[:,tf.newaxis]
          ]
    book_data = tf.concat(book_data,axis=-1)
    
    # Trade features
    #time_id	seconds_in_bucket	price	size	order_count
    #time_id = trade[:,0]
    seconds_in_bucket = trade[:,0]
    price = trade[:,1]
    size = trade[:,2]
    order_count = trade[:,3]
    #price_log_return
    price_log_return = calculate_log_return(tf.reshape(price,(-1,1)))
    #trade_size
    trade_size = tf.cast(tf.shape(trade)[0],tf.float64)
    
    trade_data = [
        #price_log_return,
        tf.repeat(trade_size,tf.shape(trade)[0])[:,tf.newaxis]
          ]
    trade_data = tf.concat(trade_data,axis=-1)
    
    return book_data,trade_data
    


In [None]:
def get_datasets():
    builder = TrainDataset(data_dir=GCS_PATH)
    # The following line download the dataset
    builder.download_and_prepare()
    dataset = builder.as_dataset()['train']
    
    size = len(dataset)

    # pad,shuffle and bacth
    def preprecoss(x):
        stock_id,book,trade,target = x['stock_id'],x['book'],x['trade'],x['target']
        
        book_data,trade_data = features_builder(stock_id,book,trade)
        
        book = tf.concat([book,book_data],axis=-1)
        trade = tf.concat([trade,trade_data],axis=-1)
        
        p1 = [[0,MAX_SEQ-tf.shape(book)[0]],[0,0]]
        p2 = [[0,MAX_SEQ-tf.shape(trade)[0]],[0,0]]
        
        book = tf.pad(book,p1, constant_values=0.)
        trade = tf.pad(trade,p2, constant_values=0.)
        
        return (book,trade),target
    
    def shape_fix(inputs,target):
        book,trade = inputs
        book = tf.reshape(book,(MAX_SEQ,9+14))
        trade = tf.reshape(trade,(MAX_SEQ,4+1))
        return (book,trade),target
    
    
    dataset = dataset.repeat().shuffle(BUFFER_SIZE).map(preprecoss,num_parallel_calls=AUTO)
    if tpu:
        dataset = dataset.map(shape_fix,num_parallel_calls=AUTO)
    dataset = dataset.batch(BATCH_SIZE_PER_REPLICA).prefetch(prefetch)
    if tpu:
        dataset = strategy.experimental_distribute_datasets_from_function(lambda x: dataset)
        
    return dataset,size//GLOBAL_BATCH_SIZE

In [None]:
def build_model():
    n_seq = MAX_SEQ#None if not tpu else MAX_SEQ
    book_input = tf.keras.layers.Input(shape=(n_seq,9+14))
    trade_input = tf.keras.layers.Input(shape=(n_seq,4+1))

    book = tf.keras.layers.Masking()(book_input)
    trade = tf.keras.layers.Masking()(trade_input)

    book = tf.keras.layers.BatchNormalization()(book)
    trade = tf.keras.layers.BatchNormalization()(trade)

    book = tf.keras.layers.GRU(256)(book)
    book = tf.keras.layers.Dropout(0.1)(book)

    trade = tf.keras.layers.GRU(256)(trade)
    trade = tf.keras.layers.Dropout(0.1)(trade)
    
    model = tf.keras.layers.concatenate([book,trade])
    
    
    for _ in range(10):
        model = keras.layers.Dense(256, activation=keras.activations.swish)(model)
        model = tf.keras.layers.Dropout(0.2)(model)
    
     
    model = tf.keras.layers.Dense(1,activation=None)(model)

    model = tf.keras.Model([book_input,trade_input],model)
    return model

In [None]:
build_model().summary()

In [None]:
def rmspe(y_true,y_pred):
    elements = ((y_true - y_pred) / y_true) ** 2
    elements = tf.reduce_sum(elements)/tf.cast(tf.size(y_pred),tf.float32)
    return tf.sqrt(elements)

In [None]:
train_dataset,train_step = get_datasets()

In [None]:
%%skip not TRAIN

with strategy.scope():
    
    #model
    model = build_model()
    model.compile(
        optimizer=tf.keras.optimizers.Adam(LR),
        loss=rmspe,
    )
    #callbacks
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=f'weights.h5',
                                                                   save_weights_only=True,
                                                                   monitor='loss',
                                                                   mode='min',verbose=True,
                                                                   save_best_only=True)
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss',mode='min', patience=5)
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.2, patience=2, min_lr=LR/100)
    terminate_onNaN = tf.keras.callbacks.TerminateOnNaN()
    # dataset
    train_dataset,train_step = get_datasets()
    # Train
    history = model.fit(train_dataset,
                        steps_per_epoch=train_step,
                        epochs=epochs,
                        callbacks=[model_checkpoint_callback,early_stopping,reduce_lr,terminate_onNaN],
                       )

In [None]:
%%skip not TRAIN
pd.DataFrame(history.history).loss.plot()

In [None]:
%%skip TRAIN
! cp  ../input/realized-volatility-keras-rnn-baseline-tpu/weights.h5 ./

In [None]:
%%skip tpu
model  = build_model()
model.load_weights(f'./weights.h5')

In [None]:
class TestDataset():
    
    def __init__(self,):
        self.test = pd.read_csv(data_path+'/test.csv')
        self.stock_ids = self.test.stock_id.unique()
        
        
    def getStockData(self,stock_id):
        # Read data
        book_path = f'{data_path}/book_test.parquet/stock_id={stock_id}'
        trade_path = f'{data_path}/trade_test.parquet/stock_id={stock_id}'
        book_df = pd.read_parquet(book_path)
        trade_df = pd.read_parquet(trade_path)
        
        book_df_grp = book_df.groupby('time_id')
        trade_df_grp = trade_df.groupby('time_id')
        
        data = []
        for time_id in self.test.time_id.unique():
            try:
                book_items = book_df_grp.get_group(time_id)
                del book_items['time_id']
            except:
                continue
            try:
                trade_items = trade_df_grp.get_group(time_id)
                del trade_items['time_id']
                trade_items = trade_items.values
            except:
                # If No trade
                trade_items = np.zeros((1,4),dtype=np.float64)
                
            
            row_id = self.test[(self.test.stock_id == stock_id) & (self.test.time_id == time_id) ].row_id.values
            if len(row_id) == 0:
                continue
            row_id = row_id[0]
            
            item = row_id,stock_id,book_items.values,trade_items
            data.append(item)
            
        return data
    
    def gen(self):
        for stock_id in self.stock_ids:
            for item in self.getStockData(stock_id):
                yield item
        
    def __len__(self):
        return len(self.train)

In [None]:
%%skip tpu
test_data = TestDataset()

In [None]:
%%skip tpu
for x in test_data.gen():
    break

In [None]:
def test_preprecoss(row_id,stock_id,book,trade):
    book_data,trade_data = features_builder(stock_id,book,trade)
        
    book = tf.concat([book,book_data],axis=-1)
    trade = tf.concat([trade,trade_data],axis=-1)
        
    p1 = [[0,MAX_SEQ-tf.shape(book)[0]],[0,0]]
    p2 = [[0,MAX_SEQ-tf.shape(trade)[0]],[0,0]]
        
    book = tf.pad(book,p1, constant_values=0.)
    trade = tf.pad(trade,p2, constant_values=0.)
        
    return row_id,(book,trade)

In [None]:
%%skip tpu
test_dataset = tf.data.Dataset.from_generator(test_data.gen,
                                         output_signature=(
                                             tf.TensorSpec(shape=(), dtype=tf.string),
                                             tf.TensorSpec(shape=(),dtype=tf.float64),
                                             tf.TensorSpec(shape=(None,9,),dtype=tf.float64),
                                             tf.TensorSpec(shape=(None,4,),dtype=tf.float64),
                                         )
                                        )

test_dataset = test_dataset.map(test_preprecoss,num_parallel_calls=AUTO)
test_dataset = test_dataset.batch(64).prefetch(prefetch)

In [None]:
%%skip tpu
def predict_fn(r,X):
    return r, model(X,training=False)
test_dataset = test_dataset.map(predict_fn).prefetch(10)

In [None]:
%%skip tpu
ids,targets= [],[]
for (row_id,y) in test_dataset:
    ids.extend(row_id.numpy().flatten())
    targets.extend(y.numpy().flatten())
    
ids = [s.decode('ascii') for s in ids]

In [None]:
%%skip tpu
# add missing rows
miss_idx = ~test_data.test.row_id.isin(ids)
miss = test_data.test.loc[miss_idx,'row_id'].values
ids.extend(miss)
targets.extend([0 for _ in miss])
#targets.extend([train_df.target.min() for _ in miss])

In [None]:
%%skip tpu
df = pd.DataFrame({'row_id':ids,'target':targets})
df.to_csv('submission.csv',index=False)
df.head(10)