In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#import modules
from sklearn.metrics import r2_score
import pandas as pd
import numpy as np
import plotly.express as px

import os
import glob
import shap

from multiprocessing import Pool

from joblib import Parallel, delayed

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow import feature_column

import time
import matplotlib.pyplot as plt
%matplotlib inline

path_to_files = "../input/optiver-realized-volatility-prediction"

book_train_files =  path_to_files + '/book_train.parquet/stock_id={}'
trade_train_files =  path_to_files + '/trade_train.parquet/stock_id={}'

book_test_files =  path_to_files + '/book_test.parquet/stock_id={}'
trade_test_files =  path_to_files + '/trade_test.parquet/stock_id={}'

#SMALL_F = 0.0000000000000001
SMALL_F = 0.00000001

tf.random.set_seed(111)
np.random.seed(111)

In [None]:
#Configuration
cfg = dict(
    isCollectDataOnly = True,
    isStockIdUsed = False,
    isTFModelUsed = False,
    trainNotUsedCols = ['row_id', 'target', 'time_id', 'stock_id'],
    predictNotUsedCols = ['row_id', 'time_id', 'stock_id'],
    useHyperOpt = False,
    useLabelTransformation = False,
    volumeBarThreshold = 1000.0
)


cfg

In [None]:
def log_return(series):
    return np.log(series).diff()

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))


In [None]:
def getDataFromBidAsk_numpy(df, ci):
    a = 0
    b = 0
    spread  = {}
    for k in [1,2]:
        #k = i+1
        bidp = 'bid_price{}'.format(k)
        askp = 'ask_price{}'.format(k)
        bids = 'bid_size{}'.format(k)
        asks = 'ask_size{}'.format(k)
        #calculate comulative wap
        a += (df[:,ci[bidp]] * df[:,ci[asks]] + df[:,ci[askp]] * df[:,ci[bids]])
        b += df[:,ci[bids]] + df[:,ci[asks]]

        #wap 1 and 2
        spread[f'fb_w_{k}'] = (df[:,ci[bidp]] * df[:,ci[asks]] + df[:,ci[askp]] * df[:,ci[bids]] ) / (df[:,ci[bids]] + df[:,ci[asks]] + SMALL_F)
        spread[f'fb_mid_point_{k}'] = (df[:,ci[askp]]) + (df[:,ci[bidp]]) / 2
    
    # mean wap
    spread['fb_w'] = (a/(b+SMALL_F))
    # rates
    spread['fb_w_rate'] = (spread['fb_w_1']) / (spread['fb_w_2']+SMALL_F) 
    #sum volume
   
    return spread

def Fx(group, stock_id=0, n=10):
    new_df = pd.DataFrame()
    name = int(group.time_id.unique()[0])
    tmp = pd.DataFrame()

    #calculate log return from the following features:
    cols = [
        'fb_w', 
        'fb_w_1', 
        'fb_w_2',
        'fb_mid_point_1',
        'fb_w_rate',
    ]

    new_cols = [s + '_lr' for s in cols]
    group.loc[:,new_cols] = log_return(group[cols]).to_numpy()
    group = group[~group['fb_w'].isnull()]

    #calculate realized volatility
    cols = new_cols
    new_cols = [s + '_vola' for s in cols]
    tmp = pd.concat([tmp, pd.DataFrame(realized_volatility(group.loc[:,cols]).to_numpy().reshape(1,-1), columns=new_cols)], axis=1)
    
    tmp.loc[:,'row_id'] = str(stock_id) + '-' + str(name)
    tmp.loc[:,'time_id'] = int(name)
    return tmp

def getFeaturesFromBookData(df, stock_id, n=10):
    results = df.groupby(['time_id']).apply(Fx, stock_id=stock_id, n=n).reset_index(drop=True)
    return results


In [None]:
def getDataFromTrade(df):
    log_ret = log_return(df.price).dropna()
    rz_vol = realized_volatility(log_ret)
    
    tmp = pd.DataFrame()
    tmp.loc[:,'p_rz_vol'] = [rz_vol]

    time_id = df.time_id.unique()[0]
    tmp.loc[:,'time_id'] = time_id
    return tmp

def getFeaturesFromTradeData(df):
    return df.groupby(['time_id']).apply(getDataFromTrade).reset_index(drop=True)

In [None]:
def constructPreprocessedDataFrame(file_path, isTrain):
    stock_id = file_path.split('=')[1]
    df_book_data = pd.read_parquet(file_path)
    if True == isTrain:
        df_trade_data =  pd.read_parquet(trade_train_files.format(stock_id))
    else:
        df_trade_data =  pd.read_parquet(trade_test_files.format(stock_id))

    print('Processing stock id:', stock_id)
    #display(df_book_data.time_id.unique())
    #preprocess book
    a = time.time()
    spread = getDataFromBidAsk_numpy(df_book_data.to_numpy(),{k: v for v, k in enumerate(df_book_data.columns.values)})
    df_book_data = pd.concat([df_book_data,pd.DataFrame(spread)], axis=1)
    df_book_datar = getFeaturesFromBookData(df_book_data, stock_id, 10)
    b = time.time()
    #print(f'preprocess book: {b-a}')
    
    #preprocess trade
    df_trade_datar = getFeaturesFromTradeData(df_trade_data)
    df_book_datar = df_book_datar.merge(df_trade_datar, on = ['time_id'], how = 'left')
    c = time.time()
    #print(f'preprocess trade: {c-b}')

    df_book_datar.loc[:,'stock_id'] = stock_id
    df_book_datar = df_book_datar.fillna(0.0)
    return df_book_datar

def constructBookDataDataFrame(list_file, isTrain=True):
    df_book = pd.DataFrame()
    for file in list_file:
        df_book = pd.concat([df_book, constructPreprocessedDataFrame(file, isTrain=isTrain)])
    return df_book


def preprocessor(list_file, isTrain = True):
    # Use parallel api to call paralle for loop
    df = Parallel(n_jobs = -1, verbose = 1)(delayed(constructPreprocessedDataFrame)(stock_file, isTrain) for stock_file in list_file)
    # Concatenate all the dataframes that return from Parallel
    df = pd.concat(df, ignore_index = True)
    return df


In [None]:
list_order_book_file_train = glob.glob(path_to_files + '/book_train.parquet/*')
list_order_book_file_train[0:1]

In [None]:
%%time
ret_df = preprocessor(list_order_book_file_train)
display(ret_df.shape)
ret_df.head()

In [None]:
ret_df.to_csv('111.csv')

In [None]:
ret_df = pd.read_csv('111.csv', index_col=0)

In [None]:
#important -> don't forget to sort by stock_id
ret_df = ret_df.sort_values(by='stock_id').reset_index(drop=True)
cs = ['row_id', 'stock_id', 'time_id']
used_cols = list(set(ret_df.columns.to_list()) - set(cs))
y_col = 'target'

ALL_STOCKS = {k: v for v, k in enumerate(ret_df.stock_id.unique())}
len(ALL_STOCKS)

In [None]:
used_cols

In [None]:
scaler_target = MinMaxScaler()
def getTrainData(ret_df, seed = 42):
    train = pd.read_csv(path_to_files + '/train.csv')
    #convert stock_id to the same time as in train data
    ret_df.stock_id = ret_df.stock_id.astype(int)
    #merge
    data_df = ret_df.merge(train, on = ['stock_id', 'time_id'], how = 'left')
    data_df.loc[:,'target_orig'] = data_df.loc[:,'target'] 

    if True == cfg['useLabelTransformation']:
        data_df.loc[:,'target'] = data_df.loc[:,'target'] * 100
        scaler_target.fit(data_df.loc[:,'target'].to_numpy().reshape(-1,1))
        data_df.loc[:,'target'] = scaler_target.transform(data_df.loc[:,'target'].to_numpy().reshape(-1,1)).flatten()

    #get train test index 
    all_time_ids = data_df.time_id.unique()

    train_ids, val_ids = train_test_split(all_time_ids, test_size=0.05, random_state=seed)
    test_ids, val_ids = train_test_split(val_ids, test_size=0.5, random_state=seed)

    f = data_df.time_id.isin(train_ids)
    train_df = data_df.loc[f].reset_index(drop=True).copy()

    f = data_df.time_id.isin(val_ids)
    val_df = data_df.loc[f].reset_index(drop=True).copy()

    f = data_df.time_id.isin(test_ids)
    test_df = data_df.loc[f].reset_index(drop=True).copy()
    
    return train_df, val_df, test_df

def predictFromModel(model, df, used_cols=used_cols, prediction_column_name='target'):
    predict = model.predict(df.loc[:, used_cols].values).flatten()
    df_ret = pd.DataFrame()
    df_ret[prediction_column_name] = predict
    df_ret['row_id'] = df['row_id'].values
    return df_ret[['row_id', prediction_column_name]].reset_index(drop=True)

In [None]:
from sklearn.preprocessing import PowerTransformer
def firScalerAndNormalizer(train_df, used_cols=used_cols):
    #scale data
    scaler = MinMaxScaler()
    #scaler = PowerTransformer()
    scaler.fit(train_df.loc[:,used_cols].to_numpy())

    #and normalize
    normalizer = preprocessing.Normalization(axis=-1)
    normalizer.adapt(np.array(train_df.loc[:,used_cols].to_numpy()))

    return scaler, normalizer


# Function to insert row in the dataframe
def Insert_row_(row_number, df, row_value):
    # Slice the upper half of the dataframe
    df1 = df[0:row_number]
   
    # Store the result of lower half of the dataframe
    df2 = df[row_number:]
   
    # Inser the row in the upper half dataframe
    df1 = df1.append(row_value, ignore_index=True)
   
    # Concat the two dataframes
    df_result = pd.concat([df1, df2])
   
    # Return the updated dataframe
    return df_result.reset_index(drop=True)

def get_one_input(X_df, time_id, used_cols, y_col=None):
    f = X_df.time_id == time_id
    y = None

    if y_col is not None:
        y = X_df.loc[f, y_col].to_numpy()
    
    X = X_df.loc[f].copy()
    if(X.shape[0] < len(ALL_STOCKS.keys())):
        #create new empty raw
        new_row = np.zeros((X.shape[1]))
        new_row = pd.DataFrame(np.zeros((1, X.shape[1])), columns = X.columns).astype(X.dtypes)
        new_row.loc[:, 'stock_id'] = 255

        missing_ids = list(set(ALL_STOCKS.keys()) - set(X.loc[:, 'stock_id'].unique()))
        #print(missing_ids)
        #print('Time_Id:', time_id)
        for i in missing_ids:
            X = Insert_row_(ALL_STOCKS[i], X, new_row)
            if y_col is not None:
                y = np.insert(y, ALL_STOCKS[i], .0)

    return X.loc[:,used_cols].to_numpy(), y

def tf_data_generator(df, used_col, y_col=None):
    while True:
        tid = np.random.choice(df.time_id.unique())
        #tf.print(tid)
        X, y = get_one_input(df, tid, used_cols, y_col)
        X = tf.convert_to_tensor(X, tf.float64, name='features')
        y = tf.convert_to_tensor(y, tf.float64, name='labels')
        yield X,y


def tf_data_generator2(df, used_col, y_col=None):
    tids = np.array([])
    while True:
        if len(tids) == 0:
            tids = df.time_id.unique()

        tid =  np.random.choice(tids)
        tids = np.setdiff1d(tids, [tid])

        #tf.print(len(tids))
        X, y = get_one_input(df, tid, used_cols, y_col)
        
        X = tf.convert_to_tensor(X, tf.float64, name='features')
        y = tf.convert_to_tensor(y, tf.float64, name='labels')
        
        yield X,y

def getPredictionFromOneTimeId(model, X_df, time_id, used_cols):
    X, _ = get_one_input(X_df, time_id, used_cols, None)
    X = np.expand_dims(X, axis=0)
    #print(X.shape)
    predicted = model.predict(X).flatten()
    
    #get missing stock ids
    f = time_id == X_df.time_id
    if(X_df.loc[f].shape[0] < len(ALL_STOCKS.keys())):
        missing_ids = list(set(ALL_STOCKS.keys()) - set(X_df.loc[f, 'stock_id'].unique()))
        tl = [ALL_STOCKS[i] for i in missing_ids]
        #print('tl:',tl)
        #print('missing_ids:',missing_ids)
        predicted = np.delete(predicted, tl)
    return predicted

def getPredictionFromTheModel(model, X_df, used_cols):
    df_res = pd.DataFrame()

    for j in X_df.time_id.unique():    
        predicted = getPredictionFromOneTimeId(model, X_df, j, used_cols)
        f = j == X_df.time_id
        targets = X_df.loc[f,'target_orig'].to_numpy()
        row_ids = X_df.loc[f,'row_id'].to_numpy()
        if len(predicted) != len(targets):
            print(targets.shape)
            print(predicted.shape)
            print(row_ids.shape)
        df_res = tmp = pd.concat([df_res, pd.DataFrame({'row_id':row_ids, 'predict':predicted, 'target':targets})])
    
    return df_res.reset_index(drop=True)    

def plotHistory(history):
    # Plot history: MAE
    plt.plot(history.history['loss'], label='Training loss')
    plt.plot(history.history['val_loss'], label='Validation')
    plt.title('History')
    plt.ylabel('Loss value')
    plt.xlabel('No. epoch')
    plt.legend(loc="upper left")
    plt.show()

In [None]:
#split data
train_df, val_df, test_df = getTrainData(ret_df, seed = 42)
#fit normalizer and scaler
scaler, normalizer = firScalerAndNormalizer(train_df, used_cols=used_cols)

#scale data
train_df.loc[:,used_cols] = scaler.transform(train_df.loc[:,used_cols])
val_df.loc[:,used_cols] = scaler.transform(val_df.loc[:,used_cols])
test_df.loc[:,used_cols] = scaler.transform(test_df.loc[:,used_cols])

display(train_df.loc[:,used_cols].head(1))
print('train_shape', train_df.loc[:,used_cols].shape)
print('train elements:',len(train_df.time_id.unique()))
print('test elements:',len(test_df.time_id.unique()))
print('val elements:',len(val_df.time_id.unique()))

#create tf datasets
n_feat = len(used_cols)
train_ds = tf.data.Dataset.from_generator(
        lambda: tf_data_generator2(train_df, used_cols, y_col),
        output_signature=(
            tf.TensorSpec(shape=(None,n_feat), dtype=tf.float64),
            tf.TensorSpec(shape=(None,), dtype=tf.float64)))

val_ds = tf.data.Dataset.from_generator(
        lambda: tf_data_generator2(val_df, used_cols, y_col),
        output_signature=(
            tf.TensorSpec(shape=(None,n_feat), dtype=tf.float64),
            tf.TensorSpec(shape=(None,), dtype=tf.float64)))

test_ds = tf.data.Dataset.from_generator(
        lambda: tf_data_generator2(test_df, used_cols, y_col),
        output_signature=(
            tf.TensorSpec(shape=(None,n_feat), dtype=tf.float64),
            tf.TensorSpec(shape=(None,), dtype=tf.float64)))

#collect all data and save for faster training
tf.data.experimental.save(train_ds.take(3638), "./saved_train_ds")
tf.data.experimental.save(test_ds.take(96), "./saved_test_ds")
tf.data.experimental.save(val_ds.take(96), "./saved_val_ds")


In [None]:
#load datasets
train_ds = tf.data.experimental.load("./saved_train_ds", element_spec=(
            tf.TensorSpec(shape=(None,n_feat), dtype=tf.float64),
            tf.TensorSpec(shape=(None,), dtype=tf.float64)))
test_ds = tf.data.experimental.load("./saved_test_ds", element_spec=(
            tf.TensorSpec(shape=(None,n_feat), dtype=tf.float64),
            tf.TensorSpec(shape=(None,), dtype=tf.float64)))
val_ds = tf.data.experimental.load("./saved_val_ds", element_spec=(
            tf.TensorSpec(shape=(None,n_feat), dtype=tf.float64),
            tf.TensorSpec(shape=(None,), dtype=tf.float64)))

In [None]:
def printAndReturnModelDescrAndErrors(model, model_id, y_test, predict_y):
    R2 = round(r2_score(y_true = y_test, y_pred = predict_y),5)
    RMSPE = round(rmspe(y_true = y_test, y_pred = predict_y),5)
    print(f'Model {model_id} Performance of the prediction: R2 score: {R2}, RMSPE: {RMSPE}')
    
    return  {'model':model, 'R2':R2, 'RMSPE':RMSPE}

In [None]:
def train_tf_model2(train_ds, val_ds, test_ds, model_id, epochs, steps, n_feat=8, y_size=112, batch=256, patience=20):
    global normalizer, scaler_target


    Y_Size = y_size
    n_feat = n_feat

    cbk_es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience, mode='min', restore_best_weights=True)

    initial_learning_rate = 0.1
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=200,
    decay_rate=0.96,
    staircase=True)

    input_price = tf.keras.Input(shape=(Y_Size,n_feat),name='input_price')
    output_shape = Y_Size

    L = (input_price)
    L = layers.Flatten()(L)
    L = layers.Dense(200, activation='relu')(L)
    L = layers.Dense(200, activation='relu')(L)
    out = layers.Dense(output_shape)(L)

    model = tf.keras.Model(inputs=input_price, outputs=out, name=str(model_id))

    rmse = tf.keras.metrics.RootMeanSquaredError()
    model.compile(loss=tf.keras.losses.MeanSquaredError(),# root_mean_squared_per_error,
                  optimizer=tf.keras.optimizers.Adam(0.001),
                  metrics=['mae'])


    train_ds = train_ds.repeat().shuffle(3000).batch(batch).prefetch(tf.data.AUTOTUNE)
    #train_ds = train_ds.batch(batch)
    val_ds = val_ds.batch(1).take(96)
    test_ds = test_ds.batch(1).take(96)

    display(model.summary())
    history = model.fit(
        train_ds,
        validation_data=val_ds,
        verbose=2, 
        epochs=epochs,
        steps_per_epoch = steps,
        callbacks = [cbk_es]
    )

    df_predicted = getPredictionFromTheModel(model, test_df, used_cols)
    if True == cfg['useLabelTransformation']:
        df_predicted.loc[:,'predict'] = scaler_target.inverse_transform(df_predicted.loc[:,'predict'].to_numpy().reshape(-1,1)).flatten() / 100
        print('Test')

    model_descr = printAndReturnModelDescrAndErrors(model, "Test Model", df_predicted.loc[:,'target'].to_numpy(), df_predicted.loc[:,'predict'].to_numpy())
    model_descr['history'] = history
    return model_descr

In [None]:
batch_size = 256
steps = (3638 // batch_size)+1
#steps=114
model_descr = train_tf_model2(train_ds, val_ds, test_ds, "tf_model_2", 1500, steps, n_feat=len(used_cols), batch=batch_size, y_size=len(ALL_STOCKS.keys()), patience=50)

In [None]:
list_order_book_file_test = glob.glob(path_to_files + '/book_test.parquet/*')
list_order_book_file_test[:1]

In [None]:
%%time
#ret_df = constructBookDataDataFrame(list_order_book_file_test, isTrain=False)
ret_df = preprocessor(list_order_book_file_test, isTrain=False)
ret_df = ret_df.sort_values(by='stock_id').reset_index(drop=True)
ret_df.loc[:,'stock_id'] = ret_df.loc[:,'stock_id'].astype(np.int16)
print(ret_df.shape)
ret_df.head()

In [None]:
def getTestPredictionFromTheModel(model, X_df, used_cols):
    df_res = pd.DataFrame()

    for j in X_df.time_id.unique():    
        predicted = getPredictionFromOneTimeId(model, X_df, j, used_cols)
        f = j == X_df.time_id
        row_ids = X_df.loc[f,'row_id'].to_numpy()
        df_res = pd.concat([df_res, pd.DataFrame({'row_id':row_ids, 'target':predicted})])
    
    return df_res.reset_index(drop=True)    


In [None]:
#local test prediction will not work if you didn't train you model with stockid 0
#for proper submission model should be trained on all stocks
df_predicted = getTestPredictionFromTheModel(model_descr['model'], ret_df, used_cols)
df_predicted.to_csv('submisson.csv',index = False)
df_predicted.head()