# Deanonimising time from event

It's still unclear what is happening exactly. For a while I tought there was an event between training data and target. But it seems that the event appears before the training data. Optiver even took some extra step to anonymize the time from the event by cutting a random amount of time at the begining of the data set. 

I figured this time from event could do a good feature and could be estimated from the volatilities and returns. The whole approach is explained in this notebook: https://www.kaggle.com/lucasmorin/volatility-maximum-likelihood-estimation (which in turn heavily draw from : https://www.kaggle.com/pcarta/jane-street-time-horizons-and-volatilities).

Note that this is an approximation (assuming constant vol from observed data is not ideal for a volatility forecasting competition), but a usefull one. We also assume independant movements, this is also far from ideal.

For each time id, we use initial returns and estimated realized volatilities of different stocks to minimise:

$$
\mathcal{l}(\sigma, T) = \sum_{i = 1}^n \left(\frac{{\Delta W_{i}}^2}{2 \sigma_i^2 (\Delta T)} + \frac{1}{2}\log(\Delta T) + \log(\sigma_i)\right) + \text{const}
$$

It allows us to get the time from when the price was standardised, which I assume to be the time from event. If we are dealing with decaying volatilities from an event, this features might be a good one, with volatilities generally getting lower as this time from event grows.

In [None]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
from scipy.optimize import minimize
import matplotlib.pyplot as plt

In [None]:
def calc_wap(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1'])/(df['bid_size1'] + df['ask_size1'])
    return wap

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

get_first_ret = lambda x: np.log(x.iloc[0])
get_first_ret.__name__ = 'get_first_ret'

def preprocessor_book(file_path_book):

    df_book = pd.read_parquet(file_path_book)
    stock_id = int(file_path_book.split('=')[1])

    df_book['wap'] = calc_wap(df_book)
    df_book['log_wap'] = np.log(df_book['wap'])
    df_book['log_return'] = df_book.groupby('time_id')['log_wap'].diff()
    
    create_feature_dict_time = {
        'wap': [get_first_ret],
        'log_return': [realized_volatility],
    }

    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(fe_dict,seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df_book[df_book['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(fe_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # all features
    df_feature = get_stats_window(create_feature_dict_time, seconds_in_bucket = 0, add_suffix = False)

    return df_feature


def preprocessor(list_stock_ids, is_train = True):
    
    def for_joblib(stock_id):

        if is_train:
            file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
        else:
            file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            
        df_tmp = preprocessor_book(file_path_book)
        
        return df_tmp
    
    # Use parallel api to call paralle for loop
    df = Parallel(n_jobs = -1, verbose = 1)(delayed(for_joblib)(stock_id) for stock_id in list_stock_ids)
    # Concatenate all the dataframes that return from Parallel
    df = pd.concat(df, ignore_index = True)
    
    return df

def read_train_test():
    train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
    test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
    # Create a key to merge with book and trade data
    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    print(f'Our training set has {train.shape[0]} rows')
    return train, test

In [None]:
%%time

data_dir  ='../input/optiver-realized-volatility-prediction/'
train, test = read_train_test()
train_stock_ids = train['stock_id'].unique()
test_stock_ids = test['stock_id'].unique()
train_ = preprocessor(train_stock_ids, is_train = True)
test_ = preprocessor(test_stock_ids, is_train = False)

In [None]:
%%time

list_time_id = []
list_time_delta = []


for i in train_.time_id_.unique():
    dft = train_[train_.time_id_==i]

    dW = dft.wap_get_first_ret
    Sigma = dft.log_return_realized_volatility

    # initialize time
    dT_1 = 25
    log_Sigma = np.log(Sigma)
    dT_1_log = np.log(dT_1)

    def neg_log_likelihood(dT_1_log):
         # compute the log-likelihood
        logL = 1/2 * np.sum((dW**2 @ (1/np.exp(2*log_Sigma) * (1/np.exp(dT_1_log))))) + 1/2*np.sum(dT_1_log) + np.sum(log_Sigma)
        return logL

    res = np.exp(minimize(neg_log_likelihood, dT_1_log, method='nelder-mead',options={'xatol': 1e-8, 'disp': False}).final_simplex[0][0][0])
    
    list_time_id.append(i)
    list_time_delta.append(res)

In [None]:
map_delta = pd.Series(list_time_delta,index=list_time_id)
train['delta_T'] = train.time_id.map(map_delta)
train['rv'] = train_['log_return_realized_volatility']
train['percentage_error'] = (train['target'] - train['rv'])/train['target']

import random
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
sns.regplot(x=train['delta_T'],y=train['rv'],color=(random.random(), random.random(), random.random()), order = 2, line_kws={"color": 'black'})

As expected we can see the volatilities decaying with delta_T.

In [None]:
sns.regplot(x=train['delta_T'],y=train['percentage_error'],color=(random.random(), random.random(), random.random()), order = 2, line_kws={"color": 'black'})