# Optimal variable extraction: 12X TIMES FASTER !

**August 2021**

**If you use parts of this notebook in your scripts/notebooks, giving  some kind of credit would be very much appreciated :)  You can for instance link back to this `notebook`, and `upvote it`. Thanks!**



In this notebook, I implement my numpy's version of `realized_volatility_per_time_id` function, a crucial function for prediction, using only `numpy` array. My implemtation shows great performance as It performs 12 times faster than original Optiver's pandas implementation.

**Results of the numpy implementation is showed below at the end.**

In [None]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import numpy as np
import plotly.express as px
warnings.filterwarnings('ignore')
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
train.head()

In [None]:
list_order_book_file_train = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')


In [None]:
time_id_unique = np.unique(list_order_book_file_train)

# Pandas function ( original implementation of the tutorial notebook )

In [None]:
def log_return(wap):
    return np.log(wap).diff()


def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def wap_1(df):
    df['wap1'] =(df['bid_price1'] * df['ask_size1']+df['ask_price1'] * df['bid_size1'])  / (
                              df['bid_size1']+ df['ask_size1'])
    return df['wap1']
def wap_2(df):
    df['wap2'] =(df['bid_price2'] * df['ask_size2']+df['ask_price2'] * df['bid_size2'])  / (
                              df['bid_size2']+ df['ask_size2'])
    return df['wap2'] 


def wap_logreturn(df,index):
    df['log_return{}'.format(index)] = df.groupby(['time_id'])['wap{}'.format(index)].apply(log_return)
    df = df[~df['log_return{}'.format(index)].isnull()]
    
    ## Compute the realized volatility of the stock per time id 
    df_realized_vol_per_stock =  pd.DataFrame(df.groupby(['time_id'])['log_return{}'.format(index)].agg(realized_volatility)).reset_index()

    return df_realized_vol_per_stock['log_return{}'.format(index)]



def realized_volatility_per_time_id(file_path):
    df = pd.read_parquet(file_path)
    df_realized_vol_per_stock = pd.DataFrame()


    ## Calculate WAP
    df['wap1'] = wap_1(df)
    df['wap2'] = wap_2(df)
    
    ## Apply log return after grouping by id ( 5 then 6 then 7 ..... ) in order to apply the log return lag correctly
    df_realized_vol_per_stock['rv1'] = wap_logreturn(df,1)
    df_realized_vol_per_stock['rv2'] = wap_logreturn(df,2)
    df_realized_vol_per_stock['time_id'] =  np.unique(np.array(df)[:,0]).flatten().astype(int)


    ## Extract the stock index / indice
    stock_id = file_path.split('=')[1]
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')
    return df_realized_vol_per_stock[['row_id','rv1','rv2']]

In [None]:
%time p = realized_volatility_per_time_id(list_order_book_file_train[0])


# Numpy Implementation 

In [None]:
## I use this package to exploit the groupby function under numpy
!pip3 install numpy_indexed
import numpy_indexed as npi


I re-implement the computation of log return as well as realized volatility because it seems that aggregation and applied function on groupby data slowed down the execution of the process.
- The function is pretty simple, I took advantage of list and numpy array to extract the realized volatility using the `wap_logreturn_numpy` function.
    - `index` argument was used to select the respective wap for each of `wap1` and `wap2`.
    


In [None]:
def wap_logreturn_numpy(df,index):
    np_df = np.array(df).astype(np.float32)
    np_df_unique_id = np.unique(np_df[:,0]).reshape(-1,1)
    #rv = np.array([])

    wap_grouby_timeid_flatten = npi.group_by(np_df[:, 0]).split(np_df[:, -2 + index]) 
    for i in range(len(wap_grouby_timeid_flatten)):
        wap_grouby_timeid_flatten[i] = np.diff(np.log(wap_grouby_timeid_flatten[i]))

    rv_list = []
    for i in range(len(wap_grouby_timeid_flatten)):
        rv_list.append(realized_volatility(wap_grouby_timeid_flatten[i]))
      
    rv_array = np.array(rv_list).reshape(-1,1)
    rv = np.concatenate((np_df_unique_id,rv_array),axis=1)
    return  rv

 - `numpy_realized_volatility_per_time_id`is a global function that calls  `wap1` and `wap2`function, as well as   the previous one `wap_logreturn_numpy`.

- Path name `split` was handled using simple list comprehension loop. 

In [None]:
def numpy_realized_volatility_per_time_id(file_path):
    df = pd.read_parquet(file_path)
    np_df_unique_id = np.unique(np.array(df)[:,0]).flatten()

    df_realized_vol_per_stock = pd.DataFrame()
    ## Calculate WAP
    df['wap1'] = wap_1(df)
    df['wap2'] = wap_2(df)
    
    ## Apply log return after grouping by id ( 5 then 6 then 7 ..... ) in order to apply the log return lag correctly
    df_realized_vol_per_stock['rv1'] = wap_logreturn_numpy(df,0)[:,-1]
    df_realized_vol_per_stock['rv2'] = wap_logreturn_numpy(df,1)[:,-1]

    ## Extract the stock index / indice    
    stock_id = np.int(file_path.split('=')[1])
    
    list_of_index = []
    for i in range(len(np.unique(np_df_unique_id))):
        list_of_index.append(f'{stock_id}-{np.int(np_df_unique_id[i])}')

    df_realized_vol_per_stock['row_id'] = list_of_index

    return df_realized_vol_per_stock[['row_id','rv1','rv2']]

df = pd.read_parquet(list_order_book_file_train[0])
df['wap1'] = wap_1(df)
df['wap2'] = wap_2(df)

In [None]:
%time n = numpy_realized_volatility_per_time_id(list_order_book_file_train[0])

In [None]:
n[['row_id','rv1','rv2']] == p[['row_id','rv1','rv2']]