# Simple Solution with Optimal variable extraction: 12X TIMES FASTER !

 See my notebook [here](https://www.kaggle.com/rayanaay/12x-faster-variable-extraction-using-numpy) for a minimal example where I compare execution time between Numpy and Pandas.

If you use parts of this notebook in your scripts/notebooks, giving some kind of credit would be very much appreciated :) You can for instance link back to this notebook, and upvote it. Thanks!

## Import libraries

In [None]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import numpy as np
import plotly.express as px
warnings.filterwarnings('ignore')
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
train.head()

In [None]:
list_order_book_file_train = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')
list_order_trade_file_train = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/*')

## Some useful functions to calculate WAP

In [None]:
def log_return(wap):
    return np.log(wap).diff()


def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def wap_1(df):
    df['wap1'] =(df['bid_price1'] * df['ask_size1']+df['ask_price1'] * df['bid_size1'])  / (
                              df['bid_size1']+ df['ask_size1'])
    return df['wap1']
def wap_2(df):
    df['wap2'] =(df['bid_price2'] * df['ask_size2']+df['ask_price2'] * df['bid_size2'])  / (
                              df['bid_size2']+ df['ask_size2'])
    return df['wap2'] 

Because, internet must be disable for submission, I loaded the library from kaggle datasets.

In [None]:
## I use this package to exploit the groupby function under numpy
!pip install  ../input/numpy-indexed-v035/numpy_indexed-0.3.5-py2.py3-none-any.whl

import numpy_indexed as npi


# Numpy implementation for computing realized volatility on all the data

In [None]:
def wap_logreturn_numpy(df,index):
    np_df = np.array(df).astype(np.float32)
    np_df_unique_id = np.unique(np_df[:,0]).reshape(-1,1)
    #rv = np.array([])

    wap_grouby_timeid_flatten = npi.group_by(np_df[:, 0]).split(np_df[:, -2 + index]) 
    for i in range(len(wap_grouby_timeid_flatten)):
        wap_grouby_timeid_flatten[i] = np.diff(np.log(wap_grouby_timeid_flatten[i]))

    rv_list = []
    for i in range(len(wap_grouby_timeid_flatten)):
        rv_list.append(realized_volatility(wap_grouby_timeid_flatten[i]))
      
    rv_array = np.array(rv_list).reshape(-1,1)
    rv = np.concatenate((np_df_unique_id,rv_array),axis=1)
    return  rv

 - `numpy_realized_volatility_per_time_id`is a global function that calls  `wap1` and `wap2`function, as well as   the previous one `wap_logreturn_numpy`.

- Path name `split` was handled using simple list comprehension loop. 

# Deal with Trade Data

In [None]:
def trade_per_time_id(file_path):
    df_trade = pd.read_parquet(file_path)
    df_realized_vol_per_stock = pd.DataFrame()

    np_df_unique_id = np.unique(np.array(df_trade)[:,0]).flatten()
    
    stock_id = np.int(file_path.split('=')[1])
    
    list_of_index = []
    for i in range(len(np.unique(np_df_unique_id))):
        list_of_index.append(f'{stock_id}-{np.int(np_df_unique_id[i])}')

    df_realized_vol_per_stock['row_id'] = list_of_index
    df_realized_vol_per_stock[['seconds_in_bucket','price','size','order_count']] = df_trade.groupby(['time_id']).mean()[['seconds_in_bucket','price','size','order_count']].values

    return df_realized_vol_per_stock

def trade_groupby(list_file):
    df_past_realized = pd.DataFrame()

    for f1 in list_file:
        df_past_realized = pd.concat([df_past_realized,
                                     trade_per_time_id(f1)])
  
    return df_past_realized


In [None]:
time df_trade_train = trade_groupby(list_order_trade_file_train)

# Deal with Book Data

In [None]:
def numpy_realized_volatility_per_time_id(file_path):


    df_book = pd.read_parquet(file_path)
    np_df_unique_id = np.unique(np.array(df_book)[:,0]).flatten()

    df_realized_vol_per_stock = pd.DataFrame()
    ## Calculate WAP
    df_book['wap1'] = wap_1(df_book)
    df_book['wap2'] = wap_2(df_book)
    
    ## Apply log return after grouping by id ( 5 then 6 then 7 ..... ) in order to apply the log return lag correctly
    df_realized_vol_per_stock['rv1'] = wap_logreturn_numpy(df_book,0)[:,-1]
    df_realized_vol_per_stock['rv2'] = wap_logreturn_numpy(df_book,1)[:,-1]


    ## Extract the stock index / indice    
    stock_id = np.int(file_path.split('=')[1])
    
    list_of_index = []
    for i in range(len(np.unique(np_df_unique_id))):
        list_of_index.append(f'{stock_id}-{np.int(np_df_unique_id[i])}')

    df_realized_vol_per_stock['row_id'] = list_of_index

    return df_realized_vol_per_stock[['row_id','rv1','rv2']]#,'vol1','vol2','fft1','fft2']]

def past_realized_volatility_per_book(list_file):
    df_past_realized = pd.DataFrame()
    for file in list_file:
        df_past_realized = pd.concat([df_past_realized,
                                     numpy_realized_volatility_per_time_id(file)])
  
    return df_past_realized




In [None]:
df_past_realized_train = past_realized_volatility_per_book(list_order_book_file_train)

Merge the two datafram on `row_id`column

In [None]:
df_data = df_past_realized_train.merge(df_trade_train, on = ['row_id'], how='left')

In [None]:
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train_ = train[['row_id','target']]
df_joined = train_.merge(df_data[['row_id','rv1', 'rv2', 'seconds_in_bucket', 'price', 'size',
       'order_count']], on = ['row_id'], how = 'left').dropna()

# Preprocessing

In [None]:
## Add new variable Rv3
df_joined['rv3'] = df_joined['rv1'] ** df_joined['rv2'] 

In [None]:
## Split the data
X_train_, y_train = np.log10(df_joined[['rv1','rv2','rv3','price', 'size',
       'order_count']]), np.log10(df_joined[['target']])

# Linear Regression

In [None]:

from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr = lr.fit(X_train_,y_train)
lr.score(X_train_,y_train)

In [None]:
y_pred_train_log10 = lr.predict(X_train_)
y_pred_train = 10**(y_pred_train_log10)

In [None]:
from sklearn.metrics import r2_score

R2 = r2_score(df_joined['target'].values, y_pred_train)

print(R2)

In [None]:
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

RMSPE = round(rmspe(10**y_train,y_pred_train),3)
print(f'Performance of the naive prediction:, RMSPE: {RMSPE}')

# Submission

In [None]:
def log_return(wap):
    return np.log(wap).diff()


def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def wap_1(df):
    df['wap1'] =(df['bid_price1'] * df['ask_size1']+df['ask_price1'] * df['bid_size1'])  / (
                              df['bid_size1']+ df['ask_size1'])
    return df['wap1']
def wap_2(df):
    df['wap2'] =(df['bid_price2'] * df['ask_size2']+df['ask_price2'] * df['bid_size2'])  / (
                              df['bid_size2']+ df['ask_size2'])
    return df['wap2'] 


def wap_logreturn(df,index):
    df['log_return{}'.format(index)] = df.groupby(['time_id'])['wap{}'.format(index)].apply(log_return)
    df = df[~df['log_return{}'.format(index)].isnull()]
    
    ## Compute the realized volatility of the stock per time id 
    df_realized_vol_per_stock =  pd.DataFrame(df.groupby(['time_id'])['log_return{}'.format(index)].agg(realized_volatility)).reset_index()

    return df_realized_vol_per_stock['log_return{}'.format(index)]



def realized_volatility_per_time_id(file_path):
    df = pd.read_parquet(file_path)
    df_realized_vol_per_stock = pd.DataFrame()


    ## Calculate WAP
    df['wap1'] = wap_1(df)
    df['wap2'] = wap_2(df)
    
    ## Apply log return after grouping by id ( 5 then 6 then 7 ..... ) in order to apply the log return lag correctly
    df_realized_vol_per_stock['rv1'] = wap_logreturn(df,1)
    df_realized_vol_per_stock['rv2'] = wap_logreturn(df,2)
    df_realized_vol_per_stock['time_id'] =  np.unique(np.array(df)[:,0]).flatten().astype(int)


    ## Extract the stock index / indice
    stock_id = file_path.split('=')[1]
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')



    return df_realized_vol_per_stock[['row_id','rv1','rv2']]


def past_realized_volatility_per_stock(list_file):
    df_past_realized = pd.DataFrame()
    for file in list_file:
        df_past_realized = pd.concat([df_past_realized,
                                     realized_volatility_per_time_id(file)])
    return df_past_realized

In [None]:
list_order_book_file_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')
list_order_trade_file_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/trade_test.parquet/*')

df_book_test  = past_realized_volatility_per_stock(list_file=list_order_book_file_test)
df_trade_test = trade_groupby(list_order_trade_file_test)[['row_id','price','size','order_count']]                                                      


In [None]:
df_test = df_book_test.merge(df_trade_test, on = ['row_id'], how="left")
df_test[['rv1','rv2','price', 'size','order_count']] = df_test[['rv1','rv2','price', 'size','order_count']].apply(lambda x: x.fillna(x.mean()),axis=0)


In [None]:
df_test['rv3'] = df_test['rv1'] ** df_test['rv2']
df_test2 = df_test[['rv1','rv2','rv3','price', 'size','order_count']]
#df_test2 = df_test2.apply(lambda x: x.fillna(x.mean()),axis=0)
X_test = np.log10(df_test2)



In [None]:
y_pred_test = 10**lr.predict(X_test).flatten()
y_pred_test

In [None]:
df_test_2 = df_test[['row_id']]
df_test_2['target'] = y_pred_test

In [None]:
df_test_2.to_csv('submission.csv',index = False)