# A minimal benchmark
The motivation of this notebook is to demonstrate how to quickly reproduce the benchmark solution by using pandas aggregation with Numba engine. Apart from that, I try to keep it as simple as possible.

Update 27/07/2021: Thanks @max2020 for pointing out the bug I made caused by the difference between pandas .diff and numpy .diff.

In [None]:
import numpy as np
import pandas as pd

DEBUG = 0

In [None]:
# functions
def load_book_data_by_id(stock_id):
    train_test = 'train' if DEBUG else 'test'
    df = pd.read_parquet(f'../input/optiver-realized-volatility-prediction/book_{train_test}.parquet/stock_id={stock_id}')
    return df

def calc_wap(df):
    return (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])

def calc_rv_from_wap_numba(values, index):
    log_return = np.diff(np.log(values))
    realized_vol = np.sqrt(np.sum(np.square(log_return)))
    return realized_vol

In [None]:
# Calculate past realized vol for all stocks
list_df = []
for stock_id in range(127):
    # loading data
    try:
        df_book = load_book_data_by_id(stock_id)
    except:
        continue
    # make submission for one stock
    df_book['wap'] = calc_wap(df_book)
    df_sub = df_book.groupby('time_id')['wap'] \
        .agg(calc_rv_from_wap_numba, engine='numba') \
        .to_frame() \
        .reset_index()
    df_sub['time_id'] = [f'{stock_id}-{time_id}' for time_id in df_sub['time_id']]
    # add result to list_df
    list_df.append(df_sub)
# Make submission
df_submission = pd.concat(list_df)
df_submission = df_submission.rename(columns={'time_id': 'row_id', 'wap': 'target'})
df_submission.to_csv('submission.csv', index=False)