In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from glob import glob
from tqdm import tqdm
from numba import njit, jit


def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

def weighted_average_price(df, N=1):
    if N==1:
        return (df.bid_price1 * df.ask_size1 + df.bid_size1 * df.ask_price1) / (df.bid_size1 + df.ask_size1)
    elif N==2:
        return (df.bid_price2 * df.ask_size2 + df.bid_size2 * df.ask_price2) / (df.bid_size2 + df.ask_size2)
    else:
        assert False, 'Super duper sumo!'

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

# Aggregation by per stock per time-id Explanation

- The book table must be aggregated to calculate the `realized_volatility`, also to have the same aggregation level as the submission file.
- When aggregating, we lose the row level data for all columns.
- To reduce loss of information, we add a few statistical functions for each column.
- The feature dictionary below defines which statistical aggregation we will use for each column from book table.

In [None]:
base_agg = [np.mean, np.std]

feature_dictionary = {
    'bid_price1': base_agg,
    'bid_price2': base_agg,
    'ask_price1': base_agg,
    'ask_price2': base_agg,
    'bid_size1': base_agg,
    'bid_size2': base_agg,
    'ask_size1': base_agg,
    'ask_size2': base_agg,
    'wap1': base_agg,
    'wap2': base_agg,
    'log_return1': base_agg + [realized_volatility],
    'log_return2': base_agg + [realized_volatility]
}


def calculate_features(file_list):
    lst = []
    # read the training data file for book
    for g in tqdm(file_list):
        df = pd.read_parquet(g)
        # calculate wap and add as a column
        df['wap1'] = weighted_average_price(df)
        df['wap2'] = weighted_average_price(df, N=2)
        # calculate log-return and add as a column
        df['log_return1'] = log_return(df.wap1)
        df['log_return2'] = log_return(df.wap2)
        # remove null values created from 'diff' and log
        df = df[~df.log_return1.isnull()]

        # calculate aggregation; rv=realized_volatility
        rv_per_stock_timeid = df.groupby('time_id').agg(feature_dictionary)
        # groupby with multiple aggregations create multi-index columns
        # flatten them
        rv_per_stock_timeid.columns = ['-'.join(x) for x in rv_per_stock_timeid.columns]
        # add row-id
        rv_per_stock_timeid['row_id'] = [f'{g.split("=")[-1]}-{time_id}' for time_id in rv_per_stock_timeid.index]
        # drop the index which is time-id
        rv_per_stock_timeid.reset_index(drop=True, inplace=True)

        lst.append(rv_per_stock_timeid)

    return pd.concat(lst)

In [None]:
rv_per_stock_timeid = calculate_features(
    glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*'))

# Get Training Target Values

In [None]:
%%time
# get the target values; cast type since we don't need the ids to be floats
target = pd.read_csv(
    '../input/optiver-realized-volatility-prediction/train.csv',
    dtype={'stock_id':str, 'time_id':str, 'target':float}
)
# create row_id
target['row_id'] = [f'{r.stock_id}-{r.time_id}' for _,r in target.iterrows()]
# keep only the necessary columns
target = target.loc[:,['row_id','target']]

In [None]:
# xy dataframe is the merged result of the features dataframe and target dataframe
xy = rv_per_stock_timeid.merge(
    right = target,
    how = 'left',
    on = 'row_id'
)
print(xy.shape)

In [None]:
xy.head(2)

# Model Building, Evaluation and Prediction

## Evaluation

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(
    xy.loc[:,[c for c in xy.columns if c not in ['row_id','target']]],
    xy.target,
    test_size=0.1,
    random_state=42
)
print(len(xtrain), len(xtest))

mdl = LGBMRegressor(random_state=42)
mdl.fit(xtrain, ytrain)
print('rmspe:', rmspe(ytest, mdl.predict(xtest)))

# Submission

In [None]:
list_order_book_file_test = glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')
print(len(list_order_book_file_test))
sub = calculate_features(list_order_book_file_test)
feats = sub.loc[:,[c for c in sub.columns if c not in ['row_id','target']]]
pred = mdl.predict(feats)
result = pd.DataFrame({
    'row_id': sub.row_id,
    'target': pred
})
result.to_csv('submission.csv',index = False)