In [None]:
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm

from numpy.testing import assert_almost_equal

## Introduction

This notebook builds on [the work done here](https://www.kaggle.com/slawekbiel/naive-but-fast-submission) to show how to use numpy to speed up data processing.

I've done a timed comparison of numpy vs. pandas to create a dataframe for all `stock_id` and `time_id` values. It shows that numpy is around 10x quicker than the pandas implementation given in the introductory notebook.

As you build out more features, using the numpy implementation will:

* Enable much quicker iterations (hours -> minutes) üß†
* Speed up submissions ‚òïÔ∏è
* Streamline your workflow üö¥

If you like this notebook please show your appreciation with an upvote üòÉ

In [None]:
def compute_realized_volatility(wap_values):
    """Computes realized volatility from an array of weighted average prices"""
    log_wap = np.log(wap_values)
    log_return = np.diff(log_wap)
    return np.sqrt((log_return ** 2).sum())

def get_time_id_data_splits(time_ids, data):
    """Returns zipped unique time_id and associated data"""
    # Get unique_time_ids and their indices
    unique_time_ids, indices = np.unique(time_ids, return_index=True)

    # First index is zero so discard
    assert indices[0] == 0
    data_splits = np.split(data, indices[1:])
    
    return zip(unique_time_ids, data_splits)

def create_order_df(input_path):
    """Outputs a pandas dataframe with columns:
            - stock_id
            - time_id
            - realized_volatility
    """
    df = pd.read_parquet(input_path)
    
    # Compute weighted average price
    time_ids, bid_price, bid_size, ask_price, ask_size = (df[col].values for col in ['time_id', 'bid_price1','bid_size1','ask_price1','ask_size1' ])
    wap = (bid_price * ask_size + ask_price * bid_size) / (ask_size + bid_size)

    # Compute realized vol for all time_ids
    time_id_splits = get_time_id_data_splits(time_ids, wap)
    output = {"time_id": [], "realized_volatility": []}
    for time_id, wap_split in time_id_splits:
        realized_volatility = compute_realized_volatility(wap_split)
        output["realized_volatility"].append(realized_volatility)
        output["time_id"].append(time_id)

    # Convert data into a pandas dataframe
    output_df = pd.DataFrame(output)

    # Add stock_id column
    stock_id = int(input_path.split("=")[1])
    output_df["stock_id"] = stock_id

    return output_df

In [None]:
%%time
book_files = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')

data = []
for input_path in tqdm(book_files):
    data.append(create_order_df(input_path))
order_df = pd.concat(data)
print(order_df.shape)

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def create_order_df_pandas(input_path):
    """Outputs a pandas dataframe with columns:
            - stock_id
            - time_id
            - realized_volatility
    """
    df = pd.read_parquet(input_path)
    
    stock_id = int(input_path.split("=")[1])
    df["stock_id"] = stock_id

    df['wap'] =(df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1'])  / (df['bid_size1'] + df['ask_size1'])
    df['log_return'] = df.groupby(['time_id'])['wap'].apply(log_return)
    output_df = df.groupby(['time_id', 'stock_id']).agg(
        realized_volatility=("log_return", realized_volatility),
    ).reset_index()
    return output_df

In [None]:
%%time
book_files = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')

data = []
for input_path in tqdm(book_files):
    data.append(create_order_df_pandas(input_path))
order_df_pandas = pd.concat(data)
print(order_df_pandas.shape)

In [None]:
# Ensure computed values are almost equal
for col in order_df.columns:
    assert_almost_equal(order_df[col].to_numpy(), order_df_pandas[col].to_numpy(), decimal=6)