# Import libraries and create helper functions
Functions do:
- Compute **Weighted-Average-Price (WAP)**
- Compute **log_return** of WAP
- Compute **volatility** of WAP_log_return
- Compute **Volatility** of a list
- Open **books and trades**
- **Merge** book and trade on given **time_id**

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import IPython
import glob
import plotly.graph_objects as go
from plotly.subplots import make_subplots
dsp = IPython.display.display


def book_path(stock_id):
    return f'../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id={stock_id}'

def open_book(stock_id, time_id=None):
    '''Opens the 2nd order order-books'''
    book = pd.read_parquet(book_path(stock_id))
    book.loc[:,'stock_id'] = stock_id
    book = book.sort_values(['time_id', 'seconds_in_bucket'])
    if time_id is None:
        return book
    return book[book['time_id']==time_id]

def trade_path(stock_id):
    return f'../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id={stock_id}'

def open_trades(stock_id, time_id=None):
    '''Open the trade books'''
    book = pd.read_parquet(trade_path(stock_id))
    book.loc[:,'stock_id'] = stock_id
    if time_id is None:
        return book
    return book[book['time_id']==time_id]


def set_wap(df):
    df.loc[:, 'wap'] = (
        (  df['bid_price1'] * df['ask_size1']
         + df['ask_price1'] * df['bid_size1'])
        / (df['bid_size1'] + df['ask_size1']))

def set_log_return(df):
    if 'wap' not in df.columns:
        df.set_wap()
    df.loc[:, 'log_return'] = np.log(df['wap']).diff()
    df = df[~df['log_return'].isnull()]

def log_return_volatility(df):
    if 'log_return' not in df.columns:
        df.set_log_return()
    return df['log_return'].volatility()

def volatility(serie):
    return np.sqrt(np.sum(serie ** 2))

def get_time_id(df, time_id):
    return df[df.time_id == time_id]

def get_third(df, n):
    third = 600 / 3
    return df[(df.seconds_in_bucket > n * third)
              & (df.seconds_in_bucket < (n + 1) * third)]


def get_merge(book, trades, time_id):
    _b = book.get_tId(time_id).copy().sort_values(['stock_id', 'time_id', 'seconds_in_bucket'])
    _t = trades.get_tId(time_id).copy().sort_values(['stock_id', 'time_id', 'seconds_in_bucket'])
    _b.set_log_return()
    df = pd.concat([_t, _b])
    return df


pd.DataFrame.get_third = get_third
pd.DataFrame.get_tId = get_time_id
pd.DataFrame.set_wap = set_wap
pd.DataFrame.set_log_return = set_log_return
pd.DataFrame.log_return_volatility = log_return_volatility
pd.Series.volatility = volatility

# Initiate variables
- Open training target
- Get train/test stock_ids

In [None]:
data_path = '../input/optiver-realized-volatility-prediction/'
train_stock_ids = sorted([int(p.split('=')[1])
                         for p in glob.glob(
                             data_path + '/book_train.parquet/*')])

test_stock_ids = sorted([int(p.split('=')[1])
                         for p in glob.glob(
                             data_path + '/book_test.parquet/*')])

# These are the values to predict (future 10 minute volatility)
fut_vol = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
fut_vol = fut_vol.set_index(['stock_id', 'time_id'])
dsp(fut_vol.head())

# Create interactive data visualization
Graph shows ask/bid price/size of different time_ids and stock_ids

In [None]:
for stock_id in range(5,7):

    book = open_book(stock_id)
    trades = open_trades(stock_id)
    time_ids = book.time_id.unique()[:10]


    # Create general figure
    fig = make_subplots(rows=3, cols=1,
                        row_heights=[.15, .7, .15],
                        vertical_spacing = 0.02,
                        shared_yaxes=False,
                        shared_xaxes=True,
                        specs=[[{"secondary_y": False}], [{"secondary_y": False}], [{"secondary_y": True}]] )

    # Add visible and hidden traces to figure
    for time_id in time_ids:
        df = get_merge(book, trades, time_id)
        fig.add_trace(  # TOP GRAPH
            go.Scatter(x=df.seconds_in_bucket, y=df['ask_size1'],
                       name='ask_size1', fill='tozeroy',
                       line=dict(color=('rgba(0, 200, 0, 0.3)')),
                       visible=False),
            row=1, col=1)
        fig.add_trace(
            go.Scatter(x=df.seconds_in_bucket, y=df['ask_size1'] + df['ask_size2'],
                       name='ask_size1&2', fill='tonexty',
                       line=dict(color=('rgba(0, 200, 0, 0.3)')),
                       visible=False),
            row=1, col=1)
        fig.add_trace(
            go.Scatter(x=df.seconds_in_bucket, y=-df['bid_size1'],
                       name='bid_size1', fill='tozeroy',
                       line=dict(color=('rgba(200, 0, 0, 0.3)')),
                       visible=False),
            row=1, col=1)
        fig.add_trace(
            go.Scatter(x=df.seconds_in_bucket, y=-df['bid_size1'] - df['bid_size2'],
                       name='bid_size1&2', fill='tonexty',
                       line=dict(color=('rgba(200, 0, 0, 0.3)')),
                       visible=False),
            row=1, col=1)
        fig.add_trace(  # MIDDLE GRAPH
            go.Scatter(x=df.seconds_in_bucket, y=df['ask_price1'], name='ask_price1',
                       line=dict(color=('rgba(0, 100, 0, .9)')),
                       visible=False),
            row=2, col=1)
        fig.add_trace(
            go.Scatter(x=df.seconds_in_bucket, y=df['ask_price2'], name='ask_price2',
                       line=dict(color=('rgba(0, 200, 0, .9)')),
                       visible=False),
            row=2, col=1)
        fig.add_trace(
            go.Scatter(x=df.seconds_in_bucket, y=df['bid_price1'], name='bid_price1',
                       line=dict(color=('rgba(150, 0, 0, 0.9)')),
                       visible=False),
            row=2, col=1)
        fig.add_trace(
            go.Scatter(x=df.seconds_in_bucket, y=df['bid_price2'], name='bid_price2',
                       line=dict(color=('rgba(200, 0, 0, 0.9)')),
                       visible=False),
            row=2, col=1)
        fig.add_trace(
            go.Scatter(x=df.seconds_in_bucket, y=df['price'],
                       name='price', mode='markers',
                       line=dict(color=('rgba(0, 0, 200, 0.4)')),
                       marker=dict(size=np.log(df.fillna(1)['size']*1e3)),
                       visible=False),
            row=2, col=1)
        fig.add_trace(  # BOTTOM GRAPH
            go.Scatter(x=df.seconds_in_bucket, y=df['log_return'],
                       name='ask_size1', fill='tozeroy',
                       line=dict(color=('rgba(0, 0, 20, 0.6)')),
                       visible=False),
            row=3, col=1)
        df_vol = df.log_return_volatility()
        f_vol = fut_vol.loc[stock_id, time_id].target
        c = df.seconds_in_bucket.max()
        fig.add_trace(
            go.Scatter(x=[c-10,c,c+10,c,c-10], y=[0,df_vol,0,-df_vol,0],
                       name='log_volatility',
                       fill="toself",
                       visible=False),
            row=3, col=1, secondary_y=True)
        fig.add_trace(
            go.Scatter(x=[c,c+10,c+20,c+10,c], y=[0,f_vol,0,-f_vol,0],
                       name='log_volatility',
                       fill="toself",
                       visible=False),
            row=3, col=1, secondary_y=True)

    for i in range(13):
        fig.data[i].visible = True

    # Create and add slider
    steps = []
    for i in range(int(len(fig.data) / 12)):
        step = dict(
            method="update",
            args=[{"visible": [False] * len(fig.data)},
                  {"title": "time_id: " + str(i)}],  # layout attribute
        )
        for j in range(i * 12, (i+1) * 12):
            step["args"][0]["visible"][j] = True  # Toggle i'th trace to "visible"
        steps.append(step)


    fig.update_layout(
        sliders=[dict(
            active=25,
            currentvalue={"prefix": "time_id: "},
            pad={"t": 50},
            steps=steps)],
        autosize=False, width=1200, height=700
    )

    fig.show()

# Extract some features...

In [None]:
book_feat = pd.DataFrame()
for stock_id in train_stock_ids[:]:
    book = open_book(stock_id)
    trades = open_trades(stock_id)
    time_ids = book.time_id.unique()

    time_id = time_ids[0]

    def get_feat(book):
        book = book.sort_values(['stock_id', 'time_id'])
        return pd.Series({
            'npts': len(book),
            'lrv': book.log_return_volatility(),
            'lrv1': book.get_third(0).log_return_volatility(),
            'lrv2': book.get_third(1).log_return_volatility(),
            'lrv3': book.get_third(2).log_return_volatility(),
            'wapM': book.wap.mean(),
            'lrM': book.log_return.mean(),
            'ap1v': volatility(book.ask_price1),
            'ap2v': volatility(book.ask_price2),
            'bp1v': volatility(book.bid_price1),
            'bp2v': volatility(book.bid_price2),
            'as1m': book.ask_size1.mean(),
            'as2m': book.ask_size2.mean(),
            'bs1m': book.bid_size1.mean(),
            'bs2m': book.bid_size2.mean()})

    book_feat = pd.concat([
        book.groupby(['stock_id', 'time_id']).apply(get_feat),
        book_feat
    ])

book_feat = book_feat.join(fut_vol)

# Plot Book-features correlation

In [None]:
fig = px.scatter_matrix(book_feat)
fig.update_layout(autosize=False, width=1200, height=700)
fig.show()
book_feat.corr()['target']

In [None]:
book_feat.to_parquet('test_out.parket')