In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
book_train_path='../input/optiver-realized-volatility-prediction/book_train.parquet'
train=pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')


> Lets get the volatility for every 1-min in the bucket.

In [None]:
def get_volatility(wap_val):
    s=np.log(wap_val)
    s=np.diff(s)
    s=(s**2).sum()
    s=np.sqrt(s)
    return s

def get_volatility_per_minute(row):
    seconds_in_bucket=np.array(row.seconds_in_bucket)
    wap=np.array(row.wap)
    rv=[]
    
    for i in np.arange(60, 601, 60):
        s=i-60; e=i
        time_idx=np.where(seconds_in_bucket[(seconds_in_bucket>=s) & (seconds_in_bucket<=e)])[0]
        wap_val=wap[time_idx]
        if len(wap_val) == 0:
            rv.append(0)
            continue
        rv.append( get_volatility(wap_val))
    return rv

def get_bucket_volatility():
    all_df=pd.DataFrame()
    for i, filepath in enumerate(os.listdir(book_train_path)):
        if i == 10:
            break
        path=os.path.join(book_train_path, filepath)
        stock_id=int(filepath.split('=')[-1])
        
        df=pd.read_parquet(path)
        df['stock_id']=stock_id
        df['wap'] = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1'])
        df['wap'] /= (df['ask_size1']+df['bid_size1'])
        
        
        
        df=df.groupby(['stock_id', 'time_id'])[['seconds_in_bucket', 'wap']].agg(list).reset_index()
        df['min_rv'] = df.apply(get_volatility_per_minute, axis=1)
        df['bucket_rv']=df['wap'].apply(get_volatility)
        
        df=df[['stock_id', 'time_id', 'min_rv', 'bucket_rv']].copy()
        all_df=pd.concat([all_df, df])
    return all_df

In [None]:
%%time
bucket_df=get_bucket_volatility()
bucket_df=bucket_df.merge(train)

bucket_df.head()

In [None]:
def visualize_bucket_volatility(stock_id, time_id):
    sample_df=bucket_df[(bucket_df.stock_id==stock_id) & (bucket_df.time_id==time_id)].copy()
    min_rv=sample_df.min_rv.values[0]
    bucket_rv=sample_df.bucket_rv.values[0]/2
    target=sample_df.target.values[0]/2

    plt.figure(figsize=(10, 5))
    plt.plot(min_rv+[bucket_rv], color='g', label='Bucket RV')
    plt.plot(min_rv+[target], color='r', label='Target')
    
    plt.plot(min_rv, color='b')
    plt.legend(loc='upper left')
    plt.title("StockId:{} - Time Id:{}".format(stock_id, time_id))
    plt.show()

In [None]:
bucket_df['rv_diff']=bucket_df['target'] - bucket_df['bucket_rv']
bucket_df.head()

In [None]:
visualize_bucket_volatility(97, 5)
visualize_bucket_volatility(97, 31)


# Work In Progress...