# Introduction 
In this Notebook is presented comparison of Target and Realized volatility for all available stocks in two chosen time buckets.

_'Target'_ (red line) is volatility for second 10 minutes period in time bucket. 
_'Bucket volatility'_ (green line), is calculated for first period of 10 minutes.

Realized volatility calculated in 1 minutes period (blue line) is rescaled to period 10 minutes

This  Notebook is based on analysis proposed by **doteeee**
[link](https://www.kaggle.com/narendra/optiver-realized-volatility-eda)

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math # TeKa change 
from matplotlib.collections import LineCollection # TeKa change 

In [None]:
book_train_path='../input/optiver-realized-volatility-prediction/book_train.parquet'
train=pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')


> Lets get the volatility for every 1-min in the bucket.

In [None]:
def get_volatility(wap_val):
    s=np.log(wap_val)
    s=np.diff(s)
    s=(s**2).sum()
    s=np.sqrt(s)
    return s

def get_volatility_per_minute(row):
    seconds_in_bucket=np.array(row.seconds_in_bucket)
    wap=np.array(row.wap)
    rv=[]
    
    for i in np.arange(60, 601, 60): 
        s=i-60; e=i
        time_idx=np.where(seconds_in_bucket[(seconds_in_bucket>=s) & (seconds_in_bucket<=e)])[0]
        
        wap_val=wap[time_idx]
        if len(wap_val) == 0:
            rv.append(0)
            continue
        rv.append( get_volatility(wap_val)*math.sqrt(10)) # TeKa change 
    return rv

def get_bucket_volatility():
    all_df=pd.DataFrame()
    for i, filepath in enumerate(os.listdir(book_train_path)):
        if i == 10:
            break
        path=os.path.join(book_train_path, filepath)
        stock_id=int(filepath.split('=')[-1])
        
        df=pd.read_parquet(path)
        df['stock_id']=stock_id
        df['wap'] = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1'])
        df['wap'] /= (df['ask_size1']+df['bid_size1'])
        
        
        
        df=df.groupby(['stock_id', 'time_id'])[['seconds_in_bucket', 'wap']].agg(list).reset_index()
        df['min_rv'] = df.apply(get_volatility_per_minute, axis=1)
        df['bucket_rv']=df['wap'].apply(get_volatility)
        
        df=df[['stock_id', 'time_id', 'min_rv', 'bucket_rv']].copy()
        all_df=pd.concat([all_df, df])
    return all_df

In [None]:
%%time
bucket_df=get_bucket_volatility()
bucket_df=bucket_df.merge(train)

bucket_df.head(10)

In [None]:
def visualize_bucket_volatility(stock_id, time_id):
    sample_df=bucket_df[(bucket_df.stock_id==stock_id) & (bucket_df.time_id==time_id)].copy()
    min_rv=sample_df.min_rv.values[0]
 
    plt.figure(figsize=(10, 5))
    plt.xlim(0,20)
    plt.ylim(0,0.01)
    
    bucket_rv_t=sample_df.bucket_rv.values[0] # TeKa change 
    target_t=sample_df.target.values[0]  # TeKa change  
    
    x = np.linspace(1, 10, 10)
    y = min_rv
    plt.plot(x, y, color='b', label='Realized volatility calculated in minutes') # TeKa change 
    l_rv = [(0, bucket_rv_t), (10, bucket_rv_t)]
    l_t = [(10, target_t), (20, target_t)]
    lc = LineCollection([l_rv, l_t], color=["g","r"], label="Realized volatility calculated afer 10 minutes, Target - realized volatility calculated afer 20 minutes")

    plt.gca().add_collection(lc)
    #plt.axhline(y = bucket_rv_t, xmin = 0, xmax = 0.5, color='g', label='Realized volatility calculated afer 10 minutes') # TeKa change 
    #plt.axhline(y = target_t, xmin = 0.5, xmax = 1, color='r', label='Target - realized volatility calculated afer 20 minutes') # TeKa change 
    
    #plt.legend(loc='lower left')
    plt.title("Stock Id:{} - Time Id:{}".format(stock_id, time_id))
    plt.show()

In [None]:
bucket_df['rv_diff']=bucket_df['target'] - bucket_df['bucket_rv']
bucket_df.head()
bucket_df.tail()

In [None]:
time_id = 5

sample_time_df=bucket_df[bucket_df.time_id==time_id].copy()

for i in sample_time_df['stock_id'].to_list():
    visualize_bucket_volatility(i, time_id)
    

In [None]:
time_id = 32767

sample_time_df=bucket_df[bucket_df.time_id==time_id].copy()

for i in sample_time_df['stock_id'].to_list():
    visualize_bucket_volatility(i, time_id)
    