In [None]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import plotly_express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

from plotly.subplots import make_subplots

In [None]:
%%time
train=pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
book_train=pd.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet')
trade_train=pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_train.parquet')

book_train_path='../input/optiver-realized-volatility-prediction/book_train.parquet'

In [None]:
print(train.shape)
print(book_train.shape)
print(trade_train.shape)

In [None]:
def update_trace_line(fig, x, y, row, col, args=None):
    if args is None:
        args={}
    fig.add_trace(
        go.Scatter(x=x, y=y, 
                   mode=args.get('mode', None),
                   line=args.get('line', None),
                   text=args.get('text', None),
                   name=args.get('name', None),
                   hoverinfo=args.get('hoverinfo', None)
                  ),
        row=row, col=col
    )

# Visualize a single bucket info

In [None]:
def visualize_sample(stock_id, time_id):
    window=3

    sample_book=book_train[(book_train.stock_id==stock_id) & (book_train.time_id==time_id)].copy()
    sample_book['wap']=(sample_book['bid_price1'] * sample_book['ask_size1']) + (sample_book['ask_price1'] * sample_book['bid_size1'])
    sample_book['wap']/=(sample_book['ask_size1'] + sample_book['bid_size1'])

    sample_book['wap_rolling']=sample_book.wap.rolling(window).mean()
    sample_book['ask_price_rolling1']=sample_book.ask_price1.rolling(window).mean()
    sample_book['bid_price_rolling1']=sample_book.bid_price1.rolling(window).mean()

    sample_book['ask_size_rolling1']=sample_book.ask_size1.rolling(window).mean()
    sample_book['bid_size_rolling1']=sample_book.bid_size1.rolling(window).mean()

    sample_book.fillna(0, inplace=True)
    
    
    
    
    fig=make_subplots(rows=2, cols=2,
                      subplot_titles=["Price Movements in a Bucket.", 
                                      "Rolling Avg Price Movements in Bucket",
                                      "Volume Movements in a Bucket",
                                      "Rolling Avg Movements in a Bucket"]
                     )


    update_trace_line(fig, sample_book['seconds_in_bucket'], sample_book['ask_price1'], 1, 1, {
        'mode': 'lines',
        'line': dict(width=2),
        'name': 'ask_price1',
        'hoverinfo': 'name'
    })

    update_trace_line(fig, sample_book['seconds_in_bucket'], sample_book['bid_price1'], 1, 1, {
        'mode': 'lines',
        'line': dict(width=2),
        'text': 'bid_price1',
        'hoverinfo': 'text'
    })
    update_trace_line(fig, sample_book['seconds_in_bucket'], sample_book['wap'], 1, 1, {
        'mode': 'lines',
        'line': dict(width=1),
        'text': 'wap',
        'hoverinfo': 'text'
    })



    update_trace_line(fig, sample_book['seconds_in_bucket'].values[3:], 
                      sample_book['ask_price_rolling1'].values[3:], 
                      1, 2,
                      {
                          'mode': 'lines',
                          'line': dict(width=2),
                          'name': 'ask_price_rolling1',
                          'hoverinfo':'name'
                      })

    update_trace_line(fig, sample_book['seconds_in_bucket'].values[3:], 
                      sample_book['bid_price_rolling1'].values[3:],
                      1, 2,
                      {
                          'mode': 'lines',
                          'line': dict(width=2),
                          'name': 'bid_price_rolling1',
                          'hoverinfo':'name'
                      })

    update_trace_line(fig, sample_book['seconds_in_bucket'].values[3:], 
                      sample_book['wap_rolling'].values[3:],
                      1, 2, 
                      {
                          'mode': 'lines',
                          'line': dict(width=1),
                          'name': 'wap_rolling',
                          'hoverinfo':'name'
                      })



    update_trace_line(fig, sample_book['seconds_in_bucket'], 
                      sample_book['ask_size1'],
                      2, 1, 
                      {
                          'mode': 'lines',
                          'line': dict(width=2),
                          'name': 'ask_size1',
                          'hoverinfo':'name'
                      })


    update_trace_line(fig, sample_book['seconds_in_bucket'], 
                      sample_book['bid_size1'],
                      2, 1, 
                      {
                          'mode': 'lines',
                          'line': dict(width=2),
                          'name': 'bid_size1',
                          'hoverinfo':'name'
                      })


    update_trace_line(fig, sample_book['seconds_in_bucket'].values[3:],
                      sample_book['ask_size_rolling1'].values[3:],
                      2, 2, 
                      {
                          'mode': 'lines',
                          'line': dict(width=2),
                          'name': 'ask_size_rolling1',
                          'hoverinfo':'name'
                      })


    update_trace_line(fig, sample_book['seconds_in_bucket'].values[3:],
                      sample_book['bid_size_rolling1'].values[3:],
                      2, 2,
                      {
                          'mode': 'lines',
                          'line': dict(width=2),
                          'name': 'bid_size_rolling1',
                          'hoverinfo':'name'
                      })

    fig.update_layout(width=1200, height=1000, 
                      title="Stock:{}-Bucket:{} <br> Target 10-min realized Volatitlity:{}".format(stock_id, time_id, 
                                                                                                train[(train.stock_id==stock_id) & (train.time_id==time_id)].target.values[0]
                                                                                               ))
    fig.show()

# Random Sample Visualization

In [None]:
visualize_sample(0, 5)
visualize_sample(56, 25312)
visualize_sample(111, 15765)
visualize_sample(111, 15770)

Volume of the Bid/Offers are having impact on the price movements.

**Stock_id:0, Time_id:5**

**Price Movment plots in stock_id:0, time_id:5, we can observe that**

1. WAP will be closer to the ask_price curve if, there is more supply in the market than demand and viceversa.
2. WAP Movements is closer to ask_price curve and bid price changing with time, and can be interpreted as the market is trying to catch-up the supply and demand.
3. Around second-400 difference of the bid-price and the offer-price is less, which could represent market-stability at that point.
4. The Price Movment starts-off from low-high and gradually getting reduced ==> favouring for the more bid sizes at the end of the bucket period.

**Volume Movement- stock_id: 0, time_id:5**
1. Bid & Ask Volume fluctuations are more in the initial stage of the market, which effects the wap to favour demand or supply curves.
2. for second-130 there is a peak in the bid_size, leading to the wap towards demand curve.
3. Volumes of both bid and ask-sizes are reduced as the time increases, resulting in the reduction of the wap at the end of bucket.


**Volume had effect on the prices and viceversa as lowering prices from the high-prices will lead to more bid-size**

**Stock_id=0, time_id:5 had the kind of triangular curve**




**Stock_id:111, Time_id:15765**

1. WAP continously decreases from the begining and  ask-sizes are increasing, indicating the fall of the market for this stock in the bucket.
2. For most of the time in between there is no most price movement in the market and the volumes are much higher.
3. if we look into the time_id: 15770 of the same stock, both the demand and supply seems to be in relatvely same at the begining, resulting in the less difference between the bid_price and ask_price, but since for some-time before there is more supply than demand the effetive prices got reduced.
4. As the deamnd gradually meets the supply and then tends to increase, the price movements startup increasing, to much higher values than before.




1. Price may move up/down with small changes.
2. May shoot high suddenly depending on high volumne of bids.
3. Drops low if ask volumes are more.
4. May Drop again after the supply meets demand.
5. can remain almost constant if supply doesn't change much with demand.


Based on the movement of the prices , will have different curves.

In [None]:
#print()
print(train[(train.stock_id==0) & (train.time_id==5)].target.values[0])
print(train[(train.stock_id==56) & (train.time_id==25312)].target.values[0])
print(train[(train.stock_id==111) & (train.time_id==15765)].target.values[0])
print(train[(train.stock_id==111) & (train.time_id==15770)].target.values[0])

# Bucket Stats

In [None]:
def get_bucket_wap_stat(df):
    wap_df=df.groupby(['stock_id', 'time_id'])[['wap']].agg([np.mean, np.std, np.min, np.max]).reset_index()
    wap_df.columns=['stock_id', 'time_id', 'wap_mean', 'wap_std', 'wap_min', 'wap_max']
    wap_df['wap_ratio']=wap_df['wap_max']/wap_df['wap_min']
    return wap_df

def get_bucket_vol_stat(df):
    vol_df=df.groupby(['stock_id', 'time_id'])[['bid_size1', 'ask_size1', 'vol_ratio']].mean().reset_index()
    vol_df.columns=['stock_id', 'time_id', 'mean_bid_size1', 'mean_ask_size1', 'mean_vol_ratio']
    return vol_df

def get_bucket_price_stat(df):
    price_df=df.groupby(['stock_id', 'time_id'])[['bid_price1', 'ask_price1', 'price_ratio']].mean().reset_index()
    price_df.columns=['stock_id', 'time_id', 'mean_bid_price1', 'mean_ask_price1', 'mean_price_ratio']
    return price_df

def get_order_updates(df):
    update_df=df.groupby(['stock_id', 'time_id'])[['seconds_in_bucket']].count().reset_index().rename(columns={
        'seconds_in_bucket': 'num_updates'
    })
    update_df['num_updates']/=600
    return update_df

def get_relative_volatility(s):
    s=np.log(s)
    s=np.diff(s)
    s=(s**2).sum()
    s=np.sqrt(s)
    return s

def get_bucket_volatility(df):
    vol_df=df.groupby(['stock_id', 'time_id'])[['wap']].agg(list).reset_index()
    vol_df['wap']=vol_df.wap.apply(get_relative_volatility)
    vol_df.rename(columns={'wap': 'realized_volatility'}, inplace=True)
    return vol_df
    
def get_bucket_stat():
    all_stat_df=pd.DataFrame()
    for filepath in os.listdir(book_train_path):
        path=os.path.join(book_train_path, filepath)
        stock_id=int(filepath.split('=')[-1])
        
        df=pd.read_parquet(path)
        df['stock_id']=stock_id
        df['vol_ratio']=df['ask_size1']/df['bid_size1']
        df['price_ratio']=df['ask_price1']/df['bid_price1']
        
        df['wap'] = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1'])
        df['wap'] /= (df['ask_size1']+df['bid_size1'])
        
        wap_df=get_bucket_wap_stat(df)
        vol_df=get_bucket_vol_stat(df)
        price_df=get_bucket_price_stat(df)
        update_df=get_order_updates(df)
        volatility_df=get_bucket_volatility(df)
        
        stat_df=vol_df.merge(wap_df)
        stat_df=stat_df.merge(price_df)
        stat_df=stat_df.merge(update_df)
        stat_df=stat_df.merge(volatility_df)
        
        all_stat_df=pd.concat([all_stat_df, stat_df])
    return all_stat_df

In [None]:
%%time
book_stat=get_bucket_stat()
book_stat.head()

In [None]:
book_stat=book_stat.merge(train)

Lets check the price & volume graphs for the top-3 and bottom-3 realized volatitlies

In [None]:
book_stat.sort_values('realized_volatility', ascending=False).head(3)

# Top-3 Realized Volatitlity

In [None]:
visualize_sample(30, 30128)
visualize_sample(30, 3138)
visualize_sample(42, 30128)

In [None]:
book_stat.sort_values('realized_volatility').head(3)

# Bottom-3 volatitlies

In [None]:
visualize_sample(31, 28959)
visualize_sample(31, 16733)
visualize_sample(31, 18495)

1. Both the Bid & Ask Volumes of the low volatile Buckets are high compared to the high volatile buckets.
2. And Bid & Ask Prices difference is also high in low-volatile bucket compared to high-volatile bucket.
3. There is trend in the low-volatile bucket at the end for the Ask-sizes to reach for the bid-Sizes(Figure-3) and Bid-sizes to reach for the Ask-sizes(Figure-1 &2)


In [None]:
book_stat.head()

Lets see about the number of updates in 10-min

In [None]:
book_stat.num_updates.describe()

In [None]:
px.histogram(book_stat, x='num_updates', title="% Distribution of number of Updates")


# Bucket Bid & Ask volumes & Impacts

In [None]:
bucket_vol=[50,70, 90, 100, 120, 130, 140,  150, 160, 170, 180, 200, 
            220, 240, 260, 280, 300, 330, 360, 390,
            410, 440, 470, 500, 550, 650, 750, 850, 
            1000, 1200, 1400, 1600, 2000, 2200, 2400, 2600, 2800,
            3000, 3500, 4000, 4500, 5000]

def get_volume_buckets(x):
    for i, v in enumerate(bucket_vol):
        if x<=v:
            return i
        
        if x>v and x>bucket_vol[-1]:
            return i+1

In [None]:
book_stat['bid_vol_bucket1']=book_stat['mean_bid_size1'].apply(get_volume_buckets)
book_stat['ask_vol_bucket1']=book_stat['mean_ask_size1'].apply(get_volume_buckets)

book_stat.head()

In [None]:
px.density_heatmap(book_stat,
                   x='bid_vol_bucket1',
                   y='ask_vol_bucket1', 
                   z='realized_volatility',
                   histfunc='avg',
                   marginal_x='histogram',
                   marginal_y='histogram',
                   title="Densitly Plots between volumes & Relative Volatilities"
                  )

1. Diagonal and near diagonal has minimal volatilties
2. As the differnece between the buckets increases volatitlty also getting increased in most cases.
3. in the Upper-left Block --> Bid-Sizes < Ask-Sizes --> More Demand than Supply.
4. Lower-Right Block --> Ask-Sizes < Bid-Sizes --> More Supply than demand
5. High Volumnes in both Bid & Ask-Sizes --> lower volatilities.
6. Lower-Right Block has high volatitlies registered than Upper-Left Block.

In [None]:
px.density_heatmap(book_stat,
                   x='bid_vol_bucket1',
                   y='ask_vol_bucket1', 
                   z='mean_price_ratio',
                   histfunc='avg',
                   marginal_x='histogram',
                   marginal_y='histogram',
                   title="Densitly Plots between volumes & Relative Volatilities")

1. Mean Price Ratio -> Avg(ask_price/bid_price) of bucket 
2. Mean Price ratio is less when Supply(Ask size) is more than Demand(Bid Size) in most cases --> upper-left block
3. Price Ratios are near 1 around the Diagonal Block

# Lets us check the relation between the target and Relative Volatility

In [None]:
book_stat[['realized_volatility', 'target']].corr()

In [None]:
rv_diff=book_stat['target'] - book_stat['realized_volatility']
rv_diff.describe()

In [None]:
px.box(y=rv_diff)

In [None]:
px.histogram(rv_diff)

1. there is a high correlation between the 1st 10-min volatility and for the next 10-min interval
2. Statistically More than 50% of the samples have the reduced volatility in the next 20-min.
3. Increase or decrease in the volatitlity depends on the trends and cycles of volumes in the current 10-min window