In [None]:
import random
import glob
import gc
from itertools import product

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_colwidth', None)

### Train data

In [None]:
order_book_train_files = glob.glob('../input/optiver-realized-volatility-prediction/book_train.parquet/*')
stock_ids = [stock.split('=')[1] for stock in order_book_train_files]
df_order_book_train_files = pd.DataFrame({'stock_id': stock_ids,'stock_path': order_book_train_files})
df_order_book_train_files.head()

Each record in the dataframe contains a parquet file path that has book data about that particular stock

### Log Return

In [None]:
def calculate_wap(df):
    a = df['bid_price1'] * df['ask_size1']
    b = df['ask_price1'] * df['bid_size1']
    c = df['bid_size1'] + df['ask_size1']
    return (a + b) / c

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

# Volatility Clustering

### Input

We will be using **randomly sampled stocks** and a **random time span**(singe time_id) to look at the volatility clusters.

In [None]:
def abs_log_returns_topn(list_file, n_top):
    '''Returns log returns for top n stocks from shuffled stock list'''
    df_list = []
    for stock_file in list_file[:n_top]:
        df_book_data = pd.read_parquet(stock_file)
        df_book_data['wap'] = calculate_wap(df_book_data) 
        df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
        df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
        df_book_data['log_return'] = df_book_data['log_return'].apply(abs)
        df_book_data['stock_id'] = int(stock_file.split('=')[1])
        df_list.append(df_book_data)
    df = pd.concat(df_list, ignore_index=True)
    return df

In [None]:
# Randomly shuffle the stocks
random.shuffle(order_book_train_files)

# Get log returns for 5 stocks sample
df_log_returns = abs_log_returns_topn(order_book_train_files, 5)

selected_stocks = df_log_returns['stock_id'].unique()
print('Stock Ids:', ', '.join(str(s) for s in selected_stocks))

In [None]:
df_log_returns['stock_id'].value_counts()

### ACF/PACF Plots

In [None]:
def plot_auto_correlation(series, lags=30, stock=''):
    plt.rcParams["figure.figsize"] = 20, 5
    fig, axes = plt.subplots(1, 2)
    acf = plot_acf(series, lags=lags, ax = axes[0])
    pacf = plot_pacf(series, lags=lags, ax = axes[1])
    acf.suptitle(f'Autocorrelation and Partial Autocorrelation - stock {stock}', fontsize=20)
    plt.show()

In [None]:
for stock_id in selected_stocks:
    df_log_returns_stock = df_log_returns[df_log_returns['stock_id'] == stock_id]
    random_time_id = random.choice(df_log_returns_stock['time_id'].unique())
    df_time_slice = df_log_returns_stock[df_log_returns_stock['time_id'] == random_time_id]
    plot_auto_correlation(df_time_slice['log_return'], stock=stock_id)

Looking at the PACF plots for absolute log returns, there's a lag of either 1 or 2, meaning that all the higher-order autocorrelations can be explained by lag-1 & lag-2 autocorrelations. Let's have a look at ARIMA model and try to find if volatility clusters exist

**Finding order sequence for ARIMA model**

In [None]:
arima_order = []
for stock_id in selected_stocks:
    for order_seq in product((2, 1, 0), repeat=3):
        if order_seq == (0, 0, 0):     #ARIMA(0,0,0) model is white noise 
            continue
        try:
            df_log_returns_stock = df_log_returns[df_log_returns['stock_id'] == stock_id]
            # Slicing random time period to train ARIMA model
            random_time_id = random.choice(df_log_returns_stock['time_id'].unique())
            df_time_slice = df_log_returns_stock[df_log_returns_stock['time_id'] == random_time_id]
            arima_model = ARIMA(df_time_slice['log_return'], order = order_seq)
            results_ARIMA = arima_model.fit(disp=-1)
            arima_order.append([stock_id, order_seq, results_ARIMA.aic])
            # print(f'{order_seq} AIC: {results_ARIMA.aic}')
        except:
            pass
df_arima_order = pd.DataFrame(arima_order, columns=['stock_id', 'order', 'aic'])

In [None]:
df_arima_order_min = pd.merge(df_arima_order.groupby('stock_id')['aic'].min().reset_index(),
                          df_arima_order, 
                          how='left', 
                          on=['stock_id','aic'])
df_arima_order_min

Different stocks seem to follow different ARIMA orders. Let's try plugging these order numbers in the model and see if there's volatility clustering in squared residuals

### Volatility Clustering in Squared Residuals

In [None]:
def plot_resid_sqr(series, stock_id='', time_id=''):
    plt.rcParams["figure.figsize"] = 20, 5
    plt.plot(series.values)
    plt.title(f'Squared Residial Plot - stock {stock_id} - time_id {time_id}', fontsize=14)
    plt.ylabel('Squared Residial', fontsize=14)
    plt.show()

In [None]:
for idx, row in df_arima_order_min.iterrows():
    stock_id = row['stock_id']
    df_log_returns_stock = df_log_returns[df_log_returns['stock_id'] == stock_id]
    # Slicing random time period to train ARIMA model
    random_time_id = random.choice(df_log_returns_stock['time_id'].unique())
    df_time_slice = df_log_returns_stock[df_log_returns_stock['time_id'] == random_time_id]
    arima_model = ARIMA(df_time_slice['log_return'], order = row['order'])
    results_ARIMA = arima_model.fit(disp=-1)
    sqr_resid = np.power(results_ARIMA.resid, 2)
    plot_resid_sqr(sqr_resid, stock_id, random_time_id)

**Looking at the squared residual plots, there definitely seems to be volatility clustering found in data, though the degree of clustering vary depending on stock and time_id. Let's have a look at the auto correlations for squared residuals**

In [None]:
for idx, row in df_arima_order_min.iterrows():
    stock_id = row['stock_id']
    df_log_returns_stock = df_log_returns[df_log_returns['stock_id'] == stock_id]
    # Slicing random time period to train ARIMA model
    random_time_id = random.choice(df_log_returns_stock['time_id'].unique())
    df_time_slice = df_log_returns_stock[df_log_returns_stock['time_id'] == random_time_id]
    arima_model = ARIMA(df_time_slice['log_return'], order = row['order'])
    results_ARIMA = arima_model.fit(disp=-1)
    sqr_resid = np.power(results_ARIMA.resid, 2)
    plot_auto_correlation(sqr_resid, stock=stock_id)

**ACF/PACF of Squared residuals also show the lag-1 & lag-2 auto-correlation. Looks like we can use both to experiment with GARCH models.**