# Table of Contents
* [Basic evaluations](#1)
* [Plot all stocks](#2)
* [Target (vola) distribution](#3)
* [Pick an example for deep-dive](#4)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

# statistics
from scipy import stats
from scipy.stats import t

# distribution fits
from fitter import Fitter, get_common_distributions, get_distributions

In [None]:
# read file
df_train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
df_train.head()

<a id='1'></a>
# Basic evaluations

In [None]:
# data frame overview
df_train.info()

In [None]:
# stock id frequencies
stock_id_freqs = df_train.stock_id.value_counts()
print(stock_id_freqs)

### We have 112 different stocks with a maximum of 3830 observations for each.

In [None]:
# show all stocks
stock_ids = stock_id_freqs.index.tolist()
stock_ids.sort()
print(stock_ids)

In [None]:
# look for stocks with incomplete history
incomplete_stocks = stock_id_freqs[stock_id_freqs<3830].index.tolist()
print('Incomplete history for the following stocks:', incomplete_stocks)
print('This results in', 112*3830 - stock_id_freqs.sum(), 'gaps')

In [None]:
# visualize gaps
df_train_incomplete = df_train[df_train.stock_id.isin(incomplete_stocks)]
gap_matrix = pd.crosstab(df_train_incomplete.time_id, df_train_incomplete.stock_id)
plt.figure(figsize=(3,30))
sns.heatmap(gap_matrix, cbar=False)
plt.show()

In [None]:
# time id frequencies
time_id_freqs = df_train.time_id.value_counts()
print(time_id_freqs)

In [None]:
# numerical stats for time_id
df_train.time_id.describe()

<a id='2'></a>
# Plot all stocks

In [None]:
# plot volas for each stock
fig, axs = plt.subplots(28, 4, figsize=(16,128))
i = 0
for si in stock_ids:
    current_ax = axs.flat[i]
    df_temp = df_train[df_train.stock_id==si]
    current_ax.plot(df_temp.time_id, df_temp.target)
    current_ax.set_title('Stock Id = ' + str(si))
    current_ax.grid()
    i = i + 1

<a id='3'></a>
# Target (vola) distribution

In [None]:
# target basic stats
df_train.target.describe()

In [None]:
# plot target distribution
plt.figure(figsize=(8,4))
df_train.target.plot(kind='hist', bins=100)
plt.title('Target')
plt.grid()
plt.show()

In [None]:
# boxplot
plt.figure(figsize=(8,2))
plt.boxplot(df_train.target, vert=False)
plt.title('Target - Boxplot')
plt.grid()
plt.show()

### Logarithmic view:

In [None]:
# logarithmic plot
plt.figure(figsize=(8,4))
np.log10(df_train.target).plot(kind='hist', bins=100)
plt.title('log10(Target)')
plt.grid()
plt.show()

In [None]:
# boxplot
plt.figure(figsize=(8,2))
plt.boxplot(np.log10(df_train.target), vert=False)
plt.title('log10(Target) - Boxplot')
plt.grid()
plt.show()

<a id='4'></a>
# Pick an example for deep-dive

In [None]:
# zoom in an example / scatter plot
my_stock = 0

df_pick = df_train[df_train.stock_id==my_stock].copy()

plt.figure(figsize=(14,6))
plt.scatter(df_pick.time_id, df_pick.target, alpha=0.5)
plt.title('Stock '+str(my_stock))
plt.grid()

In [None]:
# show distribution
plt.figure(figsize=(8,4))
df_pick.target.plot(kind='hist', bins=100)
plt.title('Vola - Histogram')
plt.grid()
plt.show()

In [None]:
# plot increments
df_pick['diff_target'] = df_pick.target.diff()

plt.figure(figsize=(14,6))
plt.scatter(df_pick.time_id, df_pick.diff_target, alpha=0.5)
plt.title('Stock '+ str(my_stock) + ' - Vola Differences')
plt.grid()

In [None]:
# show distribution of increments
plt.figure(figsize=(8,4))
df_pick.diff_target.plot(kind='hist', bins=100)
plt.title('Vola Differences - Histogram')
plt.grid()
plt.show()

In [None]:
# visualize vola vs previous vola
xx = np.asarray(df_pick.target.shift(1)) # previous value (nan for first entry)
yy = np.asarray(df_pick.target)

plt.figure(figsize=(6,6))
plt.scatter(xx[1:],yy[1:], alpha=0.2) # leave out first point (prev = nan)

# add regression line
mm,bb = np.polyfit(xx[1:],yy[1:],1)
plt.plot(xx, mm*xx + bb, c='magenta')
plt.title('Vola n vs n-1')
plt.xlabel('Vola n-1')
plt.ylabel('Vola n')
plt.grid()
plt.show()

In [None]:
# correlation
stats.pearsonr(xx[1:], yy[1:])

In [None]:
# visualize vola-INCREMENT vs previous vola-INCREMENT
xx = np.asarray(df_pick.diff_target.shift(1))
yy = np.asarray(df_pick.diff_target)

plt.figure(figsize=(6,6))
plt.scatter(xx[2:],yy[2:], alpha=0.2) # leave out first two points (nan)
# add regression line
mm,bb = np.polyfit(xx[2:],yy[2:],1)
plt.plot(xx, mm*xx + bb, c='magenta')
plt.title('Incremental Vola - n vs n-1')
plt.xlabel('Diff.Vola n-1')
plt.ylabel('Diff.Vola n')
plt.grid()
plt.show()

In [None]:
# correlation
stats.pearsonr(xx[2:], yy[2:])

In [None]:
# check for further autocorrelations
plt.figure(figsize=(10,5))
plt.acorr(df_pick.diff_target[2:], maxlags=20)
plt.title('Autocorrelations of incremental vola')
plt.grid()
plt.show()

#### Only lag 1 shows significant autocorrelation...

### Distribution fits on vola-increments:

In [None]:
# try to fit a few distribution types
# for full list of available distributions use "get_distributions()"
dist_fits = Fitter(df_pick.diff_target[2:], distributions=['norm','beta','cauchy','t'])
dist_fits.fit()
plt.figure(figsize=(12,5))
dist_fits.summary()

#### t-distribution seems to work best here...

In [None]:
# get best fit and corresponding parameters
best_fit = dist_fits.get_best(method = 'sumsquare_error')
print(best_fit)

Parameters are degree of freedom, location and scale here...

In [None]:
# let's check if we can reproduce the fitted distribution
x = np.linspace(-0.03,0.03,100)
plt.plot(x, t.pdf(x, 
                  df=best_fit['t'][0],
                  loc=best_fit['t'][1],
                  scale=best_fit['t'][2]))
plt.title('Fitted t-distribution')
plt.grid()
plt.show()

In [None]:
# export for potential further analyses
df_pick.to_csv('example.csv')