# Introduction

We will take a quick look to the Optiver competition data.

# Analysis preparation

In [None]:
import pandas as pd
import numpy as np
import os
from tqdm import notebook

In [None]:
os.listdir("/kaggle/input/optiver-realized-volatility-prediction")

In [None]:
train_df = pd.read_csv("/kaggle/input/optiver-realized-volatility-prediction/train.csv")
test_df = pd.read_csv("/kaggle/input/optiver-realized-volatility-prediction/test.csv")

# Quick data exploration

## Train and test

In [None]:
print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")

In [None]:
print(f"Book train: {len(os.listdir('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet'))}")
print(f"Trade train: {len(os.listdir('/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet'))}")
print(f"Book test: {len(os.listdir('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet'))}")
print(f"Trade test: {len(os.listdir('/kaggle/input/optiver-realized-volatility-prediction/trade_test.parquet'))}")

In [None]:
train_df.head()

In [None]:
train_df.tail()

In [None]:
test_df.head()

In [None]:
print(f"Train stock_id: {train_df.stock_id.nunique()}, Test stock_id: {test_df.stock_id.nunique()}")
print(f"Train time_id: {train_df.time_id.nunique()}, Test time_id: {test_df.time_id.nunique()}")

In [None]:
train_df.info()

In [None]:
test_df.info()

## Read parquet data

Let's start by reading one couple of parquet files. We can use **read_parquet** function from pandas for this.

In [None]:
book_01_df = pd.read_parquet('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0')
trade_01_df =  pd.read_parquet('/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0')

In [None]:
print(f"book_01 shape: {book_01_df.shape}, trade_01 shape: {trade_01_df.shape}")

In [None]:
book_01_df.head()

In [None]:
trade_01_df.head()

In [None]:
book_01_df.info()

In [None]:
trade_01_df.info()

Let's check how many time_id we have for book & trade data.

In [None]:
print(f"Book time_ids: {book_01_df.time_id.nunique()}")
print(f"Trade time_ids: {trade_01_df.time_id.nunique()}")

## Parse all parquet files and get some statistical values

In [None]:
path = '/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/'
list_stocks = os.listdir(path)
trade_d_list = []
for stock_id in notebook.tqdm(list_stocks):
    df = pd.read_parquet(f"/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/{stock_id}")
    trade_d_list.append((stock_id, df.shape[0], df.time_id.nunique(), df.price.min(), df.price.max(), df.price.mean(), df.price.var())) 

In [None]:
path = '/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/'
list_stocks = os.listdir(path)
book_d_list = []
for stock_id in notebook.tqdm(list_stocks):
    df = pd.read_parquet(f"/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/{stock_id}")
    book_d_list.append((stock_id, df.shape[0], df.time_id.nunique(),
                        df.bid_price1.min(), df.bid_price1.max(), df.bid_price1.mean(), df.bid_price1.var(),
                        df.bid_price2.min(), df.bid_price2.max(), df.bid_price2.mean(), df.bid_price2.var(),
                        df.ask_price1.min(), df.ask_price1.max(), df.ask_price1.mean(), df.ask_price1.var(),
                        df.ask_price2.min(), df.ask_price2.max(), df.ask_price2.mean(), df.ask_price2.var())) 

In [None]:
trade_train_df = pd.DataFrame(trade_d_list)
trade_train_df.columns = ['stock_id', 'rows', 'time_id_count', 'min_price', 'max_price', 'avg_price', 'var_price']
trade_train_df.head()

In [None]:
book_train_df = pd.DataFrame(book_d_list)
book_train_df.columns = ['stock_id', 'rows', 'time_id_count',
                         'min_bid_price1', 'max_bid_price1', 'avg_bid_price1', 'var_bid_price1',
                         'min_bid_price2', 'max_bid_price2', 'avg_bid_price2', 'var_bid_price2',
                         'min_ask_price1', 'max_ask_price1', 'avg_ask_price1', 'var_ask_price1',
                         'min_ask_price2', 'max_ask_price2', 'avg_ask_price2', 'var_ask_price2']
book_train_df.head()

Let's see how many rows we do have in the training parquet data.

In [None]:
print(f"Trade data: {sum(trade_train_df.rows)/1_000_000} Million rows")
print(f"Book data: {sum(book_train_df.rows)/1_000_000} Million rows")

Let's check how many different time_id counts we have for the data.

In [None]:
print(f"Trade data: {trade_train_df.time_id_count.nunique()}: {trade_train_df.time_id_count.unique()}")
print(f"Book data: {book_train_df.time_id_count.nunique()}: {book_train_df.time_id_count.unique()}")

Apparently, there are cases when for a specific trade data, we do not have the corresponding book data aligned.

# Few takeovers

You can explore easily the data using pandas.

Data is quite clean, does not have missing values.

Data ingestion from the parquet files is very fast.

If you will aggregate the trade and book data for your preliminary models, you will not need to store all the data, you can process it on the fly.

