In [None]:
import numpy as np 
import pandas as pd 
import os
from pathlib import Path 
from tqdm import tqdm
import re
import gc
from pandas.api.types import is_integer_dtype
# import lightgbm as lgb
import plotly.express as px
import plotly.figure_factory as ff
import seaborn as sns

In [None]:
BASE_DIR = '../input/optiver-realized-volatility-prediction'
base_path = Path(BASE_DIR)

# Functions

In [None]:
# OPTIMIZE MEMORY from Chris (https://www.kaggle.com/cdeotte/time-split-validation-malware-0-68)
def reduce_memory(df,col):
    mx = df[col].max()
    if mx<256:
            df[col] = df[col].astype('uint8')
    elif mx<65536:
        df[col] = df[col].astype('uint16')
    else:
        df[col] = df[col].astype('uint32')

In [None]:
def make_parquet_df(path_list, train_df=None):
    df_list = []
    for path in tqdm(path_list):
        df = pd.read_parquet(path)
        stock_id = stock_key(path)
        df['stock_id'] = stock_id
        df['stock_id'] = df['stock_id'].astype(np.uint8)
        ## Currently adding Targets ends in out of memory
        if train_df is not None:
            df = df.merge(train_df, on=['stock_id', 'time_id'])
        df_list.append(df)
    return pd.concat(df_list, ignore_index=True)

# Load Data

In [None]:
train_df = pd.read_csv(f"{BASE_DIR}/train.csv")
reduce_memory(train_df, 'stock_id')
reduce_memory(train_df, 'time_id')
train_df.info()
gc.collect()
train_df.head()

In [None]:
target_dtype = 'float32'
print(f"Max error when converting target to {target_dtype}:", (train_df['target'] - train_df['target'].astype(target_dtype)).max())
target_dtype = 'float16'
print(f"Max error when converting target to {target_dtype}:", (train_df['target'] - train_df['target'].astype(target_dtype)).max())
# train_df['target'] = train_df['target'].astype(target_dtype)

In [None]:
train_books_paths = list(Path(base_path / 'book_train.parquet').rglob('*.parquet'))
train_trades_paths = list(Path(base_path / 'trade_train.parquet').rglob('*.parquet'))

In [None]:
stock_key = lambda path: int(re.findall("stock_id=(\d*)",str(path))[0])
train_books_paths.sort(key = stock_key)
train_trades_paths.sort(key = stock_key)

In [None]:
train_books_df = make_parquet_df(train_books_paths[:10], train_df=None)
# train_trades_df = make_parquet_df(train_trades_paths)

In [None]:
for col in train_books_df.columns:
    if is_integer_dtype(train_books_df[col].dtype):
        reduce_memory(train_books_df,col)
gc.collect()
train_books_df.info()
train_books_df.head()

# Data Validation

In [None]:
train_books_df['ask_price_diff'] = train_books_df['ask_price2'] - train_books_df['ask_price1']
train_books_df['ask_size_diff'] = train_books_df['ask_size1'] - train_books_df['ask_size2']
train_books_df['bid_price_diff'] = train_books_df['bid_price1'] - train_books_df['bid_price2']
train_books_df['bid_size_diff'] = train_books_df['bid_size1'] - train_books_df['bid_size2']

In [None]:
col = 'ask_price_diff'
print(f"{col}: Maximum {train_books_df[col].max()} and Minimum {train_books_df[col].min()}")
col = 'bid_price_diff'
print(f"{col}: Maximum {train_books_df[col].max()} and Minimum {train_books_df[col].min()}")
col = 'ask_size_diff'
print(f"{col}: Maximum {train_books_df[col].max()} and Minimum {train_books_df[col].min()}")
col = 'bid_size_diff'
print(f"{col}: Maximum {train_books_df[col].max()} and Minimum {train_books_df[col].min()}")


In [None]:
unique_time_ids = train_df['time_id'].nunique()
time_ids_per_stock = train_df.groupby('stock_id')['time_id'].nunique()
print(f"There are {unique_time_ids} unique time ids in the training sample, the following stocks are missing some time_ids")
time_ids_per_stock[time_ids_per_stock < unique_time_ids]

In [None]:
max_stock_id = train_df['stock_id'].max()
missing_stock_ids = [idx for idx in range(max_stock_id) if idx not in train_df['stock_id'].unique()]
missing_stock_ids

The Fact that some stocks are missing some of the time_ids, might be a sign that these would be part of the test set

# Stock Correlation

It seems many of the stocks volatility are highly correlated

In [None]:
stock_corr_df = pd.pivot(train_df, index='time_id', columns='stock_id', values='target').corr()
stock_corr_df.describe()

In [None]:
train_df['rolled_target'] = train_df.groupby('stock_id')['target'].transform(lambda x: x.rolling(window=10, min_periods=5).mean())
subset_df = train_df[train_df['stock_id'].isin(range(0,15))]
subset_df = subset_df[subset_df['time_id'].isin(range(0,1000))]


### Warning: Time_ids are not sequential, they have been randomly shuffled
More Info: https://www.kaggle.com/c/optiver-realized-volatility-prediction/discussion/249564
This high level of correlation does seem to imply that time_id are the same for all stocks as asked in the discussion

In [None]:
fig = px.line(subset_df, x='time_id', y='rolled_target', color='stock_id')
fig.show()

In [None]:
sns.lineplot(data=subset_df, x='time_id', y='rolled_target', hue='stock_id')