## Install Libraries

## Import Labraries

In [None]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', 300)

import cudf

import os
import glob
from datetime import datetime
from joblib import Parallel, delayed
from typing import List
from pathlib import Path
from tqdm.auto import tqdm
from warnings import filterwarnings
filterwarnings('ignore')

## Set Config

In [None]:
class Config:
    INPUT_PATH = Path('../input/optiver-realized-volatility-prediction/')
    IS_DEBUG = False
    SECONDS_IN_BUCKETS = [0, 100] if IS_DEBUG else list(range(0, 600, 100))
    NUM_STOCK_IDS = 3 if IS_DEBUG else 200
    N_CLUSTERS = 7

## Define Utils

In [None]:
def calc_wap(df, bid_price_col, bid_size_col, ask_price_col, ask_size_col):
    bid_price = df[bid_price_col]
    bid_size = df[bid_size_col]
    ask_price = df[ask_price_col]
    ask_size = df[ask_size_col]

    wap  = bid_price * ask_size
    wap += ask_price * bid_size
    wap /= bid_size  + ask_size

    return wap

def calc_log_return_squared(df, col):
    col_log = df[col].log()
        
    log_return = col_log - col_log.groupby(df['time_id']).shift().reset_index(drop=True)
    return log_return ** 2

def calc_wap_balance(df) -> np.array:
    wap1 = df["wap1"]
    wap2 = df["wap2"]
    return np.abs(wap1 - wap2)

In [None]:
def calc_quoted_spread(df:pd.DataFrame, bid_price_col, ask_price_col) -> np.array:
    bid_price = df[bid_price_col]
    ask_price = df[ask_price_col]
    
    midpoint = (ask_price + bid_price) / 2
    quoted_spread = (ask_price - bid_price) / midpoint * 100
    return quoted_spread

def calc_spread(df:pd.DataFrame, col1, col2) -> np.array:
    return df[col1] - df[col2]

def calc_total_volume(df:pd.DataFrame) -> np.array:
    bid_size1 = df[f'bid_size1']
    bid_size2 = df[f'bid_size2']
    ask_size1 = df[f'ask_size1']
    ask_size2 = df[f'ask_size2']
    
    total_volumne = (bid_size1 + bid_size2) + (ask_size1 + ask_size2)
    return total_volumne

def calc_volume_imbalance(df:pd.DataFrame) -> np.array:
    bid_size1 = df[f'bid_size1']
    bid_size2 = df[f'bid_size2']
    ask_size1 = df[f'ask_size1']
    ask_size2 = df[f'ask_size2']
    
    volume_imbalance = np.abs((ask_size1 + ask_size2) - (bid_size1 + bid_size2))
    return volume_imbalance

In [None]:
def get_stats_window(
    feature_df:pd.DataFrame, 
    feature_dict:dict,
    prefix:str,
    seconds_in_bucket:int=0) -> pd.DataFrame:
    
    feature_df = feature_df.query(f'seconds_in_bucket >= {seconds_in_bucket}')
    feature_df = feature_df.groupby('time_id').agg(feature_dict)
    
    feature_df.columns = [f'{prefix}_' + '_'.join(col) + f'>={seconds_in_bucket}' for col in feature_df.columns]
    
    return feature_df

## Book Preprocessor

In [None]:
def book_preprocessor(
    stock_id:int=0, 
    exec_type='train', 
    seconds_in_buckets:List[int]=[0, 100]) -> pd.DataFrame:
    
    assert exec_type in ['train', 'test']
    
    BOOK_FILE_PATH = Config.INPUT_PATH / f'book_{exec_type}.parquet/stock_id={stock_id}'
    book_df = cudf.read_parquet(BOOK_FILE_PATH)
    
    book_df["wap1"] = calc_wap(book_df, bid_price_col="bid_price1", bid_size_col="bid_size1", ask_price_col="ask_price1", ask_size_col="ask_size1")
    book_df["wap2"] = calc_wap(book_df, bid_price_col="bid_price2", bid_size_col="bid_size2", ask_price_col="ask_price2", ask_size_col="ask_size2")
    
    book_df["wap1_log_return_squared"] = calc_log_return_squared(book_df, "wap1")
    book_df["wap2_log_return_squared"] = calc_log_return_squared(book_df, "wap2")
    
    book_df["wap_balance"] = calc_wap_balance(book_df)
    
    book_df["quoted_spread"] = calc_quoted_spread(book_df, bid_price_col="bid_price1", ask_price_col="ask_price1")
    book_df["bid_spread"] = calc_spread(book_df, col1="bid_price1", col2="bid_price2")
    book_df["ask_spread"] = calc_spread(book_df, col1="ask_price1", col2="ask_price2")
    book_df["bid_ask_spread"] = np.abs(calc_spread(book_df, col1="bid_spread", col2="ask_spread"))
    
    book_df["total_volume"] = calc_total_volume(book_df)
    book_df["volume_imbalance"] = calc_volume_imbalance(book_df)
    
    # Dict for aggregations
    default_stats = ['sum', 'mean', 'std', 'max', 'min']
    book_feature_dict = {
        'seconds_in_bucket': ['nunique'],
        
        'wap1': ['max', 'min'],
        'wap2': ['max', 'min'],
        
        'wap1_log_return_squared': ['sum'],
        'wap2_log_return_squared': ['sum'],
        
        'quoted_spread': [ 'mean', 'max', 'min'],
        'bid_spread': [ 'mean', 'max', 'min'],
        'ask_spread': [ 'mean', 'max', 'min'],
        'bid_ask_spread': [ 'mean', 'max', 'min'],
        
        'wap_balance': ['mean', 'max', 'min'],
        'total_volume': ['sum', 'mean', 'max', 'min'],
        'volume_imbalance': ['mean', 'max', 'min'],
    }
    book_feature_dict_time = {
        'wap1_log_return_squared': ['sum'],
        'wap2_log_return_squared': ['sum'],
    }
    
    output_df = cudf.DataFrame()
    
    for seconds_in_bucket in tqdm(Config.SECONDS_IN_BUCKETS):
        tmp_df = get_stats_window(
            feature_df=book_df, 
            feature_dict=book_feature_dict if seconds_in_bucket == 0 else book_feature_dict_time,
            prefix='book',
            seconds_in_bucket=seconds_in_bucket
        )
        
        output_df = cudf.concat([
            output_df, 
            tmp_df,
        ], axis=1)
        
    output_df.reset_index(inplace=True)
    output_df['stock_id'] = np.int8(stock_id)
    output_df.set_index(['stock_id', 'time_id'] ,inplace=True)
    
    return output_df

## Trade Preprocessor

In [None]:
def trade_preprocessor(
    stock_id:int=0, 
    exec_type='train', 
    seconds_in_buckets:List[int]=[0, 100]) -> pd.DataFrame:
    
    assert exec_type in ['train', 'test']
    
    TRADE_FILE_PATH = Config.INPUT_PATH / f'trade_{exec_type}.parquet/stock_id={stock_id}'
    trade_df = cudf.read_parquet(TRADE_FILE_PATH)
    
    trade_df["price_log_return_squared"] = calc_log_return_squared(trade_df, "price")
    trade_df["amount"] = trade_df["price"] * trade_df["size"]
    
    # Dict for aggregations
    default_stats = ['sum', 'mean', 'std', 'max', 'min']
    trade_feature_dict = {
        'seconds_in_bucket': ['nunique'],
        'price': ['mean', 'max', 'min'],
        'price_log_return_squared': ['sum', "mean"],
        'size': ['sum', 'mean', 'max', 'min'],
        'order_count': ['sum', 'mean', 'max', 'min'],
        'amount': ['sum', 'mean', 'max', 'min'],
    }
    trade_feature_dict_time = {
        'seconds_in_bucket': ['nunique'],
        'price_log_return_squared': ['sum', "mean"],
        'size': ['sum', 'mean', 'max', 'min'],
        'order_count':  ['sum', 'mean', 'max', 'min'],
    }

    output_df = cudf.DataFrame()
    
    for seconds_in_bucket in Config.SECONDS_IN_BUCKETS:
        tmp_df = get_stats_window(
            feature_df=trade_df, 
            feature_dict=trade_feature_dict if seconds_in_bucket == 0 else trade_feature_dict_time,
            prefix='trade',
            seconds_in_bucket=seconds_in_bucket
        )
        
        output_df = cudf.concat([
            output_df, 
            tmp_df,
        ], axis=1)

    output_df.reset_index(inplace=True)
    output_df['stock_id'] = np.int8(stock_id)
    output_df.set_index(['stock_id', 'time_id'] ,inplace=True)
    
    return output_df

## Parallel Preprocessor

In [None]:
def Preprocessor(
    stock_ids:List[int]=[0], 
    exec_type:str='train') -> pd.DataFrame:
    
    assert exec_type in ['train', 'test']
    
    def parallel_preprocessor(stock_id:int, exec_type:str):
        book_df = book_preprocessor(
            stock_id=stock_id, 
            exec_type=exec_type
        )
        trade_df = trade_preprocessor(
            stock_id=stock_id, 
            exec_type=exec_type
        )

        output_df = cudf.merge( 
            left=book_df,
            right=trade_df, 
            how='left',
            left_index=True, 
            right_index=True
        )
        
        return output_df
    
    output_dfs = Parallel(n_jobs = -1, verbose = 1)(delayed(parallel_preprocessor)(stock_id, exec_type) for stock_id in stock_ids)
    
    # Concatenate all the dataframes that return from Parallel
    output_df = cudf.concat(output_dfs)
    return output_df

## Add_time_stock

In [None]:
def add_time_stock(df:pd.DataFrame) -> pd.DataFrame:
    aggs = ['mean', 'std', 'max', 'min']
    df = df.reset_index()
    use_cols = [col for col in df.columns if 'squared_sum' in col]
    
    stock_id_df = df.groupby('stock_id')[use_cols].agg(["mean", "max", "min"])
    stock_id_df.columns = ["_".join(map(str, col)) + '_stock_id' for col in stock_id_df.columns]
    
    time_id_df = df.groupby('time_id')[use_cols].agg(["mean", "max", "min"])
    time_id_df.columns = ["_".join(map(str, col)) + '_time_id' for col in time_id_df.columns]
    
    use_cols = [col for col in df.columns if '>=0' in col]
    cluster_time_id_df = df.groupby(['cluster', 'time_id'])[use_cols].agg("mean")
    cluster_time_id_df.reset_index(inplace=True)
    cluster_time_id_df = cluster_time_id_df.pivot(index='time_id', columns='cluster')
    cluster_time_id_df.columns = ['_'.join(map(str, col)) for col in cluster_time_id_df.columns]
    
    for col in use_cols:
        for i in range(Config.N_CLUSTERS):
            if f'{col}_{i}' not in cluster_time_id_df.columns:
                cluster_time_id_df[f'{col}_{i}'] = np.nan
    cluster_time_id_df = cluster_time_id_df[[f'{col}_{i}' for col in use_cols for i in range(Config.N_CLUSTERS)]]

    df = cudf.merge(left=df, right=stock_id_df, left_on=['stock_id'], right_index=True)
    df = cudf.merge(left=df, right=time_id_df, left_on=['time_id'], right_index=True)
    df = cudf.merge(left=df, right=cluster_time_id_df, left_on=['time_id'], right_index=True)
    
    return df.reset_index(drop=True)

## Add Cluster

In [None]:
from sklearn.cluster import KMeans, AgglomerativeClustering

class ClusterStock:
    def __init__(
        self,
        n_clusters: int=7, 
        random_state: int=3665):
        
        self.n_clusters = n_clusters
        self.random_state = random_state
    
    def fit(
        self, 
        X:pd.DataFrame):
        
        pivot_df = X.pivot_table(index='time_id', columns='stock_id', values='target')
        corr_df = pivot_df.corr()
        
        clustering_method = KMeans(n_clusters=self.n_clusters, random_state=self.random_state)
#         clustering_method = AgglomerativeClustering(n_clusters=self.n_clusters, affinity = 'euclidean', linkage = 'ward')
        clustering_method.fit(corr_df.to_numpy())
        
        self.mapping_dict = dict(zip(corr_df.index, clustering_method.labels_))
    
    def transform(
        self, 
        X:pd.DataFrame) -> np.array:
        
        return X['stock_id'].map(self.mapping_dict).fillna(-1).to_array()
    
    def fit_transform(
        self, 
        X:pd.DataFrame) -> np.array:
        
        self.fit(X)
        return self.transform(X)

## Load Data

In [None]:
train_dtypes = {'stock_id': np.int8, 'time_id':np.int16, 'target':np.float32}
train_df = pd.read_csv(Config.INPUT_PATH / "train.csv", dtype=train_dtypes)

train_stock_ids = train_df['stock_id'].unique()

feature_train_df = Preprocessor(stock_ids=train_stock_ids[:Config.NUM_STOCK_IDS], exec_type='train')

cluster_stock = ClusterStock(n_clusters=Config.N_CLUSTERS, random_state=3655)
cluster_stock.fit(train_df)

feature_train_df['cluster'] = cluster_stock.transform(feature_train_df.reset_index())
feature_train_df = add_time_stock(feature_train_df)

feature_train_df = cudf.merge(
    left=cudf.DataFrame(train_df).set_index(['stock_id', 'time_id']),
    right=feature_train_df.set_index(['stock_id', 'time_id']),
    how='left', 
    left_index=True, 
    right_index=True)

feature_train_df.sort_index(inplace=True)
feature_train_df.reset_index(inplace=True)

feature_train_df.to_feather('feature_train.feather')