In [None]:
import sys
sys.path.append('../input/rapids-kaggle-utils')

import pandas as pd
import numpy as np
import cupy as cp
import cudf
import cuml
import glob
from tqdm import tqdm
import cu_utils.transform as cutran

import gc
from joblib import Parallel, delayed

In [None]:
PATH = "../input/optiver-realized-volatility-prediction/"
order_book_training = glob.glob(f'{PATH}/book_train.parquet/*/*')
trade_training = glob.glob(f'{PATH}/trade_train.parquet/*/*')
print('size of order_book_training' ,len(order_book_training))
print('size of trade_training' ,len(trade_training))

# Steps/Code to reproduce bug

In [None]:
data_df = cudf.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0/c439ef22282f412ba39e9137a3fdabac.parquet')
offsets = data_df.groupby(['time_id'], as_index=False).agg({'seconds_in_bucket':'min'}).reset_index(drop=True)
offsets.columns = ['time_id', 'offset']
data_df = cudf.merge(data_df, offsets, on = ['time_id'], how = 'left')
data_df.seconds_in_bucket = data_df.seconds_in_bucket - data_df.offset
# MultiIndex.from_product uses pandas in the background
# That's why we need to transform the data into pd dataframe
data_df = data_df.set_index(['time_id', 'seconds_in_bucket'])
columns = [col for col in data_df.columns.values]
data_df = data_df.reindex(cudf.MultiIndex.from_product([data_df.to_pandas().index.levels[0], np.arange(0,600)], names = ['time_id', 'seconds_in_bucket']), columns=columns).fillna(method='ffill')
data_df = cudf.DataFrame(data_df.reset_index())

In [None]:
data_df

# Workaround not working

This solution was given by @aerdem4.

In [None]:
data_df = cudf.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0/c439ef22282f412ba39e9137a3fdabac.parquet')
offsets = data_df.groupby(['time_id'], as_index=False).agg({'seconds_in_bucket':'min'}).reset_index(drop=True)
offsets.columns = ['time_id', 'offset']
data_df = cudf.merge(data_df, offsets, on = ['time_id'], how = 'left')
data_df.seconds_in_bucket = data_df.seconds_in_bucket - data_df.offset
# MultiIndex.from_product uses pandas in the background
# That's why we need to transform the data into pd dataframe
data_df = data_df.set_index(['time_id', 'seconds_in_bucket'])
columns = [col for col in data_df.columns.values]
indices = cudf.MultiIndex.from_product([data_df.to_pandas().index.levels[0], np.arange(0,600)], names = ['time_id', 'seconds_in_bucket'])

data_df = cudf.DataFrame().set_index(indices).join(data_df, how="left").fillna(method='ffill').reset_index(drop=True)

In [None]:
data_df

In [None]:
def fix_offsets_ffill(data_df):
    
    data_df = cudf.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0/c439ef22282f412ba39e9137a3fdabac.parquet')
    offsets = data_df.groupby(['time_id'], as_index=False).agg({'seconds_in_bucket':'min'}).reset_index(drop=True)
    offsets.columns = ['time_id', 'offset']
    data_df = cudf.merge(data_df, offsets, on = ['time_id'], how = 'left')
    data_df.seconds_in_bucket = data_df.seconds_in_bucket - data_df.offset
    # MultiIndex.from_product uses pandas in the background
    # That's why we need to transform the data into pd dataframe
    data_df = data_df.set_index(['time_id', 'seconds_in_bucket'])
    columns = [col for col in data_df.columns.values]
    indices = cudf.MultiIndex.from_product([data_df.to_pandas().index.levels[0], np.arange(0,600)], names = ['time_id', 'seconds_in_bucket'])
    
    data_df = cudf.DataFrame().set_index(indices).join(data_df, how="left").fillna(method='ffill').reset_index(drop=True)
    
    return data_df


def preprocess_book(book_path):
    
    def rel_vol_fe(df, null_val=-9999):
    
        # compute wap
        for n in range(1, 3):
            p1 = df[f"bid_price{n}"]
            p2 = df[f"ask_price{n}"]
            s1 = df[f"bid_size{n}"]
            s2 = df[f"ask_size{n}"]
            df["WAP"] = (p1*s2 + p2*s1) / (s1 + s2)


            df["log_wap"] = df["WAP"].log()
            df["log_wap_shifted"] = (df[["time_id", "log_wap"]].groupby("time_id")
                                 .apply_grouped(cutran.get_cu_shift_transform(shift_by=1, null_val=null_val),
                                                incols={"log_wap": 'x'},
                                                outcols=dict(y_out=cp.float32),
                                                tpb=32)["y_out"])
            df = df[df["log_wap_shifted"] != null_val]

            df["diff_log_wap"] = df["log_wap"] - df["log_wap_shifted"]
            df[f"diff_log_wap{n}"] = df["diff_log_wap"]**2


    
        # Summary statistics for different 'diff_log_wap'
        sum_df = df.groupby("time_id").agg({"diff_log_wap1": {"sum", "mean", "std", "median", "max", "min"}, 
                                        "diff_log_wap2": {"sum", "mean", "std", "median", "max", "min"}}
                                      ).reset_index()
    
        # Create wanted features for training
        def f(x):
            if x[1] == "":
                return x[0]
            return x[0] + "_" + x[1]
    
        sum_df.columns = [f(x) for x in sum_df.columns]
        sum_df["volatility1"] = (sum_df["diff_log_wap1_sum"])**0.5
        sum_df["volatility2"] = (sum_df["diff_log_wap2_sum"])**0.5
        sum_df["vol1_mean"] = sum_df["diff_log_wap1_mean"].fillna(0).values
        sum_df["vol2_mean"] = sum_df["diff_log_wap2_mean"].fillna(0).values
        sum_df["vol1_std"] = sum_df["diff_log_wap1_std"].fillna(0).values
        sum_df["vol2_std"] = sum_df["diff_log_wap2_std"].fillna(0).values
        sum_df["vol1_median"] = sum_df["diff_log_wap1_median"].fillna(0).values
        sum_df["vol2_median"] = sum_df["diff_log_wap2_median"].fillna(0).values
        sum_df["vol1_max"] = sum_df["diff_log_wap1_max"].fillna(0).values
        sum_df["vol2_max"] = sum_df["diff_log_wap2_max"].fillna(0).values
        sum_df["vol1_min"] = sum_df["diff_log_wap1_min"].fillna(0).values
        sum_df["vol2_min"] = sum_df["diff_log_wap2_min"].fillna(0).values
        sum_df["volatility_rate"] = (sum_df["volatility1"] / sum_df["volatility2"]).fillna(0)
        sum_df["mean_volatility_rate"] = (sum_df["vol1_mean"] / sum_df["vol2_mean"]).fillna(0)
        sum_df["std_volatility_rate"] = (sum_df["vol1_std"] / sum_df["vol2_std"]).fillna(0)
        sum_df["median_volatility_rate"] = (sum_df["vol1_median"] / sum_df["vol2_median"]).fillna(0)
        sum_df["max_volatility_rate"] = (sum_df["vol1_max"] / sum_df["vol2_max"]).fillna(0)
    
        return sum_df[["time_id", "volatility1", "volatility2", 
                   "volatility_rate", "vol1_std", "vol2_std",
                   "vol1_mean", "vol2_mean", "vol1_median", "vol2_median",
                   "vol1_max", "vol2_max", "vol1_min", "vol2_min",
                   "mean_volatility_rate", "std_volatility_rate",
                   "median_volatility_rate", "max_volatility_rate"]]

    def spread_fe(df):
    
        # Bid ask spread
        df['bas'] = (df[['ask_price1', 'ask_price2']].min(axis = 1)
                                    / df[['bid_price1', 'bid_price2']].max(axis = 1) - 1)                               
    
        # different spreads
        df['h_spread_l1'] = df['ask_price1'] - df['bid_price1']
        df['h_spread_l2'] = df['ask_price2'] - df['bid_price2']
        df['v_spread_b'] = df['bid_price1'] - df['bid_price2']
        df['v_spread_a'] = df['ask_price1'] - df['ask_price2']
        
        # Summary statistics for different spread
        spread_df = df.groupby("time_id", as_indes=False).agg({"h_spread_l1": { "mean", "std", "median", "max", "min"}, 
                                        "h_spread_l2": { "mean", "std", "median", "max", "min"},
                                        "v_spread_b": {"mean", "std", "median", "max", "min"},
                                        "v_spread_a": {"mean", "std", "median", "max", "min"},
                                        "bas": {"mean"},
                                       }).reset_index()
    
    
        # Create wanted features for training
        def f(x):
            if x[1] == "":
                return x[0]
            return x[0] + "_" + x[1]
    
        spread_df.columns = [f(x) for x in spread_df.columns]

        return spread_df
    
    # Stats for book data
    book = cudf.read_parquet(book_path)
    stock_id = int(book_path.split("=")[1].split("/")[0])
    book = fix_offsets_ffill(book)
    rel_vol_data = rel_vol_fe(book)
    spread_data = spread_fe(book)
    
    transbook = cudf.merge(rel_vol_data,
                           spread_data,
                           on = ['time_id'], how = 'left')
    transbook['stock_id'] = stock_id
    
    return transbook


def preprocess_trade(trade_path):
    
    def trade_fe(trade_df, null_val=-9999):
    
        trade_df["log_wap_shifted"] = (trade_df[["time_id", "price"]].groupby("time_id")
                                 .apply_grouped(cutran.get_cu_shift_transform(shift_by=1, null_val=-9999),
                                                incols={"price": 'x'},
                                                outcols=dict(y_out=cp.float32),
                                                tpb=32)["y_out"])
        trade_df = trade_df[trade_df["log_wap_shifted"] != -9999]
    
        trade_df["diff_log_wap"] = trade_df["price"] - trade_df["log_wap_shifted"]
        trade_df[f"diff_log_wap"] = trade_df["diff_log_wap"]**2
    
        trade_features = trade_df.groupby('time_id', as_index=False).agg({'diff_log_wap':{'sum', 'mean', 'std'},
                                                 'order_count':{'mean'},
                                                 'size':{'sum'}
                                                })
        # Create wanted features for training
        def f(x):
            if x[1] == "":
                return x[0]
            return x[0] + "_" + x[1]
        
        trade_features.columns = [f(x) for x in trade_features.columns]
    
        trade_features['trade_volatility'] = (trade_features['diff_log_wap_sum']**0.5)
        trade_features['trade_vol_mean'] = trade_features['diff_log_wap_mean'].fillna(0).values
        trade_features['trade_vol_std'] = trade_features['diff_log_wap_std'].fillna(0).values
        
        return trade_features[[col for col in trade_features.columns if col not in ['diff_log_wap_sum', 'diff_log_wap_std', 'diff_log_wap_mean', ]]]
    
    trade_df = cudf.read_parquet(trade_path)
    trade_stat = trade_fe(trade_df)
    
    return trade_stat


def preprocessor(list_stock_ids, is_train = True):
    
    
    def for_joblib(stock_id):
        if is_train:
            file_path_book = PATH + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = PATH + "trade_train.parquet/stock_id=" + str(stock_id)
        else:
            file_path_book = PATH + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = PATH + "trade_test.parquet/stock_id=" + str(stock_id)
            
        df_tmp = cudf.merge(preprocess_book(file_path_book),preprocess_trade(file_path_trade),on='time_id',how='left')
     
        return df_tmp
    
    df = Parallel(n_jobs=-1, verbose=1)(
            delayed(for_joblib)(stock_id) for stock_id in list_stock_ids
            )

    df =  cudf.concat(df,ignore_index = True)
    return df

In [None]:
%%time
train = cudf.read_csv(PATH + 'train.csv')
train_ids = train.stock_id.unique().to_array()
train_process = preprocessor(list_stock_ids= train_ids, is_train = True)

In [None]:
df_train = train.merge(train_process, on = ['time_id', 'stock_id'], how = 'left').fillna(0)
df_train.to_csv("./dtrain.csv", index=False)
df_train.head()