# Accelerating Trading on GPU via RAPIDS
## Best scoring CPU kernel is accelerated on GPU. 3.5x Speedup!!!

![](https://i.imgur.com/lkjVW2f.png)

In [None]:
# https://www.kaggle.com/aerdem4/optiver-pytorch-gpu-no-feature-eng

def nn_notebook():
    import numpy as np
    import pandas as pd
    import glob
    from tqdm import tqdm
    import sys, os


    def load_data(mode, path="/kaggle/input/optiver-realized-volatility-prediction"):
        # mode = "train"/"test"
        file_name = f'{path}/{mode}.csv'
        return pd.read_csv(file_name)

    df = load_data("test")
    
    SCALE = 100
    PATH = "/kaggle/input/optiver-realized-volatility-prediction"

    order_book_paths = glob.glob(f'{PATH}/book_test.parquet/*/*')
    trade_paths = glob.glob(f'{PATH}/trade_test.parquet/*/*')

    
    order_books = dict()
    for path in tqdm(order_book_paths):
        stock_id = int(path.split("=")[1].split("/")[0])
        book_df = pd.read_parquet(path)
        books_by_time = dict()

        for time_id in book_df.time_id.unique():
            books_by_time[time_id] = book_df[book_df["time_id"] == time_id].reset_index(drop=True)

        order_books[stock_id] = books_by_time
    
    trades = dict()
    for path in tqdm(trade_paths):
        stock_id = int(path.split("=")[1].split("/")[0])
        trade_df = pd.read_parquet(path)
        trade_by_time = dict()

        for time_id in trade_df.time_id.unique():
            trade_by_time[time_id] = trade_df[trade_df["time_id"] == time_id].reset_index(drop=True)

        trades[stock_id] = trade_by_time
        
    import torch
    import torch.nn as nn
    from torch.utils.data import DataLoader, Dataset


    means_order = torch.FloatTensor([  0.9997,   1.0003, 769.9902, 766.7346,   0.9995,   1.0005, 959.3417,
            928.2203, 300])
    stds_order = torch.FloatTensor([3.6881e-03, 3.6871e-03, 5.3541e+03, 4.9549e+03, 3.7009e-03, 3.6991e-03,
            6.6838e+03, 5.7353e+03, 300])

    means_trade = torch.FloatTensor([300, 1.0, 100, 3.0])
    stds_trade = torch.FloatTensor([300, 0.004, 153, 3.5])



    class OptiverDataset(Dataset):

        def __init__(self, df, aug=False):
            super().__init__()
            self.df = df.reset_index(drop=True)
            self.aug = aug
            self.seq_len = 600
            self.order_features = ['bid_price1', 'ask_price1', 'bid_size1', 'ask_size1','bid_price2', 
                             'ask_price2', 'bid_size2', 'ask_size2', "seconds_in_bucket"]
            self.trade_features = ["seconds_in_bucket", "price", "size", "order_count"]


        def extract_features(self, data_dict, stock_id, time_id, features, means, stds):
            X = -torch.ones((self.seq_len, len(features)))
            try:
                df = data_dict[stock_id][time_id]
                feature_array = df[features].values
                X[-feature_array.shape[0]:] = (torch.FloatTensor(feature_array) - means)/stds
            except:
                pass
            return X


        def __getitem__(self, index):
            row = self.df.iloc[index]

            X1 = self.extract_features(order_books, row.stock_id, row.time_id, self.order_features,
                                      means_order, stds_order)
            try:
                X2 = self.extract_features(trades, row.stock_id, row.time_id, self.trade_features,
                                          means_trade, stds_trade) 
            except:
                X2 = -torch.ones((self.seq_len, len(self.trade_features)))
            target = torch.FloatTensor([0.0])
            stock = torch.LongTensor([row.stock_id])
            return X1, X2, stock, target

        def __len__(self):
            return self.df.shape[0]

    ds = OptiverDataset(df)
    
    class ConvBlock(nn.Module):
        def __init__(self, in_dim, out_dim, kernel_size, stride=1):
            super().__init__()
            self.lin = nn.Conv1d(in_dim, out_dim, kernel_size, stride=stride)
            self.bn = nn.BatchNorm1d(out_dim)
            self.activation = nn.ReLU()

        def forward(self, x):
            x = self.lin(x)
            x = self.bn(x)
            return self.activation(x)


    class SubModel(nn.Module):
        def __init__(self, in_dim):
            super().__init__()
            self.convs1 = nn.Sequential(ConvBlock(in_dim, 16, 3),
                                       ConvBlock(16, 32, 3))
            self.stock_conv = ConvBlock(36, 64, 4, stride=4)
            self.avg_pool = nn.AdaptiveAvgPool1d(8)
            self.max_pool = nn.AdaptiveMaxPool1d(8)
            self.convs2 = nn.Sequential(ConvBlock(128, 128, 2, stride=2),
                                        ConvBlock(128, 32, 2, stride=2),
                                        ConvBlock(32, 8, 2, stride=2))

        def forward(self, x, s):
            x = self.convs1(x.transpose(2, 1))
            x = self.stock_conv(torch.cat([x, s.repeat(1, 1, x.shape[2])], axis=1))
            x = torch.cat([self.avg_pool(x), self.max_pool(x)], axis=1)
            x = self.convs2(x).squeeze(-1)
            return x


    class Model(nn.Module):
        def __init__(self):
            super().__init__()
            self.order_model = SubModel(in_dim=9)
            self.trade_model = SubModel(in_dim=4)
            self.top = nn.Linear(16, 1)
            self.stock_emb = nn.Embedding(127, 4)

        def forward(self, inputs):
            x1, x2, s = inputs
            s = self.stock_emb(s).transpose(2, 1)

            x1 = self.order_model(x1, s)
            x2 = self.trade_model(x2, s)
            x = self.top(torch.cat([x1, x2], axis=1))
            return x
        
    def read_data(data):
        return tuple(d.cuda() for d in data[:-1]), data[-1].cuda()

    def inference(model, loader, num_folds=5):
        model.eval()

        tbar = tqdm(loader, file=sys.stdout)

        preds = []

        model_weights = {i: torch.load(f"/kaggle/input/optiver-nn/optiver_nn_v01_{i}.pth") for i in range(num_folds)}

        with torch.no_grad():
            for idx, data in enumerate(tbar):
                inputs, target = read_data(data)

                model.load_state_dict(model_weights[0])
                pred = model(inputs)/num_folds
                for i in range(1, num_folds):
                    model.load_state_dict(model_weights[i])
                    pred += model(inputs)/num_folds


                preds.append(pred.detach().cpu().numpy().ravel())

        return np.concatenate(preds)

    NW = 4
    BS = 256
    loader = DataLoader(ds, batch_size=BS, shuffle=False, num_workers=NW, pin_memory=False, drop_last=False)


    model = Model()
    model = model.cuda()

    y = inference(model, loader)
    
    df["nn_pred"] = np.clip(y, 0.0, None)/SCALE

    df.to_csv("nn_preds.csv", index=False, columns=["stock_id", "time_id", "nn_pred"])
nn_notebook()

In [None]:
import cupy as cp
import cudf
import cuml
import glob
from tqdm import tqdm

cudf.__version__

In [None]:
PATH = "/kaggle/input/optiver-realized-volatility-prediction"


def load_data(mode, path="/kaggle/input/optiver-realized-volatility-prediction"):
    # mode = "train"/"test"
    file_name = f'{path}/{mode}.csv'
    return cudf.read_csv(file_name)

dev_df = load_data("train", path=PATH)
dev_df.head()

In [None]:
SCALE = 100
dev_df["target"] *= SCALE

stock_ids = dev_df["stock_id"].unique()
len(stock_ids)

In [None]:
order_book_training = glob.glob(f'{PATH}/book_train.parquet/*/*')
order_book_test = glob.glob(f'{PATH}/book_test.parquet/*/*')

len(order_book_training), len(order_book_test)

In [None]:
trades_training = glob.glob(f'{PATH}/trade_train.parquet/*/*')
trades_test = glob.glob(f'{PATH}/trade_test.parquet/*/*')

len(trades_training), len(trades_test)

## Using rapids-kaggle-utils for missing cuDF aggregation functions

In [None]:
%cd /kaggle/input/rapids-kaggle-utils/

In [None]:
import cu_utils.transform as cutran



def log_diff(df, in_col, null_val):
    df["logx"] = df[in_col].log()
    df["logx_shifted"] = (df[["time_id", "logx"]].groupby("time_id")
                             .apply_grouped(cutran.get_cu_shift_transform(shift_by=1, null_val=null_val),
                                            incols={"logx": 'x'},
                                            outcols=dict(y_out=cp.float32),
                                            tpb=32)["y_out"])
    df["keep_row"] = df[f"logx_shifted"] != null_val
    return df["logx"] - df["logx_shifted"]



def extract_raw_book_features(df, null_val=-9999):
    for n in range(1, 3):
        p1 = df[f"bid_price{n}"]
        p2 = df[f"ask_price{n}"]
        s1 = df[f"bid_size{n}"]
        s2 = df[f"ask_size{n}"]
        df[f"wap{n}"] = (p1*s2 + p2*s1) / (s1 + s2)
        df[f"log_return{n}"] = log_diff(df, in_col=f"wap{n}", null_val=null_val)
        df[f"realized_vol{n}"] = df[f"log_return{n}"]**2
        
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    df["c"] = 1
    
    df = df[df["keep_row"]]
    return df


def extract_raw_trade_features(df, null_val=-9999):
    df["realized_vol_trade"] = log_diff(df, in_col=f"price", null_val=null_val)**2
    df = df[df["keep_row"]]
    return df


def agg(df, feature_dict):
    agg_df = df.groupby("time_id").agg(feature_dict).reset_index()
    def f(x):
        if x[1] == "":
            return x[0]
        return x[0] + "_" + x[1]
    
    agg_df.columns = [f(x) for x in agg_df.columns]
    return agg_df    


def extract_book_stats(df):
    default_stats = ["sum", "mean", "std"]
    feature_dict = {
        'wap1': default_stats,
        'wap2': default_stats,
        'log_return1': default_stats,
        'log_return2': default_stats,
        'wap_balance': default_stats,
        'price_spread': default_stats,
        'bid_spread': default_stats,
        'ask_spread': default_stats,
        'total_volume': default_stats,
        'volume_imbalance': default_stats,
        'c': ["sum"],
        'realized_vol1': ["sum"],
        'realized_vol2': ["sum"],
    }
    
    return agg(df, feature_dict)
    

    
    
def extract_trade_stats(df):
    feature_dict = {
        'realized_vol_trade': ["sum"],
        'seconds_in_bucket':["count"],
        'size': ["sum"],
        'order_count': ["mean"],
    }
    
    return agg(df, feature_dict)


def time_constraint_fe(df, stats_df, last_sec, fe_function, cols):
    sub_df = df[df["seconds_in_bucket"] >= (600 - last_sec)].reset_index(drop=True)
    if sub_df.shape[0] > 0:
        sub_stats = fe_function(sub_df)
    else:
        sub_stats = cudf.DataFrame(columns=cols)
    return stats_df.merge(sub_stats, on="time_id", how="left", suffixes=('', f'_{last_sec}'))    
    

def feature_engineering(book_path, trade_path):
    book_df = cudf.read_parquet(book_path)
    book_df = extract_raw_book_features(book_df)
    book_stats = extract_book_stats(book_df)
    book_cols = book_stats.columns
    
    trade_df = cudf.read_parquet(trade_path)
    trade_df = extract_raw_trade_features(trade_df)
    trade_stats = extract_trade_stats(trade_df)
    trade_cols = trade_stats.columns
    
    for last_sec in [150, 300, 450]:
        book_stats = time_constraint_fe(book_df, book_stats, last_sec, extract_book_stats, book_cols) 
        trade_stats = time_constraint_fe(trade_df, trade_stats, last_sec, extract_trade_stats, trade_cols) 

    return book_stats.merge(trade_stats, on="time_id", how="left")


def process_data(order_book_paths, trade_paths, stock_ids):
    stock_dfs = []
    for book_path, trade_path in tqdm(list(zip(order_book_paths, trade_paths))):
        stock_id = int(book_path.split("=")[1].split("/")[0])

        df = feature_engineering(book_path, trade_path)
        df["stock_id"] = stock_id
        stock_dfs.append(df)
    return cudf.concat(stock_dfs)

In [None]:
past_volatility = process_data(order_book_training, trades_training, stock_ids)
past_test_volatility = process_data(order_book_test, trades_test, stock_ids)

past_volatility.shape, past_test_volatility.shape

### Get NN with no FE features from https://www.kaggle.com/aerdem4/optiver-pytorch-gpu-no-feature-eng

In [None]:
past_volatility = past_volatility.merge(cudf.read_csv("/kaggle/input/optiver-nn/optiver_nn_v01_oof.csv"), on=["stock_id", "time_id"], how="left")
past_test_volatility = past_test_volatility.merge(cudf.read_csv("/kaggle/working/nn_preds.csv"), on=["stock_id", "time_id"], how="left")


past_volatility["nn_pred"] = past_volatility["nn_pred"].clip(0.0, None)*SCALE
past_test_volatility["nn_pred"] = past_test_volatility["nn_pred"].clip(0.0, None)*SCALE

In [None]:
def stock_time_fe(df):
    cols = ['realized_vol1_sum', 'realized_vol2_sum', 'realized_vol_trade_sum',
            'realized_vol1_sum_150', 'realized_vol2_sum_150', 'realized_vol_trade_sum_150',
            'realized_vol1_sum_300', 'realized_vol2_sum_300', 'realized_vol_trade_sum_300',
            'realized_vol1_sum_450', 'realized_vol2_sum_450', 'realized_vol_trade_sum_450',
            'nn_pred']
    
    for agg_col in ["stock_id", "time_id"]:
        for agg_func in ["mean", "max", "std", "min"]:
            agg_df = df.groupby(agg_col)[cols].agg(agg_func)
            agg_df.columns = [f"{agg_col}_{agg_func}_{col}" for col in agg_df.columns]
            df = df.merge(agg_df.reset_index(), on=agg_col, how="left")
    
    return df

past_volatility["is_test"] = False
past_test_volatility["is_test"] = True
all_df = past_volatility.append(past_test_volatility).reset_index(drop=True)

all_df = stock_time_fe(all_df)

past_volatility = all_df[~all_df["is_test"]]
past_test_volatility = all_df[all_df["is_test"]]

In [None]:
dev_df = dev_df.merge(past_volatility, on=["stock_id", "time_id"], how="left")

features = [col for col in list(dev_df.columns)
            if col not in {"stock_id", "time_id", "target", "is_test"}]
len(features)

## Train XGBoost model on GPU

In [None]:
import xgboost as xgb

def rmspe(y_true, y_pred):
    return (cp.sqrt(cp.mean(cp.square((y_true - y_pred) / y_true))))


def rmspe_xgb(pred, dtrain):
    y = dtrain.get_label()
    return 'rmspe', rmspe(cp.array(y), cp.array(pred))


NUM_FOLDS = 5
param = {'objective': 'reg:squarederror',
         'learning_rate': 0.1,
         'max_depth': 3,
         "min_child_weight": 200,
         "reg_alpha": 10.0,
         "tree_method": 'gpu_hist', "gpu_id": 0,
         'disable_default_eval_metric': 1
    }

target = "target"

oof_preds = cp.zeros(dev_df.shape[0])
test_preds = cp.zeros(past_test_volatility.shape[0])

for fold in range(NUM_FOLDS):
    print("Fold", fold)
    train_ind = cp.where(dev_df["time_id"].values % NUM_FOLDS != fold)[0]
    val_ind = cp.where(dev_df["time_id"].values % NUM_FOLDS == fold)[0]
        
    train_df, val_df = dev_df.iloc[train_ind], dev_df.iloc[val_ind]

    d_train = xgb.DMatrix(train_df[features], train_df[target], weight=1/cp.square(train_df[target]))
    d_val = xgb.DMatrix(val_df[features], val_df[target], weight=1/cp.square(val_df[target]))

    model = xgb.train(param, d_train, evals=[(d_train, "train"), (d_val, "val")], 
                      num_boost_round=5000, verbose_eval=50, feval=rmspe_xgb,
                      early_stopping_rounds=200)
    
    oof_preds[val_ind] = model.predict(d_val)
    test_preds += cp.array(model.predict(xgb.DMatrix(past_test_volatility[features].astype("float")))/NUM_FOLDS)

In [None]:
dev_df["pred"] = oof_preds
print(f'The RMSPE score of XGB is {rmspe(dev_df["target"], dev_df["pred"])}')

In [None]:
past_test_volatility["row_id"] = past_test_volatility["stock_id"].astype(str) + "-" + past_test_volatility["time_id"].astype(str) 
past_test_volatility["target"] = test_preds.clip(0.0, 100.0)/SCALE

In [None]:
%cd /kaggle/working

In [None]:
sub_df = load_data("test", path=PATH).merge(past_test_volatility[["row_id", "target"]], 
                                            on="row_id", how="left").fillna(0.0)

sub_df.to_csv("submission.csv", index=False, columns=["row_id", "target"])

In [None]:
cudf.read_csv("submission.csv")