# Accelerating Trading on GPU via RAPIDS
## Best scoring CPU kernel is accelerated on GPU. 3.5x Speedup!!!



Based on a GPU accelerated version with other potential items & considerations that can be done on the GPU.


In [None]:
import cupy as cp
import cudf
import cuml
import glob
from tqdm import tqdm

cudf.__version__

In [None]:
PATH = "/kaggle/input/optiver-realized-volatility-prediction"


def load_data(mode, path="/kaggle/input/optiver-realized-volatility-prediction"):
    # mode = "train"/"test"
    file_name = f'{path}/{mode}.csv'
    return cudf.read_csv(file_name)

dev_df = load_data("train", path=PATH)
dev_df.head()

In [None]:
SCALE = 100
dev_df["target"] *= SCALE

stock_ids = dev_df["stock_id"].unique()
len(stock_ids)

In [None]:
order_book_training = glob.glob(f'{PATH}/book_train.parquet/*/*')
order_book_test = glob.glob(f'{PATH}/book_test.parquet/*/*')

len(order_book_training), len(order_book_test)

In [None]:
trades_training = glob.glob(f'{PATH}/trade_train.parquet/*/*')
trades_test = glob.glob(f'{PATH}/trade_test.parquet/*/*')

len(trades_training), len(trades_test)

## Using rapids-kaggle-utils for missing cuDF aggregation functions

In [None]:
%cd /kaggle/input/rapids-kaggle-utils/

In [None]:
import cu_utils.transform as cutran



def log_diff(df, in_col, null_val):
    df["logx"] = df[in_col].log()
    df["logx_shifted"] = (df[["time_id", "logx"]].groupby("time_id")
                             .apply_grouped(cutran.get_cu_shift_transform(shift_by=1, null_val=null_val),
                                            incols={"logx": 'x'},
                                            outcols=dict(y_out=cp.float32),
                                            tpb=32)["y_out"])
    df["keep_row"] = df[f"logx_shifted"] != null_val
    return df["logx"] - df["logx_shifted"]



def extract_raw_book_features(df, null_val=-9999):
    for n in range(1, 3):
        p1 = df[f"bid_price{n}"]
        p2 = df[f"ask_price{n}"]
        s1 = df[f"bid_size{n}"]
        s2 = df[f"ask_size{n}"]
        df[f"wap{n}"] = (p1*s2 + p2*s1) / (s1 + s2)
        df[f"log_return{n}"] = log_diff(df, in_col=f"wap{n}", null_val=null_val)
        df[f"realized_vol{n}"] = df[f"log_return{n}"]**2
        
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    df["c"] = 1
    
    df = df[df["keep_row"]]
    return df


def extract_raw_trade_features(df, null_val=-9999):
    df["realized_vol_trade"] = log_diff(df, in_col=f"price", null_val=null_val)**2
    df = df[df["keep_row"]]
    return df


def agg(df, feature_dict):
    agg_df = df.groupby("time_id").agg(feature_dict).reset_index()
    def f(x):
        if x[1] == "":
            return x[0]
        return x[0] + "_" + x[1]
    
    agg_df.columns = [f(x) for x in agg_df.columns]
    return agg_df    


def extract_book_stats(df):
    default_stats = ["sum", "mean", "std"]
    feature_dict = {
        'wap1': default_stats,
        'wap2': default_stats,
        'log_return1': default_stats,
        'log_return2': default_stats,
        'wap_balance': default_stats,
        'price_spread': default_stats,
        'bid_spread': default_stats,
        'ask_spread': default_stats,
        'total_volume': default_stats,
        'volume_imbalance': default_stats,
        'c': ["sum"],
        'realized_vol1': ["sum"],
        'realized_vol2': ["sum"],
    }
    
    return agg(df, feature_dict)
    

    
    
def extract_trade_stats(df):
    feature_dict = {
        'realized_vol_trade': ["sum"],
        'seconds_in_bucket':["count"],
        'size': ["sum"],
        'order_count': ["mean"],
    }
    
    return agg(df, feature_dict)


def time_constraint_fe(df, stats_df, last_sec, fe_function, cols):
    sub_df = df[df["seconds_in_bucket"] >= (600 - last_sec)].reset_index(drop=True)
    if sub_df.shape[0] > 0:
        sub_stats = fe_function(sub_df)
    else:
        sub_stats = cudf.DataFrame(columns=cols)
    return stats_df.merge(sub_stats, on="time_id", how="left", suffixes=('', f'_{last_sec}'))    
    

def feature_engineering(book_path, trade_path):
    book_df = cudf.read_parquet(book_path)
    book_df = extract_raw_book_features(book_df)
    book_stats = extract_book_stats(book_df)
    book_cols = book_stats.columns
    
    trade_df = cudf.read_parquet(trade_path)
    trade_df = extract_raw_trade_features(trade_df)
    trade_stats = extract_trade_stats(trade_df)
    trade_cols = trade_stats.columns
    
    for last_sec in [150, 300, 450]:
        book_stats = time_constraint_fe(book_df, book_stats, last_sec, extract_book_stats, book_cols) 
        trade_stats = time_constraint_fe(trade_df, trade_stats, last_sec, extract_trade_stats, trade_cols) 

    return book_stats.merge(trade_stats, on="time_id", how="left")


def process_data(order_book_paths, trade_paths, stock_ids):
    stock_dfs = []
    for book_path, trade_path in tqdm(list(zip(order_book_paths, trade_paths))):
        stock_id = int(book_path.split("=")[1].split("/")[0])

        df = feature_engineering(book_path, trade_path)
        df["stock_id"] = stock_id
        stock_dfs.append(df)
    return cudf.concat(stock_dfs)

In [None]:
past_volatility = process_data(order_book_training, trades_training, stock_ids)
past_test_volatility = process_data(order_book_test, trades_test, stock_ids)

past_volatility.shape, past_test_volatility.shape

Usually the time seconds provided for each window can be evaluated whether it is stationary or not

Time series stationarity and creating features based on ADF test can be potential areas be accelerated on the GPU.
These features can be added to below features to see if they add predictive power to the XGBOOST model below.
In addition based on the stationary, there could be other time series models which could be done on the GPU. 
Typically if time series is non-stationary, we could look at Differencing (integer or GPU fractinal differencing) to look at such time series data. Time series libraries have become popular (looking at stationarity, seasonality and aspects of an ARIMA pdq model).

Additonal neural net models can be stacked to the XGBOOST below.

GPUs when a lot of data can also be used to create metrics such as Hurst exponent (similar to the time series other statistics) so can work on large amount of data parallely. Few of these measures can be done in C++ and also Cupy (GPU version of Numpy providing similar features)/Numba (LLVM GPU compilation)

Another area is when the data is bigger. A GPU with higer memory would be able to run such calcs aboove parallely so check if faster speedups for data processing, transformation and backtesting in bigger data sets.  Considreing that Markets are random walk and have high signal to noise ratio with many areas being simulated, GPUs are great for the simulation of synthetic data (MonteCarlo, Bootstrapping, GAN etc) if you want to generate more data from the pipeline of data and train your model based on it. Here we continue to proceed below with XGBOOST which is accelerated on the GPU but those are above potential areas which could be analyzed at a later time to support trading on GPUs. We will discuss the above in a separate thread when examining those areas closely

XGBOOST feature engineering below

In [None]:
def stock_time_fe(df):
    cols = ['realized_vol1_sum', 'realized_vol2_sum', 'realized_vol_trade_sum',
            'realized_vol1_sum_150', 'realized_vol2_sum_150', 'realized_vol_trade_sum_150',
            'realized_vol1_sum_300', 'realized_vol2_sum_300', 'realized_vol_trade_sum_300',
            'realized_vol1_sum_450', 'realized_vol2_sum_450', 'realized_vol_trade_sum_450']
    
    for agg_col in ["stock_id", "time_id"]:
        for agg_func in ["mean", "max", "std", "min"]:
            agg_df = df.groupby(agg_col)[cols].agg(agg_func)
            agg_df.columns = [f"{agg_col}_{agg_func}_{col}" for col in agg_df.columns]
            df = df.merge(agg_df.reset_index(), on=agg_col, how="left")
    
    return df

past_volatility["is_test"] = False
past_test_volatility["is_test"] = True
all_df = past_volatility.append(past_test_volatility).reset_index(drop=True)

all_df = stock_time_fe(all_df)

past_volatility = all_df[~all_df["is_test"]]
past_test_volatility = all_df[all_df["is_test"]]

In [None]:
dev_df = dev_df.merge(past_volatility, on=["stock_id", "time_id"], how="left")

features = [col for col in list(dev_df.columns)
            if col not in {"stock_id", "time_id", "target", "is_test"}]
len(features)

## Train XGBoost model on GPU

In [None]:
import xgboost as xgb

def rmspe(y_true, y_pred):
    return (cp.sqrt(cp.mean(cp.square((y_true - y_pred) / y_true))))


def rmspe_xgb(pred, dtrain):
    y = dtrain.get_label()
    return 'rmspe', rmspe(cp.array(y), cp.array(pred))


NUM_FOLDS = 5
param = {'objective': 'reg:squarederror',
         'learning_rate': 0.1,
         'max_depth': 3,
         "min_child_weight": 200,
         "reg_alpha": 10.0,
         "tree_method": 'gpu_hist', "gpu_id": 0,
         'disable_default_eval_metric': 1
    }

target = "target"

oof_preds = cp.zeros(dev_df.shape[0])
test_preds = cp.zeros(past_test_volatility.shape[0])

for fold in range(NUM_FOLDS):
    print("Fold", fold)
    train_ind = cp.where(dev_df["time_id"].values % NUM_FOLDS != fold)[0]
    val_ind = cp.where(dev_df["time_id"].values % NUM_FOLDS == fold)[0]
        
    train_df, val_df = dev_df.iloc[train_ind], dev_df.iloc[val_ind]

    d_train = xgb.DMatrix(train_df[features], train_df[target], weight=1/cp.square(train_df[target]))
    d_val = xgb.DMatrix(val_df[features], val_df[target], weight=1/cp.square(val_df[target]))

    model = xgb.train(param, d_train, evals=[(d_train, "train"), (d_val, "val")], 
                      num_boost_round=5000, verbose_eval=50, feval=rmspe_xgb,
                      early_stopping_rounds=200)
    
    oof_preds[val_ind] = model.predict(d_val)
    test_preds += cp.array(model.predict(xgb.DMatrix(past_test_volatility[features].astype("float")))/NUM_FOLDS)

In [None]:
dev_df["pred"] = oof_preds
print(f'The RMSPE score of XGB is {rmspe(dev_df["target"], dev_df["pred"])}')

In [None]:
past_test_volatility["row_id"] = past_test_volatility["stock_id"].astype(str) + "-" + past_test_volatility["time_id"].astype(str) 
past_test_volatility["target"] = test_preds.clip(0.0, 100.0)/SCALE

In [None]:
%cd /kaggle/working

In [None]:
sub_df = load_data("test", path=PATH).merge(past_test_volatility[["row_id", "target"]], 
                                            on="row_id", how="left").fillna(0.0)

sub_df.to_csv("submission.csv", index=False, columns=["row_id", "target"])

In [None]:
cudf.read_csv("submission.csv")