In [None]:
#import numpy as np
#import pandas as pd
#import sklearn
import cupy as cp
import cudf
import cuml
import glob
from tqdm import tqdm

cudf.__version__

In [None]:
PATH = "/kaggle/input/optiver-realized-volatility-prediction"


def load_data(mode, path="/kaggle/input/optiver-realized-volatility-prediction"):
    # mode = "train"/"test"
    file_name = f'{path}/{mode}.csv'
    return cudf.read_csv(file_name)

dev_df = load_data("train", path=PATH)
dev_df.head()

In [None]:
SCALE = 100
dev_df["target"] *= SCALE

stock_ids = dev_df["stock_id"].unique()
len(stock_ids)

In [None]:
order_book_training = glob.glob(f'{PATH}/book_train.parquet/*/*')
order_book_test = glob.glob(f'{PATH}/book_test.parquet/*/*')

len(order_book_training), len(order_book_test)

In [None]:
cudf.read_parquet(order_book_training[0])

In [None]:
trades_training = glob.glob(f'{PATH}/trade_train.parquet/*/*')
trades_test = glob.glob(f'{PATH}/trade_test.parquet/*/*')

len(trades_training), len(trades_test)

In [None]:
cudf.read_parquet(trades_training[0])

In [None]:
from numba import cuda, float32, errors
import warnings

warnings.filterwarnings("ignore", category=errors.NumbaPerformanceWarning)



def get_cu_shift_transform(shift_by, null_val):
    def cu_shift_transform(x, y_out):
        for i in range(cuda.threadIdx.x, len(x), cuda.blockDim.x):
            y_out[i] = null_val
            if 0 <= i - shift_by < len(x):
                y_out[i] = x[i - shift_by]
                
    return cu_shift_transform


def log_diff(df, in_col, null_val=9999):
    df["logx"] = df[in_col].log()
    
    df["logx_shifted"] = (df[["time_id", "logx"]].groupby("time_id")
                          .apply_grouped(get_cu_shift_transform(shift_by=1, null_val=null_val), 
                                        incols={"logx": "x"}, outcols={"y_out": cp.float32}, tpb=32)["y_out"])
    
    
    df["keep_row"] = df["logx_shifted"] != null_val
    return df["logx_shifted"] - df["logx"]



def extract_raw_book_features(df):
    for n in range(1, 3):
        p1 = df[f"bid_price{n}"]
        p2 = df[f"ask_price{n}"]
        s1 = df[f"bid_size{n}"]
        s2 = df[f"ask_size{n}"]
        df[f"wap{n}"] = (p1*s2 + p2*s1) / (s1 + s2)
        df[f"log_return{n}"] = log_diff(df, in_col=f"wap{n}", null_val=9999)
        df[f"realized_vol{n}"] = df[f"log_return{n}"]**2
        
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    df["c"] = 1
    
    df = df[df["keep_row"]]
    return df


def extract_raw_trade_features(df, null_val=-9999):
    df["realized_vol_trade"] = log_diff(df, in_col=f"price", null_val=9999)**2
    df = df[df["keep_row"]]
    return df


def agg(df, feature_dict):
    agg_df = df.groupby("time_id").agg(feature_dict).reset_index()
    def f(x):
        if x[1] == "":
            return x[0]
        return x[0] + "_" + x[1]
    
    agg_df.columns = [f(x) for x in agg_df.columns]
    return agg_df    


def extract_book_stats(df):
    default_stats = ["sum", "mean", "std"]
    feature_dict = {
        'wap1': default_stats,
        'wap2': default_stats,
        'log_return1': default_stats,
        'log_return2': default_stats,
        'wap_balance': default_stats,
        'price_spread': default_stats,
        'bid_spread': default_stats,
        'ask_spread': default_stats,
        'total_volume': default_stats,
        'volume_imbalance': default_stats,
        'c': ["sum"],
        'realized_vol1': ["sum"],
        'realized_vol2': ["sum"],
    }
    
    return agg(df, feature_dict)
    

    
    
def extract_trade_stats(df):
    feature_dict = {
        'realized_vol_trade': ["sum"],
        'seconds_in_bucket':["count"],
        'size': ["sum"],
        'order_count': ["mean"],
    }
    
    return agg(df, feature_dict)


def time_constraint_fe(df, stats_df, last_sec, fe_function, cols):
    sub_df = df[df["seconds_in_bucket"] >= (600 - last_sec)].reset_index(drop=True)
    if sub_df.shape[0] > 0:
        sub_stats = fe_function(sub_df)
    else:
        sub_stats = cudf.DataFrame(columns=cols)
    return stats_df.merge(sub_stats, on="time_id", how="left", suffixes=('', f'_{last_sec}'))    
    

def feature_engineering(book_path, trade_path):
    book_df = cudf.read_parquet(book_path)
    book_df = extract_raw_book_features(book_df)
    book_stats = extract_book_stats(book_df)
    book_cols = book_stats.columns
    
    trade_df = cudf.read_parquet(trade_path)
    trade_df = extract_raw_trade_features(trade_df)
    trade_stats = extract_trade_stats(trade_df)
    trade_cols = trade_stats.columns
    
    for last_sec in [150, 300, 450]:
        book_stats = time_constraint_fe(book_df, book_stats, last_sec, extract_book_stats, book_cols) 
        trade_stats = time_constraint_fe(trade_df, trade_stats, last_sec, extract_trade_stats, trade_cols) 

    return book_stats.merge(trade_stats, on="time_id", how="left")


def process_data(order_book_paths, trade_paths, stock_ids):
    stock_dfs = []
    for book_path, trade_path in tqdm(list(zip(order_book_paths, trade_paths))):
        stock_id = int(book_path.split("=")[1].split("/")[0])

        df = feature_engineering(book_path, trade_path)
        df["stock_id"] = stock_id
        stock_dfs.append(df)
    return cudf.concat(stock_dfs)

In [None]:
past_volatility = process_data(order_book_training, trades_training, stock_ids)
past_test_volatility = process_data(order_book_test, trades_test, stock_ids)

past_volatility.shape, past_test_volatility.shape

In [None]:
def stock_time_fe(df):
    cols = ['realized_vol1_sum', 'realized_vol2_sum', 'realized_vol_trade_sum',
            'realized_vol1_sum_150', 'realized_vol2_sum_150', 'realized_vol_trade_sum_150',
            'realized_vol1_sum_300', 'realized_vol2_sum_300', 'realized_vol_trade_sum_300',
            'realized_vol1_sum_450', 'realized_vol2_sum_450', 'realized_vol_trade_sum_450']
    
    for agg_col in ["stock_id", "time_id"]:
        for agg_func in ["mean", "max", "std", "min"]:
            agg_df = df.groupby(agg_col)[cols].agg(agg_func)
            agg_df.columns = [f"{agg_col}_{agg_func}_{col}" for col in agg_df.columns]
            df = df.merge(agg_df.reset_index(), on=agg_col, how="left")
    
    return df

past_volatility["is_test"] = False
past_test_volatility["is_test"] = True
all_df = past_volatility.append(past_test_volatility).reset_index(drop=True)

all_df = stock_time_fe(all_df)

past_volatility = all_df[~all_df["is_test"]]
past_test_volatility = all_df[all_df["is_test"]]

In [None]:
dev_df = dev_df.merge(past_volatility, on=["stock_id", "time_id"], how="left")

features = [col for col in list(dev_df.columns)
            if col not in {"stock_id", "time_id", "target", "is_test"}]
len(features)

In [None]:
for col in features:
    if dev_df[col].dtype == "object":
        dev_df[col] = dev_df[col].astype(float)

In [None]:
import xgboost as xgb

def rmspe(y_true, y_pred):
    return (cp.sqrt(cp.mean(cp.square((y_true - y_pred) / y_true))))


def rmspe_xgb(pred, dtrain):
    y = dtrain.get_label()
    return 'rmspe', rmspe(cp.array(y), cp.array(pred))


NUM_FOLDS = 5
param = {'objective': 'reg:squarederror',
         'learning_rate': 0.1,
         'max_depth': 3,
         "min_child_weight": 200,
         "reg_alpha": 10.0,
         'disable_default_eval_metric': 1,
         "gpu_id": 0, "tree_method": "gpu_hist"
    }

target = "target"

oof_preds = cp.zeros(dev_df.shape[0])
test_preds = cp.zeros(past_test_volatility.shape[0])

for fold in range(NUM_FOLDS):
    print("Fold", fold)
    train_ind = cp.where(dev_df["time_id"].values % NUM_FOLDS != fold)[0]
    val_ind = cp.where(dev_df["time_id"].values % NUM_FOLDS == fold)[0]
        
    train_df, val_df = dev_df.iloc[train_ind], dev_df.iloc[val_ind]

    d_train = xgb.DMatrix(train_df[features], train_df[target], weight=1/cp.square(train_df[target]))
    d_val = xgb.DMatrix(val_df[features], val_df[target], weight=1/cp.square(val_df[target]))

    model = xgb.train(param, d_train, evals=[(d_train, "train"), (d_val, "val")], 
                      num_boost_round=5000,
                      verbose_eval=50, feval=rmspe_xgb,
                      early_stopping_rounds=200)
    
    oof_preds[val_ind] = model.predict(d_val)
    test_preds += cp.array(model.predict(xgb.DMatrix(past_test_volatility[features].astype("float")))/NUM_FOLDS)

In [None]:
dev_df["pred"] = oof_preds
print(f'The RMSPE score of XGB is {rmspe(dev_df["target"], dev_df["pred"])}')

In [None]:
past_test_volatility["row_id"] = past_test_volatility["stock_id"].astype(str) + "-" + past_test_volatility["time_id"].astype(str) 
past_test_volatility["target"] = test_preds.clip(0.0, 100.0)/SCALE

In [None]:
%cd /kaggle/working

In [None]:
sub_df = load_data("test", path=PATH).merge(past_test_volatility[["row_id", "target"]], 
                                            on="row_id", how="left").fillna(0.0)

sub_df.to_csv("submission.csv", index=False, columns=["row_id", "target"])

In [None]:
cudf.read_csv("submission.csv")

In [None]:
!pip install lofo-importance

In [None]:
param["n_estimators"] = 500

model = xgb.XGBRegressor(**param)

In [None]:
dev_df = dev_df.to_pandas().sample(frac=0.2, random_state=0)
dev_df.shape

In [None]:
import numpy as np

cv = []
NUM_FOLDS = 5


for fold in range(NUM_FOLDS):
    train_ind = np.where(dev_df["time_id"].values % NUM_FOLDS != fold)[0]
    val_ind = np.where(dev_df["time_id"].values % NUM_FOLDS == fold)[0]
    cv.append((train_ind, val_ind))

In [None]:
from lofo import Dataset, LOFOImportance, plot_importance

ds = Dataset(df=dev_df, target="target", features=features, auto_group_threshold=0.8)

In [None]:
from sklearn.metrics import make_scorer

def rmspe(y_true, y_pred):
    return (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

scoring = make_scorer(rmspe, greater_is_better=False, needs_proba=False, needs_threshold=False)


lofo_imp = LOFOImportance(ds, model=model, cv=cv, scoring=scoring)

In [None]:
importance_df = lofo_imp.get_importance()

In [None]:
importance_df["full_name"] = importance_df["feature"].values
importance_df["feature"] = importance_df["feature"].apply(lambda x: x[:100])

In [None]:
plot_importance(importance_df, figsize=(16, 12))

In [None]:
plot_importance(importance_df.head(), figsize=(16, 12), kind="box")