This is a rather high scoring RAPIDS notebook that I was working on at the beginning of the competition. I decided to share it because it shows a couple of important ideas. Most importantly, it achieves a rather high score in only 20 minutes.

1. I create my own unique features. In particular resonance and vibration (I just came up with those)
2. I use a FAST ICA to divide the analysis into categorical groups - The thinking here is to have 125 fast ICA components. n-1 the number of stocks in the sample (There were less, but at the time I wrote the notebook I thought I had 126)
3. Use of RAPIDS.

In [None]:
import cupy as cp
import cudf
import cuml
import glob
from tqdm import tqdm
import optuna
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import random
import pickle
from sklearn.model_selection import GroupKFold
from sklearn.decomposition import FastICA
from sklearn.decomposition import PCA
import pandas as pd
import gc
import xgboost as xgb
import pickle
from collections import Counter

In [None]:
REMAKE_TRAINING = False

In [None]:
PATH = "/kaggle/input/optiver-realized-volatility-prediction"
def load_data(mode, path="/kaggle/input/optiver-realized-volatility-prediction"):
    # mode = "train"/"test"
    file_name = f'{path}/{mode}.csv'
    return cudf.read_csv(file_name)

dev_df = load_data("train", path=PATH)
dev_df.head()

In [None]:
SCALE = 100
dev_df["target"] *= SCALE

stock_ids = dev_df["stock_id"].unique()
len(stock_ids)

In [None]:
order_book_training = glob.glob(f'{PATH}/book_train.parquet/*/*')
order_book_test = glob.glob(f'{PATH}/book_test.parquet/*/*')

len(order_book_training), len(order_book_test)

In [None]:
trades_training = glob.glob(f'{PATH}/trade_train.parquet/*/*')
trades_test = glob.glob(f'{PATH}/trade_test.parquet/*/*')

len(trades_training), len(trades_test)

## Using rapids-kaggle-utils for missing cuDF aggregation functions

In [None]:
%cd /kaggle/input/rapids-kaggle-utils/

In [None]:
import cu_utils.transform as cutran

def log_diff(df, in_col, null_val):
    df["logx"] = df[in_col].log()
    df["logx_shifted"] = (df[["time_id", "logx"]].groupby("time_id", method='cudf')
                             .apply_grouped(cutran.get_cu_shift_transform(shift_by=1, null_val=null_val),
                                            incols={"logx": 'x'},
                                            outcols=dict(y_out=cp.float32),
                                            tpb=32)["y_out"])
    df["keep_row"] = df[f"logx_shifted"] != null_val
    return df["logx"] - df["logx_shifted"]

def extract_raw_book_features(df, null_val=-9999):
    for n in range(1, 3):
        p1 = df[f"bid_price{n}"]
        p2 = df[f"ask_price{n}"]
        s1 = df[f"bid_size{n}"]
        s2 = df[f"ask_size{n}"]
        df[f"wap{n}"] = (p1*s2 + p2*s1) / (s1 + s2)
        df[f"log_return{n}"] = 100 * log_diff(df, in_col=f"wap{n}", null_val=null_val)
        df[f"realized_vol{n}"] = 100 * df[f"log_return{n}"] ** 2
    
    n2 = 0.99
    df['bid_size1_vibration'] = (df['bid_size1'] * (df['seconds_in_bucket'] / 300).exp())/10000000
    df['full_wap'] = (df['wap1'] * (df['bid_size1'] + df['ask_size1']) + df['wap2'] * n2 * (df['bid_size2'] + df['ask_size2']))/ (df['bid_size1'] + df['ask_size1'] + n2 * df['bid_size2'] + n2 * df['ask_size2'])
    df['log_full_return'] = 100 * log_diff(df, in_col = "full_wap", null_val=null_val)
    df['realized_full_vol'] = 100 * df['log_full_return'] ** 2
    df['resonance'] = df['realized_full_vol'] * df['seconds_in_bucket']
    df['exp_resonance'] = 1000 * df['realized_full_vol'] * (df['seconds_in_bucket'] / 600).exp()
    df["skewness"] = 10 * df[f"log_full_return"] ** 3
    df['realized_full_abs'] = abs(df['log_full_return'])
    df['wap_balance'] = 10000 * abs(df['wap1'] - df['wap2'])
    #df['wap_balance_return'] = 10 * log_diff(df, in_col="wap_balance", null_val=null_val)
    df['wap_balance_rel'] = 10000 * abs(df['wap1'] - df['wap2']) / (df['wap1'] + df['wap2'])
    df['price_spread'] = 10000 * (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    # Include price_spread TWO? No. It's already included there indirectly
    df['bid_spread'] = 10000 * (df['bid_price1'] - df['bid_price2'])
    df['bid_spread_rel'] = df['bid_spread'] / df['bid_price1']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df['ask_spread_rel'] = 1000 * df['ask_spread'] / df['ask_price2']
    df['total_volume'] = ((df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])) / 10000
    df['volume_imbalance'] = 0.001 * abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2'])) / df['total_volume']
    df['real_action_imbalance'] = 0.001 * (abs(df['ask_size1'] + df['bid_size1']) - abs(df['ask_size2'] + df['bid_size2'])) / df['total_volume']
    df['quad_volume_vibration'] = df['total_volume'] * (df['seconds_in_bucket'] / 600) ** 2
    df['exp_volume_vibration'] = df['total_volume'] * ((df['seconds_in_bucket'] - 300) / 300).exp()
    df['volume_vibration'] = df['total_volume'] * df['seconds_in_bucket'] / 10000
    df['volume_imbalance_vibration'] = df['volume_imbalance'] * (df['seconds_in_bucket'] / 600).exp()
    df["c"] = 1
    df = df[df["keep_row"]]
    df['bid_size1'] /= 1000000
    return df

def extract_raw_trade_features(df, null_val=-9999):
    df['per_order'] = df['size'] / (df['order_count'] + 1)
    df['per_order_vibration'] = df['per_order'] * (df['seconds_in_bucket'] / 1200).exp()
    df['realized_price'] = log_diff(df, in_col=f"price", null_val=null_val)
    df["realized_vol_trade"] = df['realized_price']**2
    df["realized_abs_trade"] = abs(df['realized_price'])
    df["exp_vibration"] = (df['seconds_in_bucket'] / 600).exp() * df['size']
    df["vibration"] = df['seconds_in_bucket'] * df['size']
    df['order_count_vibration'] = (df['seconds_in_bucket'] / 400).exp() * df['order_count']
    df = df[df["keep_row"]]
    return df


def agg(df, feature_dict):
    agg_df = df.groupby("time_id").agg(feature_dict).reset_index()
    def f(x):
        if x[1] == "":
            return x[0]
        return x[0] + "_" + x[1]
    
    agg_df.columns = [f(x) for x in agg_df.columns]
    return(agg_df)

def extract_book_stats(df):
    default_stats = ["mean", "std", "var"]
    feature_dict = {
        'bid_size1': ["mean", "std"],
        'bid_size1_vibration': ["mean"],
        'bid_price1': ["mean", "std", "var"],
        'log_return1': ["mean", "std"],
        'log_return2': ["mean", "std"],
        'realized_vol1': ["mean", "std", "sum", "var"],
        'realized_vol2': ["mean", "std", "sum"],
        'wap1': ["mean", "std"],
        'wap2': ["mean", "std"],
        'full_wap': ["sum", "mean", "std", "max", "min"],
        'log_full_return': ["mean", "std"],
        'realized_full_vol': ["mean", "std"],
        'skewness': ["mean"],
        'realized_full_abs': ["mean"],
        'wap_balance': ["mean", "std"],
        #'wap_balance_return': default_stats,
        'wap_balance_rel': default_stats,
        'price_spread': default_stats,
        'bid_spread': default_stats,
        'ask_spread': default_stats,
        'bid_spread_rel': default_stats,
        'ask_spread_rel': default_stats,
        'total_volume': default_stats,
        'volume_imbalance': default_stats,
        'real_action_imbalance': default_stats,
        'volume_vibration': ["mean", "var", "std"],
        'volume_imbalance_vibration': default_stats,
        'c': ["sum"],
        # Include sum of kurtosis? - I should because vol of 
        # Include a measure of amplitude (look at the rmspe definition below for an example)
    }
    all_df = agg(df, feature_dict)
    sum_df = extract_book_sum_stats(df)
    all_df = all_df.merge(sum_df, on="time_id", how="left")
    return(all_df)

def extract_book_sum_stats(df):
    # This one extracts the sums that cause trouble, because of missing values (I fill with zeros)
    feature_dict = {
        'bid_size1': ["sum"],
        'bid_size1_vibration': ["sum"],
        'bid_price1': ["sum"],
        'log_return1': ["sum"],
        'log_return2': ["sum"],
        'log_full_return': ["sum"],
        'realized_full_vol': ["sum"],
        'wap1': ["sum"],
        'wap2': ["sum"],
        'skewness': ["sum"],
        'realized_full_abs': ["sum"],
        'wap_balance': ["sum"],
        'wap_balance_rel': ["sum"],
        'price_spread': ["sum"],
        'bid_spread': ["sum"],
        'ask_spread': ["sum"],
        'bid_spread_rel': ["sum"],
        'ask_spread_rel': ["sum"],
        'total_volume': ["sum"],
        'volume_imbalance': ["sum"],
        'real_action_imbalance': ["sum"],
        'volume_vibration': ["sum"],
        'volume_imbalance_vibration': ["sum"],
        'exp_volume_vibration': ["sum"],
        'quad_volume_vibration': ["sum"],
        'exp_resonance': ["sum"],
        'resonance': ["sum"]
        # Include sum of kurtosis? - I should because vol of 
        # Include a measure of amplitude (look at the rmspe definition below for an example)
    }
    dfzero = df[["time_id"] + list(feature_dict.keys())]
    dfzero.fillna(0.0)
    return agg(dfzero, feature_dict)
    
def extract_trade_stats(df):
    feature_dict = {
        'realized_vol_trade': ["sum", "std"],
        'realized_abs_trade': ["sum", "var"],
        'seconds_in_bucket':["count", "sum"], #Include count unique? Review!
        'size': ["mean", "sum"],
        'order_count': ["mean"],
        'vibration': ["mean"],
        'order_count_vibration': ["mean"],
        'exp_vibration': ["mean"],
        'price': ["mean", "std"],
        'per_order':["mean", "std"],
        'per_order_vibration': ["mean"]
    }
    return agg(df, feature_dict)

def time_constraint_fe(df, stats_df, last_sec, fe_function, cols):
    sub_df = df[df["seconds_in_bucket"] >= (600 - last_sec)].reset_index(drop=True)
    if sub_df.shape[0] > 0:
        sub_stats = fe_function(sub_df)
    else:
        sub_stats = cudf.DataFrame(columns=cols)
    return stats_df.merge(sub_stats, on="time_id", how="left", suffixes=('', f'_{last_sec}'))

def feature_engineering(book_path, trade_path):
    book_df = cudf.read_parquet(book_path)
    book_df = extract_raw_book_features(book_df)
    book_stats = extract_book_stats(book_df)
    book_cols = book_stats.columns
    
    trade_df = cudf.read_parquet(trade_path)
    trade_df = extract_raw_trade_features(trade_df)
    trade_stats = extract_trade_stats(trade_df)
    trade_cols = trade_stats.columns
    
    for last_sec in [150, 300, 450]:
        book_stats = time_constraint_fe(book_df, book_stats, last_sec, extract_book_stats, book_cols) 
        trade_stats = time_constraint_fe(trade_df, trade_stats, last_sec, extract_trade_stats, trade_cols)
    return book_stats.merge(trade_stats, on="time_id", how="left")

def last_touches(a):
    # Create the oscillations
    plus_signs = [n for n in a.columns if 'max' in n]
    minus_signs = [n.replace('max', 'min') for n in plus_signs]
    new_sign = [n.replace('max', 'oscillation') for n in plus_signs]
    for i, j, new_name in zip(plus_signs, minus_signs, new_sign):
        a[new_name] = (a[i] - a[j]) / a[j]
        a.drop([i, j], axis = 1, inplace = True)
    
def process_data(order_book_paths, trade_paths, stock_ids):
    stock_dfs = []
    for book_path, trade_path in tqdm(list(zip(order_book_paths, trade_paths))):
        stock_id = int(book_path.split("=")[1].split("/")[0])

        df = feature_engineering(book_path, trade_path)
        df["stock_id"] = stock_id
        stock_dfs.append(df)
    smat = cudf.concat(stock_dfs)
    #smat = last_touches(smat)
    return(smat)

In [None]:
past_test_volatility = process_data(order_book_test, trades_test, stock_ids)
if REMAKE_TRAINING:
    past_volatility = process_data(order_book_training, trades_training, stock_ids)
    past_volatility.shape, past_test_volatility.shape
    pickle.dump(past_volatility.to_pandas(), open("/kaggle/working/pastVolatility.pickle", "wb"))
else:
    past_volatility = cudf.from_pandas(pickle.load(open("/kaggle/input/optiver-trainingmatrix/pastVolatility.pickle", "rb")))

In [None]:
def stock_time_fe(df):
    cols = ['realized_vol1_sum', 'realized_vol2_sum', 'realized_vol_trade_sum',
            'realized_vol1_sum_150', 'realized_vol2_sum_150', 'realized_vol_trade_sum_150',
            'realized_vol1_sum_300', 'realized_vol2_sum_300', 'realized_vol_trade_sum_300',
            'realized_vol1_sum_450', 'realized_vol2_sum_450', 'realized_vol_trade_sum_450',
            'order_count_mean', 'real_action_imbalance_mean', 
            'volume_imbalance_mean', 'skewness_mean', 'volume_imbalance_mean_150',
            'full_wap_mean', 'real_action_imbalance_mean_300',
            'real_action_imbalance_mean_450', 'volume_imbalance_mean_450',
            'full_wap_mean_300', 'volume_vibration_mean', 'vibration_mean',
            'volume_imbalance_vibration_sum',
            'volume_imbalance_vibration_sum_450', 'order_count_vibration_mean', 'exp_resonance_sum_450',
            'exp_resonance_sum', 'resonance_sum', 'exp_vibration_mean', 'quad_volume_vibration_sum',
            'per_order_vibration_mean', 'bid_size1_std'
           ] 
    #Include others here?
    for agg_col in ["stock_id", "time_id"]:
        for agg_func in ["mean", "max", "std", "min"]:
            agg_df = df.groupby(agg_col)[cols].agg(agg_func)
            agg_df.columns = [f"{agg_col}_{agg_func}_{col}" for col in agg_df.columns]
            df = df.merge(agg_df.reset_index(), on=agg_col, how="left")
    
    return df
print(past_volatility.shape)
past_volatility["is_test"] = False
past_test_volatility["is_test"] = True
all_df = past_volatility.append(past_test_volatility).reset_index(drop=True)
all_df = stock_time_fe(all_df)
all_df.dropna(axis=1, how='all', inplace = True) # Drop columns with all positions NAN

#Convert stock_id to categorical using one-hot encoding (Choose only a few that work since they're not that useful.
# Candidates are on version 123)
# codes = all_df['stock_id'].unique()
# all_df = all_df.one_hot_encoding('stock_id', 'dummy_id', codes)
# di = [f for f in all_df.columns if 'dummy_id' in f]
# all_df[di] = all_df[di].astype('int8') # Reduce memory size
pickle.dump(all_df.columns, open("/kaggle/working/stayingcolumns.pickle", "wb"))
past_volatility = all_df[~all_df["is_test"]]
past_test_volatility = all_df[all_df["is_test"]]
del all_df
gc.collect()
past_volatility.shape

#Just drop the features that have NANs from any analysis

In [None]:
fs = set(pickle.load(open("/kaggle/input/optiver-trainingmatrix/featureschampion.pickle", "rb"))[1])
pickle.dump(fs, open("/kaggle/working/fslist.pickle", "wb"))
fspv = set([col for col in list(past_volatility.columns) if col not in {"time_id", "stock_id", "target", "is_test"}])
pickle.dump(fspv, open("/kaggle/working/fspvlist.pickle", "wb"))
print('tools that I erased', fs - fspv)
print('tools that I created that are new', len(fspv - fs))
print(len(fs), len(fspv))

In [None]:
# features_PCA will be deleted
features_PCA = list(fspv - fs)
len(features_PCA)

In [None]:
print('are there columns with some missing values?', past_volatility.isna().any().any())
print('are there columns with all missing values?', past_volatility.isna().all().any())

In [None]:
features_PCA

In [None]:
# mydicts = Counter()
# for i in range(5):
#     thismodel = pickle.load(open("/kaggle/input/optiver-trainingmatrix/modelv1-fold-" + str(i) + ".pickle", "rb"))
#     mydicts += Counter(thismodel[3])
# # The features below are the features with less information. These will build the PCA
# features_PCA = list(set([item[0] for item in mydicts.most_common()[-350:]]).intersection(set(past_volatility.columns)))
# print(len(features_PCA))
# features_PCA[:5]

In [None]:
# Change data type
# di = [[f, 'float32'] for f in past_volatility.columns if past_volatility[f].dtype == 'float64']
# past_volatility = past_volatility.astype(dict(di))
# print([f for f in past_volatility.columns if past_volatility[f].dtype == 'float64'])

# di = [[f, 'float32'] for f in past_test_volatility.columns if past_test_volatility[f].dtype == 'float64']
# past_test_volatility = past_test_volatility.astype(dict(di))
# print([f for f in past_test_volatility.columns if past_test_volatility[f].dtype == 'float64'])

# di = [f for f in past_test_volatility.columns if past_test_volatility[f].dtype == 'float64']
# for f in di:
#     print('statistics for', f)
#     print(past_volatility[f].min(), past_volatility[f].mean(), past_volatility[f].max())
# print('list the features with memory float64')

In [None]:
# Keep a record of the features for the PCA analysis for later, in case I want to review in a cheap CPU
# pickle.dump(past_volatility[features_PCA].to_pandas(), open("/kaggle/working/pastVolatility4PCA.pickle", "wb"))

In [None]:
#Let's move to the pandas world to run PCA
pv_pandas = past_volatility.to_pandas()
ptv_pandas = past_test_volatility.to_pandas()

In [None]:
column_means_dict = dict(pv_pandas[features_PCA].median())
pickle.dump(column_means_dict, open("/kaggle/working/column_means_dict.pickle", "wb"))
#print(len(column_means_dict), column_means_dict)

Fill NAs

In [None]:
print('are there columns with some missing values?', pv_pandas[features_PCA].isna().any().any())
print('are there columns with all missing values?', pv_pandas[features_PCA].isna().all().any())

In [None]:
pv_pandas[features_PCA] = pv_pandas[features_PCA].fillna(column_means_dict)
ptv_pandas[features_PCA] = ptv_pandas[features_PCA].fillna(column_means_dict)

In [None]:
print('are there columns with some missing values?', pv_pandas[features_PCA].isna().any().any())
print('are there columns with all missing values?', pv_pandas[features_PCA].isna().all().any())

Create the PCA columns

In [None]:
NCOMP = 2
pca = PCA(n_components = NCOMP)
pca.fit(pv_pandas[features_PCA])
print(pca.explained_variance_ratio_)
columns_pca = ['pc_' + str(i) for i in range(NCOMP)]
pc = pd.DataFrame(pca.transform(pv_pandas[features_PCA]), columns = columns_pca)
ptc = pd.DataFrame(pca.transform(ptv_pandas[features_PCA]), columns = columns_pca)

NFPCA = 125
fpca = FastICA(n_components = NFPCA)
fpca.fit(pv_pandas[features_PCA])
columns_fastica = ['fpca_' + str(i) for i in range(NFPCA)]
fpc = pd.DataFrame(fpca.transform(pv_pandas[features_PCA]), columns = columns_fastica)
fptc = pd.DataFrame(fpca.transform(ptv_pandas[features_PCA]), columns = columns_fastica)

pickle.dump(pca, open("/kaggle/working/pca.pickle", "wb"))
pickle.dump(fpca, open("/kaggle/working/fpca.pickle", "wb"))

In [None]:
fptc.head()

Cocatenate the results

In [None]:
# from __future__ import print_function  # for Python2
# import sys

# local_vars = list(locals().items())
# for var, obj in local_vars:
#     print(var, sys.getsizeof(obj))

Remove the columns from pandas. Make sure that they get replaced by the PCA and fastICA columns

In [None]:
pv_pandas.drop(features_PCA, axis = 1, inplace = True)
ptv_pandas.drop(features_PCA, axis = 1, inplace = True)
del column_means_dict
del past_volatility
del past_test_volatility
gc.collect()

In [None]:
pv_pandas = pd.concat([pv_pandas.reset_index(), pc.reset_index(), fpc.reset_index()], axis = 1).drop('index', axis = 1)
ptv_pandas = pd.concat([ptv_pandas.reset_index(), ptc.reset_index(), fptc.reset_index()], axis = 1).drop('index', axis = 1)

In [None]:
pv_pandas.head()

In [None]:
ptv_pandas.head()

In [None]:
# di = [[f, 'float32'] for f in pv_pandas.columns if pv_pandas[f].dtype == 'float64']
# pv_pandas = pv_pandas.astype(dict(di))
# print([f for f in pv_pandas.columns if pv_pandas[f].dtype == 'float64'])

# di = [[f, 'float32'] for f in ptv_pandas.columns if ptv_pandas[f].dtype == 'float64']
# ptv_pandas = ptv_pandas.astype(dict(di))
# print([f for f in ptv_pandas.columns if ptv_pandas[f].dtype == 'float64'])

In [None]:
past_volatility = cudf.DataFrame(pv_pandas)
past_test_volatility = cudf.DataFrame(ptv_pandas)

In [None]:
del pv_pandas
del ptv_pandas
del pc
del ptc
del fpc
del fptc
gc.collect()

In [None]:
dev_df = dev_df.merge(past_volatility, on=["stock_id", "time_id"], how="left")
features = [col for col in list(dev_df.columns)
            if col not in {"stock_id", "target", "is_test"}]
len(features)

## Train XGBoost model on GPU

Optuna regression setup

In [None]:
pd_df_full = dev_df.to_pandas()
#limit the fine-tuning to only a seventh of the data
#num1 = random.randint(0, 4)
#pd_df = pd_df_full[pd_df_full["time_id"].values % 5 == num1]
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))


In [None]:
# Long Model
# def objective(trial):
#     num1 = 3 #random.randint(0, 4)
#     #limit the fine-tuning to only a seventh of the data
#     train_x, test_x, train_y, test_y = train_test_split(pd_df_full[features], pd_df_full['target'], test_size=0.22, random_state=42)
#     # To select which parameters to optimize, please look at the XGBoost documentation:
#     # https://xgboost.readthedocs.io/en/latest/parameter.html
#     param = {
#         'booster': trial.suggest_categorical('booster', ['gbtree']),
#         'tree_method':'gpu_hist',  # Use GPU acceleration
#         'lambda': trial.suggest_loguniform(
#             'lambda', 0.0001, 0.15
#         ),
#         'alpha': trial.suggest_loguniform(
#             'alpha', 5e-1, 10.0
#         ),
#         'colsample_bytree': trial.suggest_float(
#             'colsample_bytree', 0.6, 0.85 #Default is 1.0
#         ),
#         'colsample_bylevel': trial.suggest_float(
#             'colsample_bylevel', 0.6, 0.8 #Default is 1.0
#         ),
#         'colsample_bynode': trial.suggest_float(
#             'colsample_bynode', 0.65, 0.99 #Default is 1.0
#         ),
#         'subsample': trial.suggest_float("subsample", 0.6,  0.95),
#         'learning_rate': trial.suggest_float(
#             'learning_rate', 0.002, 0.07
#         ),
#         'n_estimators': trial.suggest_categorical(	
#             "n_estimators", [3000, 3500]
#         ),
#         'max_depth': trial.suggest_int(
#             'max_depth', 22, 27
#         ),
#         'random_state': 42,
#         'min_child_weight': trial.suggest_int(
#             'min_child_weight', 55, 160
#         ),
#     }
#     model = XGBRegressor(**param)
    
#     model.fit(train_x,train_y,eval_set=[(test_x,test_y)], early_stopping_rounds = 50, verbose=False)
    
#     preds = model.predict(test_x)
#     #rmse = mean_squared_error(test_y, preds, squared=False)
#     rmspeval = rmspe(test_y, preds)
#     return rmspeval

# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials = 90)

# print("Number of finished trials: ", len(study.trials))
# print("Best trial:")
# trial = study.best_trial

# print("  Value: {}".format(trial.value))
# print("  Params: ")
# for key, value in trial.params.items():
#     print("    {}: {}".format(key, value))

# fig = optuna.visualization.plot_param_importances(study)
# fig.show()

# # Assign best model
# best_params = study.best_params
# best_params['tree_method'] = 'gpu_hist'
# best_params['random_state'] = 42

# best_params = {
#          "booster": 'gbtree',
#          "objective": 'reg:squarederror',
#          "lambda": 0.003692070094677976,
#          "alpha": 0.843665639617332,
#          "colsample_bytree": 0.6890567485273894,
#          "colsample_bylevel": 0.7522102593718745,
#          "colsample_bynode": 0.6643596703827361,
#          "subsample": 0.8033471108457755,
#          "learning_rate": 0.027930261435649854,
#          "max_depth": 26,
#          "min_child_weight": 127,
#          #"reg_alpha": 10.0,
#          "tree_method": 'gpu_hist', "gpu_id": 0,
#          'disable_default_eval_metric': 1
#     }

In [None]:
# Short Model
# def objective(trial):
#     num1 = 3 #random.randint(0, 4)
#     #limit the fine-tuning to only a seventh of the data
#     pd_df = pd_df_full
#     train_x, test_x, train_y, test_y = train_test_split(pd_df[features], pd_df['target'], test_size=0.22, random_state=42)
#     # To select which parameters to optimize, please look at the XGBoost documentation:
#     # https://xgboost.readthedocs.io/en/latest/parameter.html
#     param = {
#         'booster': trial.suggest_categorical('booster', ['gbtree']),
#         'tree_method':'gpu_hist',  # Use GPU acceleration
#         'lambda': trial.suggest_loguniform(
#             'lambda', 0.0001, 0.5
#         ),
#         'alpha': trial.suggest_loguniform(
#             'alpha', 5e-1, 20.0
#         ),
#         'reg_alpha': trial.suggest_float(
#             'reg_alpha', 5e-1, 20.0
#         ),
#         'colsample_bytree': trial.suggest_float(
#             'colsample_bytree', 0.7, 0.9 #Default is 1.0
#         ),
#         'colsample_bylevel': trial.suggest_float(
#             'colsample_bylevel', 0.75, 0.99 #Default is 1.0
#         ),
#         'colsample_bynode': trial.suggest_float(
#             'colsample_bynode', 0.9, 1.0 #Default is 1.0
#         ),
#         'subsample': trial.suggest_float("subsample", 0.6,  0.95),
#         'learning_rate': trial.suggest_float(
#             'learning_rate', 0.002, 0.06
#         ),
#         'n_estimators': trial.suggest_categorical(	
#             "n_estimators", [3500, 5000]
#         ),
#         'max_depth': trial.suggest_int(
#             'max_depth', 6, 10
#         ),
#         'random_state': 42,
#         'min_child_weight': trial.suggest_int(
#             'min_child_weight', 80, 200
#         ),
#     }
#     model = XGBRegressor(**param)
    
#     model.fit(train_x,train_y,eval_set=[(test_x,test_y)], early_stopping_rounds = 50, verbose=False)
    
#     preds = model.predict(test_x)
#     #rmse = mean_squared_error(test_y, preds, squared=False)
#     rmspeval = rmspe(test_y, preds)
#     return rmspeval

# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials = 50)

# print("Number of finished trials: ", len(study.trials))
# print("Best trial:")
# trial = study.best_trial

# print("  Value: {}".format(trial.value))
# print("  Params: ")
# for key, value in trial.params.items():
#     print("    {}: {}".format(key, value))

# fig = optuna.visualization.plot_param_importances(study)
# fig.show()

# # Assign best model
# best_params = study.best_params
# best_params['tree_method'] = 'gpu_hist'
# best_params['random_state'] = 43

# best_params = {
#          "booster": 'gbtree',
#          "objective": 'reg:squarederror',
#          "lambda": 0.3900133957228146,
#          "alpha": 1.6143973425818008,
#          "colsample_bytree": 0.8240236558740445,
#          "colsample_bylevel": 0.7055628112659318,
#          "colsample_bynode": 0.9858168527166616,
#          "subsample": 0.8164845919960058,
#          "learning_rate": 0.023404748889126782,
#          "max_depth": 16,
#          "min_child_weight": 187,    
#          "reg_alpha": 5.0,
#          "tree_method": 'gpu_hist', "gpu_id": 0,
#          'disable_default_eval_metric': 1
#     }

# best_params = {
#          "booster": 'gbtree',
#          "objective": 'reg:squarederror',
#          "max_depth": 7,
#          "min_child_weight": 156,    
#          "lambda": 0.006654377522518237,
#          "alpha": 3.398238828590107,
#          "colsample_bytree": 0.6,
#          "subsample": 0.9134709113526331,
#          "learning_rate": 0.010751601995412153,
#          "reg_alpha": 10.0,
#          "tree_method": 'gpu_hist', "gpu_id": 0,
#          'disable_default_eval_metric': 1
#     }
# # This is 22092
best_params = {
         "booster": 'gbtree',
         "objective": 'reg:squarederror',
         "lambda": 0.000120494603280191,
         "alpha": 4.655248899194473,
         "reg_alpha": 5.75750136407511,
         "colsample_bytree": 0.7033853760069456,
         "colsample_bylevel": 0.9241863146227871,
         "colsample_bynode": 0.9902499881033184,
         "subsample": 0.926724493495993,
         "learning_rate": 0.059258258818345,
         "max_depth": 10,
         "min_child_weight": 97,    
         "tree_method": 'gpu_hist', "gpu_id": 0,
         'disable_default_eval_metric': 1
    }

best_params = {
         "booster": 'dart',
         "rate_drop": 0.10,
         "skip_drop": 0.5,
         "objective": 'reg:squarederror',
         "max_depth": 9,
         "min_child_weight": 147,    
         "lambda": 0.006654377522518237,
         "alpha": 3.398238828590107,
         "colsample_bytree": 0.7033853760069456,
         "colsample_bylevel": 0.9241863146227871,
         "colsample_bynode": 0.9902499881033184,
         "subsample": 0.9134709113526331,
         "learning_rate": 0.012751601995412153,
         "reg_alpha": 10.0,
         "tree_method": 'gpu_hist', "gpu_id": 0,
         'disable_default_eval_metric': 1
    }

optuna selection

In [None]:
# Change data type and save some memory
# di = [[f, 'float32'] for f in pd_df_full.columns if pd_df_full[f].dtype == 'float64']
# pd_df_full = pd_df_full.astype(dict(di))
# print([f for f in pd_df_full.columns if pd_df_full[f].dtype == 'float64'])
# gc.collect()

In [None]:
import xgboost as xgb

def get_xgb_imp(xgb, feat_names):
    from numpy import array
    imp_vals = xgb.get_booster().get_fscore()
    imp_dict = {feat_names[i]:float(imp_vals.get('f'+str(i),0.)) for i in range(len(feat_names))}
    total = array(imp_dict.values()).sum()
    return {k:v/total for k,v in imp_dict.items()}

def rmspe(y_true, y_pred):
    return (cp.sqrt(cp.mean(cp.square((y_true - y_pred) / y_true))))


def rmspe_xgb(pred, dtrain):
    y = dtrain.get_label()
    return 'rmspe', rmspe(cp.array(y), cp.array(pred))


NUM_FOLDS = 5

target = "target"

oof_preds = cp.zeros(dev_df.shape[0])
test_preds = cp.zeros(past_test_volatility.shape[0])
kfold = GroupKFold(n_splits = NUM_FOLDS)

for fold, (train_ind, val_ind) in enumerate(kfold.split(pd_df_full[features], pd_df_full[target].values, pd_df_full["time_id"].values)):
    print("Fold", fold)    
    train_df, val_df = dev_df.iloc[train_ind], dev_df.iloc[val_ind]
    
    d_train = xgb.DMatrix(train_df[features], train_df[target], weight=1/cp.square(train_df[target]))
    d_val = xgb.DMatrix(val_df[features], val_df[target], weight=1/cp.square(val_df[target]))
    
    model = xgb.train(best_params, d_train, evals = [(d_train, "train"), (d_val, "val")], 
                      num_boost_round = 3500, 
                      verbose_eval = 50, feval = rmspe_xgb,
                      early_stopping_rounds = 200)
    importances = {k: v for k, v in sorted(model.get_fscore().items(), key=lambda item: item[1], reverse = True)}
    print('Importances for this iteration:', importances)
    
    pickle.dump({"model": model, "features": features, "best_params": best_params, "importances":importances, 
                 "features_PCA": features_PCA, "pca": pca, "fpca": fpca}, 
                open("/kaggle/working/modelv2-fold-" + str(fold) + ".pickle", "wb"))
    
    oof_preds[val_ind] = model.predict(d_val)
    test_preds += cp.array(model.predict(xgb.DMatrix(past_test_volatility[features].astype("float")))/NUM_FOLDS)

In [None]:
dev_df["pred"] = oof_preds
print(f'The RMSPE score of XGB is {rmspe(dev_df["target"], dev_df["pred"])}')

In [None]:
past_test_volatility["row_id"] = past_test_volatility["stock_id"].astype(str) + "-" + past_test_volatility["time_id"].astype(str) 
past_test_volatility["target"] = test_preds.clip(0.0, 100.0)/SCALE

In [None]:
%cd /kaggle/working

In [None]:
sub_df = load_data("test", path=PATH).merge(past_test_volatility[["row_id", "target"]], on="row_id", how="left")
sub_df['target'] = sub_df['target'].fillna(0.00373)
sub_df.to_csv("submission.csv", index=False, columns=["row_id", "target"])

In [None]:
cudf.read_csv("submission.csv")