<a href="https://colab.research.google.com/github/saintblue97/DeepNN_Optiver/blob/main/optiver_tseries_clustering_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Comment this out if running on Kaggle
from google.colab import drive; drive.mount('/content/drive')

In [None]:
# Comment this cell out if running on Kaggle

!pip install --upgrade --force-reinstall --no-deps kaggle > log  # upgrade kaggle package (to avoid a warning)
!mkdir -p ~/.kaggle                                           # .kaggle folder must contain kaggle.json for kaggle executable to properly authenticate you to Kaggle.com
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/kaggle.json >log  # First, download kaggle.json from kaggle.com (in Account page) and place it in the root of mounted Google Drive
#!cp kaggle.json ~/.kaggle/kaggle.json > log                   # Alternative location of kaggle.json (without a connection to Google Drive)
!chmod 600 ~/.kaggle/kaggle.json                              # give only the owner full read/write access to kaggle.json
!kaggle config set -n competition -v optiver-trading-at-the-close        # set the competition context for the next few kaggle API calls. !kaggle config view - shows current settings
!kaggle competitions download >> log                          # download competition dataset as a zip file
!unzip -o *.zip >> log                                        # Kaggle dataset is copied as a single file and needs to be unzipped.
# !kaggle competitions leaderboard --show                       # print public leaderboard

In [None]:
%%time
%%capture
%reset -f
import numpy as np, pandas as pd, time, matplotlib.pyplot as plt, seaborn as sns, os, tqdm, re, sys, cv2, skimage, xgboost, lightgbm as lgb, librosa

ToCSV = lambda df, fname: df.round(2).to_csv(f'{fname}.csv', index_label='id') # rounds values to 2 decimals

class Timer():
    def __init__(self, lim:'RunTimeLimit'=14400): self.t0, self.lim, _ = time.time(), lim, print(f'⏳ started. You have {lim} sec. Good luck!')
    def ShowTime(self):
        msg = f'Runtime is {time.time()-self.t0:.0f} sec'
        print(f'\033[91m\033[1m' + msg + f' > {self.lim} sec limit!!!\033[0m' if (time.time()-self.t0-1) > self.lim else msg)

np.set_printoptions(linewidth=100, precision=2, edgeitems=2, suppress=True)
pd.set_option('display.max_columns', 20, 'display.precision', 2, 'display.max_rows', 4)

CPU times: user 2.21 s, sys: 469 ms, total: 2.68 s
Wall time: 2.99 s


In [None]:
def is_google_colab():
    """Check if the environment is Google Colab.

    Returns:
        bool: True if in Google Colab, False otherwise.
    """
    try:
        import google.colab
        return True
    except ImportError:
        return False

if is_google_colab():
    file_path = ''
else:
    file_path = '/kaggle/input/optiver-trading-at-the-close/'

In [None]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler, RobustScaler,OneHotEncoder
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold, KFold, train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import ARDRegression, BayesianRidge, LinearRegression, Ridge, Lars
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from itertools import combinations
import gc
import os
import time
from tqdm import tqdm
from operator import itemgetter

In [None]:
import multiprocessing as mp
from multiprocessing.pool import ThreadPool

In [None]:
# Number of Time Series, each with 55 time slots (To use all data set it to -1)
SAMPLE_SIZE = 50000
COMPUTE_TRAINING_METRICS = True   # Compute Training MAE error
UNIT_TESTS = False

In [None]:
sample_submission_path = file_path + 'example_test_files/sample_submission.csv'

sample_submission = pd.read_csv(sample_submission_path); sample_submission

Unnamed: 0,time_id,row_id,target
0,26290,478_0_0,0
1,26290,478_0_1,0
...,...,...,...
32998,26454,480_540_198,0
32999,26454,480_540_199,0


# Load Data

In [None]:
tmr = Timer()

⏳ started. You have 14400 sec. Good luck!


In [None]:
train_data_path = file_path + 'train.csv'
df = pd.read_csv(train_data_path);
df = df.dropna(subset=['target']); df

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3.18e+06,1,1.0,1.34e+07,,,1.0,60651.50,1.0,8493.03,1.0,-3.03,0,0_0_0
1,1,0,0,1.67e+05,-1,1.0,1.64e+06,,,1.0,3233.04,1.0,20605.09,1.0,-5.52,0,0_0_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237978,198,480,540,1.00e+06,1,1.0,9.48e+07,1.0,1.0,1.0,125631.72,1.0,669893.00,1.0,-1.54,26454,480_540_198
5237979,199,480,540,1.88e+06,-1,1.0,2.41e+07,1.0,1.0,1.0,250081.44,1.0,300167.56,1.0,-6.53,26454,480_540_199


In [None]:
test_data_path = file_path + 'example_test_files/test.csv'
df_test  = pd.read_csv(test_data_path); df_test

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,time_id,row_id,currently_scored
0,0,478,0,3.75e+06,-1,1.0,1.15e+07,,,1.0,22940.00,1.0,9177.60,1.0,26290,478_0_0,False
1,1,478,0,9.86e+05,-1,1.0,3.85e+06,,,1.0,1967.90,1.0,19692.00,1.0,26290,478_0_1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32998,198,480,540,1.00e+06,1,1.0,9.48e+07,1.0,1.0,1.0,125631.72,1.0,669893.00,1.0,26454,480_540_198,False
32999,199,480,540,1.88e+06,-1,1.0,2.41e+07,1.0,1.0,1.0,250081.44,1.0,300167.56,1.0,26454,480_540_199,False


In [None]:
print('Shape of training data =', df.shape)
print('Shape of testing data  =', df_test.shape)

Shape of training data = (5237892, 17)
Shape of testing data  = (33000, 17)


# Preprocessing

In [None]:
class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Only drop columns that exist in X
        columns_to_drop = [col for col in self.columns_to_drop if col in X.columns]
        return X.drop(columns_to_drop, axis=1)

class DataFrameSimpleImputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='mean', fill_value=None):
        self.strategy_ = strategy
        self.fill_value_ = fill_value

    def fit(self, X, y=None):
        if self.strategy_ == 'mean':
            self.fill_values_ = X.mean()
        elif self.strategy_ == 'median':
            self.fill_values_ = X.median()
        elif self.strategy_ == 'constant':
            if self.fill_value_ is None:
                raise ValueError("fill_value must be provided for strategy='constant'")
            self.fill_values_ = pd.Series(self.fill_value_, index=X.columns)
        else:
            raise ValueError(f"Unknown strategy type: {self.strategy_}")

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill_values_)

class MovingAverageTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, window_size, columns):
        self.window_size = window_size  # The size of the window for moving average
        self.columns = columns          # The columns on which to apply moving average

    def fit(self, X, y=None):
        return self  # No fitting process, so just return self

    def calculate_moving_average(self, series):
        return series.rolling(window=self.window_size).mean()

    def transform(self, X):
        X_new = X.copy()

        for col in self.columns:
            X_new[f'{col}_sma'] = X_new.groupby('stock_id')[col].transform(self.calculate_moving_average)

            X_new[f'{col}_sma'].fillna(X_new[col], inplace=True)


        return X_new


In [None]:
# generate imbalance features
import numpy as np
import pandas as pd
from itertools import combinations

def imbalance_features(df):
    df = df.copy()
    prices = ["reference_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size"]

    df.loc[:, "volume"] = df["ask_size"] + df["bid_size"]
    df.loc[:, "mid_price"] = (df["ask_price"] + df["bid_price"]) / 2
    df.loc[:, "liquidity_imbalance"] = (df["bid_size"] - df["ask_size"]) / (df["bid_size"] + df["ask_size"])

    for c in combinations(prices, 2):
        df.loc[:, f"{c[0]}_{c[1]}_imb"] = (df[c[0]] - df[c[1]]) / (df[c[0]] + df[c[1]])

    df.loc[:, "stock_weights"] = df["stock_id"].map(weights_dict)
    df.loc[:, "weighted_wap"] = df["stock_weights"] * df["wap"]
    df.loc[:, "price_spread"] = df["ask_price"] - df["bid_price"]
    df.loc[:, "spread_depth_ratio"] = df["price_spread"] / (df['bid_size'] + df['ask_size'])

    df.loc[:, f"all_prices_mean"] = df[prices].mean(axis=1)
    df.loc[:, f"all_prices_std"] = df[prices].std(axis=1)
    df.loc[:, f"all_sizes_mean"] = df[sizes].mean(axis=1)
    df.loc[:, f"all_sizes_std"] = df[sizes].std(axis=1)

    return df



def other_features(df):
    df = df.copy()
    df.loc[:, "dow"] = df["date_id"] % 5
    df.loc[:, "dom"] = df["date_id"] % 20
    df.loc[:, "seconds"] = df["seconds_in_bucket"] % 60
    df.loc[:, "minute"] = df["seconds_in_bucket"] // 60

    for key, value in global_stock_id_feats.items():
        df.loc[:, f"global_{key}"] = df["stock_id"].map(value.to_dict())

    return df

# generate all features
def generate_all_features(df):
    df = df.copy()
    cols = [c for c in df.columns if c not in ["row_id", "time_id", "target"]]
    df = df[cols]
    df = imbalance_features(df)
    df = other_features(df)
    gc.collect()

    feature_name = [i for i in df.columns if i not in ["row_id", "target", "time_id"]]#, "date_id"]]

    return df[feature_name]

def compute_triplet_imbalance(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))

    for i in range(num_combinations):
        a, b, c = comb_indices[i]
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val
            if mid_val == min_val:  # Prevent division by zero
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)

    return imbalance_features

def calculate_triplet_imbalance_numba(price, df):
    # Convert DataFrame to numpy array for Numba compatibility
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]

    # Calculate the triplet imbalance
    features_array = compute_triplet_imbalance(df_values, comb_indices)

    # Create a DataFrame from the results
    columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)

    return features

In [None]:
class ImbalanceFeaturesTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self  # No fitting process needed

    def transform(self, X):
        # Make sure that imbalance_features returns a DataFrame
        transformed_X = imbalance_features(X)
        return transformed_X


class OtherFeaturesTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self  # No fitting process needed

    def transform(self, X):
        return other_features(X)


class AllFeaturesTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self  # No fitting process needed

    def transform(self, X):
        return generate_all_features(X)


In [None]:
weights = [
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04 , 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
]

global_stock_id_feats = {
        "median_size": df.groupby("stock_id")["bid_size"].median() + df.groupby("stock_id")["ask_size"].median(),
        "std_size": df.groupby("stock_id")["bid_size"].std() + df.groupby("stock_id")["ask_size"].std(),
        "ptp_size": df.groupby("stock_id")["bid_size"].max() - df.groupby("stock_id")["bid_size"].min(),
        "median_price": df.groupby("stock_id")["bid_price"].median() + df.groupby("stock_id")["ask_price"].median(),
        "std_price": df.groupby("stock_id")["bid_price"].std() + df.groupby("stock_id")["ask_price"].std(),
        "ptp_price": df.groupby("stock_id")["bid_price"].max() - df.groupby("stock_id")["ask_price"].min(),
    }

In [None]:
# Convert weights list to a dictionary
stock_ids = range(len(weights))  # Replace this with your actual stock IDs if they are different
weights_dict = dict(zip(stock_ids, weights))

In [None]:
from sklearn.compose import ColumnTransformer

# Set up preprocessing pipeline
preprocess_steps = [
    ('drop_columns', DropColumns(['time_id', 'currently_scored'])),
    ('imputer1', DataFrameSimpleImputer(strategy='constant', fill_value=0)),
    ('moving_average', MovingAverageTransformer(window_size=2, columns=['ask_size', 'ask_price', 'bid_price', 'ask_size', 'reference_price', 'wap'])),
    ('imbalance_features', ImbalanceFeaturesTransformer()),
    ('other_features', OtherFeaturesTransformer()),
    ('all_features', AllFeaturesTransformer()),
    ('imputer2', DataFrameSimpleImputer(strategy='constant', fill_value=0)),
]
pp_pipeline = Pipeline(preprocess_steps)

In [None]:
def get_grouped_df(original_df:pd.DataFrame, columns_lst:list):
    grouped_df_lst = []
    incomplete_ts = []
    columns_lst = columns_lst[0] if len(columns_lst)==1 else columns_lst # To avoid Pandas future warning
    for k, subdf in tqdm(original_df.groupby(columns_lst)):
        if subdf.shape[0] >= 55:
            grp_item = [col_name for col_name in k] if isinstance(k, tuple) else [k]
            grp_item.append(subdf)
            grouped_df_lst.append(grp_item)
        else:
            incomplete_dict ={}
            for index, col_name in enumerate(columns_lst):
                incomplete_dict[col_name] = k[index]
            incomplete_dict["length"] = len(subdf)
            incomplete_ts.append(incomplete_dict)
    return grouped_df_lst, incomplete_ts

In [None]:
stock_grp_lst, incomplete_ts = get_grouped_df(df, ['stock_id', 'date_id']) ; del df

100%|██████████| 95235/95235 [00:06<00:00, 14134.85it/s]


In [None]:
MAX_NUM_SAMPLES = len(stock_grp_lst)
if SAMPLE_SIZE < 0:
    SAMPLE_SIZE = MAX_NUM_SAMPLES

In [None]:
stock_grp_lst[0][2].head(2)

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180000.0,1,1.0,13400000.0,,,1.0,60651.5,1.0,8493.03,1.0,-3.03,0,0_0_0
191,0,0,10,1300000.0,1,1.0,15300000.0,,,1.0,13996.5,1.0,23519.16,1.0,0.39,1,0_10_0


In [None]:
%%time
dataset_folds = KFold(n_splits=10, shuffle=True, random_state=0)
XY_train_lst = []
XY_test_lst = []
for fold_num, (train_index, test_index) in enumerate(dataset_folds.split(stock_grp_lst)):
    XY_train_df_getter = itemgetter(*list(train_index))(stock_grp_lst)     # check if we can use generators
    XY_test_df_getter = itemgetter(*list(test_index))(stock_grp_lst)
    for stock_id, date_id, XY_train_df_item in XY_train_df_getter[:SAMPLE_SIZE]:
        XY_train_lst.append(XY_train_df_item)
    for stock_id, date_id, XY_test_df_item in XY_test_df_getter[:SAMPLE_SIZE//10]:
        XY_test_lst.append(XY_test_df_item)
    break # Get one fold
XY_train_df = pd.concat(XY_train_lst, axis=0)
XY_test_df = pd.concat(XY_test_lst, axis=0)

CPU times: user 20.7 s, sys: 308 ms, total: 21.1 s
Wall time: 21.1 s


In [None]:
del stock_grp_lst

In [None]:
XY_train_df.shape

(2750000, 17)

In [None]:
XY_train_df.head(2)

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180000.0,1,1.0,13400000.0,,,1.0,60651.5,1.0,8493.03,1.0,-3.03,0,0_0_0
191,0,0,10,1300000.0,1,1.0,15300000.0,,,1.0,13996.5,1.0,23519.16,1.0,0.39,1,0_10_0


In [None]:
XY_test_df.shape

(275000, 17)

In [None]:
X_train_pp = pp_pipeline.fit_transform(XY_train_df)
Y_train = XY_train_df.target
X_test_pp = pp_pipeline.fit_transform(XY_test_df)
Y_test = XY_test_df.target
del XY_train_df
del XY_test_df
gc.collect()

0

In [None]:
X_train_pp.columns

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'ask_size_sma', 'ask_price_sma', 'bid_price_sma',
       'reference_price_sma', 'wap_sma', 'volume', 'mid_price',
       'liquidity_imbalance', 'reference_price_ask_price_imb',
       'reference_price_bid_price_imb', 'reference_price_wap_imb',
       'ask_price_bid_price_imb', 'ask_price_wap_imb', 'bid_price_wap_imb',
       'stock_weights', 'weighted_wap', 'price_spread', 'spread_depth_ratio',
       'all_prices_mean', 'all_prices_std', 'all_sizes_mean', 'all_sizes_std',
       'dow', 'dom', 'seconds', 'minute', 'global_median_size',
       'global_std_size', 'global_ptp_size', 'global_median_price',
       'global_std_price', 'global_ptp_price'],
      dtype='object')

In [None]:
X_train_pp.shape

(2750000, 46)

In [None]:
%%time
tseries_columns = [
    'ask_price_sma',
    'bid_price_sma',
    'wap_sma',
    ]
stocks_tseries_data = []
for k, subdf in tqdm(X_train_pp.groupby(['stock_id', 'date_id'])):
    stocks_tseries_data.append(subdf[tseries_columns].T.values.ravel())
stocks_tseries = np.array(stocks_tseries_data)

100%|██████████| 50000/50000 [01:06<00:00, 746.80it/s]


CPU times: user 1min 15s, sys: 1.41 s, total: 1min 16s
Wall time: 1min 16s


In [None]:
stocks_tseries.shape

(50000, 165)

In [None]:
%%time
pca = PCA(n_components=3, random_state=0)
stocks_tseries_transformed = pca.fit_transform(stocks_tseries)

CPU times: user 1.75 s, sys: 532 ms, total: 2.29 s
Wall time: 621 ms


In [None]:
stocks_tseries_transformed.shape

(50000, 3)

In [None]:
%%time
kmeans_tseries = KMeans(n_clusters=5, n_init=10,
                        random_state=0,
                        max_iter=5000)
tseries_clusters = kmeans_tseries.fit_predict(stocks_tseries_transformed)

CPU times: user 23.8 s, sys: 2.8 s, total: 26.6 s
Wall time: 21.9 s


In [None]:
len(tseries_clusters)

50000

In [None]:
tseries_clusters[:20]

array([3, 1, 1, 1, 3, 1, 0, 3, 3, 3, 3, 3, 1, 1, 3, 0, 3, 1, 3, 3], dtype=int32)

In [None]:
%%time
# Add cluster information to stock DataFrame
X_train_pp_c_lst = []
for index, grpby_tuple in enumerate(X_train_pp.groupby(['stock_id', 'date_id'])):
    k, subdf = grpby_tuple
    subdf['cluster_id'] = tseries_clusters[index]
    X_train_pp_c_lst.append(subdf)
X_train_pp_c = pd.concat(X_train_pp_c_lst, axis=0)
X_train_pp_c.shape

CPU times: user 1min 40s, sys: 2.73 s, total: 1min 43s
Wall time: 1min 43s


(2750000, 47)

In [None]:
X_train_pp_c.columns

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'ask_size_sma', 'ask_price_sma', 'bid_price_sma',
       'reference_price_sma', 'wap_sma', 'volume', 'mid_price',
       'liquidity_imbalance', 'reference_price_ask_price_imb',
       'reference_price_bid_price_imb', 'reference_price_wap_imb',
       'ask_price_bid_price_imb', 'ask_price_wap_imb', 'bid_price_wap_imb',
       'stock_weights', 'weighted_wap', 'price_spread', 'spread_depth_ratio',
       'all_prices_mean', 'all_prices_std', 'all_sizes_mean', 'all_sizes_std',
       'dow', 'dom', 'seconds', 'minute', 'global_median_size',
       'global_std_size', 'global_ptp_size', 'global_median_price',
       'global_std_price', 'global_ptp_price', 'cluster_id'],
      dtype='object')

In [None]:
XY_train_df = X_train_pp_c.copy()
XY_train_df['target'] = Y_train

In [None]:
grp_df_lst_clstr, _ = get_grouped_df(XY_train_df, ['cluster_id'])

100%|██████████| 5/5 [00:00<00:00,  7.87it/s]


In [None]:
num_tslots = []
for cluster_id, stock_df in grp_df_lst_clstr:
    #display(grp_df_lst_clstr[3][1])
    print(f"cluster:{cluster_id}")
    print(f"  Number of time series: {len(stock_df)/55}")
    print(f"  Number of time slots:  {len(stock_df)}")
    num_tslots.append(len(stock_df))

cluster:0
  Number of time series: 10939.0
  Number of time slots:  601645
cluster:1
  Number of time series: 13780.0
  Number of time slots:  757900
cluster:2
  Number of time series: 2187.0
  Number of time slots:  120285
cluster:3
  Number of time series: 19601.0
  Number of time slots:  1078055
cluster:4
  Number of time series: 3493.0
  Number of time slots:  192115


In [None]:
num_tslots

[601645, 757900, 120285, 1078055, 192115]

In [None]:
# Sample clustered DataFrames
MAX_CLSTR_SAMPLE_SIZE = min(num_tslots)
CLSTR_SAMPLE_SIZE = int(0.99*MAX_CLSTR_SAMPLE_SIZE)
print(f"Cluster Sample size: {CLSTR_SAMPLE_SIZE}")

cluster_dataframes = {}
# Iterate each cluster in grp_df_lst_clstr
for cluster_id, stock_df in tqdm(grp_df_lst_clstr):
    # Group each DataFrame by ['stock_id', 'date_id']
    stock_grp_lst, _ = get_grouped_df(stock_df, ['stock_id', 'date_id'])
    # Generate Train and Test Dataset per cluster
    num_splits = min(len(stock_grp_lst), 10)
    dataset_folds = KFold(n_splits=num_splits, shuffle=True, random_state=0)
    clstr_XY_train_df_lst = []
    clstr_XY_test_df_lst = []
    for fold_num, (train_index, test_index) in enumerate(dataset_folds.split(stock_grp_lst)):
        clstr_XY_train_df_getter = itemgetter(*list(train_index))(stock_grp_lst)    # check if we can use generators
        clstr_XY_test_df_getter = itemgetter(*list(test_index))(stock_grp_lst)
        for stock_id, date_id, clstr_XY_train_df_item in clstr_XY_train_df_getter[:CLSTR_SAMPLE_SIZE] :
            clstr_XY_train_df_lst.append(clstr_XY_train_df_item)
        for stock_id, date_id, clstr_XY_test_df_item in clstr_XY_test_df_getter[:CLSTR_SAMPLE_SIZE//10]:
            clstr_XY_test_df_lst.append(clstr_XY_test_df_item)
        break # Get one fold
    clstr_XY_train_df = pd.concat(clstr_XY_train_df_lst, axis=0)
    clstr_XY_test_df = pd.concat(clstr_XY_test_df_lst, axis=0)
    if clstr_XY_train_df is not None and clstr_XY_test_df is not None:
        cluster_dataframes[cluster_id] = {
            "XY_train" : clstr_XY_train_df.copy().drop(['cluster_id'], axis=1),
            "XY_test" : clstr_XY_test_df.copy().drop(['cluster_id'], axis=1),
        }
    else:
        print(f"we did a boo boo. cluster_id: {cluster_id}")
    # break # Get one cluster

Cluster Sample size: 119082


  0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/10939 [00:00<?, ?it/s][A
  0%|          | 1/10939 [00:00<32:50,  5.55it/s][A
 27%|██▋       | 2903/10939 [00:00<00:00, 12838.26it/s][A
 53%|█████▎    | 5808/10939 [00:00<00:00, 19069.67it/s][A
100%|██████████| 10939/10939 [00:00<00:00, 19731.51it/s][A
 20%|██        | 1/5 [00:06<00:26,  6.64s/it]
  0%|          | 0/13780 [00:00<?, ?it/s][A
  0%|          | 1/13780 [00:00<52:41,  4.36it/s][A
 21%|██        | 2847/13780 [00:00<00:01, 10922.81it/s][A
 39%|███▊      | 5318/13780 [00:00<00:00, 15803.82it/s][A
 60%|█████▉    | 8219/13780 [00:00<00:00, 20239.14it/s][A
100%|██████████| 13780/13780 [00:00<00:00, 19089.82it/s][A
 40%|████      | 2/5 [00:15<00:23,  7.75s/it]
  0%|          | 0/2187 [00:00<?, ?it/s][A
100%|██████████| 2187/2187 [00:00<00:00, 20735.16it/s][A
 60%|██████    | 3/5 [00:16<00:09,  4.84s/it]
  0%|          | 0/19601 [00:00<?, ?it/s][A
  0%|          | 1/19601 [00:00<1:47:56,  3.03it/s][A
 13%|█▎     

In [None]:
if UNIT_TESTS:
    if cluster_dataframes:
        cluster_id = 0
        print(f"Cluster #{cluster_id}")
        print(f"    keys: {cluster_dataframes[cluster_id].keys()}")
        display(cluster_dataframes[cluster_id]['XY_test'].head(2))
        print(f"    columns:{cluster_dataframes[cluster_id]['XY_test'].columns}")

In [None]:
def get_transformed_datasets(cluster_dataframes:dict):
    transformed_df = {}
    for cluster_id, df_data in tqdm(cluster_dataframes.items()):
        clstr_XY_train_df = df_data['XY_train']
        clstr_XY_test_df = df_data['XY_test']
        transformed_df[cluster_id] = {
            "X_train_pp" : clstr_XY_train_df.drop(['target', 'stock_id', 'date_id', 'seconds_in_bucket'], axis=1),
            "Y_train" : clstr_XY_train_df.target,
            "X_test_pp" : clstr_XY_test_df.drop(['target', 'stock_id', 'date_id', 'seconds_in_bucket'], axis=1),
            "Y_test" : clstr_XY_test_df.target,
            }
    return transformed_df

In [None]:
cluster_dataframes_tr = get_transformed_datasets(cluster_dataframes)

100%|██████████| 5/5 [00:00<00:00, 17.71it/s]


In [None]:
if UNIT_TESTS:
    cluster_id = 0
    print(f"Cluster #{cluster_id}")
    display(cluster_dataframes_tr[cluster_id]['X_train_pp'])
    print(f"  columns: {cluster_dataframes_tr[cluster_id]['X_train_pp'].columns}")
elif not COMPUTE_TRAINING_METRICS:
    # Release memory
    del cluster_dataframes

In [None]:
print("Number of timeslots per cluster:")
for cluster_id, all_dataframes in cluster_dataframes_tr.items():
    print(f"cluster_id: {cluster_id}")
    print(f"  X_train_pp - {all_dataframes['X_train_pp'].shape[0]}")
    print(f"  X_test_pp  - {all_dataframes['X_test_pp'].shape[0]}")

Number of timeslots per cluster:
cluster_id: 0
  X_train_pp - 541475
  X_test_pp  - 60170
cluster_id: 1
  X_train_pp - 682110
  X_test_pp  - 75790
cluster_id: 2
  X_train_pp - 108240
  X_test_pp  - 12045
cluster_id: 3
  X_train_pp - 970200
  X_test_pp  - 107855
cluster_id: 4
  X_train_pp - 172865
  X_test_pp  - 19250


# Make submission

In [None]:
cluster_dataframes_tr[0].keys()

dict_keys(['X_train_pp', 'Y_train', 'X_test_pp', 'Y_test'])

In [None]:
def build_train_models(cluster_dataframes_tf:dict):
    """Train a model for each Time Series Cluster"""
    all_models = {}
    ctb_params = dict(iterations=1000,
                      learning_rate=0.1,
                      depth=8,
                      l2_leaf_reg=30,
                      bootstrap_type='Bernoulli',
                      subsample=0.66,
                      loss_function='MAE',
                      eval_metric = 'MAE',
                      metric_period=100,
                      od_type='Iter',
                      od_wait=30,
                      allow_writing_files=False,
                      )
    for cluster_id, df_data in tqdm(cluster_dataframes_tf.items()):
        X_train_tr = df_data['X_train_pp']
        Y_train = df_data['Y_train']
        models = {
#                 'lr': LinearRegression(),
#                 'lgb': lgb.LGBMRegressor(objective='regression_l1', n_estimators=250),
                'catboost': CatBoostRegressor(**ctb_params)
            }
        for name, model in models.items():
            print(f'Training {name} model for cluster {cluster_id}')
            model.fit(X_train_tr, Y_train.values.ravel())
        all_models[cluster_id] = models
    return all_models

In [None]:
%%time
all_models = build_train_models(cluster_dataframes_tr)

  0%|          | 0/5 [00:00<?, ?it/s]

Training catboost model for cluster 0
0:	learn: 6.5756380	total: 249ms	remaining: 4m 8s
100:	learn: 6.1172157	total: 13.8s	remaining: 2m 2s
200:	learn: 6.0233538	total: 27.6s	remaining: 1m 49s
300:	learn: 5.9466370	total: 40.8s	remaining: 1m 34s
400:	learn: 5.8822843	total: 54.1s	remaining: 1m 20s
500:	learn: 5.8276980	total: 1m 7s	remaining: 1m 7s
600:	learn: 5.7770323	total: 1m 20s	remaining: 53.7s
700:	learn: 5.7292293	total: 1m 34s	remaining: 40.3s
800:	learn: 5.6868274	total: 1m 47s	remaining: 26.8s
900:	learn: 5.6457090	total: 2m 1s	remaining: 13.4s


 20%|██        | 1/5 [02:15<09:03, 135.88s/it]

999:	learn: 5.6111918	total: 2m 14s	remaining: 0us
Training catboost model for cluster 1
0:	learn: 6.1664697	total: 219ms	remaining: 3m 38s
100:	learn: 5.7588122	total: 17.2s	remaining: 2m 32s
200:	learn: 5.6881352	total: 33.5s	remaining: 2m 13s
300:	learn: 5.6286112	total: 50s	remaining: 1m 56s
400:	learn: 5.5787066	total: 1m 5s	remaining: 1m 38s
500:	learn: 5.5330427	total: 1m 22s	remaining: 1m 22s
600:	learn: 5.4900598	total: 1m 39s	remaining: 1m 5s
700:	learn: 5.4501708	total: 1m 55s	remaining: 49.3s
800:	learn: 5.4141995	total: 2m 11s	remaining: 32.8s
900:	learn: 5.3814997	total: 2m 28s	remaining: 16.3s


 40%|████      | 2/5 [05:01<07:39, 153.33s/it]

999:	learn: 5.3512506	total: 2m 44s	remaining: 0us
Training catboost model for cluster 2
0:	learn: 10.9067318	total: 53.7ms	remaining: 53.6s
100:	learn: 9.6603331	total: 4.12s	remaining: 36.7s
200:	learn: 9.1770572	total: 8.02s	remaining: 31.9s
300:	learn: 8.8297321	total: 12.3s	remaining: 28.7s
400:	learn: 8.5534662	total: 16.3s	remaining: 24.3s
500:	learn: 8.3185489	total: 20.2s	remaining: 20.2s
600:	learn: 8.1208507	total: 24.2s	remaining: 16.1s
700:	learn: 7.9429640	total: 28.2s	remaining: 12s
800:	learn: 7.7866729	total: 32.2s	remaining: 8.01s
900:	learn: 7.6417544	total: 36.2s	remaining: 3.97s


 60%|██████    | 3/5 [05:41<03:23, 101.74s/it]

999:	learn: 7.5186540	total: 40s	remaining: 0us
Training catboost model for cluster 3
0:	learn: 5.4536125	total: 411ms	remaining: 6m 50s
100:	learn: 5.1591172	total: 23.5s	remaining: 3m 29s
200:	learn: 5.1106575	total: 46.9s	remaining: 3m 6s
300:	learn: 5.0700473	total: 1m 9s	remaining: 2m 41s
400:	learn: 5.0331233	total: 1m 31s	remaining: 2m 16s
500:	learn: 5.0018671	total: 1m 54s	remaining: 1m 53s
600:	learn: 4.9735586	total: 2m 16s	remaining: 1m 30s
700:	learn: 4.9470581	total: 2m 39s	remaining: 1m 7s
800:	learn: 4.9218699	total: 3m 1s	remaining: 45.1s
900:	learn: 4.8993031	total: 3m 24s	remaining: 22.4s


 80%|████████  | 4/5 [09:29<02:31, 151.62s/it]

999:	learn: 4.8774927	total: 3m 46s	remaining: 0us
Training catboost model for cluster 4
0:	learn: 9.2075747	total: 70.6ms	remaining: 1m 10s
100:	learn: 8.3257119	total: 5.6s	remaining: 49.9s
200:	learn: 8.0352706	total: 10.9s	remaining: 43.4s
300:	learn: 7.8078002	total: 16.3s	remaining: 37.8s
400:	learn: 7.6257338	total: 21.6s	remaining: 32.3s
500:	learn: 7.4690212	total: 27s	remaining: 26.9s
600:	learn: 7.3258142	total: 32.8s	remaining: 21.8s
700:	learn: 7.2045377	total: 38.3s	remaining: 16.3s
800:	learn: 7.1011432	total: 43.7s	remaining: 10.9s
900:	learn: 7.0001880	total: 49.2s	remaining: 5.4s


100%|██████████| 5/5 [10:24<00:00, 124.96s/it]

999:	learn: 6.9141582	total: 54.5s	remaining: 0us
CPU times: user 32min 43s, sys: 26.7 s, total: 33min 10s
Wall time: 10min 24s





In [None]:
all_models.keys()

dict_keys([0, 1, 2, 3, 4])

In [None]:
if UNIT_TESTS:
    x_test_df = cluster_dataframes[0]['XY_test'].drop(['target'], axis=1)
    display(x_test_df.head(4))
    print(f"  columns: {x_test_df.columns}")
    print(f"  Moving average features: {[col_name for col_name in x_test_df.columns if 'sma' in col_name]}")

In [None]:
def get_ts_cluster_data(target_df_:pd.DataFrame, tseries_columns:list, kmeans_tseries):
    """
    Generate cluster_id for each Time Series grouped by ['stock_id', 'date_id']
    """
    target_df = target_df_.copy()
    stocks_tseries_data = []
    for k, subdf in tqdm(target_df.groupby(['stock_id', 'date_id'])):
        stocks_tseries_data.append(subdf[tseries_columns].values.ravel())
        stocks_tseries = np.array(stocks_tseries_data)
        if stocks_tseries.shape[1] > len(tseries_columns):
            stocks_tseries_transformed = pca.transform(stocks_tseries)
        else:
            stocks_tseries_transformed = stocks_tseries
        cluster_data = kmeans_tseries.predict(stocks_tseries_transformed)
        target_df['cluster_id'] = cluster_data[0]
    return target_df[['cluster_id']]

In [None]:
if UNIT_TESTS:
    cluster_data = get_ts_cluster_data(x_test_df, ['ask_price_sma'], kmeans_tseries)
    display(cluster_data.head(4))
    print(f"Length of cluster_data: {len(cluster_data)}")

In [None]:
def get_prediction(X, all_models:dict):
    #print(f"cluster_id: {X['cluster_id']}")
    row_inputs = X.drop(['cluster_id']).to_frame().T
    #print(f"row_inputs: {row_inputs.shape}")
    return np.mean([model.predict(row_inputs) for _, model in all_models[X['cluster_id']].items()])

In [None]:
def get_all_preds(X_inputs:pd.DataFrame, all_models:dict, cluster_data:pd.DataFrame):
    # Add cluster information
    X_inputs_clstr = pd.concat([cluster_data, X_inputs], axis=1)
    return X_inputs_clstr.apply(lambda row: get_prediction(row, all_models), axis=1)

In [None]:
%%time
# Unit Test
if UNIT_TESTS:
    x_test_inputs = x_test_df.drop(['stock_id', 'date_id', 'seconds_in_bucket'], axis=1)
    print(f"x_test_inputs.shape: {x_test_inputs.shape}")
    all_preds = get_all_preds(x_test_inputs, all_models, cluster_data)
    print(f"Number of predictions: {len(all_preds)}")

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 10 µs


In [None]:
%%time
if COMPUTE_TRAINING_METRICS:
    all_metrics = {}
    for cluster_id, datasets in cluster_dataframes.items():
        X_test_clstr = datasets['XY_test'].drop(['target'], axis=1)
        clstr_y_true = datasets['XY_test'].target
        cluster_data = get_ts_cluster_data(X_test_clstr, tseries_columns, kmeans_tseries)
        clstr_y_preds = get_all_preds(X_test_clstr.drop(['stock_id', 'date_id', 'seconds_in_bucket'], axis=1), all_models, cluster_data)
        all_metrics[cluster_id] = {"MAE": mean_absolute_error(clstr_y_true, clstr_y_preds)}
    all_errors = [metrics_data["MAE"] for _, metrics_data in all_metrics.items()]
    print(f"Average MAE: {np.mean(all_errors)}")
    print(all_metrics)

100%|██████████| 1094/1094 [00:52<00:00, 20.66it/s]
100%|██████████| 1378/1378 [01:16<00:00, 18.02it/s]
100%|██████████| 219/219 [00:00<00:00, 692.60it/s]
100%|██████████| 1961/1961 [02:08<00:00, 15.31it/s]
100%|██████████| 350/350 [00:04<00:00, 84.45it/s] 


Average MAE: 7.256611244817909
{0: {'MAE': 6.362073373681628}, 1: {'MAE': 5.6978156919703045}, 2: {'MAE': 10.303579083397088}, 3: {'MAE': 5.302556966090376}, 4: {'MAE': 8.617031108950155}}
CPU times: user 31min 8s, sys: 7min 33s, total: 38min 42s
Wall time: 24min 47s


In [None]:
# if not is_google_colab():
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

In [None]:
%%time
from sklearn.metrics import mean_absolute_error
import pandas as pd

if is_google_colab():

    # Lists to store true and predicted targets
    true_targets = []
    predicted_targets = []

    for (test, revealed_targets, sample_prediction) in iter_test:
        # Get Cluster predictions
        cluster_data = get_ts_cluster_data(test, tseries_columns, kmeans_tseries)

        # Preprocess test DataFrame
        test_pp = pp_pipeline.transform(test)

        # Predict target values
        predictions = get_all_preds(test_pp.drop(['stock_id', 'date_id', 'seconds_in_bucket'], axis=1), all_models, cluster_data)

        # Only take as many predictions and true targets as the minimum of the two
        min_len = min(len(predictions), len(revealed_targets))

        # Trim down the predictions list so that it is the same size as the revealed targets
        current_predictions = predictions[:min_len]
        current_true_targets = revealed_targets['revealed_target'].values[:min_len]

        # Extend the lists of true targets and predictions with the ones from the current batch
        true_targets.extend(current_true_targets)
        predicted_targets.extend(current_predictions)

        # Make the actual prediction and add to sample_prediction df
        sample_prediction['target'] = predictions

        # Submit the predictions for this chunk of test data
        env.predict(sample_prediction)

    if true_targets and predicted_targets:
        # Convert lists to dataframes
        df_true = pd.DataFrame(true_targets, columns=['true_target'])
        df_pred = pd.DataFrame(predicted_targets, columns=['predicted_target'])

        # Calculate and print the MAE
        mae = mean_absolute_error(df_true.fillna(0), df_pred)
        print("Mean Absolute Error:", mae)
        # 5.261823197997517
    else:
        print("No targets available to compute metrics.")

CPU times: user 0 ns, sys: 249 µs, total: 249 µs
Wall time: 346 µs


In [None]:
%%time
if not is_google_colab():
    cnt = 0
    for (test, revealed_targets, sample_prediction) in tqdm(iter_test):
        # Preprocess test DataFrame
        test_pp = pp_pipeline.transform(test)
        # Get Cluster predictions
        cluster_data = get_ts_cluster_data(test_pp, tseries_columns, kmeans_tseries)
        # Predict target values
        sample_prediction['target'] = get_all_preds(test_pp.drop(['stock_id', 'date_id', 'seconds_in_bucket'], axis=1), all_models, cluster_data)
        # Submit the predictions for this chunk of test data
        env.predict(sample_prediction)
        cnt += 1
    print(f"iterations: {cnt}")

0it [00:00, ?it/s]

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.



  0%|          | 0/200 [00:00<?, ?it/s][A
 44%|████▍     | 88/200 [00:00<00:00, 871.44it/s][A
100%|██████████| 200/200 [00:00<00:00, 816.51it/s][A
1it [00:04,  4.84s/it]
  0%|          | 0/200 [00:00<?, ?it/s][A
 44%|████▍     | 88/200 [00:00<00:00, 874.17it/s][A
100%|██████████| 200/200 [00:00<00:00, 804.51it/s][A
2it [00:09,  4.68s/it]
  0%|          | 0/200 [00:00<?, ?it/s][A
 43%|████▎     | 86/200 [00:00<00:00, 852.36it/s][A
100%|██████████| 200/200 [00:00<00:00, 808.74it/s][A
3it [00:13,  4.58s/it]
  0%|          | 0/200 [00:00<?, ?it/s][A
 42%|████▏     | 84/200 [00:00<00:00, 835.43it/s][A
100%|██████████| 200/200 [00:00<00:00, 794.75it/s][A
4it [00:18,  4.54s/it]
  0%|          | 0/200 [00:00<?, ?it/s][A
 42%|████▏     | 84/200 [00:00<00:00, 837.91it/s][A
100%|██████████| 200/200 [00:00<00:00, 782.91it/s][A
5it [00:22,  4.51s/it]
  0%|          | 0/200 [00:00<?, ?it/s][A
 44%|████▍     | 88/200 [00:00<00:00, 873.77it/s][A
100%|██████████| 200/200 [00:00<00:00,

iterations: 165
CPU times: user 12min 38s, sys: 11.2 s, total: 12min 50s
Wall time: 12min 24s





In [None]:
tmr.ShowTime()    # measure Colab's runtime.

Runtime is 3183 sec
