In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score
from scipy.stats import pearsonr, spearmanr
import pandas as pd
import numpy as np
from tqdm import tqdm

def get_cols_inf(df):
    """
    Returns a list of column names that contain positive or negative infinity.
    """
    return df.columns[np.isinf(df.values).any(axis=0)].tolist()

def get_cols_zerostd(df):
    """
    Returns a list of column names with zero standard deviation (excluding NaNs).
    """
    nunique_non_nan = df.nunique(dropna=True)
    return nunique_non_nan[nunique_non_nan <= 1].index.tolist()

def get_nan_columns(df):
    """
    Returns a list of column names that contain NaN values.
    """
    return df.columns[df.isna().any()].tolist()

def preprocess_train(train, columns_to_drop=[]):
    df = train.copy()
    
    #### Preprocessing
    # Identify once at the start
    cols_inf = get_cols_inf(df)
    print("Columns with infinite values:", cols_inf)
    cols_nan = get_nan_columns(df)
    print("Columns with NaN values:", cols_nan)
    cols_zerostd = get_cols_zerostd(df)
    print("Columns with zero standard deviation:", cols_zerostd)
    # Drop all at once
    cols_to_drop = set(cols_inf) | set(cols_nan) | set(cols_zerostd)

    df = df.drop(columns=cols_to_drop)

    #### Feature Engineering

    df.loc[:, 'bidask_ratio'] = df['bid_qty'] / df['ask_qty']
    df.loc[:, 'buysell_ratio'] = np.where(df['volume'] == 0, 0, df['buy_qty'] / df['sell_qty'])

    # df.loc[:, 'buysell_ratio_shift1'] = df['buysell_ratio'].shift(-1)

    df.loc[:, 'bidask_delta'] = df['bid_qty'] - df['ask_qty']
    df.loc[:, 'buysell_delta'] = df['buy_qty'] - df['sell_qty']

    df.loc[:, 'buysell_size'] = df['buy_qty'] + df['sell_qty']
    df.loc[:, 'bidask_size'] = df['bid_qty'] + df['ask_qty']

    # Final Drop
    df = df.drop(columns=columns_to_drop)
    return df

def evaluate_model(y_true, y_pred, X=None, linear=False, verbose=True):
    """
    General evaluation of regression models.
    Inputs:
        y_true: True target values
        y_pred: Predicted target values
        X: Feature matrix (optional, for adj_r2 and n_features)
        linear: If True, AIC and BIC will be computed (meaningful for linear models only)
        verbose: Print the results

    Outputs (dict):
        n_obs: Number of observations
        n_features: Number of features (if X provided)
        r2: R^2 score
        adj_r2: Adjusted R^2 (only if X is provided)
        rmse: Root Mean Squared Error
        mae: Mean Absolute Error
        medae: Median Absolute Error
        pearson_corr, pearson_pvalue
        spearman_corr, spearman_pvalue
        aic: Akaike Information Criterion (only if linear=True and X provided)
        bic: Bayesian Information Criterion (only if linear=True and X provided)
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    n_obs = len(y_true)
    n_features = X.shape[1] if X is not None else None

    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    medae = median_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    pearson_corr, pearson_p = pearsonr(y_true, y_pred)
    spearman_corr, spearman_p = spearmanr(y_true, y_pred)
    adj_r2 = (
        1 - (1 - r2) * (n_obs - 1) / (n_obs - n_features - 1)
        if X is not None and n_obs > n_features + 1 else np.nan
    )

    if linear and X is not None:
        n_params = n_features + 1  # +1 for intercept
        rss = np.sum((y_true - y_pred)**2)
        aic = n_obs * np.log(rss / n_obs) + 2 * n_params
        bic = n_obs * np.log(rss / n_obs) + n_params * np.log(n_obs)
    else:
        aic = np.nan
        bic = np.nan

    results = {
        "n_obs": n_obs,
        "n_features": n_features,
        "r2": r2,
        "adj_r2": adj_r2,
        "rmse": rmse,
        "mae": mae,
        "medae": medae,
        "pearson_corr": pearson_corr,
        "pearson_pvalue": pearson_p,
        # "spearman_corr": spearman_corr,
        # "spearman_pvalue": spearman_p,
        "aic": aic,
        "bic": bic,
    }

    if verbose:
        print(f"Observations:            {n_obs}")
        if n_features is not None:
            print(f"Features:                {n_features}")
        print(f"R^2:                     {r2:.5f}")
        print(f"Adjusted R^2:            {adj_r2:.5f}")
        print(f"RMSE:                    {rmse:.5f}")
        print(f"MAE:                     {mae:.5f}")
        print(f"Median Absolute Error:   {medae:.5f}")
        print(f"Pearson Corr:            {pearson_corr:.5f} (p={pearson_p:.3g})")
        print(f"Spearman Corr:           {spearman_corr:.5f} (p={spearman_p:.3g})")
        if linear and X is not None:
            print(f"AIC:                     {aic:.2f}")
            print(f"BIC:                     {bic:.2f}")
    return results

/kaggle/input/drw-crypto-market-prediction/sample_submission.csv
/kaggle/input/drw-crypto-market-prediction/train.parquet
/kaggle/input/drw-crypto-market-prediction/test.parquet


# Functions

## Time Series Splitting

In [2]:
from sklearn.model_selection import TimeSeriesSplit

def sklearn_timeseries_split(X, n_splits=5, **kwargs):
    tscv = TimeSeriesSplit(n_splits=n_splits, **kwargs)
    for train_idx, test_idx in tscv.split(X):
        yield train_idx, test_idx
        
# 2. Time Series
def walk_forward_split(X, initial_train_size, test_size, step_size=1):
    """
    Generator for walk-forward validation splits.

    Parameters:
    - X: pandas DataFrame or Series (indexed by time)
    - initial_train_size: int, number of initial observations for training
    - test_size: int, number of observations in each test fold
    - step_size: int, how much the train/test window shifts per iteration

    Yields:
    - (train_index, test_index): tuple of numpy arrays
    """
    n = len(X)
    train_start = 0

    for train_end in range(initial_train_size, n - test_size + 1, step_size):
        test_start = train_end
        test_end = test_start + test_size

        train_index = np.arange(train_start, train_end)
        test_index = np.arange(test_start, test_end)

        yield train_index, test_index

## Tree Based Models

In [3]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    BaggingRegressor,
    RandomForestRegressor,
    ExtraTreesRegressor,
    AdaBoostRegressor,
    GradientBoostingRegressor
)
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor


# def fit_decision_tree_regression(X, y, **kwargs):
#     """Decision Tree Regression: greedy MSE splits, high variance, low bias"""
#     model = DecisionTreeRegressor(**kwargs)
#     model.fit(X, y)
#     return model


# def fit_bagged_trees(X, y, base_estimator=None, n_estimators=100, **kwargs):
#     """Bagged Trees: bootstrap averaging to reduce variance"""
#     base = base_estimator or DecisionTreeRegressor()
#     model = BaggingRegressor(base_estimator=base,
#                              n_estimators=n_estimators,
#                              **kwargs)
#     model.fit(X, y)
#     return model


def fit_random_forest(X, y, n_estimators=100, max_features='auto', **kwargs):
    """Random Forest: bagging + random feature subsets for decorrelation"""
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_features=max_features,
        **kwargs
    )
    model.fit(X, y)
    return model


# def fit_extra_trees(X, y, n_estimators=100, max_features='auto', **kwargs):
#     """Extra-Trees: extreme randomness in features and thresholds"""
#     model = ExtraTreesRegressor(
#         n_estimators=n_estimators,
#         max_features=max_features,
#         **kwargs
#     )
#     model.fit(X, y)
#     return model


def fit_adaboost_regression(X, y, n_estimators=50, learning_rate=1.0, base_estimator=None, **kwargs):
    """AdaBoost Regression: sequential stumps fitted to weighted residuals"""
    base = base_estimator or DecisionTreeRegressor(max_depth=1)
    model = AdaBoostRegressor(
        base_estimator=base,
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        **kwargs
    )
    model.fit(X, y)
    return model


def fit_gradient_boosting(X, y, n_estimators=100, learning_rate=0.1, max_depth=3, **kwargs):
    """Gradient Boosting Machine: stage-wise fitting of negative gradient"""
    model = GradientBoostingRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        **kwargs
    )
    model.fit(X, y)
    return model


def fit_xgboost(X, y, n_estimators=100, learning_rate=0.1, max_depth=3, reg_lambda=1, reg_alpha=0, **kwargs):
    """XGBoost: GBM with regularized leaf weights and efficient split finding"""
    model = XGBRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        **kwargs
    )
    model.fit(X, y)
    return model


def fit_lightgbm_regression(X, y, n_estimators=100, learning_rate=0.1, num_leaves=31, **kwargs):
    """LightGBM: leaf-wise growth with histogram binning for speed"""
    model = LGBMRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        num_leaves=num_leaves,
        **kwargs
    )
    model.fit(X, y)
    return model


def fit_catboost_regression(X, y, iterations=1000, learning_rate=0.1, depth=6, verbose=False, **kwargs):
    """CatBoost: ordered boosting and native categorical handling"""
    model = CatBoostRegressor(
        iterations=iterations,
        learning_rate=learning_rate,
        depth=depth,
        verbose=verbose,
        **kwargs
    )
    model.fit(X, y)
    return model

## Training

In [4]:
def evaluate_grid(X, y, params_data, params_split, params_model, is_linear=False, metric_func=evaluate_model):
    all_results = []

    # Calculate total iterations for tqdm progress bar
    total_iters = len(params_data) * len(params_split) * len(params_model)

    with tqdm(total=total_iters, desc="Total Model Runs") as pbar:
        for data_key, data_val in params_data.items():
            mask = (X.index >= data_val['start']) & (X.index <= data_val['end'])
            X_window = X.loc[mask]
            y_window = y.loc[mask]

            for split_key, split_val in params_split.items():
                splitter_func = split_val['splitter_func']
                splitter_args = split_val['splitter_args']
                splits = list(splitter_func(X_window, **splitter_args))

                for model_key, model_val in params_model.items():
                    model_func = model_val['model_func']
                    model_args = model_val['model_args']

                    split_results = []

                    for i, (train_idx, test_idx) in enumerate(splits):
                        X_train, X_test = X_window.iloc[train_idx], X_window.iloc[test_idx]
                        y_train, y_test = y_window.iloc[train_idx], y_window.iloc[test_idx]

                        model = model_func(X_train, y_train, **model_args)
                        y_pred = model.predict(X_test)
                        metric_dict = metric_func(y_test, y_pred, X=X_test, linear=is_linear, verbose=False)
                        metric_dict['split_num'] = i
                        split_results.append(metric_dict)

                    split_df = pd.DataFrame(split_results)
                    metrics_to_agg = [col for col in split_df.columns if col != "split_num"]
                    overall_results = split_df[metrics_to_agg].mean().to_dict()
                    model_output = {
                        "data_key": data_key,
                        "split_key": split_key,
                        "model_key": model_key,
                        **overall_results,
                        "n_splits": len(split_df),
                    }
                    all_results.append(model_output)
                    pbar.update(1)
    return all_results

# Data

In [5]:
data = pd.read_parquet('/kaggle/input/drw-crypto-market-prediction/train.parquet')
y = data['label']
X = preprocess_train(data, columns_to_drop=['label', 'bid_qty', 'ask_qty', 'buy_qty', 'sell_qty'])
X = X.sort_index(ascending=True)
X

Columns with infinite values: ['X697', 'X698', 'X699', 'X700', 'X701', 'X702', 'X703', 'X704', 'X705', 'X706', 'X707', 'X708', 'X709', 'X710', 'X711', 'X712', 'X713', 'X714', 'X715', 'X716', 'X717']
Columns with NaN values: []
Columns with zero standard deviation: ['X697', 'X698', 'X699', 'X700', 'X701', 'X702', 'X703', 'X704', 'X705', 'X706', 'X707', 'X708', 'X709', 'X710', 'X711', 'X712', 'X713', 'X714', 'X715', 'X716', 'X717', 'X864', 'X867', 'X869', 'X870', 'X871', 'X872']


Unnamed: 0_level_0,volume,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X887,X888,X889,X890,bidask_ratio,buysell_ratio,bidask_delta,buysell_delta,buysell_size,bidask_size
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-03-01 00:00:00,221.389,0.121263,-0.417690,0.005399,0.125948,0.058359,0.027359,0.035780,0.068219,1.034825,...,0.377630,0.210153,0.159183,0.530636,1.814006,3.921505,6.858,131.421,221.389,23.708
2023-03-01 00:01:00,847.796,0.302841,-0.049576,0.356667,0.481087,0.237954,0.208359,0.217057,0.249624,0.948694,...,0.374515,0.209573,0.158963,0.530269,16.519692,1.633316,36.254,203.896,847.796,40.926
2023-03-01 00:02:00,295.596,0.167462,-0.291212,0.083138,0.206881,0.101727,0.072778,0.081564,0.114166,0.896459,...,0.371424,0.208993,0.158744,0.529901,0.007336,1.167619,-59.808,22.858,295.596,60.692
2023-03-01 00:03:00,460.705,0.072944,-0.436590,-0.102483,0.017551,0.007149,-0.021681,-0.012936,0.019634,0.732634,...,0.368358,0.208416,0.158524,0.529534,0.231490,2.686731,-16.151,210.779,460.705,25.881
2023-03-01 00:04:00,142.818,0.173820,-0.213489,0.096067,0.215709,0.107133,0.078976,0.087818,0.120426,0.763537,...,0.365314,0.207839,0.158304,0.529167,7.869603,2.216115,23.707,54.004,142.818,30.609
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-02-29 23:55:00,94.388,0.020155,0.076565,0.228994,0.288856,0.151634,0.108347,0.088073,0.073729,0.071211,...,0.393726,0.212651,0.136494,0.243172,0.611756,0.705263,-2.642,-16.314,94.388,10.968
2024-02-29 23:56:00,177.372,0.016262,0.062527,0.214072,0.276463,0.146521,0.104164,0.084063,0.069788,0.024066,...,0.390476,0.212063,0.136305,0.243004,0.564317,1.640604,-1.768,43.030,177.372,6.348
2024-02-29 23:57:00,101.252,0.045407,0.109834,0.263577,0.329266,0.174214,0.132940,0.113052,0.098865,-0.057370,...,0.387252,0.211477,0.136117,0.242836,1.438736,2.292427,1.597,39.746,101.252,8.877
2024-02-29 23:58:00,74.560,0.124783,0.244168,0.408704,0.480016,0.251493,0.211727,0.192160,0.178116,0.111335,...,0.384054,0.210892,0.135928,0.242668,1.169353,0.428489,0.830,-29.830,74.560,10.632


# Training

In [6]:
params_data = {
    # "d1": {
    #     "start": pd.Timestamp('2023-03-01 00:00:00'),
    #     "end": pd.Timestamp('2024-02-29 23:59:00')
    # },
    # "d2": {
    #     "start": pd.Timestamp('2023-06-01 00:00:00'),
    #     "end": pd.Timestamp('2024-02-29 23:59:00')
    # },
    # "d3": {
    #     "start": pd.Timestamp('2023-09-01 00:00:00'),
    #     "end":   pd.Timestamp('2024-02-29 23:59:00')
    # },
    "d4": {
        "start": pd.Timestamp('2023-12-01 00:00:00'),
        "end":   pd.Timestamp('2024-02-29 23:59:00')
    }
}

params_split = {
    "ts1": {
        "splitter_func": sklearn_timeseries_split,
        "splitter_args": {"n_splits": 5}
    },
    # "ts2": {
    #     "splitter_func": sklearn_timeseries_split,
    #     "splitter_args": {"n_splits": 10}
    # },
    # "wf1": {
    #     "splitter_func": walk_forward_split,
    #     "splitter_args": {
    #         "initial_train_size": 500,
    #         "test_size": 100,
    #         "step_size": 100
    #     }
    # },
    # "wf2": {
    #     "splitter_func": walk_forward_split,
    #     "splitter_args": {
    #         "initial_train_size": 1000,
    #         "test_size": 200,
    #         "step_size": 200
    #     }
    # }
}

# # Parameter grid for tree-based models (balanced)
# params_model_tree = 
#     "dt_m1": {
#         "model_func": fit_decision_tree_regression,
#         "model_args": {}
#     },
#     "bag_m1": {
#         "model_func": fit_bagged_trees,
#         "model_args": {"n_estimators": 100}
#     },
#     "rf_m1": {
#         "model_func": fit_random_forest,
#         "model_args": {"n_estimators": 100, "max_features": "sqrt", "n_jobs": -1}
#     },
    # "et_m1": {
    #     "model_func": fit_extra_trees,
    #     "model_args": {"n_estimators": 100, "max_features": "auto"}
#     "ada_m1": {
#         "model_func": fit_adaboost_regression,
#         "model_args": {"n_estimators": 50, "learning_rate": 1.0}
#     },
#     "gbm_m1": {
#         "model_func": fit_gradient_boosting,
#         "model_args": {"n_estimators": 100, "learning_rate": 0.1, "max_depth": 3, "subsample": 1.0}
#     },
#     "xgb_m1": {
#         "model_func": fit_xgboost,
#         "model_args": {"n_estimators": 100, "learning_rate": 0.1, "max_depth": 3, "subsample": 1.0, "tree_method": "heap" , "n_jobs": -1}
#     },
#     "lgbm_m1": {
#         "model_func": fit_lightgbm_regression,
#         "model_args": {"n_estimators": 100, "learning_rate": 0.1, "num_leaves": 31, "n_jobs": -1}
#     },
#     "cat_m1": {
#         "model_func": fit_catboost_regression,
#         "model_args": {"iterations": 1000, "learning_rate": 0.1, "depth": 6, "thread_count": -1, "verbose": False}
#     }
# }

# Quick-training parameter grid for speed\ n
params_model_tree_fast = {
    "rf_m1": {
        "model_func": fit_random_forest,
        "model_args": {"n_estimators": 10, "max_depth": 3, "n_jobs": -1}
    },
    "ada_m1": {
        "model_func": fit_adaboost_regression,
        "model_args": {"n_estimators": 10, "learning_rate": 1.0}
    },
    "gbm_m1": {
        "model_func": fit_gradient_boosting,
        "model_args": {"n_estimators": 10, "learning_rate": 0.3, "max_depth": 3, "subsample": 0.5}
    },
    "xgb_m1": {
        "model_func": fit_xgboost,
        "model_args": {"n_estimators": 10, "learning_rate": 0.3, "max_depth": 3, "subsample": 0.5, "tree_method": "hist", "n_jobs": -1}
    },
    "lgbm_m1": {
        "model_func": fit_lightgbm_regression,
        "model_args": {"n_estimators": 10, "learning_rate": 0.3, "num_leaves": 31, "n_jobs": -1}
    },
    "cat_m1": {
        "model_func": fit_catboost_regression,
        "model_args": {"iterations": 100, "learning_rate": 0.3, "depth": 4, "thread_count": -1, "verbose": False}
    }
}

results = evaluate_grid(
    X=X, 
    y=y, 
    params_data=params_data, 
    params_split=params_split, 
    params_model=params_model_tree_fast, 
    metric_func=evaluate_model,
    is_linear=False
)

  warn(
  warn(
  warn(
  warn(
  warn(
Total Model Runs:  67%|██████▋   | 4/6 [26:06<11:42, 351.09s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.465371 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 220925
[LightGBM] [Info] Number of data points in the train set: 21792, number of used features: 869
[LightGBM] [Info] Start training from score 0.119399
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.817341 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 221595
[LightGBM] [Info] Number of data points in the train set: 43581, number of used features: 869
[LightGBM] [Info] Start training from score 0.062742
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.194092 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 221850
[LightGBM] [Info] Number of data points in the train set: 65370, number of used features: 870
[LightGBM] [Info] 

Total Model Runs: 100%|██████████| 6/6 [28:25<00:00, 284.20s/it]


In [7]:
df_results = pd.DataFrame(results).sort_values(by=['pearson_corr'], ascending=False)
df_results

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,data_key,split_key,model_key,n_obs,n_features,r2,adj_r2,rmse,mae,medae,pearson_corr,pearson_pvalue,aic,bic,n_splits
4,d4,ts1,lgbm_m1,21789.0,870.0,-0.441618,-0.501576,1.209268,0.872783,0.656271,0.053919,0.1051012,,,5
0,d4,ts1,rf_m1,21789.0,870.0,-0.089884,-0.135213,1.063608,0.730151,0.50913,0.053884,0.07463381,,,5
3,d4,ts1,xgb_m1,21789.0,870.0,-0.234392,-0.285731,1.119457,0.781256,0.556787,0.039813,0.006428268,,,5
5,d4,ts1,cat_m1,21789.0,870.0,-0.583709,-0.649577,1.278494,0.91633,0.672497,0.03183,4.713897e-08,,,5
2,d4,ts1,gbm_m1,21789.0,870.0,-0.130569,-0.177591,1.082537,0.741349,0.511475,0.012258,0.1992111,,,5
1,d4,ts1,ada_m1,21789.0,870.0,-0.021593,-0.064082,1.032997,0.704245,0.490364,-0.000185,0.3228324,,,5


In [8]:
from datetime import datetime

current_datetime = datetime.now().strftime("%d%m%y_%H%M")
iteration_name = f"/kaggle/working/treebasedmodel_results_{current_datetime}.ipynb"

df_results.to_csv(iteration_name, index = False)