In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score
from scipy.stats import pearsonr, spearmanr
import pandas as pd
import numpy as np
from tqdm import tqdm

def get_cols_inf(df):
    """
    Returns a list of column names that contain positive or negative infinity.
    """
    return df.columns[np.isinf(df.values).any(axis=0)].tolist()

def get_cols_zerostd(df):
    """
    Returns a list of column names with zero standard deviation (excluding NaNs).
    """
    nunique_non_nan = df.nunique(dropna=True)
    return nunique_non_nan[nunique_non_nan <= 1].index.tolist()

def get_nan_columns(df):
    """
    Returns a list of column names that contain NaN values.
    """
    return df.columns[df.isna().any()].tolist()

def preprocess_train(train, columns_to_drop=[]):
    df = train.copy()
    
    #### Preprocessing
    # Identify once at the start
    cols_inf = get_cols_inf(df)
    print("Columns with infinite values:", cols_inf)
    cols_nan = get_nan_columns(df)
    print("Columns with NaN values:", cols_nan)
    cols_zerostd = get_cols_zerostd(df)
    print("Columns with zero standard deviation:", cols_zerostd)
    # Drop all at once
    cols_to_drop = set(cols_inf) | set(cols_nan) | set(cols_zerostd)

    df = df.drop(columns=cols_to_drop)

    #### Feature Engineering

    df.loc[:, 'bidask_ratio'] = df['bid_qty'] / df['ask_qty']
    df.loc[:, 'buysell_ratio'] = np.where(df['volume'] == 0, 0, df['buy_qty'] / df['sell_qty'])

    # df.loc[:, 'buysell_ratio_shift1'] = df['buysell_ratio'].shift(-1)

    df.loc[:, 'bidask_delta'] = df['bid_qty'] - df['ask_qty']
    df.loc[:, 'buysell_delta'] = df['buy_qty'] - df['sell_qty']

    df.loc[:, 'buysell_size'] = df['buy_qty'] + df['sell_qty']
    df.loc[:, 'bidask_size'] = df['bid_qty'] + df['ask_qty']

    # Final Drop
    df = df.drop(columns=columns_to_drop)
    return df

def evaluate_model(y_true, y_pred, X=None, linear=False, verbose=True):
    """
    General evaluation of regression models.
    Inputs:
        y_true: True target values
        y_pred: Predicted target values
        X: Feature matrix (optional, for adj_r2 and n_features)
        linear: If True, AIC and BIC will be computed (meaningful for linear models only)
        verbose: Print the results

    Outputs (dict):
        n_obs: Number of observations
        n_features: Number of features (if X provided)
        r2: R^2 score
        adj_r2: Adjusted R^2 (only if X is provided)
        rmse: Root Mean Squared Error
        mae: Mean Absolute Error
        medae: Median Absolute Error
        pearson_corr, pearson_pvalue
        spearman_corr, spearman_pvalue
        aic: Akaike Information Criterion (only if linear=True and X provided)
        bic: Bayesian Information Criterion (only if linear=True and X provided)
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    n_obs = len(y_true)
    n_features = X.shape[1] if X is not None else None

    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    medae = median_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    pearson_corr, pearson_p = pearsonr(y_true, y_pred)
    spearman_corr, spearman_p = spearmanr(y_true, y_pred)
    adj_r2 = (
        1 - (1 - r2) * (n_obs - 1) / (n_obs - n_features - 1)
        if X is not None and n_obs > n_features + 1 else np.nan
    )

    if linear and X is not None:
        n_params = n_features + 1  # +1 for intercept
        rss = np.sum((y_true - y_pred)**2)
        aic = n_obs * np.log(rss / n_obs) + 2 * n_params
        bic = n_obs * np.log(rss / n_obs) + n_params * np.log(n_obs)
    else:
        aic = np.nan
        bic = np.nan

    results = {
        "n_obs": n_obs,
        "n_features": n_features,
        "r2": r2,
        "adj_r2": adj_r2,
        "rmse": rmse,
        "mae": mae,
        "medae": medae,
        "pearson_corr": pearson_corr,
        "pearson_pvalue": pearson_p,
        # "spearman_corr": spearman_corr,
        # "spearman_pvalue": spearman_p,
        "aic": aic,
        "bic": bic,
    }

    if verbose:
        print(f"Observations:            {n_obs}")
        if n_features is not None:
            print(f"Features:                {n_features}")
        print(f"R^2:                     {r2:.5f}")
        print(f"Adjusted R^2:            {adj_r2:.5f}")
        print(f"RMSE:                    {rmse:.5f}")
        print(f"MAE:                     {mae:.5f}")
        print(f"Median Absolute Error:   {medae:.5f}")
        print(f"Pearson Corr:            {pearson_corr:.5f} (p={pearson_p:.3g})")
        print(f"Spearman Corr:           {spearman_corr:.5f} (p={spearman_p:.3g})")
        if linear and X is not None:
            print(f"AIC:                     {aic:.2f}")
            print(f"BIC:                     {bic:.2f}")
    return results

/kaggle/input/drw-crypto-market-prediction/sample_submission.csv
/kaggle/input/drw-crypto-market-prediction/train.parquet
/kaggle/input/drw-crypto-market-prediction/test.parquet


# Functions

## Time Series Splitting

In [None]:
from sklearn.model_selection import TimeSeriesSplit

def sklearn_timeseries_split(X, n_splits=5, **kwargs):
    tscv = TimeSeriesSplit(n_splits=n_splits, **kwargs)
    for train_idx, test_idx in tscv.split(X):
        yield train_idx, test_idx
        
# 2. Time Series
def walk_forward_split(X, initial_train_size, test_size, step_size=1):
    """
    Generator for walk-forward validation splits.

    Parameters:
    - X: pandas DataFrame or Series (indexed by time)
    - initial_train_size: int, number of initial observations for training
    - test_size: int, number of observations in each test fold
    - step_size: int, how much the train/test window shifts per iteration

    Yields:
    - (train_index, test_index): tuple of numpy arrays
    """
    n = len(X)
    train_start = 0

    for train_end in range(initial_train_size, n - test_size + 1, step_size):
        test_start = train_end
        test_end = test_start + test_size

        train_index = np.arange(train_start, train_end)
        test_index = np.arange(test_start, test_end)

        yield train_index, test_index

## Tree Based Models

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    BaggingRegressor,
    RandomForestRegressor,
    ExtraTreesRegressor,
    AdaBoostRegressor,
    GradientBoostingRegressor
)
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor


# def fit_decision_tree_regression(X, y, **kwargs):
#     """Decision Tree Regression: greedy MSE splits, high variance, low bias"""
#     model = DecisionTreeRegressor(**kwargs)
#     model.fit(X, y)
#     return model


# def fit_bagged_trees(X, y, base_estimator=None, n_estimators=100, **kwargs):
#     """Bagged Trees: bootstrap averaging to reduce variance"""
#     base = base_estimator or DecisionTreeRegressor()
#     model = BaggingRegressor(base_estimator=base,
#                              n_estimators=n_estimators,
#                              **kwargs)
#     model.fit(X, y)
#     return model


def fit_random_forest(X, y, n_estimators=100, max_features='auto', **kwargs):
    """Random Forest: bagging + random feature subsets for decorrelation"""
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_features=max_features,
        **kwargs
    )
    model.fit(X, y)
    return model


# def fit_extra_trees(X, y, n_estimators=100, max_features='auto', **kwargs):
#     """Extra-Trees: extreme randomness in features and thresholds"""
#     model = ExtraTreesRegressor(
#         n_estimators=n_estimators,
#         max_features=max_features,
#         **kwargs
#     )
#     model.fit(X, y)
#     return model


def fit_adaboost_regression(X, y, n_estimators=50, learning_rate=1.0, base_estimator=None, **kwargs):
    """AdaBoost Regression: sequential stumps fitted to weighted residuals"""
    base = base_estimator or DecisionTreeRegressor(max_depth=1)
    model = AdaBoostRegressor(
        base_estimator=base,
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        **kwargs
    )
    model.fit(X, y)
    return model


def fit_gradient_boosting(X, y, n_estimators=100, learning_rate=0.1, max_depth=3, **kwargs):
    """Gradient Boosting Machine: stage-wise fitting of negative gradient"""
    model = GradientBoostingRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        **kwargs
    )
    model.fit(X, y)
    return model


def fit_xgboost(X, y, n_estimators=100, learning_rate=0.1, max_depth=3, reg_lambda=1, reg_alpha=0, **kwargs):
    """XGBoost: GBM with regularized leaf weights and efficient split finding"""
    model = XGBRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        **kwargs
    )
    model.fit(X, y)
    return model


def fit_lightgbm_regression(X, y, n_estimators=100, learning_rate=0.1, num_leaves=31, **kwargs):
    """LightGBM: leaf-wise growth with histogram binning for speed"""
    model = LGBMRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        num_leaves=num_leaves,
        **kwargs
    )
    model.fit(X, y)
    return model


def fit_catboost_regression(X, y, iterations=1000, learning_rate=0.1, depth=6, verbose=False, **kwargs):
    """CatBoost: ordered boosting and native categorical handling"""
    model = CatBoostRegressor(
        iterations=iterations,
        learning_rate=learning_rate,
        depth=depth,
        verbose=verbose,
        **kwargs
    )
    model.fit(X, y)
    return model

## Training

In [None]:
def evaluate_grid(X, y, params_data, params_split, params_model, is_linear=False, metric_func=evaluate_model):
    all_results = []

    # Calculate total iterations for tqdm progress bar
    total_iters = len(params_data) * len(params_split) * len(params_model)

    with tqdm(total=total_iters, desc="Total Model Runs") as pbar:
        for data_key, data_val in params_data.items():
            mask = (X.index >= data_val['start']) & (X.index <= data_val['end'])
            X_window = X.loc[mask]
            y_window = y.loc[mask]

            for split_key, split_val in params_split.items():
                splitter_func = split_val['splitter_func']
                splitter_args = split_val['splitter_args']
                splits = list(splitter_func(X_window, **splitter_args))

                for model_key, model_val in params_model.items():
                    model_func = model_val['model_func']
                    model_args = model_val['model_args']

                    split_results = []

                    for i, (train_idx, test_idx) in enumerate(splits):
                        X_train, X_test = X_window.iloc[train_idx], X_window.iloc[test_idx]
                        y_train, y_test = y_window.iloc[train_idx], y_window.iloc[test_idx]

                        model = model_func(X_train, y_train, **model_args)
                        y_pred = model.predict(X_test)
                        metric_dict = metric_func(y_test, y_pred, X=X_test, linear=is_linear, verbose=False)
                        metric_dict['split_num'] = i
                        split_results.append(metric_dict)

                    split_df = pd.DataFrame(split_results)
                    metrics_to_agg = [col for col in split_df.columns if col != "split_num"]
                    overall_results = split_df[metrics_to_agg].mean().to_dict()
                    model_output = {
                        "data_key": data_key,
                        "split_key": split_key,
                        "model_key": model_key,
                        **overall_results,
                        "n_splits": len(split_df),
                    }
                    all_results.append(model_output)
                    pbar.update(1)
    return all_results

# Data

In [None]:
data = pd.read_parquet('/kaggle/input/drw-crypto-market-prediction/train.parquet')
y = data['label']
X = preprocess_train(data, columns_to_drop=['label', 'bid_qty', 'ask_qty', 'buy_qty', 'sell_qty'])
X = X.sort_index(ascending=True)
X

# Training

In [None]:
params_data = {
    "d1": {
        "start": pd.Timestamp('2023-03-01 00:00:00'),
        "end": pd.Timestamp('2024-02-29 23:59:00')
    },
    "d2": {
        "start": pd.Timestamp('2023-06-01 00:00:00'),
        "end": pd.Timestamp('2024-02-29 23:59:00')
    },
    "d3": {
        "start": pd.Timestamp('2023-09-01 00:00:00'),
        "end":   pd.Timestamp('2024-02-29 23:59:00')
    },
    "d4": {
        "start": pd.Timestamp('2023-12-01 00:00:00'),
        "end":   pd.Timestamp('2024-02-29 23:59:00')
    }
}

params_split = {
    "ts1": {
        "splitter_func": sklearn_timeseries_split,
        "splitter_args": {"n_splits": 5}
    },
    "ts2": {
        "splitter_func": sklearn_timeseries_split,
        "splitter_args": {"n_splits": 10}
    },
    "wf1": {
        "splitter_func": walk_forward_split,
        "splitter_args": {
            "initial_train_size": 500,
            "test_size": 100,
            "step_size": 100
        }
    },
    "wf2": {
        "splitter_func": walk_forward_split,
        "splitter_args": {
            "initial_train_size": 1000,
            "test_size": 200,
            "step_size": 200
        }
    }
}

# Parameter grid for tree-based models
params_model_tree = {
    # "dt_m1": {
    #     "model_func": fit_decision_tree_regression,
    #     "model_args": {}
    # },
    # "bag_m1": {
    #     "model_func": fit_bagged_trees,
    #     "model_args": {"n_estimators": 100}
    # },
    # "rf_m1": {
    #     "model_func": fit_random_forest,
    #     "model_args": {"n_estimators": 100, "max_features": "sqrt"}
    # },
    # "et_m1": {
    #     "model_func": fit_extra_trees,
    #     "model_args": {"n_estimators": 100, "max_features": "auto"}
    # },
    # "ada_m1": {
    #     "model_func": fit_adaboost_regression,
    #     "model_args": {"n_estimators": 50, "learning_rate": 1.0}
    # },
    # "gbm_m1": {
    #     "model_func": fit_gradient_boosting,
    #     "model_args": {"n_estimators": 100, "learning_rate": 0.1, "max_depth": 3}
    # },
    "xgb_m1": {
        "model_func": fit_xgboost,
        "model_args": {"n_estimators": 100, "learning_rate": 0.1, "max_depth": 3}
    },
    "lgbm_m1": {
        "model_func": fit_lightgbm_regression,
        "model_args": {"n_estimators": 100, "learning_rate": 0.1}
    },
    # "cat_m1": {
    #     "model_func": fit_catboost_regression,
    #     "model_args": {"iterations": 1000, "learning_rate": 0.1, "depth": 6}
    # }
}


results = evaluate_grid(
    X=X, 
    y=y, 
    params_data=params_data, 
    params_split=params_split, 
    params_model=params_model_tree, 
    metric_func=evaluate_model,
    is_linear=False
)

In [None]:
df_results = pd.DataFrame(results).sort_values(by=['pearson_corr'], ascending=False)
df_results