# DRW - Linear Approach

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from functions import preprocess_train, evaluate_model

# Train and Test

In [2]:
data = pd.read_parquet('data/train.parquet')
y = data['label']
X = preprocess_train(data, columns_to_drop=['label', 'bid_qty', 'ask_qty', 'buy_qty', 'sell_qty'])
X = X.sort_index(ascending=True)
X

Columns with infinite values: ['X697', 'X698', 'X699', 'X700', 'X701', 'X702', 'X703', 'X704', 'X705', 'X706', 'X707', 'X708', 'X709', 'X710', 'X711', 'X712', 'X713', 'X714', 'X715', 'X716', 'X717']
Columns with NaN values: []
Columns with zero standard deviation: ['X697', 'X698', 'X699', 'X700', 'X701', 'X702', 'X703', 'X704', 'X705', 'X706', 'X707', 'X708', 'X709', 'X710', 'X711', 'X712', 'X713', 'X714', 'X715', 'X716', 'X717', 'X864', 'X867', 'X869', 'X870', 'X871', 'X872']


Unnamed: 0_level_0,volume,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X887,X888,X889,X890,bidask_ratio,buysell_ratio,bidask_delta,buysell_delta,buysell_size,bidask_size
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-03-01 00:00:00,221.389,0.121263,-0.417690,0.005399,0.125948,0.058359,0.027359,0.035780,0.068219,1.034825,...,0.377630,0.210153,0.159183,0.530636,1.814006,3.921505,6.858,131.421,221.389,23.708
2023-03-01 00:01:00,847.796,0.302841,-0.049576,0.356667,0.481087,0.237954,0.208359,0.217057,0.249624,0.948694,...,0.374515,0.209573,0.158963,0.530269,16.519692,1.633316,36.254,203.896,847.796,40.926
2023-03-01 00:02:00,295.596,0.167462,-0.291212,0.083138,0.206881,0.101727,0.072778,0.081564,0.114166,0.896459,...,0.371424,0.208993,0.158744,0.529901,0.007336,1.167619,-59.808,22.858,295.596,60.692
2023-03-01 00:03:00,460.705,0.072944,-0.436590,-0.102483,0.017551,0.007149,-0.021681,-0.012936,0.019634,0.732634,...,0.368358,0.208416,0.158524,0.529534,0.231490,2.686731,-16.151,210.779,460.705,25.881
2023-03-01 00:04:00,142.818,0.173820,-0.213489,0.096067,0.215709,0.107133,0.078976,0.087818,0.120426,0.763537,...,0.365314,0.207839,0.158304,0.529167,7.869603,2.216115,23.707,54.004,142.818,30.609
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-02-29 23:55:00,94.388,0.020155,0.076565,0.228994,0.288856,0.151634,0.108347,0.088073,0.073729,0.071211,...,0.393726,0.212651,0.136494,0.243172,0.611756,0.705263,-2.642,-16.314,94.388,10.968
2024-02-29 23:56:00,177.372,0.016262,0.062527,0.214072,0.276463,0.146521,0.104164,0.084063,0.069788,0.024066,...,0.390476,0.212063,0.136305,0.243004,0.564317,1.640604,-1.768,43.030,177.372,6.348
2024-02-29 23:57:00,101.252,0.045407,0.109834,0.263577,0.329266,0.174214,0.132940,0.113052,0.098865,-0.057370,...,0.387252,0.211477,0.136117,0.242836,1.438736,2.292427,1.597,39.746,101.252,8.877
2024-02-29 23:58:00,74.560,0.124783,0.244168,0.408704,0.480016,0.251493,0.211727,0.192160,0.178116,0.111335,...,0.384054,0.210892,0.135928,0.242668,1.169353,0.428489,0.830,-29.830,74.560,10.632


# Model Training Workflow
1. **Data Subsampling (Temporal Strategies)**

* **Full history:** Train on all available data.
* **Recent window:** Train on most recent $N$ rows (e.g. last 50k, 100k, etc.).
* **Rolling/Expanding window:** Slide a fixed-size window forward in time; train/validate at each step.
* **Seasonal/Block:** Segment by time periods (e.g. month, quarter, year) and train on specific blocks.
* **Event-based:** Subset around key events/regimes.

2. **Feature Reduction Methods**

* **Filter:** Variance, correlation, missingness, mutual information.
* **Embedded:** Lasso, Elastic Net, Stability Selection.
* **Wrapper:** RFE, SFS.
* **Projection:** PCA, PLS, Truncated SVD.
* **Clustering:** Correlation clustering, hierarchical feature grouping.
* **Domain/Statistical:** Univariate tests, expert filtering.

3. **Training Methods**

* **Linear regression (OLS)**
* **Ridge/Lasso/Elastic Net**
* **Partial Least Squares (PLS)**
* **Tree-based (for comparison):** Decision Tree, Random Forest, XGBoost
* **Regularized/Robust variants:** Bayesian Ridge, Huber, etc.

4. **Cross-Validation for Time Series**

* **Expanding/rolling window CV**
* **Blocked time CV**
* **Walk-forward validation**
* **Purged CV (to avoid leakage)**


| Temporal Split | Feature Reduction | Model Type        | Cross-Validation         |
| -------------- | ----------------- | ----------------- | ------------------------ |
| Recent window  | Filter            | OLS               | Rolling window           |
| Recent window  | Embedded          | Lasso/Elastic Net | Rolling window           |
| Recent window  | Projection        | PLS/PCA + OLS     | Rolling window           |
| Rolling window | Filter            | Ridge             | Rolling window           |
| Rolling window | Embedded          | Lasso/Elastic Net | Rolling window           |
| Seasonal/Block | Filter/Embedded   | OLS/Lasso         | Blocked time-series      |
| Event-based    | Any               | OLS/Lasso/PLS     | Custom event-based split |
| Any            | Clustering        | OLS/Lasso         | Rolling/Expanding        |
| Any            | Domain            | OLS/Lasso         | Walk-forward             |
<!-- Not Possible because no prediction data
| Full history   | Filter            | OLS               | Expanding window         |
| Full history   | Embedded          | Lasso/Elastic Net | Expanding window         |
| Full history   | Projection        | PLS/PCA + OLS     | Expanding window         |
--> 

**Prioritization:**

1. Vary temporal split first (recent vs. full vs. rolling).
2. Within each, iterate over feature reduction (filter, embedded, projection, clustering, domain).
3. For each, try linear, regularized, and projection-based models.
4. Use time-series-appropriate cross-validation throughout.

## Feature Reduction

In [3]:
from sklearn.decomposition import PCA, KernelPCA, TruncatedSVD
from sklearn.cross_decomposition import PLSRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


def reduction_truncatedsvd(X, n_components):
    svd = TruncatedSVD(n_components=n_components)
    X_reduced = svd.fit_transform(X)
    return X_reduced, svd

def reduction_pca(X, n_components):
    pca = PCA(n_components=n_components)
    X_reduced = pca.fit_transform(X)
    return X_reduced, pca


def apply_kernel_pca(X, n_components, kernel='rbf'):
    kpca = KernelPCA(n_components=n_components, kernel=kernel)
    X_reduced = kpca.fit_transform(X)
    return X_reduced, kpca

def reduction_pls(X, Y, n_components):
    # Mechanism: Projects X onto components that maximize covariance with y.
    # Bias: PLS aligns projection to best explain y, which may overfit y.
    pls = PLSRegression(n_components=n_components)
    X_reduced = pls.fit_transform(X, Y)[0]
    return X_reduced, pls

def apply_lda(X, y, n_components):
    # Mechanism: Projects data to directions that maximize between-class variance and minimize within-class variance.
    # Bias : Yes — the reduced dimensions are explicitly chosen to maximize class separability.
    lda = LinearDiscriminantAnalysis(n_components=n_components)
    X_reduced = lda.fit_transform(X, y)
    return X_reduced, lda

from sklearn.feature_selection import mutual_info_classif
from scipy.cluster.hierarchy import linkage, fcluster
import numpy as np

def correlation_clustering(X, threshold=0.9):
    corr = np.corrcoef(X.T)
    distance = 1 - np.abs(corr)
    linkage_matrix = linkage(distance, method='average')
    labels = fcluster(linkage_matrix, t=threshold, criterion='distance')
    selected_features = [np.where(labels == i)[0][0] for i in np.unique(labels)]
    return X[:, selected_features]

import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform

def hierarchical_feature_grouping(X: pd.DataFrame, method='average', threshold=0.7):
    """
    Perform hierarchical clustering on features based on correlation distance,
    and select one representative feature from each cluster.
    
    Parameters:
    - X: DataFrame of shape (n_samples, n_features)
    - method: linkage method for clustering (e.g., 'average', 'complete')
    - threshold: float in (0, 1). Distance threshold for cluster formation.
    
    Returns:
    - reduced_X: DataFrame with selected features
    - selected_columns: list of selected feature names
    - cluster_labels: array of cluster labels for each feature
    """
    # Step 1: Compute correlation matrix and convert to distance matrix
    corr = X.corr().abs()
    distance = 1 - corr
    np.fill_diagonal(distance.values, 0)
    
    # Step 2: Condensed distance matrix for linkage
    condensed_dist = squareform(distance, checks=False)
    linkage_matrix = linkage(condensed_dist, method=method)
    
    # Step 3: Form flat clusters
    cluster_labels = fcluster(linkage_matrix, t=threshold, criterion='distance')
    
    # Step 4: Select one representative per cluster (first occurrence)
    selected_columns = []
    for cluster_id in np.unique(cluster_labels):
        cluster_indices = np.where(cluster_labels == cluster_id)[0]
        representative = X.columns[cluster_indices[0]]
        selected_columns.append(representative)
    
    reduced_X = X[selected_columns].copy()
    return reduced_X, selected_columns, cluster_labels

# from sklearn.feature_selection import VarianceThreshold
# selector = VarianceThreshold(threshold=1e-5)
# X_reduced = selector.fit_transform(X)
# X.shape, X_reduced.shape

## Train Test Splitting

In [4]:
# 1. Walk Forward
from sklearn.model_selection import TimeSeriesSplit

def sklearn_timeseries_split(X, n_splits=5, **kwargs):
    tscv = TimeSeriesSplit(n_splits=n_splits, **kwargs)
    for train_idx, test_idx in tscv.split(X):
        yield train_idx, test_idx
        
# 2. Time Series
def walk_forward_split(X, initial_train_size, test_size, step_size=1):
    """
    Generator for walk-forward validation splits.

    Parameters:
    - X: pandas DataFrame or Series (indexed by time)
    - initial_train_size: int, number of initial observations for training
    - test_size: int, number of observations in each test fold
    - step_size: int, how much the train/test window shifts per iteration

    Yields:
    - (train_index, test_index): tuple of numpy arrays
    """
    n = len(X)
    train_start = 0

    for train_end in range(initial_train_size, n - test_size + 1, step_size):
        test_start = train_end
        test_end = test_start + test_size

        train_index = np.arange(train_start, train_end)
        test_index = np.arange(test_start, test_end)

        yield train_index, test_index

# splits = []
# for train_idx, test_idx in walk_forward_split(X, initial_train_size=500, test_size=100, step_size=100):
#     splits.append((train_idx, test_idx))

# train_idx, test_idx = splits[0]
# X_train = X.iloc[train_idx]
# X_test = X.iloc[test_idx]

## Tree-Based Model Building

**Summary**
Tree-based methods recursively partition the feature space into axis-aligned regions and fit simple models (usually constants) within each region. Ensembles—bagging, boosting, or randomization—combine many such trees to reduce variance (bagging, Random Forest, Extra-Trees) or bias (boosting, AdaBoost, Gradient Boosting, XGBoost, LightGBM, CatBoost). Splits are chosen to minimize a loss (e.g. mean squared error), and regularization is introduced via limiting tree depth, subsampling, shrinkage, or random feature selection.


### 1. Decision Tree Regression

A single tree partitions $\mathbb{R}^p$ into $M$ disjoint regions {$R_1, ... , R_M$} by choosing splits $(j,s)$ on feature $j$ at threshold $s$ to minimize

$$
\sum_{i\in R_{\text{left}}} (y_i - \bar y_{\text{left}})^2 + \sum_{i\in R_{\text{right}}} (y_i - \bar y_{\text{right}})^2,
$$

where $\bar y_{\text{left}}$ is the mean response in the left node. Prediction for $x$ is the mean of $y_i$ in the leaf containing $x$. Control of complexity is via maximum depth, minimum samples per leaf, or impurity decrease thresholds.

### 2. Bagged Trees

An ensemble of $B$ trees trained on bootstrap samples. Each tree is fully grown (or lightly pruned), and predictions are averaged:

$$
\hat f(x) = \frac1B \sum_{b=1}^B T_b(x).
$$

Bagging reduces variance by averaging uncorrelated tree errors; each tree’s splits still minimize the usual within-node MSE.

### 3. Random Forest Regression

Extends bagging by, at each candidate split, considering a random subset of $m\ll p$ features. This decorrelates trees further, yielding lower ensemble variance. The prediction remains the average of $T_b(x)$.

### 4. Extra-Trees (Extremely Randomized Trees)

Similar to Random Forest but randomizes split thresholds as well as feature selection. For each split, a feature is chosen at random and a split point is drawn uniformly between its min/max; the best among these random splits is kept. This adds bias but further reduces variance and computational cost.

### 5. AdaBoost Regression

A stage-wise additive model that fits small trees (“weak learners,” e.g. depth-1 stumps) to weighted residuals. At iteration $m$, residuals $r_i^{(m)}=y_i - \hat f^{(m-1)}(x_i)$ are fit by $h_m$, and the ensemble updates

$$
\hat f^{(m)}(x) = \hat f^{(m-1)}(x) + \alpha_m\,h_m(x),
$$

where the weights $\alpha\_m$ depend on the fit’s error. This focuses subsequent learners on previously mispredicted points.

### 6. Gradient Boosting Machines (GBM)

Generalizes AdaBoost by fitting each learner $h_m(x)$ to the negative gradient of a differentiable loss $\ell(y,\hat f)$ in function space. For squared-error loss, the gradient is simply the residual. A shrinkage parameter $\nu$ scales each update to prevent overfitting:

$$
\hat f^{(m)}(x) = \hat f^{(m-1)}(x) + \nu \,h_m(x).
$$

Regularization via $\nu$, tree depth, and subsampling of rows or features.

### 7. XGBoost

An efficient implementation of GBM that adds regularization on leaf weights. At iteration $m$, it minimizes

$$
\sum_{i}\ell\bigl(y_i,\hat f^{(m-1)}(x_i)+h_m(x_i)\bigr) + \Omega(h_m),
$$

where $\Omega(h)=\gamma T + \tfrac12\lambda\sum\_{j=1}^T w\_j^2$ penalizes number of leaves $T$ and leaf weights ${w\_j}$. Supports column subsampling and approximate split finding for large data.

### 8. LightGBM (and CatBoost)

LightGBM speeds up GBM by growing trees leaf‐wise (best gain) rather than level‐wise, and using histogram binning for continuous features. CatBoost further addresses categorical variables via ordered target statistics and combats prediction shift. Both implement regularization analogous to XGBoost and offer fast, scalable training.

---

| Model                 | Ensemble Type        | Split Selection               | Loss / Objective                        | Regularization                             | Variance Control           | Bias Control                  |
| --------------------- | -------------------- | ----------------------------- | --------------------------------------- | ------------------------------------------ | -------------------------- | ----------------------------- |
| **Decision Tree**     | Single               | Greedy MSE reduction          | MSE                                     | Depth, min samples, impurity decrease      | Low                        | High (overfits easily)        |
| **Bagged Trees**      | Bagging              | Greedy MSE on bootstraps      | MSE                                     | Tree hyperparams                           | High (averaging)           | Same as base                  |
| **Random Forest**     | Bagging + Randomness | Random feature subsets        | MSE                                     | As bagging + feature subsample             | Higher (decorrelation)     | Same as base                  |
| **Extra-Trees**       | Bagging + Randomness | Random features & thresholds  | MSE                                     | As Random Forest                           | Highest (heavy randomness) | Increased                     |
| **AdaBoost**          | Boosting             | Weighted residuals (stumps)   | Exponential / squared loss              | Shrinkage via stump weight                 | Moderate                   | Lower (focus on hard cases)   |
| **GBM**               | Boosting             | Gradient of loss              | Any differentiable (e.g. MSE)           | Shrinkage, subsample, depth                | Moderate                   | Lower                         |
| **XGBoost**           | Boosting             | Gradient + regularized splits | Any differentiable + $\Omega$ penalty | $\gamma,T,\lambda$, shrinkage, subsample | Moderate                   | Controlled via regularization |
| **LightGBM/CatBoost** | Boosting             | Leaf-wise, histogram, ordered | Same as GBM/XGBoost                     | As XGBoost + categorical handling          | Moderate–High              | Balanced via advanced schemes |

**Key points:**

* **Variance reduction** is achieved by averaging uncorrelated trees (bagging, Random Forest, Extra-Trees).
* **Bias reduction** comes from sequentially fitting residuals (boosting).
* **Regularization** in boosting is multifaceted: shrinkage, tree complexity, subsampling, and explicit penalties on leaf weights.
* **Interpretability** declines as ensembles and randomness increase, though individual tree inspection remains possible.

In [5]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    BaggingRegressor,
    RandomForestRegressor,
    ExtraTreesRegressor,
    AdaBoostRegressor,
    GradientBoostingRegressor
)
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor


def fit_decision_tree_regression(X, y, **kwargs):
    """Decision Tree Regression: greedy MSE splits, high variance, low bias"""
    model = DecisionTreeRegressor(**kwargs)
    model.fit(X, y)
    return model


def fit_bagged_trees(X, y, base_estimator=None, n_estimators=100, **kwargs):
    """Bagged Trees: bootstrap averaging to reduce variance"""
    base = base_estimator or DecisionTreeRegressor()
    model = BaggingRegressor(base_estimator=base,
                             n_estimators=n_estimators,
                             **kwargs)
    model.fit(X, y)
    return model


def fit_random_forest(X, y, n_estimators=100, max_features='auto', **kwargs):
    """Random Forest: bagging + random feature subsets for decorrelation"""
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_features=max_features,
        **kwargs
    )
    model.fit(X, y)
    return model


def fit_extra_trees(X, y, n_estimators=100, max_features='auto', **kwargs):
    """Extra-Trees: extreme randomness in features and thresholds"""
    model = ExtraTreesRegressor(
        n_estimators=n_estimators,
        max_features=max_features,
        **kwargs
    )
    model.fit(X, y)
    return model


def fit_adaboost_regression(X, y, n_estimators=50, learning_rate=1.0, base_estimator=None, **kwargs):
    """AdaBoost Regression: sequential stumps fitted to weighted residuals"""
    base = base_estimator or DecisionTreeRegressor(max_depth=1)
    model = AdaBoostRegressor(
        base_estimator=base,
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        **kwargs
    )
    model.fit(X, y)
    return model


def fit_gradient_boosting(X, y, n_estimators=100, learning_rate=0.1, max_depth=3, **kwargs):
    """Gradient Boosting Machine: stage-wise fitting of negative gradient"""
    model = GradientBoostingRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        **kwargs
    )
    model.fit(X, y)
    return model


def fit_xgboost(X, y, n_estimators=100, learning_rate=0.1, max_depth=3, reg_lambda=1, reg_alpha=0, **kwargs):
    """XGBoost: GBM with regularized leaf weights and efficient split finding"""
    model = XGBRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        **kwargs
    )
    model.fit(X, y)
    return model


def fit_lightgbm_regression(X, y, n_estimators=100, learning_rate=0.1, num_leaves=31, **kwargs):
    """LightGBM: leaf-wise growth with histogram binning for speed"""
    model = LGBMRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        num_leaves=num_leaves,
        **kwargs
    )
    model.fit(X, y)
    return model


def fit_catboost_regression(X, y, iterations=1000, learning_rate=0.1, depth=6, verbose=False, **kwargs):
    """CatBoost: ordered boosting and native categorical handling"""
    model = CatBoostRegressor(
        iterations=iterations,
        learning_rate=learning_rate,
        depth=depth,
        verbose=verbose,
        **kwargs
    )
    model.fit(X, y)
    return model

In [6]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
# model = fit_random_forest(X=X_train, y=y_train, max_features="sqrt")
# y_pred = model.predict(X_test)
# evaluate_model(y_test=y_test, y_pred=y_pred)

In [7]:
# from functions import plot_actual_vs_pred

# plot_actual_vs_pred(y_train, y_test, y_pred)

In [8]:
# pd.concat([pd.DataFrame(y_test).reset_index(drop=True).rename(columns={'label': 'Actual'}), pd.DataFrame(y_pred).rename(columns={0: 'Predicted'})], axis=1).plot(figsize=(12, 6), title='Actual vs Predicted', grid=True)

## Prediction

In [11]:
params_data = {
    "d1": {
        "start": pd.Timestamp('2023-03-01 00:00:00'),
        "end": pd.Timestamp('2024-02-29 23:59:00')
    },
    # "d2": {
    #     "start": pd.Timestamp('2023-06-01 00:00:00'),
    #     "end": pd.Timestamp('2024-02-29 23:59:00')
    # },
    # "d3": {
    #     "start": pd.Timestamp('2023-09-01 00:00:00'),
    #     "end":   pd.Timestamp('2024-02-29 23:59:00')
    # },
    # "d4": {
    #     "start": pd.Timestamp('2023-12-01 00:00:00'),
    #     "end":   pd.Timestamp('2024-02-29 23:59:00')
    # }
}

params_split = {
    "ts1": {
        "splitter_func": sklearn_timeseries_split,
        "splitter_args": {"n_splits": 5}
    },
    # "ts2": {
    #     "splitter_func": sklearn_timeseries_split,
    #     "splitter_args": {"n_splits": 10}
    # },
    # "wf1": {
    #     "splitter_func": walk_forward_split,
    #     "splitter_args": {
    #         "initial_train_size": 500,
    #         "test_size": 100,
    #         "step_size": 100
    #     }
    # },
    # "wf2": {
    #     "splitter_func": walk_forward_split,
    #     "splitter_args": {
    #         "initial_train_size": 1000,
    #         "test_size": 200,
    #         "step_size": 200
    #     }
    # }
}

# Parameter grid for tree-based models
params_model_tree = {
    # "dt_m1": {
    #     "model_func": fit_decision_tree_regression,
    #     "model_args": {}
    # },
    # "bag_m1": {
    #     "model_func": fit_bagged_trees,
    #     "model_args": {"n_estimators": 100}
    # },
    "rf_m1": {
        "model_func": fit_random_forest,
        "model_args": {"n_estimators": 100, "max_features": "sqrt"}
    },
    # "et_m1": {
    #     "model_func": fit_extra_trees,
    #     "model_args": {"n_estimators": 100, "max_features": "auto"}
    # },
    # "ada_m1": {
    #     "model_func": fit_adaboost_regression,
    #     "model_args": {"n_estimators": 50, "learning_rate": 1.0}
    # },
    # "gbm_m1": {
    #     "model_func": fit_gradient_boosting,
    #     "model_args": {"n_estimators": 100, "learning_rate": 0.1, "max_depth": 3}
    # },
    # "xgb_m1": {
    #     "model_func": fit_xgboost,
    #     "model_args": {"n_estimators": 100, "learning_rate": 0.1, "max_depth": 3}
    # },
    # "lgbm_m1": {
    #     "model_func": fit_lightgbm_regression,
    #     "model_args": {"n_estimators": 100, "learning_rate": 0.1}
    # },
    # "cat_m1": {
    #     "model_func": fit_catboost_regression,
    #     "model_args": {"iterations": 1000, "learning_rate": 0.1, "depth": 6}
    # }
}

In [12]:
from evaluate import evaluate_model
from tqdm import tqdm

def evaluate_grid(X, y, params_data, params_split, params_model, is_linear=True, metric_func=evaluate_model):
    all_results = []

    # Calculate total iterations for tqdm progress bar
    total_iters = len(params_data) * len(params_split) * len(params_model)

    with tqdm(total=total_iters, desc="Total Model Runs") as pbar:
        for data_key, data_val in params_data.items():
            mask = (X.index >= data_val['start']) & (X.index <= data_val['end'])
            X_window = X.loc[mask]
            y_window = y.loc[mask]

            for split_key, split_val in params_split.items():
                splitter_func = split_val['splitter_func']
                splitter_args = split_val['splitter_args']
                splits = list(splitter_func(X_window, **splitter_args))

                for model_key, model_val in params_model.items():
                    model_func = model_val['model_func']
                    model_args = model_val['model_args']

                    split_results = []

                    for i, (train_idx, test_idx) in enumerate(splits):
                        X_train, X_test = X_window.iloc[train_idx], X_window.iloc[test_idx]
                        y_train, y_test = y_window.iloc[train_idx], y_window.iloc[test_idx]

                        model = model_func(X_train, y_train, **model_args)
                        y_pred = model.predict(X_test)
                        metric_dict = metric_func(y_test, y_pred, X=X_test, linear=is_linear, verbose=False)
                        metric_dict['split_num'] = i
                        split_results.append(metric_dict)

                    split_df = pd.DataFrame(split_results)
                    metrics_to_agg = [col for col in split_df.columns if col != "split_num"]
                    overall_results = split_df[metrics_to_agg].mean().to_dict()
                    model_output = {
                        "data_key": data_key,
                        "split_key": split_key,
                        "model_key": model_key,
                        **overall_results,
                        "n_splits": len(split_df),
                    }
                    all_results.append(model_output)
                    pbar.update(1)
    return all_results

results = evaluate_grid(
    X=X, 
    y=y, 
    params_data=params_data, 
    params_split=params_split, 
    params_model=params_model_tree, 
    metric_func=evaluate_model
)

Total Model Runs:   0%|          | 0/1 [38:55<?, ?it/s]


KeyboardInterrupt: 

In [None]:
df_results = pd.DataFrame(results).sort_values(by=['pearson_corr'], ascending=False)
df_results

Unnamed: 0,data_key,split_key,model_key,n_obs,n_features,r2,adj_r2,rmse,mae,medae,pearson_corr,pearson_pvalue,aic,bic,n_splits
0,d1,ts1,ols_m1,87647.0,870.0,-37773510000.0,-38152220000.0,92268.965887,67485.939444,51282.61952,0.032965,6.343861e-09,654129.044145,662299.958435,5


In [None]:
df_results.to_csv('results/treemodels_regression_results.csv', index=False)