In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/drw-crypto-market-prediction/sample_submission.csv
/kaggle/input/drw-crypto-market-prediction/train.parquet
/kaggle/input/drw-crypto-market-prediction/test.parquet


In [2]:
os.environ.get("USER")

# DRW - Crypto Market Prediction

This notebook documents all the steps done in this project.

Timeline:
- 10/06/25: 0.05031
    - Reorganize notebooks.
    - Test training with GPU - Way faster than CPU.
    - Implement feature elimination using GPU.
    - Tested with Linear Models - will be extremely slow in iteration.
    - Develop feature engineering pipeline
- 14/06/25
    - redevelop feature engineering pipeline - pipe results into downloadable results

In [3]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score
from scipy.stats import pearsonr
import polars as pl
import numpy as np
from tqdm import tqdm
from datetime import datetime

train_splits = {
    "full" : pl.datetime(2023, 3, 1, 0, 0, 0),
    "last_9m" : pl.datetime(2023, 6, 1, 0, 0, 0),
    "last_6m" : pl.datetime(2023, 9, 1, 0, 0, 0),
    "last_3m" : pl.datetime(2023, 12, 1, 0, 0, 0),
    "last_1m": pl.datetime(2024, 2, 1, 0, 0, 0),
    "last_2w": pl.datetime(2024, 2, 15, 0, 0, 0),
}

PATHS = {
    "TRAIN_PATH" :"/kaggle/input/drw-crypto-market-prediction/train.parquet",
    "TEST_PATH" : "/kaggle/input/drw-crypto-market-prediction/test.parquet",
    "SUBMISSION_PATH" : "/kaggle/input/drw-crypto-market-prediction/sample_submission.csv",
}

# features = []

def load_data(TRAIN_PATH: str, TEST_PATH: str):
    if os.environ.get("USER"):
        TRAIN_PATH = "." + TRAIN_PATH
        TEST_PATH = "." + TEST_PATH
    train_data = pl.read_parquet(TRAIN_PATH).sort("timestamp", descending = False)
    test_data = pl.read_parquet(TEST_PATH)
    # print(f"Train data shape: {train_data.shape}")
    # print(f"Test data shape: {test_data.shape}")
    return train_data, test_data

train_data, test_data = load_data(
    TRAIN_PATH = PATHS["TRAIN_PATH"],
    TEST_PATH = PATHS["TEST_PATH"],
)
print(train_data)

shape: (525_887, 897)
┌─────────┬─────────┬─────────┬──────────┬───┬──────────┬──────────┬──────────┬──────────────┐
│ bid_qty ┆ ask_qty ┆ buy_qty ┆ sell_qty ┆ … ┆ X889     ┆ X890     ┆ label    ┆ timestamp    │
│ ---     ┆ ---     ┆ ---     ┆ ---      ┆   ┆ ---      ┆ ---      ┆ ---      ┆ ---          │
│ f64     ┆ f64     ┆ f64     ┆ f64      ┆   ┆ f64      ┆ f64      ┆ f64      ┆ datetime[ns] │
╞═════════╪═════════╪═════════╪══════════╪═══╪══════════╪══════════╪══════════╪══════════════╡
│ 15.283  ┆ 8.425   ┆ 176.405 ┆ 44.984   ┆ … ┆ 0.159183 ┆ 0.530636 ┆ 0.562539 ┆ 2023-03-01   │
│         ┆         ┆         ┆          ┆   ┆          ┆          ┆          ┆ 00:00:00     │
│ 38.59   ┆ 2.336   ┆ 525.846 ┆ 321.95   ┆ … ┆ 0.158963 ┆ 0.530269 ┆ 0.533686 ┆ 2023-03-01   │
│         ┆         ┆         ┆          ┆   ┆          ┆          ┆          ┆ 00:01:00     │
│ 0.442   ┆ 60.25   ┆ 159.227 ┆ 136.369  ┆ … ┆ 0.158744 ┆ 0.529901 ┆ 0.546505 ┆ 2023-03-01   │
│         ┆         ┆       

# 1 Data

## 1.1 Pre-processing / Feature Engineering

**Pre-Processing**
1. inf/-inf columns: `['X697', 'X698', 'X699', 'X700', 'X701', 'X702', 'X703', 'X704', 'X705', 'X706', 'X707', 'X708', 'X709', 'X710', 'X711', 'X712', 'X713', 'X714', 'X715', 'X716', 'X717']`
2. columns with NaN values: `[]`
3. 0 std columns : `['X864', 'X867', 'X869', 'X870', 'X871', 'X872']`


**Feature Engineering**
1. `bidask_ratio`
2. `buysell_ratio`
3. `bidask_delta`
4. `buysell_delta`
5. `buysell_size`
6. `bidask_size`

In [4]:
def get_cols_inf(df: pl.DataFrame) -> list[str]:
    """
    Returns a list of column names that contain any positive or negative infinity.
    """
    cols = []
    for col in df.columns:
        # df[col] is a Series; .is_infinite() → Boolean Series; .any() → Python bool
        try:
            if df[col].is_infinite().any():
                cols.append(col)
        except Exception:
            # if the column isn’t numeric, .is_infinite() might error—just skip it
            continue
    return cols

def get_nan_columns(df: pl.DataFrame) -> list[str]:
    """
    Returns a list of column names with any NaN/null values.
    """
    cols = []
    for col in df.columns:
        if df.select(pl.col(col).is_null().any()).item():
            cols.append(col)
    return cols

def get_cols_zerostd(df: pl.DataFrame) -> list[str]:
    """
    Returns a list of column names whose standard deviation is zero
    (or whose std returns None because all values are null).
    Non-numeric columns (e.g. datetime) are skipped.
    """
    cols = []
    for col, dtype in zip(df.columns, df.dtypes):
        # Only attempt std() on numeric dtypes
        if dtype.is_numeric():  
            # df[col] is a Series; .std() returns a Python float or None
            std_val = df[col].std()
            if std_val == 0.0 or std_val is None:
                cols.append(col)
    return cols


def feature_engineering(df: pl.DataFrame) -> pl.DataFrame:
    # Feature engineering
    df = df.with_columns([
        # bidask_ratio = bid_qty / ask_qty
        (pl.col("bid_qty") / pl.col("ask_qty")).alias("bidask_ratio"),

        # buysell_ratio = 0 if volume == 0 else buy_qty / sell_qty
        pl.when(pl.col("volume") == 0)
        .then(0)
        .otherwise(pl.col("buy_qty") / pl.col("sell_qty"))
        .alias("buysell_ratio"),

        # bidask_delta = bid_qty - ask_qty
        (pl.col("bid_qty") - pl.col("ask_qty")).alias("bidask_delta"),

        # buysell_delta = buy_qty - sell_qty
        (pl.col("buy_qty") - pl.col("sell_qty")).alias("buysell_delta"),

        # buysell_size = buy_qty + sell_qty
        (pl.col("buy_qty") + pl.col("sell_qty")).alias("buysell_size"),

        # bidask_size = bid_qty + ask_qty
        (pl.col("bid_qty") + pl.col("ask_qty")).alias("bidask_size"),
    ])
    return df
def preprocess_train(train: pl.DataFrame, columns_to_drop: list[str] = []) -> pl.DataFrame:
    """
    Mirror of the original pandas workflow, but using polars.
    1. Identify columns with infinite, NaN, or zero‐std and drop them.
    2. Drop any user‐specified columns (e.g. label or order‐book columns).
    3. (You can add normalized/scaling steps here if needed.)
    """
    df = train.clone()

    df = feature_engineering(df)
    
    #### Preprocessing
    cols_inf = get_cols_inf(df)
    print("Columns with infinite values:", cols_inf)

    cols_nan = get_nan_columns(df)
    print("Columns with NaN values:", cols_nan)

    cols_zerostd = get_cols_zerostd(df)
    print("Columns with zero standard deviation:", cols_zerostd)
    # Drop columns with infinite, NaN, or zero‐std values
    drop_columns = list(set(cols_inf) | set(cols_nan) | set(cols_zerostd) | set(columns_to_drop))
    if drop_columns:
        df = df.drop(drop_columns)
    # df = df.sort("timestamp", descending=False)
    return df, drop_columns

def preprocess_test(test: pl.DataFrame, columns_to_drop: list[str] = []) -> pl.DataFrame:
    df = test.clone()
    df = feature_engineering(df)
    df = df.drop(columns_to_drop)
    print("Columns dropped from test set:", columns_to_drop)
    return df

# 2 Feature Selection Model

In [13]:
import pandas as pd
from sklearn.feature_selection import (
    VarianceThreshold, SelectKBest, SelectPercentile,
    GenericUnivariateSelect, SelectFpr, SelectFdr, SelectFwe,
    RFE, RFECV, SelectFromModel, SequentialFeatureSelector,
    f_regression, mutual_info_regression
)
from sklearn.decomposition import (
    PCA, IncrementalPCA, TruncatedSVD,
    FastICA, SparsePCA, MiniBatchSparsePCA,
    DictionaryLearning, MiniBatchDictionaryLearning,
    FactorAnalysis, NMF, LatentDirichletAllocation
)
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.cluster import FeatureAgglomeration
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from tqdm.auto import tqdm
from xgboost import XGBRegressor
from sklearn.base import clone

class SklearnFeatureEngineeringRegression:
    def __init__(self, X: pd.DataFrame, y: pd.Series):
        """
        Parameters
        ----------
        X : pd.DataFrame
            Predictor matrix (n_samples × P features)
        y : pd.Series
            Target vector (n_samples,)
        """
        self.X = X.copy()
        self.y = y.copy()
        self.X_np = X.to_numpy()
        self.y_np = y.to_numpy()
        self.features = X.columns.tolist()
        self.results_df: pd.DataFrame = pd.DataFrame()
        self.use_gpu = os.environ.get("USER") == None

        # … your import logic deciding CPU vs GPU …
        if self.use_gpu:
            print("*" * 20 + "GPU " * 5 + "*" * 20)
            import cupy as cp
            self.X_np = cp.asarray(self.X_np)
            self.y_np = cp.asarray(self.y_np)
            self.tree_model = XGBRegressor(
                tree_method ="hist", device="cuda",
                n_estimators=10, max_depth=10,
                learning_rate=0.1, random_state=42, n_jobs=-1
            )
        else:
            print("*" * 20 + "CPU " * 5 + "*" * 20)
            self.tree_model = XGBRegressor(
                tree_method="hist", n_estimators=10, max_depth=10,
                learning_rate=0.05, random_state=42, n_jobs=-1
            )
        print(f"""SklearnFeatureEngineering Instantiated""")
        print(f"... Fitting {self.tree_model.__class__.__name__} on {self.X.shape[0]} rows and {self.X.shape[1]} features.")
        self.tree_model.fit(self.X_np, self.y_np)

    # ---------- Filter methods ----------
    # VarianceThreshold is a filter method that removes features with low variance.
    def _variance_threshold(self, thresh: float = 0.0) -> pd.Series:
        sel = VarianceThreshold(threshold=thresh).fit(self.X)
        return pd.Series(sel.get_support(), index=self.features).astype(int)
    # SelectKBest and SelectPercentile are filter methods that select features based on univariate statistical tests.
    def _select_kbest_freg(self, k: int = 10) -> pd.Series:
        sel = SelectKBest(score_func=f_regression, k=min(k, self.X.shape[1]))
        return pd.Series(sel.fit(self.X, self.y).get_support(), index=self.features).astype(int)
    def _select_kbest_mutualinfo(self, k: int = 10) -> pd.Series:
        sel = SelectKBest(score_func=mutual_info_regression, k=min(k, self.X.shape[1]))
        return pd.Series(sel.fit(self.X, self.y).get_support(), index=self.features).astype(int)
    def _select_percentile_freg(self, p: float = 10) -> pd.Series:
        sel = SelectPercentile(score_func=f_regression, percentile=min(p, 100))
        return pd.Series(sel.fit(self.X, self.y).get_support(), index=self.features).astype(int)
    def _select_percentile_mutualinfo(self, p: float = 10) -> pd.Series:
        sel = SelectPercentile(score_func=mutual_info_regression, percentile=min(p, 100))
        return pd.Series(sel.fit(self.X, self.y).get_support(), index=self.features).astype(int)
    # # GenericUnivariateSelect is a filter method that allows for more flexible selection criteria.
    # def _generic_univariate(self, k: int = 10) -> pd.Series:
    #     sel = GenericUnivariateSelect(score_func=f_regression, mode='k_best', param=min(k, self.X.shape[1]))
    #     return pd.Series(sel.fit(self.X, self.y).get_support(), index=self.features).astype(int)

    def _select_fpr_freg(self, alpha: float = 0.05) -> pd.Series:
        sel = SelectFpr(score_func=f_regression, alpha=alpha)
        return pd.Series(sel.fit(self.X, self.y).get_support(), index=self.features).astype(int)

    def _select_fdr_freg(self, alpha: float = 0.05) -> pd.Series:
        sel = SelectFdr(score_func=f_regression, alpha=alpha)
        return pd.Series(sel.fit(self.X, self.y).get_support(), index=self.features).astype(int)

    def _select_fwe_freg(self, alpha: float = 0.05) -> pd.Series:
        sel = SelectFwe(score_func=f_regression, alpha=alpha)
        return pd.Series(sel.fit(self.X, self.y).get_support(), index=self.features).astype(int)

    # ---------- Wrapper and Embedded methods ----------
    # def _rfe_lasso(self, n_features: int = 10) -> pd.Series:
    #     estimator = LassoCV(cv=5, max_iter=5000).fit(self.X, self.y)
    #     sel = RFE(estimator, n_features_to_select=min(n_features, self.X.shape[1]))
    #     return pd.Series(sel.fit(self.X, self.y).get_support(), index=self.features).astype(int)

    # def _rfecv_lasso(self) -> pd.Series:
    #     estimator = LassoCV(cv=5, max_iter=5000).fit(self.X, self.y)
    #     sel = RFECV(estimator, cv=5)
    #     return pd.Series(sel.fit(self.X, self.y).get_support(), index=self.features).astype(int)

    def _rfe_xgb(self, n_features: int = 10) -> pd.Series:
        """
        Recursive Feature Elimination with a fresh clone of self.tree_model.
        """
        # clone preserves GPU/CPU config + hyper‐parameters
        estimator = clone(self.tree_model)
        sel = RFE(estimator, n_features_to_select=min(n_features, self.X.shape[1]))
        
        X_fit = self.X_np.get() if self.use_gpu else self.X_np
        y_fit = self.y_np.get() if self.use_gpu else self.y_np

        mask = sel.fit(X_fit, y_fit).get_support()
        return pd.Series(mask.astype(int), index=self.features).astype(int)

    def _rfecv_xgb(self, cv: int = 5) -> pd.Series:
        """
        RFECV with a fresh clone of self.tree_model.
        """
        estimator = clone(self.tree_model)
        sel = RFECV(estimator, cv=cv, scoring="neg_mean_squared_error")
        
        X_fit = self.X_np.get() if self.use_gpu else self.X_np
        y_fit = self.y_np.get() if self.use_gpu else self.y_np
        
        mask = sel.fit(X_fit, y_fit).get_support()
        return pd.Series(mask.astype(int), index=self.features).astype(int)
    
    def _select_from_model_tree(self, threshold="median") -> pd.Series:
        """
        Uses the pre‐fitted self.tree_model via prefit=True.
        """
        sel = SelectFromModel(self.tree_model, threshold=threshold, prefit=True)
        mask = sel.get_support()
        return pd.Series(mask.astype(int), index=self.features)

    def _sequential_tree(
        self,
        n_features: int = 10,
        direction: str = "forward",
        cv: int = 5,
        n_jobs: int = -1
    ) -> pd.Series:
        """
        SequentialFeatureSelector with XGBRegressor.

        - Clones self.tree_model to preserve GPU/CPU config.
        - direction: 'forward' or 'backward'
        - cv: number of cross‐validation folds
        - n_jobs: parallel jobs for CV
        """
        # ensure we don't modify the original fitted model
        estimator = clone(self.tree_model)

        sfs = SequentialFeatureSelector(
            estimator,
            n_features_to_select=min(n_features, self.X.shape[1]),
            direction=direction,
            cv=cv,
            n_jobs=n_jobs
        )
        
        X_fit = self.X_np.get() if self.use_gpu else self.X_np
        y_fit = self.y_np.get() if self.use_gpu else self.y_np
        
        mask = sfs.fit(X_fit, y_fit).get_support()
        return pd.Series(mask.astype(int), index=self.features)

    # ---------- Decomposition methods ----------
    def _pca(self, n_components: float = 10, svd_solver : str = "covariance_eigh") -> pd.Series:
        pca = PCA(n_components=n_components).fit(self.X)
        load = pd.DataFrame(pca.components_.T, index=self.features)
        thresh = load.abs().mean().mean()
        return (load.abs().max(axis=1) >= thresh).astype(int)

    def _incremental_pca(self, n_components: float = 10) -> pd.Series:
        ipca = IncrementalPCA(n_components=n_components)
        load = pd.DataFrame(ipca.fit(self.X).components_.T, index=self.features)
        thresh = load.abs().mean().mean()
        return (load.abs().max(axis=1) >= thresh).astype(int)

    def _truncated_svd(self, n_components: int = 10) -> pd.Series:
        k = min(n_components, self.X.shape[1])
        ts = TruncatedSVD(n_components=k)
        load = pd.DataFrame(ts.fit(self.X).components_.T, index=self.features)
        thresh = load.abs().mean().mean()
        return (load.abs().max(axis=1) >= thresh).astype(int)

    def _fast_ica(self, n_components: int = 10) -> pd.Series:
        ic = FastICA(n_components=min(n_components, self.X.shape[1]), max_iter=200)
        load = pd.DataFrame(ic.fit(self.X).components_.T, index=self.features)
        thresh = load.abs().mean().mean()
        return (load.abs().max(axis=1) >= thresh).astype(int)

    def _sparse_pca(self, n_components: int = 10) -> pd.Series:
        spca = SparsePCA(n_components=min(n_components, self.X.shape[1]), alpha=1, max_iter=1000)
        load = pd.DataFrame(spca.fit(self.X).components_.T, index=self.features)
        thresh = load.abs().mean().mean()
        return (load.abs().max(axis=1) >= thresh).astype(int)

    def _minibatch_sparse_pca(self, n_components: int = 10) -> pd.Series:
        mbspca = MiniBatchSparsePCA(n_components=min(n_components, self.X.shape[1]), alpha=1)
        load = pd.DataFrame(mbspca.fit(self.X).components_.T, index=self.features)
        thresh = load.abs().mean().mean()
        return (load.abs().max(axis=1) >= thresh).astype(int)

    def _dict_learning(self, n_components: int = 10) -> pd.Series:
        dl = DictionaryLearning(n_components=min(n_components, self.X.shape[1]), alpha=1, max_iter=1000)
        load = pd.DataFrame(dl.fit(self.X).components_.T, index=self.features)
        thresh = load.abs().mean().mean()
        return (load.abs().max(axis=1) >= thresh).astype(int)

    def _minibatch_dict_learning(self, n_components: int = 10) -> pd.Series:
        mbdl = MiniBatchDictionaryLearning(n_components=min(n_components, self.X.shape[1]), alpha=1)
        load = pd.DataFrame(mbdl.fit(self.X).components_.T, index=self.features)
        thresh = load.abs().mean().mean()
        return (load.abs().max(axis=1) >= thresh).astype(int)

    def _factor_analysis(self, n_components: int = 10) -> pd.Series:
        fa = FactorAnalysis(n_components=min(n_components, self.X.shape[1]))
        load = pd.DataFrame(fa.fit(self.X).components_.T, index=self.features)
        thresh = load.abs().mean().mean()
        return (load.abs().max(axis=1) >= thresh).astype(int)

    def _nmf(self, n_components: int = 10) -> pd.Series:
        nmf = NMF(n_components=min(n_components, self.X.shape[1]), init='nndsvda', max_iter=500)
        data_pos = self.X.clip(lower=0)
        load = pd.DataFrame(nmf.fit(data_pos).components_.T, index=self.features)
        thresh = load.abs().mean().mean()
        return (load.abs().max(axis=1) >= thresh).astype(int)

    def _lda(self, n_components: int = 10) -> pd.Series:
        lda = LatentDirichletAllocation(n_components=min(n_components, self.X.shape[1]), max_iter=5)
        data_pos = self.X.clip(lower=0)
        load = pd.DataFrame(lda.fit(data_pos).components_.T, index=self.features)
        thresh = load.abs().mean().mean()
        return (load.abs().max(axis=1) >= thresh).astype(int)

    # ---- Random projections ----
    def _gaussian_random_projection(self, n_components: int = 10) -> pd.Series:
        rp = GaussianRandomProjection(n_components=min(n_components, self.X.shape[1]))
        comp = pd.DataFrame(rp.fit(self.X).components_.T, index=self.features)
        return self._decomp_mask(comp)

    def _sparse_random_projection(self, n_components: int = 10) -> pd.Series:
        srp = SparseRandomProjection(n_components=min(n_components, self.X.shape[1]))
        comp = pd.DataFrame(srp.fit(self.X).components_.T, index=self.features)
        return self._decomp_mask(comp)

    # ---- Feature grouping ----
    def _feature_agglomeration(self, n_clusters: int = 10) -> pd.Series:
        # Fit agglomeration on the array
        agg = FeatureAgglomeration(n_clusters=min(n_clusters, self.X.shape[1]))
        agg.fit(self.X)
        labels       = agg.labels_
        cluster_data = agg.transform(self.X)

        # Build a temporary DataFrame for cluster components
        df_clust = pd.DataFrame(
            cluster_data,
            columns=[f"clus_{i}" for i in range(cluster_data.shape[1])],
            index=self.X.index
        )

        # For each cluster, pick the feature with highest corr to its component
        mask = pd.Series(0, index=self.features)
        for j in range(cluster_data.shape[1]):
            members = [f for f, l in zip(self.features, labels) if l == j]
            corrs   = self.X[members].corrwith(df_clust.iloc[:, j]).abs()
            mask[corrs.idxmax()] = 1

        return mask
    
    def run(self, methods=None) -> None:
        """
        Execute each selection/decomposition method with a progress bar,
        accepting either:
          - a dict mapping method names to parameter dicts, or
          - a list of method names (no parameters).

        Assembles `results_df` with columns ['model', features..., 'total_score'].
        """
        # default methods with default params
        default_list = [
            '_variance_threshold',
            '_select_kbest_freg','_select_percentile_freg',
            '_select_kbest_mutualinfo','_select_percentile_mutualinfo',
            '_select_fpr_freg','_select_fdr_freg','_select_fwe_freg',
            '_rfe_xgb','_rfecv_xgb',
            '_select_from_model_tree','_sequential_tree',
            '_pca','_incremental_pca','_truncated_svd','_fast_ica',
            '_sparse_pca','_minibatch_sparse_pca','_dict_learning',
            '_minibatch_dict_learning','_factor_analysis','_nmf','_lda',
            '_gaussian_random_projection','_sparse_random_projection',
            '_feature_agglomeration'
        ]
        if methods is None:
            methods = {m: {} for m in default_list}
        elif isinstance(methods, list):
            methods = {m: {} for m in methods}
        elif not isinstance(methods, dict):
            raise ValueError("`methods` must be None, list, or dict")

        records = []
        for m, params in tqdm(methods.items(), desc='Running feature selection'):
            try:
                if not hasattr(self, m):
                    raise KeyError(f"Method {m} not found in class")
                    
                start = time.time()
                
                fn = getattr(self, m)
                mask = fn(**params)
                records.append({'model': m.lstrip('_'), **mask.to_dict(), 'total_score': int(mask.sum())})
                end = time.time()
                print(f"""Function : `{m.lstrip('_')}` completed. Time taken : {(end - start) / 60} minutes.""")
            except Exception as e:
                print(f"Error running method {m}: {e}")
        self.results_df = pd.DataFrame.from_records(records)

    def get_results(self) -> pd.DataFrame:
        """Return the raw selection matrix (n_models × (P + 2))."""
        return self.results_df

    def get_top_features(self, N: int) -> pd.DataFrame:
        """
        Aggregate across models (vote count = sum of 1’s per feature),
        and return the top N feature names.
        """
        return pd.DataFrame(self.results_df.loc[:, self.features].sum().sort_values(ascending=False), columns=['score']).head(N)

    def get_model_featuresincluded(self) -> pd.DataFrame:
        """
        Returns a DataFrame with number of features included per model.
        """
        model_features = self.results_df.set_index("model").sum(axis=1).reset_index()
        model_features.columns = ["model", "features_included"]
        return model_features

# Workflow

In [6]:
train_data_filtered = train_data.filter(
    pl.col("timestamp") >= train_splits["last_2w"]
)

y = train_data_filtered["label"]
X, drop_columns = preprocess_train(
    train_data_filtered,
    columns_to_drop=["label", "bid_qty", "ask_qty", "buy_qty", "sell_qty"]
)
print(X)

Columns with infinite values: ['X697', 'X698', 'X699', 'X700', 'X701', 'X702', 'X703', 'X704', 'X705', 'X706', 'X707', 'X708', 'X709', 'X710', 'X711', 'X712', 'X713', 'X714', 'X715', 'X716', 'X717']
Columns with NaN values: []
Columns with zero standard deviation: ['X864', 'X867', 'X869', 'X870', 'X871', 'X872']
shape: (21_599, 871)
┌─────────┬───────────┬───────────┬──────────┬───┬────────────┬────────────┬───────────┬───────────┐
│ volume  ┆ X1        ┆ X2        ┆ X3       ┆ … ┆ bidask_del ┆ buysell_de ┆ buysell_s ┆ bidask_si │
│ ---     ┆ ---       ┆ ---       ┆ ---      ┆   ┆ ta         ┆ lta        ┆ ize       ┆ ze        │
│ f64     ┆ f64       ┆ f64       ┆ f64      ┆   ┆ ---        ┆ ---        ┆ ---       ┆ ---       │
│         ┆           ┆           ┆          ┆   ┆ f64        ┆ f64        ┆ f64       ┆ f64       │
╞═════════╪═══════════╪═══════════╪══════════╪═══╪════════════╪════════════╪═══════════╪═══════════╡
│ 181.331 ┆ -0.606128 ┆ -0.026132 ┆ 0.240236 ┆ … ┆ 1.125   

In [None]:
fe_regression = SklearnFeatureEngineeringRegression(
    X = X.drop(["timestamp"]).to_pandas(),
    y = y.to_pandas()
)
fe_regression.run()

********************GPU GPU GPU GPU GPU ********************
SklearnFeatureEngineering Instantiated
... Fitting XGBRegressor on 21599 rows and 870 features.


Running feature selection:   0%|          | 0/26 [00:00<?, ?it/s]

Error running method _rfe_xgb: name 'use_gpu' is not defined
Error running method _rfecv_xgb: name 'use_gpu' is not defined
Error running method _sequential_tree: name 'use_gpu' is not defined




In [None]:
fe_regression.get_results()

In [None]:
fe_regression.results_df.to_xlsx("/kaggle/working/feature_engineering_results.csv")

In [None]:
fe_regression.get_top_features(50)