# Fundamental Feature Selection

In [1]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score
from scipy.stats import pearsonr
import polars as pl
import numpy as np
from tqdm import tqdm
from datetime import datetime
import os

train_splits = {
    "full" : pl.datetime(2023, 3, 1, 0, 0, 0),
    "last_9m" : pl.datetime(2023, 6, 1, 0, 0, 0),
    "last_6m" : pl.datetime(2023, 9, 1, 0, 0, 0),
    "last_3m" : pl.datetime(2023, 12, 1, 0, 0, 0),
    "last_1m": pl.datetime(2024, 2, 1, 0, 0, 0),
    "last_2w": pl.datetime(2024, 2, 15, 0, 0, 0),
}

PATHS = {
    "TRAIN_PATH" :"./kaggle/kaggle/input/drw-crypto-market-prediction/train.parquet",
    "TEST_PATH" : "./kaggle/kaggle/input/drw-crypto-market-prediction/test.parquet",
    "SUBMISSION_PATH" : "/kaggle/input/drw-crypto-market-prediction/sample_submission.csv",
}

train_data = pl.read_parquet(PATHS["TRAIN_PATH"]).sort("timestamp", descending = False)
print(train_data)

shape: (525_887, 897)
┌─────────┬─────────┬─────────┬──────────┬───┬──────────┬──────────┬──────────┬──────────────┐
│ bid_qty ┆ ask_qty ┆ buy_qty ┆ sell_qty ┆ … ┆ X889     ┆ X890     ┆ label    ┆ timestamp    │
│ ---     ┆ ---     ┆ ---     ┆ ---      ┆   ┆ ---      ┆ ---      ┆ ---      ┆ ---          │
│ f64     ┆ f64     ┆ f64     ┆ f64      ┆   ┆ f64      ┆ f64      ┆ f64      ┆ datetime[ns] │
╞═════════╪═════════╪═════════╪══════════╪═══╪══════════╪══════════╪══════════╪══════════════╡
│ 15.283  ┆ 8.425   ┆ 176.405 ┆ 44.984   ┆ … ┆ 0.159183 ┆ 0.530636 ┆ 0.562539 ┆ 2023-03-01   │
│         ┆         ┆         ┆          ┆   ┆          ┆          ┆          ┆ 00:00:00     │
│ 38.59   ┆ 2.336   ┆ 525.846 ┆ 321.95   ┆ … ┆ 0.158963 ┆ 0.530269 ┆ 0.533686 ┆ 2023-03-01   │
│         ┆         ┆         ┆          ┆   ┆          ┆          ┆          ┆ 00:01:00     │
│ 0.442   ┆ 60.25   ┆ 159.227 ┆ 136.369  ┆ … ┆ 0.158744 ┆ 0.529901 ┆ 0.546505 ┆ 2023-03-01   │
│         ┆         ┆       

In [2]:
def get_cols_inf(df: pl.DataFrame) -> list[str]:
    """
    Returns a list of column names that contain any positive or negative infinity.
    """
    cols = []
    for col in df.columns:
        # df[col] is a Series; .is_infinite() → Boolean Series; .any() → Python bool
        try:
            if df[col].is_infinite().any():
                cols.append(col)
        except Exception:
            # if the column isn’t numeric, .is_infinite() might error—just skip it
            continue
    return cols

def get_nan_columns(df: pl.DataFrame) -> list[str]:
    """
    Returns a list of column names with any NaN/null values.
    """
    cols = []
    for col in df.columns:
        if df.select(pl.col(col).is_null().any()).item():
            cols.append(col)
    return cols

def get_cols_zerostd(df: pl.DataFrame) -> list[str]:
    """
    Returns a list of column names whose standard deviation is zero
    (or whose std returns None because all values are null).
    Non-numeric columns (e.g. datetime) are skipped.
    """
    cols = []
    for col, dtype in zip(df.columns, df.dtypes):
        # Only attempt std() on numeric dtypes
        if dtype.is_numeric():  
            # df[col] is a Series; .std() returns a Python float or None
            std_val = df[col].std()
            if std_val == 0.0 or std_val is None:
                cols.append(col)
    return cols


def feature_engineering(df: pl.DataFrame) -> pl.DataFrame:
    # Feature engineering
    df = df.with_columns([
        # bidask_ratio = bid_qty / ask_qty
        (pl.col("bid_qty") / pl.col("ask_qty")).alias("bidask_ratio"),

        # buysell_ratio = 0 if volume == 0 else buy_qty / sell_qty
        pl.when(pl.col("volume") == 0)
        .then(0)
        .otherwise(pl.col("buy_qty") / pl.col("sell_qty"))
        .alias("buysell_ratio"),

        # bidask_delta = bid_qty - ask_qty
        (pl.col("bid_qty") - pl.col("ask_qty")).alias("bidask_delta"),

        # buysell_delta = buy_qty - sell_qty
        (pl.col("buy_qty") - pl.col("sell_qty")).alias("buysell_delta"),

        # buysell_size = buy_qty + sell_qty
        (pl.col("buy_qty") + pl.col("sell_qty")).alias("buysell_size"),

        # bidask_size = bid_qty + ask_qty
        (pl.col("bid_qty") + pl.col("ask_qty")).alias("bidask_size"),
    ])
    return df
def preprocess_train(train: pl.DataFrame, columns_to_drop: list[str] = []) -> pl.DataFrame:
    """
    Mirror of the original pandas workflow, but using polars.
    1. Identify columns with infinite, NaN, or zero‐std and drop them.
    2. Drop any user‐specified columns (e.g. label or order‐book columns).
    3. (You can add normalized/scaling steps here if needed.)
    """
    df = train.clone()

    df = feature_engineering(df)
    
    #### Preprocessing
    cols_inf = get_cols_inf(df)
    print("Columns with infinite values:", cols_inf)

    cols_nan = get_nan_columns(df)
    print("Columns with NaN values:", cols_nan)

    cols_zerostd = get_cols_zerostd(df)
    print("Columns with zero standard deviation:", cols_zerostd)
    # Drop columns with infinite, NaN, or zero‐std values
    drop_columns = list(set(cols_inf) | set(cols_nan) | set(cols_zerostd) | set(columns_to_drop))
    if drop_columns:
        df = df.drop(drop_columns)
    # df = df.sort("timestamp", descending=False)
    return df, drop_columns

def preprocess_test(test: pl.DataFrame, columns_to_drop: list[str] = []) -> pl.DataFrame:
    df = test.clone()
    df = feature_engineering(df)
    df = df.drop(columns_to_drop)
    print("Columns dropped from test set:", columns_to_drop)
    return df

y = train_data[["timestamp", "label"]]
X, drop_columns = preprocess_train(
    train_data,
    columns_to_drop=["label", "bid_qty", "ask_qty", "buy_qty", "sell_qty"]
)
print(X)

Columns with infinite values: ['X697', 'X698', 'X699', 'X700', 'X701', 'X702', 'X703', 'X704', 'X705', 'X706', 'X707', 'X708', 'X709', 'X710', 'X711', 'X712', 'X713', 'X714', 'X715', 'X716', 'X717']
Columns with NaN values: []
Columns with zero standard deviation: ['X864', 'X867', 'X869', 'X870', 'X871', 'X872']
shape: (525_887, 871)
┌─────────┬──────────┬───────────┬───────────┬───┬────────────┬────────────┬───────────┬───────────┐
│ volume  ┆ X1       ┆ X2        ┆ X3        ┆ … ┆ bidask_del ┆ buysell_de ┆ buysell_s ┆ bidask_si │
│ ---     ┆ ---      ┆ ---       ┆ ---       ┆   ┆ ta         ┆ lta        ┆ ize       ┆ ze        │
│ f64     ┆ f64      ┆ f64       ┆ f64       ┆   ┆ ---        ┆ ---        ┆ ---       ┆ ---       │
│         ┆          ┆           ┆           ┆   ┆ f64        ┆ f64        ┆ f64       ┆ f64       │
╞═════════╪══════════╪═══════════╪═══════════╪═══╪════════════╪════════════╪═══════════╪═══════════╡
│ 221.389 ┆ 0.121263 ┆ -0.41769  ┆ 0.005399  ┆ … ┆ 6.858  

In [3]:
X = X.filter([
    pl.col("timestamp") >= train_splits["last_2w"]
]).to_pandas()

y = y.filter([
    pl.col("timestamp") >= train_splits["last_2w"]
]).select(
    pl.col("label")
).to_pandas()
X

Unnamed: 0,volume,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X888,X889,X890,timestamp,bidask_ratio,buysell_ratio,bidask_delta,buysell_delta,buysell_size,bidask_size
0,181.331,-0.606128,-0.026132,0.240236,0.226543,0.009880,-0.207929,-0.308342,-0.414538,-0.952021,...,0.209209,0.137915,0.319477,2024-02-15 00:00:00,1.307461,0.502154,1.125,-60.097,181.331,8.443
1,110.604,-0.613099,-0.036430,0.218995,0.209116,0.002885,-0.214304,-0.314875,-0.421216,-0.687147,...,0.208630,0.137725,0.319256,2024-02-15 00:01:00,0.017929,0.400831,-10.517,-47.308,110.604,10.901
2,70.052,-0.475340,0.217352,0.478561,0.476689,0.139481,-0.076333,-0.176871,-0.283260,-0.614209,...,0.208053,0.137534,0.319035,2024-02-15 00:02:00,1.998738,1.051123,3.957,1.746,70.052,11.881
3,124.021,-0.518950,0.118303,0.378718,0.383085,0.095079,-0.119611,-0.220175,-0.326643,-0.833682,...,0.207478,0.137344,0.318814,2024-02-15 00:03:00,0.476843,1.597897,-4.846,28.543,124.021,13.680
4,131.966,-0.693000,-0.208907,0.029630,0.034411,-0.078318,-0.292848,-0.393678,-0.500346,-0.841456,...,0.206905,0.137154,0.318594,2024-02-15 00:04:00,3.877078,0.917581,6.577,-5.672,131.966,11.149
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21594,94.388,0.020155,0.076565,0.228994,0.288856,0.151634,0.108347,0.088073,0.073729,0.071211,...,0.212651,0.136494,0.243172,2024-02-29 23:55:00,0.611756,0.705263,-2.642,-16.314,94.388,10.968
21595,177.372,0.016262,0.062527,0.214072,0.276463,0.146521,0.104164,0.084063,0.069788,0.024066,...,0.212063,0.136305,0.243004,2024-02-29 23:56:00,0.564317,1.640604,-1.768,43.030,177.372,6.348
21596,101.252,0.045407,0.109834,0.263577,0.329266,0.174214,0.132940,0.113052,0.098865,-0.057370,...,0.211477,0.136117,0.242836,2024-02-29 23:57:00,1.438736,2.292427,1.597,39.746,101.252,8.877
21597,74.560,0.124783,0.244168,0.408704,0.480016,0.251493,0.211727,0.192160,0.178116,0.111335,...,0.210892,0.135928,0.242668,2024-02-29 23:58:00,1.169353,0.428489,0.830,-29.830,74.560,10.632


# 1 Sklearn Feature Selection Decomposition + Selection

In [4]:
import pandas as pd
from sklearn.feature_selection import (
    VarianceThreshold, SelectKBest, SelectPercentile,
    GenericUnivariateSelect, SelectFpr, SelectFdr, SelectFwe,
    RFE, RFECV, SelectFromModel, SequentialFeatureSelector,
    f_regression, mutual_info_regression
)
from sklearn.decomposition import (
    PCA, IncrementalPCA, TruncatedSVD,
    FastICA, SparsePCA, MiniBatchSparsePCA,
    DictionaryLearning, MiniBatchDictionaryLearning,
    FactorAnalysis, NMF, LatentDirichletAllocation
)
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.cluster import FeatureAgglomeration
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from tqdm.auto import tqdm
from xgboost import XGBRegressor
from sklearn.base import clone

class SklearnFeatureEngineeringRegression:
    def __init__(self, X: pd.DataFrame, y: pd.Series):
        """
        Parameters
        ----------
        X : pd.DataFrame
            Predictor matrix (n_samples × P features)
        y : pd.Series
            Target vector (n_samples,)
        """
        self.X_df = X.copy()
        self.X = X.to_numpy()
        self.y = y.to_numpy()
        self.features = X.columns.tolist()
        self.results_df: pd.DataFrame = pd.DataFrame()

        # … your import logic deciding CPU vs GPU …
        if os.environ.get("USER"):
            self.tree_model = XGBRegressor(
                tree_method="hist", n_estimators=1000, max_depth=100,
                learning_rate=0.05, random_state=42, n_jobs=-1
            )
        else:
            import cupy as cp
            self.X = cp.asarray(X)
            self.y = cp.asarray(y)
            self.tree_model = XGBRegressor(
                tree_method ="hist", device="cuda",
                n_estimators=10, max_depth=10,
                learning_rate=0.1, random_state=42, n_jobs=-1
            )
        print(f"""SklearnFeatureEngineering Instantiated""")
        print(f"... Fitting {self.tree_model.__class__.__name__} on {self.X_df.shape[0]} rows and {self.X_df.shape[1]} features.")
        self.tree_model.fit(self.X, self.y)

    # ---------- Filter methods ----------
    # VarianceThreshold is a filter method that removes features with low variance.
    def _variance_threshold(self, thresh: float = 0.0) -> pd.Series:
        sel = VarianceThreshold(threshold=thresh).fit(self.X)
        return pd.Series(sel.get_support(), index=self.features).astype(int)
    # SelectKBest and SelectPercentile are filter methods that select features based on univariate statistical tests.
    def _select_kbest_freg(self, k: int = 10) -> pd.Series:
        sel = SelectKBest(score_func=f_regression, k=min(k, self.X.shape[1]))
        return pd.Series(sel.fit(self.X, self.y).get_support(), index=self.features).astype(int)
    def _select_kbest_mutualinfo(self, k: int = 10) -> pd.Series:
        sel = SelectKBest(score_func=mutual_info_regression, k=min(k, self.X.shape[1]))
        return pd.Series(sel.fit(self.X, self.y).get_support(), index=self.features).astype(int)
    def _select_percentile_freg(self, p: float = 10) -> pd.Series:
        sel = SelectPercentile(score_func=f_regression, percentile=min(p, 100))
        return pd.Series(sel.fit(self.X, self.y).get_support(), index=self.features).astype(int)
    def _select_percentile_mutualinfo(self, p: float = 10) -> pd.Series:
        sel = SelectPercentile(score_func=mutual_info_regression, percentile=min(p, 100))
        return pd.Series(sel.fit(self.X, self.y).get_support(), index=self.features).astype(int)
    # # GenericUnivariateSelect is a filter method that allows for more flexible selection criteria.
    # def _generic_univariate(self, k: int = 10) -> pd.Series:
    #     sel = GenericUnivariateSelect(score_func=f_regression, mode='k_best', param=min(k, self.X.shape[1]))
    #     return pd.Series(sel.fit(self.X, self.y).get_support(), index=self.features).astype(int)

    def _select_fpr_freg(self, alpha: float = 0.05) -> pd.Series:
        sel = SelectFpr(score_func=f_regression, alpha=alpha)
        return pd.Series(sel.fit(self.X, self.y).get_support(), index=self.features).astype(int)

    def _select_fdr_freg(self, alpha: float = 0.05) -> pd.Series:
        sel = SelectFdr(score_func=f_regression, alpha=alpha)
        return pd.Series(sel.fit(self.X, self.y).get_support(), index=self.features).astype(int)

    def _select_fwe_freg(self, alpha: float = 0.05) -> pd.Series:
        sel = SelectFwe(score_func=f_regression, alpha=alpha)
        return pd.Series(sel.fit(self.X, self.y).get_support(), index=self.features).astype(int)

    # ---------- Wrapper and Embedded methods ----------
    # def _rfe_lasso(self, n_features: int = 10) -> pd.Series:
    #     estimator = LassoCV(cv=5, max_iter=5000).fit(self.X, self.y)
    #     sel = RFE(estimator, n_features_to_select=min(n_features, self.X.shape[1]))
    #     return pd.Series(sel.fit(self.X, self.y).get_support(), index=self.features).astype(int)

    # def _rfecv_lasso(self) -> pd.Series:
    #     estimator = LassoCV(cv=5, max_iter=5000).fit(self.X, self.y)
    #     sel = RFECV(estimator, cv=5)
    #     return pd.Series(sel.fit(self.X, self.y).get_support(), index=self.features).astype(int)

    def _rfe_xgb(self, n_features: int = 10) -> pd.Series:
        """
        Recursive Feature Elimination with a fresh clone of self.tree_model.
        """
        # clone preserves GPU/CPU config + hyper‐parameters
        estimator = clone(self.tree_model)
        sel = RFE(estimator, n_features_to_select=min(n_features, self.X.shape[1]))
        mask = sel.fit(self.X, self.y).get_support()
        return pd.Series(mask.astype(int), index=self.features).astype(int)

    def _rfecv_xgb(self, cv: int = 5) -> pd.Series:
        """
        RFECV with a fresh clone of self.tree_model.
        """
        estimator = clone(self.tree_model)
        sel = RFECV(estimator, cv=cv, scoring="neg_mean_squared_error")
        mask = sel.fit(self.X, self.y).get_support()
        return pd.Series(mask.astype(int), index=self.features).astype(int)
    
    def _select_from_model_tree(self, threshold="median") -> pd.Series:
        """
        Uses the pre‐fitted self.tree_model via prefit=True.
        """
        sel = SelectFromModel(self.tree_model, threshold=threshold, prefit=True)
        mask = sel.get_support()
        return pd.Series(mask.astype(int), index=self.features)

    def _sequential_tree(
        self,
        n_features: int = 10,
        direction: str = "forward",
        cv: int = 5,
        n_jobs: int = -1
    ) -> pd.Series:
        """
        SequentialFeatureSelector with XGBRegressor.

        - Clones self.tree_model to preserve GPU/CPU config.
        - direction: 'forward' or 'backward'
        - cv: number of cross‐validation folds
        - n_jobs: parallel jobs for CV
        """
        # ensure we don't modify the original fitted model
        estimator = clone(self.tree_model)

        sfs = SequentialFeatureSelector(
            estimator,
            n_features_to_select=min(n_features, self.X.shape[1]),
            direction=direction,
            cv=cv,
            n_jobs=n_jobs
        )
        mask = sfs.fit(self.X, self.y).get_support()
        return pd.Series(mask.astype(int), index=self.features)

    # ---------- Decomposition methods ----------
    def _pca(self, n_components: float = 0.95) -> pd.Series:
        pca = PCA(n_components=n_components).fit(self.X)
        load = pd.DataFrame(pca.components_.T, index=self.features)
        thresh = load.abs().mean().mean()
        return (load.abs().max(axis=1) >= thresh).astype(int)

    def _incremental_pca(self, n_components: float = 0.95) -> pd.Series:
        ipca = IncrementalPCA(n_components=n_components)
        load = pd.DataFrame(ipca.fit(self.X).components_.T, index=self.features)
        thresh = load.abs().mean().mean()
        return (load.abs().max(axis=1) >= thresh).astype(int)

    def _truncated_svd(self, n_components: int = 10) -> pd.Series:
        k = min(n_components, self.X.shape[1])
        ts = TruncatedSVD(n_components=k)
        load = pd.DataFrame(ts.fit(self.X).components_.T, index=self.features)
        thresh = load.abs().mean().mean()
        return (load.abs().max(axis=1) >= thresh).astype(int)

    def _fast_ica(self, n_components: int = 10) -> pd.Series:
        ic = FastICA(n_components=min(n_components, self.X.shape[1]), max_iter=200)
        load = pd.DataFrame(ic.fit(self.X).components_.T, index=self.features)
        thresh = load.abs().mean().mean()
        return (load.abs().max(axis=1) >= thresh).astype(int)

    def _sparse_pca(self, n_components: int = 10) -> pd.Series:
        spca = SparsePCA(n_components=min(n_components, self.X.shape[1]), alpha=1, max_iter=1000)
        load = pd.DataFrame(spca.fit(self.X).components_.T, index=self.features)
        thresh = load.abs().mean().mean()
        return (load.abs().max(axis=1) >= thresh).astype(int)

    def _minibatch_sparse_pca(self, n_components: int = 10) -> pd.Series:
        mbspca = MiniBatchSparsePCA(n_components=min(n_components, self.X.shape[1]), alpha=1)
        load = pd.DataFrame(mbspca.fit(self.X).components_.T, index=self.features)
        thresh = load.abs().mean().mean()
        return (load.abs().max(axis=1) >= thresh).astype(int)

    def _dict_learning(self, n_components: int = 10) -> pd.Series:
        dl = DictionaryLearning(n_components=min(n_components, self.X.shape[1]), alpha=1, max_iter=1000)
        load = pd.DataFrame(dl.fit(self.X).components_.T, index=self.features)
        thresh = load.abs().mean().mean()
        return (load.abs().max(axis=1) >= thresh).astype(int)

    def _minibatch_dict_learning(self, n_components: int = 10) -> pd.Series:
        mbdl = MiniBatchDictionaryLearning(n_components=min(n_components, self.X.shape[1]), alpha=1)
        load = pd.DataFrame(mbdl.fit(self.X).components_.T, index=self.features)
        thresh = load.abs().mean().mean()
        return (load.abs().max(axis=1) >= thresh).astype(int)

    def _factor_analysis(self, n_components: int = 10) -> pd.Series:
        fa = FactorAnalysis(n_components=min(n_components, self.X.shape[1]))
        load = pd.DataFrame(fa.fit(self.X).components_.T, index=self.features)
        thresh = load.abs().mean().mean()
        return (load.abs().max(axis=1) >= thresh).astype(int)

    def _nmf(self, n_components: int = 10) -> pd.Series:
        nmf = NMF(n_components=min(n_components, self.X.shape[1]), init='nndsvda', max_iter=500)
        data_pos = self.X.clip(lower=0)
        load = pd.DataFrame(nmf.fit(data_pos).components_.T, index=self.features)
        thresh = load.abs().mean().mean()
        return (load.abs().max(axis=1) >= thresh).astype(int)

    def _lda(self, n_components: int = 10) -> pd.Series:
        lda = LatentDirichletAllocation(n_components=min(n_components, self.X.shape[1]), max_iter=5)
        data_pos = self.X.clip(lower=0)
        load = pd.DataFrame(lda.fit(data_pos).components_.T, index=self.features)
        thresh = load.abs().mean().mean()
        return (load.abs().max(axis=1) >= thresh).astype(int)

    # ---- Random projections ----
    def _gaussian_random_projection(self, n_components: int = 10) -> pd.Series:
        rp = GaussianRandomProjection(n_components=min(n_components, self.X.shape[1]))
        comp = pd.DataFrame(rp.fit(self.X).components_.T, index=self.features)
        return self._decomp_mask(comp)

    def _sparse_random_projection(self, n_components: int = 10) -> pd.Series:
        srp = SparseRandomProjection(n_components=min(n_components, self.X.shape[1]))
        comp = pd.DataFrame(srp.fit(self.X).components_.T, index=self.features)
        return self._decomp_mask(comp)

    # ---- Feature grouping ----
    def _feature_agglomeration(self, n_clusters: int = 10) -> pd.Series:
        # Fit agglomeration on the array
        agg = FeatureAgglomeration(n_clusters=min(n_clusters, self.X.shape[1]))
        agg.fit(self.X)
        labels       = agg.labels_
        cluster_data = agg.transform(self.X)

        # Build a temporary DataFrame for cluster components
        df_clust = pd.DataFrame(
            cluster_data,
            columns=[f"clus_{i}" for i in range(cluster_data.shape[1])],
            index=self.X_df.index
        )

        # For each cluster, pick the feature with highest corr to its component
        mask = pd.Series(0, index=self.features)
        for j in range(cluster_data.shape[1]):
            members = [f for f, l in zip(self.features, labels) if l == j]
            corrs   = self.X_df[members].corrwith(df_clust.iloc[:, j]).abs()
            mask[corrs.idxmax()] = 1

        return mask
    
    def run(self, methods=None) -> None:
        """
        Execute each selection/decomposition method with a progress bar,
        accepting either:
          - a dict mapping method names to parameter dicts, or
          - a list of method names (no parameters).

        Assembles `results_df` with columns ['model', features..., 'total_score'].
        """
        # default methods with default params
        default_list = [
            '_variance_threshold',
            '_select_kbest_freg','_select_percentile_freg',
            '_select_kbest_mutualinfo','_select_percentile_mutualinfo',
            '_select_fpr_freg','_select_fdr_freg','_select_fwe_freg',
            '_rfe_xgb','_rfecv_xgb',
            '_select_from_model_tree','_sequential_tree',
            '_pca','_incremental_pca','_truncated_svd','_fast_ica',
            '_sparse_pca','_minibatch_sparse_pca','_dict_learning',
            '_minibatch_dict_learning','_factor_analysis','_nmf','_lda',
            '_gaussian_random_projection','_sparse_random_projection',
            '_feature_agglomeration'
        ]
        if methods is None:
            methods = {m: {} for m in default_list}
        elif isinstance(methods, list):
            methods = {m: {} for m in methods}
        elif not isinstance(methods, dict):
            raise ValueError("`methods` must be None, list, or dict")

        records = []
        for m, params in tqdm(methods.items(), desc='Running feature selection'):
            if not hasattr(self, m):
                raise KeyError(f"Method {m} not found in class")
            fn = getattr(self, m)
            mask = fn(**params)
            records.append({'model': m.lstrip('_'), **mask.to_dict(), 'total_score': int(mask.sum())})
        self.results_df = pd.DataFrame.from_records(records)

    def get_results(self) -> pd.DataFrame:
        """Return the raw selection matrix (n_models × (P + 2))."""
        return self.results_df

    def get_top_features(self, N: int) -> pd.DataFrame:
        """
        Aggregate across models (vote count = sum of 1’s per feature),
        and return the top N feature names.
        """
        return pd.DataFrame(features.results_df.loc[:, features.features].sum().sort_values(ascending=False), columns=['score']).head(N)
    
    def get_model_featuresincluded(self) -> pd.DataFrame:
        """
        Returns a DataFrame with number of features included per model.
        """
        model_features = self.results_df.set_index("model").sum(axis=1).reset_index()
        model_features.columns = ["model", "features_included"]
        return model_features

features = SklearnFeatureEngineeringRegression(
    X = X.drop(columns=["timestamp"]),
    y = y
)
features.run(methods=["_pca"])

SklearnFeatureEngineering Instantiated
... Fitting XGBRegressor on 21599 rows and 870 features.


Running feature selection:   0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
features.get_results()

Unnamed: 0,model,volume,X1,X2,X3,X4,X5,X6,X7,X8,...,X888,X889,X890,bidask_ratio,buysell_ratio,bidask_delta,buysell_delta,buysell_size,bidask_size,total_score
0,pca,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,1,1,0,5


In [9]:
features.get_top_features(20)

Unnamed: 0,score
volume,1
buysell_size,1
buysell_delta,1
bidask_delta,1
bidask_ratio,1
X577,0
X581,0
X580,0
X579,0
X578,0


In [7]:
features.get_model_featuresincluded()

Unnamed: 0,model,features_included
0,pca,10


**Principal Component Analysis (PCA)-based selection**

  1. Fit PCA on the centered feature matrix.
  2. Examine the loading matrix $L\in\mathbb R^{p\times k}$, where each column $L_j$ gives coefficients of the $j$ th principal component in the original feature basis.
  3. For each original feature $i$, compute its overall importance score, e.g.\ $\sum_{j=1}^k |L_{ij}|\times\lambda_j$, where $\lambda_j$ is the variance explained by component $j$.
  4. Select the top-$m$ features by that score or apply a threshold on absolute loading magnitude.


* **Factor Analysis (FA)-based selection**

  1. Fit a common‐factor model to capture the shared covariance among features.
  2. Obtain the factor loading matrix $\Lambda\in\mathbb R^{p\times q}$ and communalities $h_i^2=\sum_{j}\Lambda_{ij}^2$.
  3. Use each feature’s communality $h_i^2$ as its importance (higher = more explained by common factors).
  4. Retain features with highest communalities up to your budget.



**Independent Component Analysis (ICA)-based selection**

  1. Perform ICA to decompose features into statistically independent sources: $X = A\,S$.
  2. Use the absolute mixing‐matrix weights $|A_{i\,j}|$ as indications of how strongly feature $i$ contributes to source $j$.
  3. Aggregate per-feature scores, e.g.\ $\sum_j|A_{ij}|$, and select those above a threshold.

* **Canonical Correlation Analysis (CCA)-based selection**

  1. Given feature block $X$ and target $y$, form a second “block” $Y=y$ (or include engineered target lags).
  2. Solve for weight vectors $u,v$ that maximize $\mathrm{corr}(X u, Y v)$.
  3. The canonical weights $u_i$ indicate feature relevance; select features with largest $|u_i|$.


* **Correlation (Pearson) filter**

  1. Compute Pearson correlation $r_i = \mathrm{corr}(X_i, y)$ for each feature $X_i$.
  2. Rank by $|r_i|$ and choose top-$m$ features or those exceeding a predefined $|r|$ threshold.


* **Mutual Information (MI) filter**

  1. Estimate $I(X_i;y)$ nonparametrically (e.g. via k-nearest neighbors).
  2. Rank features by MI score and select the top fraction.



* **F-regression (univariate linear F-test)**

  1. For each feature $X_i$, fit the simple linear model $y = \beta_i X_i + \varepsilon$.
  2. Compute the F statistic $F_i = \frac{\mathrm{SSR}/1}{\mathrm{SSE}/(n-2)}$.
  3. Select features with the highest $F_i$ values (or lowest p-values).



* **RReliefF (regression version of ReliefF)**

  1. For a random sample of observations, find nearest‐neighbor pairs weighted by distance.
  2. Update feature weights by how well differences in $X_i$ predict differences in $y$.
  3. After many iterations, keep features with the largest Relief scores.