In [1]:
import pandas as pd
from predictables.feature_selection.src._backward_stepwise import (
    backward_stepwise_feature_selection,
    initialize_feature_set,
    calculate_all_feature_correlations,
    identify_highly_correlated_pairs,
    generate_X_y,
    evaluate_feature_removal_impact,
    select_feature_to_remove,
)
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import (
    PolynomialFeatures,
    PowerTransformer,
    FunctionTransformer,
)

import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [2]:
rng = np.random.default_rng(42)

# Generate a dataset with explicitly correlated features
fold_numbers = np.tile(np.arange(1, 11), 100)

# Add 100 fold 0 samples to ensure all folds have data
fold_numbers = np.concatenate([fold_numbers, np.zeros(500)])
rng.shuffle(fold_numbers)
base_feature = rng.normal(0, 1, size=1500)

X = pd.DataFrame(
    {
        "fold": fold_numbers,
        "feature1": base_feature,  # Base feature
        "feature2": base_feature * 1.01
        + rng.normal(0, 0.01, size=1500),  # Almost the same as feature1
        "feature3": base_feature * 0.99
        + rng.normal(0, 0.01, size=1500),  # Almost the same as feature1
        "feature4": rng.lognormal(0, 1, size=1500),  # Independent high-impact feature
        "feature5": rng.beta(2, 5, size=1500),  # Another independent feature
    }
)

# Target variable not strongly influenced by correlated features to ensure they are deemed less important
X["y"] = (
    (2 * X["feature4"] + X["feature5"] + rng.normal(0, 1, size=1500)) > 1.5
).astype(int)
y = X["y"]
X = X.drop(columns="y")
model = RandomForestClassifier(random_state=42)

original_features = set(X.columns)

# Assertions to verify that not all correlated features are retained
correlated_features = {"feature1", "feature2", "feature3"}

In [3]:
# initialize_feature_set,
# calculate_all_feature_correlations,
# identify_highly_correlated_pairs,
# generate_X_y,
# evaluate_feature_removal_impact,
# select_feature_to_remove
corr = calculate_all_feature_correlations(X)
identify_highly_correlated_pairs(corr)

[('feature1', 'feature2'), ('feature1', 'feature3'), ('feature2', 'feature3')]

In [4]:
res = {"feature1": {}, "feature2": {}, "feature3": {}}
for i in range(1, 4):
    w, wo = evaluate_feature_removal_impact(X, y, model, f"feature{i}", 5, 9)
    print(
        f"\n\nFEATURE {i}:\nmean without: {np.mean(wo):.1%}\nmean with: {np.mean(w):.1%}\nsd with: {np.std(w):.1%}\nlower bound: {np.mean(w) - np.std(w):.1%}"
    )

    res[f"feature{i}"]["with"] = w
    res[f"feature{i}"]["without"] = wo



FEATURE 1:
mean without: 85.5%
mean with: 84.7%
sd with: 3.1%
lower bound: 81.6%


FEATURE 2:
mean without: 85.7%
mean with: 84.7%
sd with: 3.1%
lower bound: 81.6%


FEATURE 3:
mean without: 85.2%
mean with: 84.7%
sd with: 3.1%
lower bound: 81.6%


In [7]:
i, j = 2, 3

select_feature_to_remove(
    res[f"feature{i}"]["with"],
    res[f"feature{i}"]["without"],
    f"feature{i}",
    res[f"feature{j}"]["with"],
    res[f"feature{j}"]["without"],
    f"feature{j}",
    1e-5,
)

'feature2'

In [3]:
# Perform feature selection
selected_features = backward_stepwise_feature_selection(
    X, y, model, start_fold=5, end_fold=9, tolerance=0.1
)

removed_features = original_features - set(selected_features)
retained_features = set(selected_features)