### Import necessary libraries

In [1]:
# Import required libraries
import numpy as np
import pandas as pd

# Import the reverse feature selection function
from reverse_feature_selection.reverse_random_forests import select_feature_subset

### Generate example dataset

In [2]:
# Set up a random number generator
rng = np.random.default_rng()

# Number of total samples
n_samples = 30

# Number of irrelevant features
n_irrelevant_features = 100

# Create DataFrame with 100 irrelevant features
data_df = pd.DataFrame({f"feature{i+1}": rng.random(n_samples) for i in range(n_irrelevant_features)})

### Add relevant features with stronger signals

In [3]:
# Number of relevant features to insert
n_relevant_features = 2

# Insert relevant features with increased effect size
for i in range(n_relevant_features):
    regulated_class = rng.random(n_samples // 2) + (i + 1) * 2
    unregulated_class = rng.random(n_samples // 2) + (i + 1)
    # Concatenate the two classes to form a single relevant feature
    data_df[f"relevant_feature{i+1}"] = np.concatenate((regulated_class, unregulated_class))

### Insert labels

In [4]:
# Construct binary class labels (15 samples of class 0 and 15 of class 1)
label = np.concatenate((np.zeros(n_samples // 2), np.ones(n_samples // 2)))

# Insert label column at the beginning of the DataFrame
data_df.insert(0, "label", label)

### Set training indices (leave-one-out cross-validation)

In [5]:
# Simulate leave-one-out cross-validation by selecting 29 out of 30 samples for training
train_indices = rng.choice(data_df.index, size=29, replace=False)

### Define meta information

In [6]:
# Generate a diverse list of integer random seeds
seeds = [29, 10, 17, 42, 213, 34, 1, 5, 19, 3, 23, 9, 7, 123, 234, 345, 456, 567, 678, 789, 890, 15, 333, 37, 45, 56]

# Meta configuration for the feature selection
meta_data = {
    "n_cpus": 4,
    "random_seeds": seeds,
    "train_correlation_threshold": 0.7,  # threshold for removing correlated features
}

### Run reverse feature selection

In [7]:
# Run the reverse feature selection algorithm (could take a Minute or two)
result_df = select_feature_subset(data_df, train_indices, meta_data)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   34.9s
[Parallel(n_jobs=4)]: Done 102 out of 102 | elapsed:  1.2min finished


### Display selected features

In [8]:
# Selected features are those with a score greater than 0, stored in the 'feature_subset_selection' column
print("Selected features:")
result_df[result_df["feature_subset_selection"] > 0]["feature_subset_selection"]

Selected features:


relevant_feature1    0.756731
relevant_feature2    0.879927
Name: feature_subset_selection, dtype: float64