# Backward feature elimination

In [1]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [2]:
# load the California House price data

X, y = fetch_california_housing(return_X_y=True, as_frame=True)

# Separate data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

## Sklearn

In [3]:
from sklearn.feature_selection import SequentialFeatureSelector as SFS

In [4]:
sfs = SFS(
    estimator = RandomForestRegressor(n_estimators=5, random_state=10), 
    n_features_to_select="auto", # the number of features to retain
    tol=0.001, # the maximum increase or decrease in the performance metric
    direction='backward', # the direction of the selection procedure
    scoring='r2', # the metric to evaluate
    cv=3, # the cross-validation fold
)

sfs = sfs.fit(X_train, y_train)

In [5]:
sfs.get_feature_names_out()

array(['MedInc', 'HouseAge', 'AveBedrms', 'AveOccup', 'Latitude',
       'Longitude'], dtype=object)

In [6]:
# transform data

X_train_t = sfs.transform(X_train)
X_test_t = sfs.transform(X_test)

## MLXtend

In [7]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [8]:
sfs = SFS(
    estimator=RandomForestRegressor(
        n_estimators=5, random_state=10),
    k_features=5,
    forward=False,
    verbose=1,
    scoring='r2',
    cv=3,
)

sfs = sfs.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    6.8s finished
Features: 7/5[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    5.7s finished
Features: 6/5[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    3.9s finished
Features: 5/5

In [9]:
# the selected features

sfs.k_feature_names_

('MedInc', 'HouseAge', 'AveOccup', 'Latitude', 'Longitude')

In [10]:
# transform data

X_train_t = sfs.transform(X_train)
X_test_t = sfs.transform(X_test)

X_test_t

array([[   4.1518    ,   22.        ,    4.18059299,   32.58      ,
        -117.05      ],
       [   5.7796    ,   32.        ,    3.02097902,   33.92      ,
        -117.97      ],
       [   4.3487    ,   29.        ,    2.91011236,   38.65      ,
        -121.84      ],
       ...,
       [   3.6296    ,   16.        ,    1.88631579,   34.2       ,
        -118.61      ],
       [   5.5133    ,   37.        ,    3.00847458,   33.9       ,
        -118.34      ],
       [   4.7639    ,   36.        ,    2.90545455,   37.66      ,
        -122.44      ]])