# Forward feature selection

- [Feature Selection in Machine Learning Book](https://www.trainindata.com/p/feature-selection-in-machine-learning-book)

In [1]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
# load dataset

X, y = load_breast_cancer(return_X_y=True, as_frame=True)

# Separate data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

X_train.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
293,11.85,17.46,75.54,432.7,0.08372,0.05642,0.02688,0.0228,0.1875,0.05715,...,13.06,25.75,84.35,517.8,0.1369,0.1758,0.1316,0.0914,0.3101,0.07007
332,11.22,19.86,71.94,387.3,0.1054,0.06779,0.005006,0.007583,0.194,0.06028,...,11.98,25.78,76.91,436.1,0.1424,0.09669,0.01335,0.02022,0.3292,0.06522
565,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,0.05533,...,23.69,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637
278,13.59,17.84,86.24,572.3,0.07948,0.04052,0.01997,0.01238,0.1573,0.0552,...,15.5,26.1,98.91,739.1,0.105,0.07622,0.106,0.05185,0.2335,0.06263
489,16.69,20.2,107.1,857.6,0.07497,0.07112,0.03649,0.02307,0.1846,0.05325,...,19.18,26.56,127.3,1084.0,0.1009,0.292,0.2477,0.08737,0.4677,0.07623


## Sklearn

In [3]:
from sklearn.feature_selection import SequentialFeatureSelector as SFS

In [4]:
sfs = SFS(
    estimator=RandomForestClassifier(n_estimators=5, random_state=0),
    n_features_to_select='auto',
    tol=0.001,  # the maximum increase or decrease in the performance metric
    direction='forward',  # the direction of the selection procedure
    scoring='roc_auc',  # the metric to evaluate
    cv=3,  # the cross-validation fold
)

sfs = sfs.fit(X_train, y_train)

In [5]:
# the selected features

sfs.get_feature_names_out()

array(['mean smoothness', 'mean concavity', 'worst texture',
       'worst perimeter', 'worst concavity'], dtype=object)

In [6]:
# transform data

X_train_t = sfs.transform(X_train)
X_test_t = sfs.transform(X_test)

X_test_t

array([[1.106e-01, 1.445e-01, 2.966e+01, 1.133e+02, 5.106e-01],
       [8.791e-02, 2.772e-02, 3.423e+01, 9.129e+01, 1.390e-01],
       [7.966e-02, 2.087e-02, 1.931e+01, 9.653e+01, 6.260e-02],
       [6.576e-02, 2.475e-02, 2.526e+01, 1.058e+02, 1.565e-01],
       [8.983e-02, 2.562e-02, 2.281e+01, 8.446e+01, 4.833e-02],
       [1.049e-01, 4.302e-02, 2.303e+01, 7.915e+01, 1.624e-01],
       [6.935e-02, 7.943e-02, 2.871e+01, 8.736e+01, 2.912e-01],
       [8.814e-02, 1.583e-02, 2.655e+01, 8.092e+01, 7.915e-02],
       [8.098e-02, 0.000e+00, 3.092e+01, 5.717e+01, 0.000e+00],
       [9.882e-02, 3.581e-02, 1.948e+01, 7.089e+01, 7.162e-02],
       [1.031e-01, 1.450e-01, 1.834e+01, 1.141e+02, 3.219e-01],
       [8.974e-02, 3.102e-02, 2.911e+01, 1.029e+02, 9.189e-02],
       [7.969e-02, 3.735e-02, 2.799e+01, 6.661e+01, 1.868e-01],
       [7.445e-02, 5.150e-02, 2.807e+01, 1.203e+02, 1.882e-01],
       [1.132e-01, 9.966e-02, 1.824e+01, 1.094e+02, 2.604e-01],
       [1.257e-01, 2.032e-01, 1.704e+01,

## MLXtend

In [7]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [8]:
# step forward feature selection

sfs = SFS(
    estimator=RandomForestClassifier(n_estimators=5, random_state=0),
    k_features=10,  # the number of features to retain
    forward=True, # the direction of  the search
    verbose=1,  # print out intermediate steps
    scoring='roc_auc',
    cv=3,
)

sfs = sfs.fit(X_train, y_train)

Features: 10/10

In [9]:
# the selected features

sfs.k_feature_names_

('mean area',
 'mean smoothness',
 'mean concavity',
 'mean concave points',
 'area error',
 'worst texture',
 'worst perimeter',
 'worst concavity',
 'worst symmetry',
 'worst fractal dimension')

In [10]:
# transform data

X_train_t = sfs.transform(X_train)
X_test_t = sfs.transform(X_test)

X_test_t

array([[5.567e+02, 1.106e-01, 1.445e-01, ..., 5.106e-01, 3.585e-01,
        1.109e-01],
       [5.379e+02, 8.791e-02, 2.772e-02, ..., 1.390e-01, 2.444e-01,
        6.788e-02],
       [6.065e+02, 7.966e-02, 2.087e-02, ..., 6.260e-02, 2.136e-01,
        6.710e-02],
       ...,
       [4.685e+02, 9.003e-02, 2.958e-02, ..., 1.791e-01, 3.110e-01,
        7.592e-02],
       [5.592e+02, 1.291e-01, 6.877e-02, ..., 8.539e-02, 2.710e-01,
        7.191e-02],
       [1.214e+03, 1.120e-01, 2.508e-01, ..., 6.810e-01, 3.643e-01,
        9.223e-02]])