# Recursive feature elimination - feature importance

- [Feature Selection in Machine Learning Book](https://www.trainindata.com/p/feature-selection-in-machine-learning-book)

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split

In [2]:
# load dataset

X, y = load_breast_cancer(return_X_y=True, as_frame=True)

# Separate data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [3]:
X_train.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
293,11.85,17.46,75.54,432.7,0.08372,0.05642,0.02688,0.0228,0.1875,0.05715,...,13.06,25.75,84.35,517.8,0.1369,0.1758,0.1316,0.0914,0.3101,0.07007
332,11.22,19.86,71.94,387.3,0.1054,0.06779,0.005006,0.007583,0.194,0.06028,...,11.98,25.78,76.91,436.1,0.1424,0.09669,0.01335,0.02022,0.3292,0.06522
565,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,0.05533,...,23.69,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637
278,13.59,17.84,86.24,572.3,0.07948,0.04052,0.01997,0.01238,0.1573,0.0552,...,15.5,26.1,98.91,739.1,0.105,0.07622,0.106,0.05185,0.2335,0.06263
489,16.69,20.2,107.1,857.6,0.07497,0.07112,0.03649,0.02307,0.1846,0.05325,...,19.18,26.56,127.3,1084.0,0.1009,0.292,0.2477,0.08737,0.4677,0.07623


### Select features recursively

In [4]:
sel_ = RFE(
    RandomForestClassifier(n_estimators=10, random_state=10),
    n_features_to_select=8,
    step=2,
).set_output(transform="pandas")

sel_.fit(X_train, y_train)

In [5]:
# selected features

selected_feat = X_train.columns[(sel_.get_support())]
len(selected_feat)

8

In [6]:
# let's display the list of features
selected_feat

Index(['mean area', 'mean concave points', 'area error', 'worst texture',
       'worst perimeter', 'worst area', 'worst compactness',
       'worst concave points'],
      dtype='object')

In [7]:
# We can then remove the features from the training and testing set
# like this:

X_train_selected = sel_.transform(X_train)
X_test_selected = sel_.transform(X_test)

X_train_selected.shape, X_test_selected.shape

((426, 8), (143, 8))

In [8]:
X_train_selected.head()

Unnamed: 0,mean area,mean concave points,area error,worst texture,worst perimeter,worst area,worst compactness,worst concave points
293,432.7,0.0228,13.88,25.75,84.35,517.8,0.1758,0.0914
332,387.3,0.007583,19.62,25.78,76.91,436.1,0.09669,0.02022
565,1261.0,0.09791,99.04,38.25,155.0,1731.0,0.1922,0.1628
278,572.3,0.01238,22.22,26.1,98.91,739.1,0.07622,0.05185
489,857.6,0.02307,22.95,26.56,127.3,1084.0,0.292,0.08737
