# Exhaustive Search

- [Feature Selection in Machine Learning Book](https://www.trainindata.com/p/feature-selection-in-machine-learning-book)

In [1]:
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

In [2]:
# load dataset

breast_cancer = load_breast_cancer()
X = pd.DataFrame(breast_cancer.data, columns=breast_cancer.feature_names)
y = breast_cancer.target

# Separate data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

X_train.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
293,11.85,17.46,75.54,432.7,0.08372,0.05642,0.02688,0.0228,0.1875,0.05715,...,13.06,25.75,84.35,517.8,0.1369,0.1758,0.1316,0.0914,0.3101,0.07007
332,11.22,19.86,71.94,387.3,0.1054,0.06779,0.005006,0.007583,0.194,0.06028,...,11.98,25.78,76.91,436.1,0.1424,0.09669,0.01335,0.02022,0.3292,0.06522
565,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,0.05533,...,23.69,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637
278,13.59,17.84,86.24,572.3,0.07948,0.04052,0.01997,0.01238,0.1573,0.0552,...,15.5,26.1,98.91,739.1,0.105,0.07622,0.106,0.05185,0.2335,0.06263
489,16.69,20.2,107.1,857.6,0.07497,0.07112,0.03649,0.02307,0.1846,0.05325,...,19.18,26.56,127.3,1084.0,0.1009,0.292,0.2477,0.08737,0.4677,0.07623


In [3]:
efs = EFS(
    estimator=RandomForestClassifier(n_estimators=3, random_state=0),
          min_features=1,
          max_features=4,
          scoring='roc_auc',
          cv=2,
)

# search features
efs = efs.fit(X_train, y_train)

Features: 31537/31930

In [5]:
# the selected features

efs.best_feature_names_

('compactness error', 'worst area', 'worst concavity', 'worst symmetry')

In [19]:
# the subsets evaluated and their score

# efs.subsets_

In [9]:
efs.subsets_[0]

{'feature_idx': (0,),
 'cv_scores': array([0.83105044, 0.86616541]),
 'avg_score': 0.8486079287582265,
 'feature_names': ('mean radius',)}

In [13]:
efs.subsets_[60]

{'feature_idx': (1, 3),
 'cv_scores': array([0.86057056, 0.90953947]),
 'avg_score': 0.8850550192906221,
 'feature_names': ('mean texture', 'mean area')}

In [16]:
# Number of subsets evaluated

len(efs.subsets_.keys())

31930

In [18]:
# transform data

X_train_t = efs.transform(X_train)
X_test_t = efs.transform(X_test)

X_test_t

array([[2.265e-02, 8.444e+02, 5.106e-01, 3.585e-01],
       [8.082e-03, 6.329e+02, 1.390e-01, 2.444e-01],
       [9.238e-03, 6.889e+02, 6.260e-02, 2.136e-01],
       [1.377e-02, 8.197e+02, 1.565e-01, 2.636e-01],
       [4.899e-03, 5.459e+02, 4.833e-02, 1.987e-01],
       [1.641e-02, 4.786e+02, 1.624e-01, 3.060e-01],
       [4.560e-02, 4.884e+02, 2.912e-01, 2.191e-01],
       [9.110e-03, 4.831e+02, 7.915e-02, 3.487e-01],
       [9.692e-03, 2.480e+02, 0.000e+00, 3.058e-01],
       [4.671e-02, 3.571e+02, 7.162e-02, 2.434e-01],
       [5.244e-02, 8.092e+02, 3.219e-01, 2.827e-01],
       [1.818e-02, 8.037e+02, 9.189e-02, 2.522e-01],
       [2.674e-02, 3.010e+02, 1.868e-01, 2.376e-01],
       [2.661e-02, 1.032e+03, 1.882e-01, 2.527e-01],
       [4.412e-02, 8.036e+02, 2.604e-01, 3.151e-01],
       [2.321e-02, 1.102e+03, 5.830e-01, 3.216e-01],
       [6.590e-02, 2.971e+02, 4.609e-01, 3.135e-01],
       [2.062e-02, 1.872e+03, 4.146e-01, 2.437e-01],
       [4.097e-02, 1.313e+03, 3.829e-01, 2.576