# Feature Selection
Discard features that have little to no impact on model

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import RFE
import pandas as pd

def feature_importances_dataframe(importances: pd.Series) -> pd.DataFrame:
    return pd.DataFrame(
        {'feature': feature_names, 'importance': importances}
    ).sort_values(by='importance', ascending=False)

# generate random data
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=2, random_state=42)

# convert to dataframe for readability
feature_names = [f'feature_{i}' for i in range(X.shape[1])]
X = pd.DataFrame(X, columns=feature_names)
y = pd.Series(y, name='target')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

1. Tree-based models like Random Forests and Gradient Boosting Trees provide feature importance scores that can indicate the relative importance of each feature.

In [7]:
# Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

# Get feature importances
importances = model.feature_importances_
feature_importances_dataframe(importances)

Accuracy: 0.94


Unnamed: 0,feature,importance
4,feature_4,0.215216
9,feature_9,0.167064
0,feature_0,0.148667
5,feature_5,0.136675
3,feature_3,0.113382
6,feature_6,0.071625
1,feature_1,0.069917
2,feature_2,0.026659
8,feature_8,0.026264
7,feature_7,0.02453


2. **Permutation Feature Importance**: evals importance of feature by measuring the decrease in model performance when the values of that feature are randomly shuffled.

In [10]:
result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2)
importances = result.importances_mean
feature_importances_dataframe(importances)

Unnamed: 0,feature,importance
4,feature_4,0.200333
5,feature_5,0.109333
0,feature_0,0.105333
9,feature_9,0.043
3,feature_3,0.026667
1,feature_1,0.007
6,feature_6,-0.000667
2,feature_2,-0.001667
8,feature_8,-0.002333
7,feature_7,-0.002333


3. **Recursive Feature Elimination (RFE)**: Fit a model and removes the weakest feature(s) until the specified number of features is reached.

In [13]:
model = RandomForestClassifier(random_state=42)
rfe = RFE(estimator=model, n_features_to_select=5)
rfe.fit(X_train, y_train)
selected_features = X.columns[rfe.support_]
selected_features

Index(['feature_0', 'feature_3', 'feature_4', 'feature_5', 'feature_9'], dtype='object')