#Feature Selection - Wrapper methods

In [4]:
import pandas as pd

# Load the filtered datasets
train_filtered = pd.read_csv("filtered_titanic_train.csv")
test_filtered = pd.read_csv("filtered_titanic_test.csv")

# Separate features and target
X_train = train_filtered.drop(columns=["target"])
y_train = train_filtered["target"]

X_test = test_filtered.drop(columns=["target"])
y_test = test_filtered["target"]


In [5]:
#Wrapper methods
#Forward Elimination
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
sfs1=SFS(RandomForestClassifier(n_jobs=4),
         k_features=5,
         floating = True,
         forward = True,
         scoring = 'roc_auc' or 'accuracy',
         verbose=2,
         cv=3)
sfs1=sfs1.fit(X_train,y_train)

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:   14.5s finished

[2025-08-20 07:54:38] Features: 1/5 -- score: 0.7837948312236286[Parallel(n_jobs=1)]: Done  19 out of  19 | elapsed:   14.0s finished

[2025-08-20 07:54:52] Features: 2/5 -- score: 0.8385696861814346[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:   13.4s finished
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.7s finished

[2025-08-20 07:55:07] Features: 3/5 -- score: 0.8581001120780591[Parallel(n_jobs=1)]: Done  17 out of  17 | elapsed:   13.2s finished
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    2.4s finished

[2025-08-20 07:55:23] Features: 4/5 -- score: 0.8654261850606542[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:   16.0s finished
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    3.9s finished

[2025-08-20 07:55:43] Features: 5/5 -- score: 0.8748121044303797

In [6]:
#Results
print("Best accuracy score:",sfs1.k_score_)
print('Best subset (indices):', sfs1.k_feature_idx_)
print('Best subset (corresponding names):', sfs1.k_feature_names_)

Best accuracy score: 0.8748121044303797
Best subset (indices): (3, 4, 9, 10, 14)
Best subset (corresponding names): ('3', '4', '9', '10', '14')


In [7]:
#Backward elimination
sfs2=SFS(RandomForestClassifier(n_jobs=4),
         k_features=5,
         floating = True,
         forward = False,
         scoring = 'roc_auc' or 'accuracy',
         verbose=2,
         cv=3)
sfs2=sfs2.fit(X_train,y_train)

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:   16.1s finished

[2025-08-20 07:56:40] Features: 19/5 -- score: 0.8513016712816457[Parallel(n_jobs=1)]: Done  19 out of  19 | elapsed:   15.2s finished

[2025-08-20 07:56:55] Features: 18/5 -- score: 0.85616985594673[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:   14.4s finished
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.0s finished

[2025-08-20 07:57:12] Features: 17/5 -- score: 0.8569191719409283[Parallel(n_jobs=1)]: Done  17 out of  17 | elapsed:   13.6s finished
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    2.5s finished

[2025-08-20 07:57:28] Features: 16/5 -- score: 0.8561465750263713[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:   15.1s finished
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    3.1s finished

[2025-08-20 07:57:46] Features: 15/5 -- score: 0.8567683610232067[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:   16.1s finished
[Parallel(n_jobs=1)]: Done   5 out of   5 

In [8]:
#Results
print("Best accuracy score:",sfs2.k_score_)
print('Best subset (indices):', sfs2.k_feature_idx_)
print('Best subset (corresponding names):', sfs2.k_feature_names_)

Best accuracy score: 0.870111830828059
Best subset (indices): (4, 9, 10, 13, 14)
Best subset (corresponding names): ('4', '9', '10', '13', '14')


In [9]:
#Exhaustive elimination
from mlxtend.feature_selection import ExhaustiveFeatureSelector
efs = ExhaustiveFeatureSelector(RandomForestClassifier(n_jobs=4),
                                min_features=1,
                                max_features=5,  # limit max features for speed
                                scoring='accuracy',
                                print_progress=True,
                                cv=5)

In [None]:
efs=efs.fit(X_train,y_train)
print('Best accuracy score:', efs.best_score_) #  shows the best score
print('Best subset (indices):', efs.best_idx_)       # shows the index o
print('Best subset (corresponding names):', efs.best_feature_names )

In [None]:
#Recursive Feature Elimination
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

# Initialize model
rf_model = RandomForestClassifier(n_jobs=4, random_state=42)
# RFE: Select top 5 features
rfe = RFE(estimator=rf_model, n_features_to_select=5)
# Fit
rfe.fit(X_train, y_train)


In [None]:
#The results are:
print("RFE Selected feature mask:", rfe.support_)           # Boolean mask
print("RFE Feature ranking (1 = most important):", rfe.ranking_)
