# Feature Selection

## 作業程式碼

本作業將請學員完成以下要求：
1. 請至 Kaggle 平台找尋欲探索的資料集，進行本次作業。
2. 實作 Exhaustive Search
3. 實作 Sequential Forward/Backward Feature Selection
4. 實作 Sequential Floating Forward/Backward Feature Selection
5. 實作 Recursive Feature Elimination
6. 實作 Recursive Feature Elimination with Cross-Validation

> 注意：由於目前尚未教學建立機器學習模型，資料集請以「預測類別特徵」為主，以利參考範例程式碼進行實作

# Import packages

In [24]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from xgboost import XGBRegressor

import warnings
warnings.filterwarnings("ignore")

In [5]:
df = pd.read_csv("disaster_train.csv") # 此行要填入資料路徑
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
df.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


In [7]:
# 把遺失值過多的欄位排除掉
df = df.dropna(axis = 1, thresh = int(round(df.shape[0]*0.5)) )
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked'],
      dtype='object')

In [10]:
selected_columns = [
    "Pclass",   
    "Sex",     
    "Age",     
    "Fare",    
    "SibSp",    
    "Embarked" 
]
x_small = df[selected_columns]
y = df["Survived"]

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [26]:
numeric_features = ["Age", "Fare", "SibSp", "Pclass"]
categorical_features = ["Sex", "Embarked"]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# Exhaustive Feature Selection
評估指標參考連結：https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter 

In [49]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector
from sklearn.neighbors import KNeighborsClassifier


In [48]:
knn = KNeighborsClassifier(n_neighbors=3)

In [50]:
efs = ExhaustiveFeatureSelector(
    estimator=knn,
    min_features=1,
    max_features=6,
    scoring="accuracy",
    cv=5,
    n_jobs=-1
)

In [51]:
pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("efs", efs)
    ]
)

In [52]:
pipeline.fit(x_small, y)

Features: 126/126

In [53]:
print("Best feature subset:", pipeline.named_steps["efs"].best_feature_names_)
print("Best CV accuracy:", pipeline.named_steps["efs"].best_score_)

Best feature subset: ('0', '1', '3', '4', '6')
Best CV accuracy: 0.810344611135522


# Sequential Forward Selection

程式碼參考連結：http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/#overview    
評估指標參考連結：https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter 

In [54]:
# from sklearn.feature_selection import SequentialFeatureSelector
from mlxtend.feature_selection import SequentialFeatureSelector

In [59]:
sfs = SequentialFeatureSelector(
    estimator=knn,
    k_features=4,
    forward=True,
    floating=False,
    scoring="accuracy",
    cv=5,
    n_jobs=-1
)

pipeline_sfs = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("sfs", sfs),
    ("model", knn)
])

pipeline_sfs.fit(x_small, y)

print("Selected feature indices:", pipeline_sfs.named_steps["sfs"].k_feature_idx_)
print("Best CV accuracy:", pipeline_sfs.named_steps["sfs"].k_score_)

Selected feature indices: (2, 4, 5, 6)
Best CV accuracy: 0.7688594564057498


# Sequential Backward Selection

In [None]:
sbs = SequentialFeatureSelector(
    estimator=knn,
    k_features=4,          
    forward=False,         
    floating=False,
    scoring="accuracy",
    cv=5,
    n_jobs=-1
)

pipeline_sbs = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("sbs", sbs)
])

pipeline_sbs.fit(x_small, y)

print("Selected feature indices:", pipeline_sbs.named_steps["sbs"].k_feature_idx_)
print("Best CV accuracy:", pipeline_sbs.named_steps["sbs"].k_score_)

Selected feature indices: (0, 2, 3, 4)
Best CV accuracy: 0.7991337643587972


# Sequential Floating Forward Selection

In [61]:
# Sequential Floating Forward Selection
sffs = SequentialFeatureSelector(
    estimator=knn,
    k_features=4,          
    forward=True,          
    floating=True,         
    scoring="accuracy",
    cv=5,
    n_jobs=-1
)

pipeline_sffs = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("sffs", sffs)
])

pipeline_sffs.fit(x_small, y)

print("Selected feature indices:", pipeline_sffs.named_steps["sffs"].k_feature_idx_)
print("Best CV accuracy:", pipeline_sffs.named_steps["sffs"].k_score_)

Selected feature indices: (2, 4, 5, 6)
Best CV accuracy: 0.7688594564057498


# Sequential Floating Backward Selection

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector

In [62]:
# Sequential Floating Forward Selection
sfbs = SequentialFeatureSelector(
    estimator=knn,
    k_features=4,          
    forward=False,          
    floating=True,         
    scoring="accuracy",
    cv=5,
    n_jobs=-1
)

pipeline_sffs = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("sffs", sffs)
])

pipeline_sffs.fit(x_small, y)

print("Selected feature indices:", pipeline_sffs.named_steps["sffs"].k_feature_idx_)
print("Best CV accuracy:", pipeline_sffs.named_steps["sffs"].k_score_)

Selected feature indices: (2, 4, 5, 6)
Best CV accuracy: 0.7688594564057498


# Recursive Feature Elimination

In [64]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [65]:
log_reg = LogisticRegression(
    max_iter=1000,
    solver="liblinear"
)

rfe = RFE(
    estimator=log_reg,
    n_features_to_select=4,   
    step=1                  
)

pipeline_rfe = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("rfe", rfe),
    ("model", log_reg)
])

pipeline_rfe.fit(x_small, y)

print("Selected features mask:", pipeline_rfe.named_steps["rfe"].support_)
print("Feature ranking:", pipeline_rfe.named_steps["rfe"].ranking_)

Selected features mask: [ True False False  True  True False  True]
Feature ranking: [1 3 2 1 1 4 1]


# Recursive Feature Elimination with Cross-Validation

In [67]:
from sklearn.feature_selection import RFECV

In [68]:
rfecv = RFECV(
    estimator=log_reg,
    step=1,                
    cv=5,                  
    scoring="accuracy",
    min_features_to_select=1
)

pipeline_rfecv = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("rfecv", rfecv),
    ("model", log_reg)
])

pipeline_rfecv.fit(x_small, y)

print("Optimal number of features:", pipeline_rfecv.named_steps["rfecv"].n_features_)
print("Feature ranking:", pipeline_rfecv.named_steps["rfecv"].ranking_)

Optimal number of features: 5
Feature ranking: [1 2 1 1 1 3 1]
