In [2]:
!pip install -U mlxtend

Collecting mlxtend
  Downloading mlxtend-0.17.2-py2.py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 1.1 MB/s eta 0:00:01
Installing collected packages: mlxtend
Successfully installed mlxtend-0.17.2
You should consider upgrading via the '/Users/jananiravi/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import RFE

from sklearn.linear_model import LogisticRegression
from mlxtend.feature_selection import SequentialFeatureSelector

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
heart_data = pd.read_csv("datasets/heart.csv")

heart_data.head(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
x = heart_data.drop("target", axis = 1)

y = heart_data["target"]

### Recursive Feature Elimination

Select features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features and the importance of each feature is obtained either through a coef_ attribute or through a feature_importances_ attribute. 

Then, the least important features are pruned from current set of features.That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached.

In [9]:
model = RandomForestClassifier(n_estimators = 10)

rfe = RFE(model, n_features_to_select = 5)

In [10]:
fit = rfe.fit(x, y)

In [11]:
feature_rank = pd.DataFrame({"Features" : x.columns,
                             "Selected" : fit.support_,
                             "Feature Rank" : fit.ranking_})

feature_rank = feature_rank.sort_values(by="Feature Rank")

feature_rank

Unnamed: 0,Features,Selected,Feature Rank
2,cp,True,1
7,thalach,True,1
9,oldpeak,True,1
11,ca,True,1
12,thal,True,1
0,age,False,2
4,chol,False,3
3,trestbps,False,4
10,slope,False,5
8,exang,False,6


In [12]:
selected_feature_names = feature_rank.loc[feature_rank["Selected"] == True]

selected_feature_names

Unnamed: 0,Features,Selected,Feature Rank
2,cp,True,1
7,thalach,True,1
9,oldpeak,True,1
11,ca,True,1
12,thal,True,1


In [16]:
selected_features = x[selected_feature_names["Features"].values]

selected_features.head()

Unnamed: 0,cp,thalach,oldpeak,ca,thal
0,3,150,2.3,0,1
1,2,187,3.5,0,2
2,1,172,1.4,0,2
3,1,178,0.8,0,2
4,0,163,0.6,0,2


### Sequential Feature Selection

Sequential feature selection algorithms are a family of greedy search algorithms that are used to reduce an initial d-dimensional feature space to a k-dimensional feature subspace where k < d. 

The motivation behind feature selection algorithms is to automatically select a subset of features that is most relevant to the problem. The goal of feature selection is two-fold: We want to improve the computational efficiency and reduce the generalization error of the model by removing irrelevant features or noise.

http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/#:~:text=Overview,feature%20subspace%20where%20k%20%3C%20d

In a nutshell, SFAs remove or add one feature at the time based on the classifier performance until a feature subset of the desired size k is reached. 

In [17]:
feature_selector = SequentialFeatureSelector(RandomForestClassifier(n_estimators = 10),
                                             k_features = 5,
                                             forward = True,
                                             scoring = "accuracy",
                                             cv = 4)

In [18]:
features = feature_selector.fit(np.array(x), y)

In [19]:
forward_elimination_feature_names = list(x.columns[list(features.k_feature_idx_)])

forward_elimination_feature_names

['sex', 'restecg', 'exang', 'ca', 'thal']

In [20]:
forward_elimination_features = x[forward_elimination_feature_names]

forward_elimination_features.head()

Unnamed: 0,sex,restecg,exang,ca,thal
0,1,0,0,0,1
1,1,1,0,0,2
2,0,0,0,0,2
3,1,1,0,0,2
4,0,1,1,0,2


In [21]:
feature_selector = SequentialFeatureSelector(RandomForestClassifier(n_estimators = 10),
                                             k_features = 5,
                                             forward = False,
                                             scoring = "accuracy",
                                             cv = 4)

In [22]:
features = feature_selector.fit(np.array(x), y)

In [23]:
back_elimination_feature_names = list(x.columns[list(features.k_feature_idx_)])

back_elimination_feature_names

['cp', 'fbs', 'exang', 'slope', 'ca']

In [24]:
back_elimination_features = x[back_elimination_feature_names]

back_elimination_features.head()

Unnamed: 0,cp,fbs,exang,slope,ca
0,3,1,0,0,0
1,2,0,0,0,0
2,1,0,0,2,0
3,1,0,0,2,0
4,0,0,1,2,0


In [25]:
def build_model(x, y, test_frac):
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_frac)
    
    model = LogisticRegression(solver='liblinear').fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    print("Test_score : ", accuracy_score(y_test, y_pred))

In [26]:
build_model(x, y, 0.2)

Test_score :  0.7377049180327869


In [27]:
build_model(selected_features, y, 0.2)

Test_score :  0.8852459016393442


In [28]:
build_model(forward_elimination_features, y, 0.2)

Test_score :  0.7704918032786885


In [29]:
build_model(back_elimination_features, y, 0.2)

Test_score :  0.7868852459016393
