In [1]:
!pip install mlxtend



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [5]:
data = pd.read_csv('dermatology_csv.csv')
data.head()

X = data.iloc[:,0:34]

y = data.iloc[:,-1]

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy ='mean')
imputer = imputer.fit(X)
X = imputer.transform(X)

In [6]:
sfs = SFS(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs = -1),
         k_features = 9,
          forward= True, 
          verbose= 2,
          scoring= 'accuracy',
          #cv is cross validation
          cv = 4,
          n_jobs= -1
         ).fit(X, y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 out of  34 | elapsed:   26.5s finished

[2020-05-25 03:06:26] Features: 1/9 -- score: 0.5041208791208791[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 out of  33 | elapsed:   23.1s finished

[2020-05-25 03:06:50] Features: 2/9 -- score: 0.6768096034400382[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 out of  32 | elapsed:   21.9s finished

[2020-05-25 03:07:12] Features: 3/9 -- score: 0.7946428571428571[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  31 out of  31 | elapsed:   21.5s finished

[2020-05-25 03:07:33] Features: 4/9 -- score: 0.8712075967510751[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   21.8s finished

[202

In [6]:
sfs.k_feature_names_

('4', '7', '13', '14', '20', '23', '27', '29', '32')

In [7]:
sfs.k_score_

0.9644648829431439

In [8]:
pd.DataFrame.from_dict(sfs.get_metric_dict()).T

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(20,)","[0.5, 0.5054945054945055, 0.5054945054945055, ...",0.504121,"(20,)",0.0038138,0.00237919,0.00137363
2,"(13, 20)","[0.6413043478260869, 0.7472527472527473, 0.659...",0.67681,"(13, 20)",0.0662537,0.0413315,0.0238628
3,"(13, 20, 32)","[0.75, 0.8791208791208791, 0.7802197802197802,...",0.794643,"(13, 20, 32)",0.0800822,0.0499583,0.0288434
4,"(13, 14, 20, 32)","[0.8804347826086957, 0.8901098901098901, 0.846...",0.871208,"(13, 14, 20, 32)",0.0263346,0.0164285,0.00948502
5,"(13, 14, 20, 29, 32)","[0.8913043478260869, 0.945054945054945, 0.9010...",0.915134,"(13, 14, 20, 29, 32)",0.0332709,0.0207557,0.0119833
6,"(13, 14, 20, 27, 29, 32)","[0.9239130434782609, 0.978021978021978, 0.9230...",0.937022,"(13, 14, 20, 27, 29, 32)",0.0379485,0.0236737,0.013668
7,"(4, 13, 14, 20, 27, 29, 32)","[0.9347826086956522, 0.989010989010989, 0.9560...",0.964465,"(4, 13, 14, 20, 27, 29, 32)",0.0334161,0.0208462,0.0120356
8,"(4, 13, 14, 20, 23, 27, 29, 32)","[0.9347826086956522, 0.989010989010989, 0.9560...",0.964465,"(4, 13, 14, 20, 23, 27, 29, 32)",0.0334161,0.0208462,0.0120356
9,"(4, 7, 13, 14, 20, 23, 27, 29, 32)","[0.9347826086956522, 0.989010989010989, 0.9560...",0.964465,"(4, 7, 13, 14, 20, 23, 27, 29, 32)",0.0334161,0.0208462,0.0120356


In [9]:
sfs = SFS(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs = -1),
         k_features = (1, 9),
          forward= True,
          verbose= 2,
          scoring= 'accuracy',
          cv = 4,
          n_jobs= -1
         ).fit(X, y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 out of  34 | elapsed:   22.6s finished

[2020-05-19 17:42:09] Features: 1/9 -- score: 0.5041208791208791[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 out of  33 | elapsed:   22.5s finished

[2020-05-19 17:42:31] Features: 2/9 -- score: 0.6768096034400382[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 out of  32 | elapsed:   20.5s finished

[2020-05-19 17:42:52] Features: 3/9 -- score: 0.7946428571428571[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  31 out of  31 | elapsed:   20.3s finished

[2020-05-19 17:43:13] Features: 4/9 -- score: 0.8712075967510751[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   20.2s finished

[202

In [10]:
sfs.k_score_

0.9644648829431439

In [11]:
sfs.k_feature_names_

('4', '13', '14', '20', '27', '29', '32')

In [12]:
###step Backward selection

In [13]:
sfs = SFS(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs = -1),
         k_features = 9,
          forward= False,
          verbose= 2,
          scoring= 'accuracy',
          cv = 4,
          n_jobs= -1
         ).fit(X, y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 out of  34 | elapsed:   24.3s finished

[2020-05-19 17:45:15] Features: 33/9 -- score: 0.9780817009077879[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 out of  33 | elapsed:   22.8s finished

[2020-05-19 17:45:38] Features: 32/9 -- score: 0.9780518394648829[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 out of  32 | elapsed:   20.8s finished

[2020-05-19 17:45:59] Features: 31/9 -- score: 0.9780817009077879[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  31 out of  31 | elapsed:   21.3s finished

[2020-05-19 17:46:21] Features: 30/9 -- score: 0.9808289536550406[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   19.9s finished



In [14]:
sbs = sfs

sbs.k_score_

0.9617176301958911

In [15]:
sbs.k_feature_names_


('0', '4', '11', '14', '15', '19', '21', '25', '29')