# Wrapper Methods

In [1]:
!pip install mlxtend



In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, roc_auc_score

from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [4]:
from sklearn.datasets import load_wine

In [5]:
data = load_wine() ## Multiclass Classification data

In [6]:
print(data.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [7]:
X = pd.DataFrame(data.data, columns=data.feature_names)

In [8]:
X.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [9]:
y = data.target

In [10]:
print(y[:5])

[0 0 0 0 0]


In [11]:
print(X.shape)
print(y.shape)

(178, 13)
(178,)


In [12]:
X.isnull().sum()

alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
dtype: int64

In [13]:
X.describe()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0


In [14]:
X.dtypes

alcohol                         float64
malic_acid                      float64
ash                             float64
alcalinity_of_ash               float64
magnesium                       float64
total_phenols                   float64
flavanoids                      float64
nonflavanoid_phenols            float64
proanthocyanins                 float64
color_intensity                 float64
hue                             float64
od280/od315_of_diluted_wines    float64
proline                         float64
dtype: object

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Step Forward Selection

In [16]:
### Selecting from all features manually

In [17]:
sfs = SFS(estimator=RandomForestClassifier(n_estimators=100, max_depth=4), k_features=13, forward=True, scoring='accuracy')
sfs.fit(X_train, y_train)
val = pd.DataFrame.from_dict(sfs.get_metric_dict()).T
val

Unnamed: 0,avg_score,ci_bound,cv_scores,feature_idx,feature_names,std_dev,std_err
1,0.743508,0.113079,"[0.8571428571428571, 0.6296296296296297, 0.653...","(6,)","(flavanoids,)",0.0879792,0.0439896
2,0.932743,0.0757985,"[0.9642857142857143, 0.8148148148148148, 0.961...","(6, 9)","(flavanoids, color_intensity)",0.0589738,0.0294869
3,0.962658,0.0301363,"[0.9642857142857143, 0.9259259259259259, 0.961...","(6, 9, 12)","(flavanoids, color_intensity, proline)",0.0234471,0.0117235
4,0.970065,0.0192823,"[0.9642857142857143, 0.9629629629629629, 0.961...","(1, 6, 9, 12)","(malic_acid, flavanoids, color_intensity, prol...",0.0150023,0.00750116
5,0.977208,0.0239281,"[1.0, 0.9629629629629629, 0.9615384615384616, ...","(0, 1, 6, 9, 12)","(alcohol, malic_acid, flavanoids, color_intens...",0.0186169,0.00930844
6,0.969801,0.0358295,"[1.0, 0.9259259259259259, 0.9615384615384616, ...","(0, 1, 4, 6, 9, 12)","(alcohol, malic_acid, magnesium, flavanoids, c...",0.0278766,0.0139383
7,0.977493,0.0382724,"[1.0, 0.9259259259259259, 0.9615384615384616, ...","(0, 1, 2, 4, 6, 9, 12)","(alcohol, malic_acid, ash, magnesium, flavanoi...",0.0297772,0.0148886
8,0.978042,0.0379184,"[0.9642857142857143, 0.9259259259259259, 1.0, ...","(0, 1, 2, 3, 4, 6, 9, 12)","(alcohol, malic_acid, ash, alcalinity_of_ash, ...",0.0295018,0.0147509
9,0.977493,0.0382724,"[1.0, 0.9259259259259259, 0.9615384615384616, ...","(0, 1, 2, 3, 4, 6, 8, 9, 12)","(alcohol, malic_acid, ash, alcalinity_of_ash, ...",0.0297772,0.0148886
10,0.977493,0.0382724,"[1.0, 0.9259259259259259, 0.9615384615384616, ...","(0, 1, 2, 3, 4, 6, 8, 9, 10, 12)","(alcohol, malic_acid, ash, alcalinity_of_ash, ...",0.0297772,0.0148886


In [18]:
### Selecting from all features not manually

In [19]:
sfs = SFS(estimator=RandomForestClassifier(n_estimators=100, max_depth=4), k_features=(1, 13), forward=True, scoring='accuracy')
sfs.fit(X_train, y_train)

SequentialFeatureSelector(clone_estimator=True, cv=5,
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
             floating=False, forward=True, k_features=(1, 13), n_jobs=1,
             pre_dispatch='2*n_jobs', scoring='accuracy', verbose=0)

In [20]:
sfs.k_score_

0.9851851851851852

In [21]:
sfs.k_feature_names_

('alcohol',
 'malic_acid',
 'alcalinity_of_ash',
 'magnesium',
 'flavanoids',
 'color_intensity',
 'hue',
 'proline')

In [22]:
selected_features = list(sfs.k_feature_names_)
selected_features

['alcohol',
 'malic_acid',
 'alcalinity_of_ash',
 'magnesium',
 'flavanoids',
 'color_intensity',
 'hue',
 'proline']

In [23]:
X_train_new = X_train[selected_features]
X_test_new = X_test[selected_features]

In [24]:
%%time
model = RandomForestClassifier(n_estimators=100, max_depth=4)
model.fit(X_train_new, y_train)
y_pred = model.predict(X_test_new)
print(accuracy_score(y_test, y_pred))

1.0
Wall time: 116 ms


In [25]:
%%time
model = RandomForestClassifier(n_estimators=100, max_depth=4)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))

1.0
Wall time: 115 ms


### Step Backward Selection

In [26]:
### Selecting from all features manually

In [27]:
sbs = SFS(estimator=RandomForestClassifier(n_estimators=100, max_depth=4), k_features=1, forward=False, scoring='accuracy')
sbs.fit(X_train, y_train)
val = pd.DataFrame.from_dict(sbs.get_metric_dict()).T
val

Unnamed: 0,avg_score,ci_bound,cv_scores,feature_idx,feature_names,std_dev,std_err
13,0.977493,0.0382724,"[1.0, 0.9259259259259259, 0.9615384615384616, ...","(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)","(alcohol, malic_acid, ash, alcalinity_of_ash, ...",0.0297772,0.0148886
12,0.985185,0.0380827,"[1.0, 0.9259259259259259, 1.0, 1.0, 1.0]","(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12)","(alcohol, malic_acid, ash, alcalinity_of_ash, ...",0.0296296,0.0148148
11,0.985185,0.0380827,"[1.0, 0.9259259259259259, 1.0, 1.0, 1.0]","(0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 12)","(alcohol, malic_acid, ash, alcalinity_of_ash, ...",0.0296296,0.0148148
10,0.985185,0.0380827,"[1.0, 0.9259259259259259, 1.0, 1.0, 1.0]","(0, 1, 2, 3, 4, 5, 6, 9, 10, 12)","(alcohol, malic_acid, ash, alcalinity_of_ash, ...",0.0296296,0.0148148
9,0.98545,0.0229107,"[0.9642857142857143, 0.9629629629629629, 1.0, ...","(0, 1, 2, 3, 4, 5, 6, 10, 12)","(alcohol, malic_acid, ash, alcalinity_of_ash, ...",0.0178253,0.00891264
8,0.9849,0.0237763,"[1.0, 0.9629629629629629, 1.0, 0.9615384615384...","(0, 2, 3, 4, 5, 6, 10, 12)","(alcohol, ash, alcalinity_of_ash, magnesium, t...",0.0184988,0.00924939
7,0.98545,0.0229107,"[0.9642857142857143, 0.9629629629629629, 1.0, ...","(0, 2, 3, 4, 5, 6, 10)","(alcohol, ash, alcalinity_of_ash, magnesium, t...",0.0178253,0.00891264
6,0.98545,0.0229107,"[0.9642857142857143, 0.9629629629629629, 1.0, ...","(0, 2, 4, 5, 6, 10)","(alcohol, ash, magnesium, total_phenols, flava...",0.0178253,0.00891264
5,0.977757,0.0233689,"[0.9642857142857143, 0.9629629629629629, 1.0, ...","(0, 4, 5, 6, 10)","(alcohol, magnesium, total_phenols, flavanoids...",0.0181818,0.00909088
4,0.962658,0.0434246,"[0.9642857142857143, 0.9259259259259259, 1.0, ...","(0, 4, 6, 10)","(alcohol, magnesium, flavanoids, hue)",0.0337858,0.0168929


In [28]:
###### Selecting from all features not manually

In [29]:
sbs = SFS(estimator=RandomForestClassifier(n_estimators=100, max_depth=4), k_features=(1, 13), forward=False, scoring='accuracy')
sbs.fit(X_train, y_train)

SequentialFeatureSelector(clone_estimator=True, cv=5,
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
             floating=False, forward=False, k_features=(1, 13), n_jobs=1,
             pre_dispatch='2*n_jobs', scoring='accuracy', verbose=0)

In [30]:
sbs.k_score_

0.9925925925925926

In [31]:
sbs.k_feature_names_

('alcohol',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'hue',
 'proline')

In [32]:
X_train_new = X_train[list(sbs.k_feature_names_)]
X_test_new = X_test[list(sbs.k_feature_names_)]

In [33]:
%%time
model = RandomForestClassifier(n_estimators=100, max_depth=4)
model.fit(X_train_new, y_train)
y_pred = model.predict(X_test_new)
print(accuracy_score(y_test, y_pred))

0.9777777777777777
Wall time: 160 ms


In [34]:
%%time
model = RandomForestClassifier(n_estimators=100, max_depth=4)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))

1.0
Wall time: 112 ms


### Exhaustive Feature Selection

In [35]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

In [36]:
efs = EFS(RandomForestClassifier(n_estimators=100, max_depth=4),min_features=4, max_features=10, scoring='accuracy')
efs.fit(X_train, y_train)

Features: 7722/7722

ExhaustiveFeatureSelector(clone_estimator=True, cv=5,
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
             max_features=10, min_features=4, n_jobs=1,
             pre_dispatch='2*n_jobs', print_progress=True,
             scoring='accuracy')

In [37]:
efs.best_score_

0.9925925925925926

In [38]:
efs.best_feature_names_

('ash',
 'alcalinity_of_ash',
 'magnesium',
 'flavanoids',
 'proanthocyanins',
 'color_intensity',
 'proline')

In [39]:
X_train_new = X_train[list(efs.best_feature_names_)]
X_test_new = X_test[list(efs.best_feature_names_)]

In [40]:
%%time
model = RandomForestClassifier(n_estimators=100, max_depth=4)
model.fit(X_train_new, y_train)
y_pred = model.predict(X_test_new)
print(accuracy_score(y_test, y_pred))

0.9777777777777777
Wall time: 127 ms


In [41]:
%%time
model = RandomForestClassifier(n_estimators=100, max_depth=4)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))

1.0
Wall time: 130 ms
