## Recursive Feature Elimination (RFE)

### works by recursively removing attributes
### build a model on those attributes that remain
### model accuracy to identify which attributes (and combination of attributes) contribute the most to predicting the target

In [1]:

# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [2]:
# load data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(url, names=names)#pandas df
## class is the response variable

In [3]:
dataframe.head(n=6)

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0


In [4]:
array = dataframe.values #np form

In [5]:
print(array)

[[   6.     148.      72.    ...,    0.627   50.       1.   ]
 [   1.      85.      66.    ...,    0.351   31.       0.   ]
 [   8.     183.      64.    ...,    0.672   32.       1.   ]
 ..., 
 [   5.     121.      72.    ...,    0.245   30.       0.   ]
 [   1.     126.      60.    ...,    0.349   47.       1.   ]
 [   1.      93.      70.    ...,    0.315   23.       0.   ]]


In [6]:
X = array[:,0:8] #predictors
Y = array[:,8]#response

### select the most important predictors to help classify the response variable

In [7]:
model = LogisticRegression()
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)

In [10]:
print(fit.n_features_)


3


In [11]:
print(fit.ranking_)

[1 2 3 5 6 1 1 4]


In [12]:
print(fit.support_) #take the features marked true

[ True False False False False  True  True False]
