In [6]:
# Statistical tests can be used to select those features that have the strongest relationship with the output variable.

## Univariate Selection

In [4]:
import pandas
import numpy
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# load data
url = "data/pima-indians-diabetes1.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
array = (dataframe.values)

X = array[:,0:8]
Y = array[:,8]

# feature extraction
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)

# summarize scores
numpy.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)

# summarize selected features
print(features[0:5,:])

[  111.52   1411.887    17.605    53.108  2175.565   127.669     5.393
   181.304]
[[ 148.     0.    33.6   50. ]
 [  85.     0.    26.6   31. ]
 [ 183.     0.    23.3   32. ]
 [  89.    94.    28.1   21. ]
 [ 137.   168.    43.1   33. ]]


In [5]:
#see the scores for each attribute and the 4 attributes chosen (those with the highest scores): 
#plas, test, mass and age.

## Recursive Feature Elimination

### The Recursive Feature Elimination (or RFE) works by recursively removing attributes and building a model on those attributes that remain.

### It uses the model accuracy to identify which attributes (and combination of attributes) contribute the most to predicting the target attribute.

In [7]:
## The example below uses RFE with the logistic regression algorithm to select the top 3 features. 
## The choice of algorithm does not matter too much as long as it is skillful and consistent.

In [12]:
# Feature Extraction with RFE
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# load data
url = "data/pima-indians-diabetes1.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
model = LogisticRegression()
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)

print (fit.support_)
print (fit.ranking_)
print (fit.n_features_)
# print("Num Features: %d") % fit.n_features_
# print("Selected Features: %s") % fit.support_
# print("Feature Ranking: %s") % fit.ranking_

[ True False False False False  True  True False]
[1 2 3 5 6 1 1 4]
3


In [13]:
#  RFE chose the the top 3 features as preg, mass and pedi.

# These are marked True in the support_ array and marked with a choice “1” in the ranking_ array.

#### The Recursive Feature Elimination (RFE) method is a feature selection approach. It works by recursively removing attributes and building a model on those attributes that remain. It uses the model accuracy to identify which attributes (and combination of attributes) contribute the most to predicting the target attribute.

#### This recipe shows the use of RFE on the Iris floweres dataset to select 3 attributes.

In [22]:


# Recursive Feature Elimination
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# load the iris datasets
dataset = datasets.load_iris()
# create a base classifier used to evaluate a subset of attributes
model = LogisticRegression()
# create the RFE model and select 3 attributes
rfe = RFE(model, 3)
rfe = rfe.fit(dataset.data, dataset.target)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[False  True  True  True]
[2 1 1 1]


### Feature Importance
#### Methods that use ensembles of decision trees (like Random Forest or Extra Trees) can also compute the relative importance of each attribute. These importance values can be used to inform a feature selection process.

#### This recipe shows the construction of an Extra Trees ensemble of the iris flowers dataset and the display of the relative feature importance.

In [23]:
# Feature Importance
from sklearn import datasets
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
# load the iris datasets
dataset = datasets.load_iris()
# fit an Extra Trees model to the data
model = ExtraTreesClassifier()
model.fit(dataset.data, dataset.target)
# display the relative importance of each attribute
print(model.feature_importances_)

[ 0.109  0.023  0.526  0.341]


### Summary
#### Feature selection methods can give you useful information on the relative importance or relevance of features for a given problem. You can use this information to create filtered versions of your dataset and increase the accuracy of your models.



## Principal Component Analysis

In [14]:
## Principal Component Analysis (or PCA) uses linear algebra to transform the dataset into a compressed form.
## Generally this is called a data reduction technique. A property of PCA is that you can choose the number of dimensions or principal component in the transformed result.
## In the example below, we use PCA and select 3 principal components.

In [18]:
# Feature selection with PCA

from pandas import read_csv
from sklearn.decomposition import PCA

# load data
url = "data/pima-indians-diabetes1.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]


# feature extraction
pca = PCA(n_components=3)
fit = pca.fit(X)
# summarize components
#print("Explained Variance: %s") % fit.explained_variance_ratio_
print (fit.explained_variance_ratio_)
print(fit.components_)

[ 0.889  0.062  0.026]
[[ -2.022e-03   9.781e-02   1.609e-02   6.076e-02   9.931e-01   1.401e-02
    5.372e-04  -3.565e-03]
 [ -2.265e-02  -9.722e-01  -1.419e-01   5.786e-02   9.463e-02  -4.697e-02
   -8.168e-04  -1.402e-01]
 [ -2.246e-02   1.434e-01  -9.225e-01  -3.070e-01   2.098e-02  -1.324e-01
   -6.400e-04  -1.255e-01]]


In [19]:
# the transformed dataset (3 principal components) bare little resemblance to the source data.

In [20]:
### Feature importance

In [21]:
##Bagged decision trees like Random Forest and Extra Trees can be used to estimate the importance of features.
##In the example below we construct a ExtraTreesClassifier classifier for the Pima Indians onset of diabetes dataset.
## You can learn more about the ExtraTreesClassifier class in the scikit-learn API.

# Feature Importance with Extra Trees Classifier
from pandas import read_csv
from sklearn.ensemble import ExtraTreesClassifier
# load data
url = "data/pima-indians-diabetes1.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
model = ExtraTreesClassifier()
model.fit(X, Y)
print(model.feature_importances_)


[ 0.107  0.234  0.094  0.074  0.079  0.132  0.128  0.151]


In [None]:
## given an importance score for each attribute where the larger score the more important the attribute. 
## The scores suggest at the importance of plas, age and mass.