In [2]:
# select those features that have the strongest relationship with the output variable.


In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [4]:
filename = "../datasets/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(filename, names=names)


In [5]:
array = dataframe.values
X = array[:,0:8]
y = array[:,8]

In [6]:
#feature extraction
test = SelectKBest(score_func=f_classif, k=4)
fit = test.fit(X,y)
fit.scores_

array([ 39.67022739, 213.16175218,   3.2569504 ,   4.30438091,
        13.28110753,  71.7720721 ,  23.8713002 ,  46.14061124])

In [7]:
#the features shown here are the 4 with the highest values in 
# the feature extraction shown above
features = fit.transform(X)
features

array([[  6. , 148. ,  33.6,  50. ],
       [  1. ,  85. ,  26.6,  31. ],
       [  8. , 183. ,  23.3,  32. ],
       ...,
       [  5. , 121. ,  26.2,  30. ],
       [  1. , 126. ,  30.1,  47. ],
       [  1. ,  93. ,  30.4,  23. ]])

### Recursive Feature Elimination

In [8]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [9]:
model = LogisticRegression(solver="liblinear")
rfe = RFE(model, n_features_to_select=3) #selects top 3 features
fit = rfe.fit(X,y)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

Num Features: 3
Selected Features: [ True False False False False  True  True False]
Feature Ranking: [1 2 3 5 6 1 1 4]


### Principal Component Analysis

In [10]:
# Principal Component Analysis (or PCA) uses linear algebra to transform 
# the dataset into a compressed form.

from sklearn.decomposition import PCA

In [11]:
pca = PCA(n_components=3)
fit = pca.fit(X,y)
print("Explained Variance: %s" % fit.explained_variance_ratio_)
print(fit.components_)

Explained Variance: [0.88854663 0.06159078 0.02579012]
[[-2.02176587e-03  9.78115765e-02  1.60930503e-02  6.07566861e-02
   9.93110844e-01  1.40108085e-02  5.37167919e-04 -3.56474430e-03]
 [-2.26488861e-02 -9.72210040e-01 -1.41909330e-01  5.78614699e-02
   9.46266913e-02 -4.69729766e-02 -8.16804621e-04 -1.40168181e-01]
 [-2.24649003e-02  1.43428710e-01 -9.22467192e-01 -3.07013055e-01
   2.09773019e-02 -1.32444542e-01 -6.39983017e-04 -1.25454310e-01]]


### Feature Importance

In [12]:
from sklearn.ensemble import ExtraTreesClassifier

In [13]:
model = ExtraTreesClassifier(n_estimators=100)

In [14]:
model.fit(X,y)
model.feature_importances_

array([0.1085884 , 0.23936448, 0.09963006, 0.07943133, 0.07450183,
       0.13519215, 0.12010354, 0.14318821])