In [7]:
### Feature Extraction with RFE
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# load data
#url = "https://goo.gl/vhm1eU"
url = 'http://bit.ly/2fYDKU7'
names=['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
model = LogisticRegression()
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)
print("Num Features: %d") % fit.n_features_
print("Selected Features: %s") % fit.support_
print("Feature Ranking: %s") % fit.ranking_

Num Features: 3
Selected Features: [ True False False False False  True  True False]
Feature Ranking: [1 2 3 5 6 1 1 4]


In [9]:
# Feature Importance with Extra Trees Classifier
from pandas import read_csv
from sklearn.ensemble import ExtraTreesClassifier
# load data
#url = "https://goo.gl/vhm1eU"
names=['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
model = ExtraTreesClassifier()
model.fit(X, Y)
#the larger the score the more important the feature
print(model.feature_importances_)

[ 0.11447869  0.2074857   0.09769229  0.07846414  0.07135084  0.15584704
  0.12365832  0.15102299]


In [22]:
# Feature Importance with Random Forest Classifier
from pandas import read_csv
from sklearn.ensemble import RandomForestClassifier
# load data
#url = "https://goo.gl/vhm1eU"
names=['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
model = RandomForestClassifier()
model.fit(X, Y)
#the larger the score the more important the feature
print(model.feature_importances_)

[ 0.08   0.247  0.101  0.062  0.073  0.165  0.134  0.137]


In [23]:
list2 = [(ind, x) for ind, x in enumerate(model.feature_importances_) if 1 == 1]
print(list2)

[(0, 0.080168783799567128), (1, 0.24658748234198052), (2, 0.10102036041174242), (3, 0.062366018095479939), (4, 0.073111416319969438), (5, 0.16521556671989338), (6, 0.13426983850434518), (7, 0.13726053380702202)]


In [24]:
list2[0][1]

0.080168783799567128

In [25]:
sorted(list2, key=lambda item: item[1], reverse=True)

[(1, 0.24658748234198052),
 (5, 0.16521556671989338),
 (7, 0.13726053380702202),
 (6, 0.13426983850434518),
 (2, 0.10102036041174242),
 (0, 0.080168783799567128),
 (4, 0.073111416319969438),
 (3, 0.062366018095479939)]

In [26]:
list3 = [(ind, names[ind], x) for ind, x in enumerate(model.feature_importances_) if 1 == 1]
sorted(list3, key=lambda item: item[2], reverse=True)

[(1, 'plas', 0.24658748234198052),
 (5, 'mass', 0.16521556671989338),
 (7, 'age', 0.13726053380702202),
 (6, 'pedi', 0.13426983850434518),
 (2, 'pres', 0.10102036041174242),
 (0, 'preg', 0.080168783799567128),
 (4, 'test', 0.073111416319969438),
 (3, 'skin', 0.062366018095479939)]

In [27]:
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
import pandas
import numpy
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# load data
# url = "https://goo.gl/vhm1eU"
# names=[preg, plas, pres, skin, test, mass, pedi, age, class] dataframe = pandas.read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)
# summarize scores
numpy.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)
# summarize selected features
print(features[0:5,:])

[  111.52   1411.887    17.605    53.108  2175.565   127.669     5.393
   181.304]
[[ 148.     0.    33.6   50. ]
 [  85.     0.    26.6   31. ]
 [ 183.     0.    23.3   32. ]
 [  89.    94.    28.1   21. ]
 [ 137.   168.    43.1   33. ]]
