# Feature Selection For Machine Learning in Python
### This is a playground for the post on feature selection. Link below:
https://machinelearningmastery.com/feature-selection-machine-learning-python/

# 1. Univariate Selection

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [3]:
filename = 'pima-indians-diabetes.data.txt'
colnames = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(filename, names = colnames, delimiter=',')
dataframe.head(4)

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0


In [4]:
array = dataframe.values
array

array([[  6.   , 148.   ,  72.   , ...,   0.627,  50.   ,   1.   ],
       [  1.   ,  85.   ,  66.   , ...,   0.351,  31.   ,   0.   ],
       [  8.   , 183.   ,  64.   , ...,   0.672,  32.   ,   1.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,   0.245,  30.   ,   0.   ],
       [  1.   , 126.   ,  60.   , ...,   0.349,  47.   ,   1.   ],
       [  1.   ,  93.   ,  70.   , ...,   0.315,  23.   ,   0.   ]])

In [5]:
X = array[:, 0:8]
Y = array[:, 8]

In [7]:
# feature extraction
test = SelectKBest(score_func=f_classif, k = 4)
fit = test.fit(X, Y)

In [11]:
# summarize scores
np.set_printoptions(precision = 3)
print(fit.scores_)
features = fit.transform(X)
# summarize selected features
print(features[0:5, :])

[ 39.67  213.162   3.257   4.304  13.281  71.772  23.871  46.141]
[[  6.  148.   33.6  50. ]
 [  1.   85.   26.6  31. ]
 [  8.  183.   23.3  32. ]
 [  1.   89.   28.1  21. ]
 [  0.  137.   43.1  33. ]]


You can see the scores for each attribute and the 4 attributes chosen (those with the highest scores). 

Specifically features with indexes 0 (preq), 1 (plas), 5 (mass), and 7 (age).

# 2. Recursive Feature Elimination

In [12]:
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [13]:
filename = 'pima-indians-diabetes.data.txt'
colnames = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(filename, names = colnames, delimiter=',')
dataframe.head(4)

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0


In [14]:
array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]

In [15]:
# feature extraction
model = LogisticRegression(solver = 'lbfgs')
rfe = RFE(estimator = model, n_features_to_select = 3)
fit = rfe.fit(X,Y)



In [16]:
print("Num Features: %d" %fit.n_features_)
print("Selected Features: %s" %fit.support_)
print("Feature Ranking: %s" %fit.ranking_)

Num Features: 3
Selected Features: [ True False False False False  True  True False]
Feature Ranking: [1 2 4 5 6 1 1 3]


In [28]:
dict(zip(colnames, fit.ranking_))

{'preg': 1,
 'plas': 2,
 'pres': 4,
 'skin': 5,
 'test': 6,
 'mass': 1,
 'pedi': 1,
 'age': 3}

# 4. Feature Importance

In [29]:
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier

In [30]:
filename = 'pima-indians-diabetes.data.txt'
colnames = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(filename, names = colnames, delimiter=',')
dataframe.head(4)

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0


In [31]:
array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]

In [33]:
# feature extraction
model = ExtraTreesClassifier(n_estimators = 10)
model.fit(X, Y)
print(model.feature_importances_)

[0.117 0.256 0.09  0.081 0.071 0.133 0.114 0.137]


In [44]:
dict(zip(colnames, model.feature_importances_))

{'preg': 0.11712024109168176,
 'plas': 0.255531962101021,
 'pres': 0.08986247094662726,
 'skin': 0.081396750607147,
 'test': 0.07109706598719855,
 'mass': 0.13326343925388043,
 'pedi': 0.11441976897767825,
 'age': 0.13730830103476577}

You can see that we are given an importance score for each attribute where the larger score the more important the attribute. 

The scores suggest at the importance of plas, age and mass.