In [6]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

In [2]:
#https://hub.packtpub.com/4-ways-implement-feature-selection-python-machine-learning/

# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)

Statistical tests can be used to select those features that have the strongest relationships with the output variable.

The scikit-learn library provides the SelectKBest class, which can be used with a suite of different statistical tests to select a specific number of features.

The following example uses the chi squared (chi^2) statistical test for non-negative features to select four of the best features from the Pima Indians onset of diabetes dataset

In [7]:
#Import chi2 for performing chi square test
from sklearn.feature_selection import SelectKBest,chi2
import pandas

#URL for loading the dataset
url ="pima-indians-diabetes.data.csv"
#Define the attribute names
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
#Create pandas data frame by loading the data from URL
dataframe = pandas.read_csv(url, names=names)
#Create array from data values
array = dataframe.values
#Split the data into input and target
X = array[:,0:8]
Y = array[:,8]

In [8]:
#We will select the features using chi square -- selects top 4.
test = SelectKBest(score_func=chi2, k=4)
#Fit the function for ranking the features by score
fit = test.fit(X, Y)
#Summarize scores
print(fit.scores_)

[  111.51969064  1411.88704064    17.60537322    53.10803984  2175.56527292
   127.66934333     5.39268155   181.30368904]


In [5]:
#Apply the transformation on to dataset
print(X[0:5,:])
X.shape
features = fit.transform(X)
features.shape
#Summarize selected features (selects 4 best features)
#You can see the scores for each attribute and the four attributes 
# chosen (those with the highest scores): plas, test, mass, and age.
print(features[0:5,:])

[[  6.00000000e+00   1.48000000e+02   7.20000000e+01   3.50000000e+01
    0.00000000e+00   3.36000000e+01   6.27000000e-01   5.00000000e+01]
 [  1.00000000e+00   8.50000000e+01   6.60000000e+01   2.90000000e+01
    0.00000000e+00   2.66000000e+01   3.51000000e-01   3.10000000e+01]
 [  8.00000000e+00   1.83000000e+02   6.40000000e+01   0.00000000e+00
    0.00000000e+00   2.33000000e+01   6.72000000e-01   3.20000000e+01]
 [  1.00000000e+00   8.90000000e+01   6.60000000e+01   2.30000000e+01
    9.40000000e+01   2.81000000e+01   1.67000000e-01   2.10000000e+01]
 [  0.00000000e+00   1.37000000e+02   4.00000000e+01   3.50000000e+01
    1.68000000e+02   4.31000000e+01   2.28800000e+00   3.30000000e+01]]


(768, 8)

(768, 4)

[[ 148.     0.    33.6   50. ]
 [  85.     0.    26.6   31. ]
 [ 183.     0.    23.3   32. ]
 [  89.    94.    28.1   21. ]
 [ 137.   168.    43.1   33. ]]


# Recursive Feature Elimination -- wrapper

RFE works by recursively removing attributes and building a model on attributes that remain. It uses model accuracy to identify which attributes (and combinations of attributes) contribute the most to predicting the target attribute.

The following example uses RFE with the logistic regression algorithm to select the top three features. The choice of algorithm does not matter too much as long as it is skillful and consistent:

In [11]:
import pandas
import numpy
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

url ="pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]

In [12]:
#Feature extraction
model = LogisticRegression() 
rfe = RFE(model, 4)
fit = rfe.fit(X, Y)
#You can see that RFE chose the the top three features as preg, mass, and pedi.
print("Num Features: %d"% fit.n_features_) 
print("Selected Features: %s"% fit.support_) 
print("Feature Ranking: %s"% fit.ranking_)

Num Features: 4
Selected Features: [ True  True False False False  True  True False]
Feature Ranking: [1 1 2 4 5 1 1 3]


------------------------------------------------
Choosing important features (feature importance)
--------------------------------------------------

Feature importance is the technique used to select features using a trained supervised classifier. When we train a classifier such as a decision tree, we evaluate each attribute to create splits; we can use this measure as a feature selector. Let’s understand it in detail.

Random forests are among the most popular machine learning methods thanks to their relatively good accuracy, robustness, and ease of use. They also provide two straightforward methods for feature selection—mean decrease impurity and mean decrease accuracy.

Otto Train data

You can download training dataset, train.csv.zip, from the https://www.kaggle.com/c/otto-group-product-classification-challenge/data and place the unzipped train.csv file in your working directory.

This dataset describes 93 obfuscated details of more than 61,000 products grouped into 10 product categories (for example, fashion, electronics, and so on). Input attributes are the counts of different events of some kind.

The goal is to make predictions for new products as an array of probabilities for each of the 10 categories, and models are evaluated using multiclass logarithmic loss (also called cross entropy).

In [13]:
from pandas import read_csv
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
np.random.seed(1)

In [14]:
#Function to create Train and Test set from the original dataset 
def getTrainTestData(dataset,split):
    np.random.seed(0) 
    training = [] 
    testing = []
    np.random.shuffle(dataset) 
    shape = np.shape(dataset)
    trainlength = np.uint16(np.floor(split*shape[0]))
    for i in range(trainlength): 
        training.append(dataset[i])
    for i in range(trainlength,shape[0]): 
        testing.append(dataset[i])
    training = np.array(training) 
    testing = np.array(testing)
    return training,testing

In [15]:
#Function to evaluate model performance
def getAccuracy(pre,ytest): 
    count = 0
    for i in range(len(ytest)):
        if ytest[i]==pre[i]: 
            count+=1
    acc = float(count)/len(ytest)
    return acc

In [16]:
#Load dataset as pandas data frame
data = read_csv('ottoTrain.csv')
#Extract attribute names from the data frame
feat = data.keys()
feat_labels = feat.get_values()
#Extract data values from the data frame
dataset = data.values
#Shuffle the dataset
np.random.shuffle(dataset)
#We will select 50000 instances to train the classifier
inst = 50000
#Extract 50000 instances from the dataset
dataset = dataset[0:inst,:]
#Create Training and Testing data for performance evaluation
train,test = getTrainTestData(dataset, 0.7)
#Split data into input and output variable with selected features
Xtrain = train[:,0:94] 
ytrain = train[:,94] 
shape = np.shape(Xtrain)
print("Shape of the dataset ",shape)
#Print the size of Data in MBs
print("Size of Data set before feature selection:",(Xtrain.nbytes/1e6),"MB")

Shape of the dataset  (35000, 94)
Size of Data set before feature selection: 26.32 MB


In [17]:
#Lets select the test data for model evaluation purpose
Xtest = test[:,0:94] 
ytest = test[:,94]
#Create a random forest classifier with the following Parameters
trees= 250
max_feat= 7
max_depth = 30
min_sample = 2
clf = RandomForestClassifier(n_estimators=trees, max_features=max_feat, max_depth=max_depth, 
min_samples_split= min_sample, random_state=0,n_jobs=-1)
#Train the classifier and calculate the training time
import time
start = time.time() 
clf.fit(Xtrain, ytrain) 
end = time.time()
#Lets Note down the model training time
print("Execution time for building the Tree is: %f"%(float(end)- float(start)))
pre = clf.predict(Xtest)
#Let's see how much time is required to train the model on the training dataset:
#Evaluate the model performance for the test data
acc = getAccuracy(pre, ytest)
print("Accuracy of model before feature selection is",(100*acc))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=30, max_features=7, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

Execution time for building the Tree is: 14.783088
Accuracy of model before feature selection is 98.82


In [18]:
#Once we have trained the model we will rank all the features 
for feature in zip(feat_labels, clf.feature_importances_):
    print(feature)

('id', 0.33346650420175183)
('feat_1', 0.0036186958628801214)
('feat_2', 0.0037243050888530957)
('feat_3', 0.011579217472062748)
('feat_4', 0.010297382675187445)
('feat_5', 0.0010359139416194116)
('feat_6', 0.00038171336038056165)
('feat_7', 0.0024867672489765021)
('feat_8', 0.0096689721610546085)
('feat_9', 0.007906150362995093)
('feat_10', 0.0022342480802130366)
('feat_11', 0.030321202266427427)
('feat_12', 0.001120862950070666)
('feat_13', 0.0039919844660730253)
('feat_14', 0.019408706880663498)
('feat_15', 0.015398634496632809)
('feat_16', 0.0055203970543115446)
('feat_17', 0.0071982339042675871)
('feat_18', 0.0036309310056707512)
('feat_19', 0.0038008858005607127)
('feat_20', 0.0046001001637091758)
('feat_21', 0.0012839572570891803)
('feat_22', 0.003458048185607362)
('feat_23', 0.0019414256864660538)
('feat_24', 0.009502403878816023)
('feat_25', 0.018382070498456828)
('feat_26', 0.022011162365845233)
('feat_27', 0.0082921478476573572)
('feat_28', 0.0031557384078345616)
('feat_29',

In [19]:
#Select features which have higher contribution in the final prediction
sfm = SelectFromModel(clf, threshold=0.01) 
sfm.fit(Xtrain,ytrain)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=30, max_features=7, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
        norm_order=1, prefit=False, threshold=0.01)

In [21]:
#Transform input dataset
Xtrain_1 = sfm.transform(Xtrain) 
Xtest_1= sfm.transform(Xtest)
#Let's see the size and shape of new dataset 
print("Size of Data set before feature selection: ",(Xtrain_1.nbytes/1e6)," MB")
shape = np.shape(Xtrain_1)
print("Shape of the dataset ",shape)

Size of Data set before feature selection:  5.6  MB
Shape of the dataset  (35000, 20)


In [23]:
#Model training time
start = time.time() 
clf.fit(Xtrain_1, ytrain) 
end = time.time()
print("Execution time for building the Random Forest is: ",(float(end)- float(start)))
#Let's evaluate the model on test data
pre = clf.predict(Xtest_1) 
count = 0
acc2 = getAccuracy(pre, ytest)
print("Accuracy after feature selection ",(100*acc2))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=30, max_features=7, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

Execution time for building the Random Forest is:  6.987008094787598
Accuracy after feature selection  99.97333333333333


# Feature Selection using Variance Threshold

In [24]:
#http://scikit-learn.org/stable/modules/feature_selection.html
#Removing features with low variance
#VarianceThreshold is a simple baseline approach to feature selection. 
#It removes all features whose variance doesn’t meet some threshold. 
#By default, it removes all zero-variance features, 
#i.e. features that have the same value in all samples.
from sklearn.feature_selection import VarianceThreshold
#suppose that we have a dataset with boolean features, and we want to remove all features
#that are either one or zero (on or off) in more than 80% of the samples.
X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(X)
#VarianceThreshold has removed the first column, which has a probability p = 5/6 > .8 of containing a zero.

[[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]

array([[0, 1],
       [1, 0],
       [0, 0],
       [1, 1],
       [1, 0],
       [1, 1]])