10.1 Thresholding Numerical Feature Variance

In [2]:
# Load libraries
from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold

# import some data to play with
iris = datasets.load_iris()

# Create features and target
features = iris.data
target = iris.target

# Create thresholder
thresholder = VarianceThreshold(threshold=.5)

# Create high variance feature matrix
features_high_variance = thresholder.fit_transform(features)

# View high variance feature matrix
features_high_variance[0:3]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2]])

In [3]:
# Finally, if the features have been standardized (to mean zero and unit variance), then for obvious reasons 
# variance thresholding will not work correctly:

# Load library
from sklearn.preprocessing import StandardScaler

# Standardize feature matrix
scaler = StandardScaler()
features_std = scaler.fit_transform(features)

# Caculate variance of each feature
selector = VarianceThreshold()
selector.fit(features_std).variances_

array([1., 1., 1., 1.])

10.2 Thresholding Binary Feature Variance

In [4]:
# Load library
from sklearn.feature_selection import VarianceThreshold

# Create feature matrix with:
# Feature 0: 80% class 0
# Feature 1: 80% class 1
# Feature 2: 60% class 0, 40% class 1
features = [[0, 1, 0],
            [0, 1, 1],
            [0, 1, 0],
            [0, 1, 1],
            [1, 0, 0]]

# Run threshold by variance
thresholder = VarianceThreshold(threshold=(.75 * (1 - .75)))
thresholder.fit_transform(features)

array([[0],
       [1],
       [0],
       [1],
       [0]])

In [5]:
# Just like with numerical features, one strategy for selecting highly informative categorical features is to 
# examine their variances. In binary features (i.e., Bernoulli random variables), variance is calculated as:

# Var(x)=p(1−p)
# where p is the proportion of observations of class 1. Therefore, by setting p, we can remove features 
# where the vast majority of observations are one class.

10.3 Handling Highly Correlated Features

In [6]:
# Load libraries
import pandas as pd
import numpy as np

# Create feature matrix with two highly correlated features
features = np.array([[1, 1, 1],
                     [2, 2, 0],
                     [3, 3, 1],
                     [4, 4, 0],
                     [5, 5, 1],
                     [6, 6, 0],
                     [7, 7, 1],
                     [8, 7, 0],
                     [9, 7, 1]])

# Convert feature matrix into DataFrame
dataframe = pd.DataFrame(features)

# Create correlation matrix
corr_matrix = dataframe.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),
                          k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

# Drop features
dataframe.drop(dataframe.columns[to_drop], axis=1).head(3)

Unnamed: 0,0,2
0,1,1
1,2,0
2,3,1


10.4 Removing Irrelevant Features for Classification

In [7]:
# Load libraries
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif

# Load data
iris = load_iris()
features = iris.data
target = iris.target

# Convert to categorical data by converting data to integers
features = features.astype(int)

# Select two features with highest chi-squared statistics
chi2_selector = SelectKBest(chi2, k=2)
features_kbest = chi2_selector.fit_transform(features, target)

# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 2


In [8]:
# If the features are quantitative, compute the ANOVA F-value between each feature and the target vector:

# Select two features with highest F-values
fvalue_selector = SelectKBest(f_classif, k=2)
features_kbest = fvalue_selector.fit_transform(features, target)

# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 2


In [9]:
# Instead of selecting a specific number of features, we can also use SelectPercentile to select the top n percent of features:

# Load library
from sklearn.feature_selection import SelectPercentile

# Select top 75% of features with highest F-values
fvalue_selector = SelectPercentile(f_classif, percentile=75)
features_kbest = fvalue_selector.fit_transform(features, target)

# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 3


In [10]:
# Chi-square statistics examines the independence of two categorical vectors. That is, the statistic is the difference 
# between the observed number of observations in each class of a categorical feature and what we would expect if that 
# feature was independent (i.e., no relationship) with the target vector:

# χ2=∑i=1n(Oi−Ei)2Ei
# where Oi is the number of observations in class i and Ei is the number of observations in class i we would expect 
# if there is no relationship between the feature and target vector.

# A chi-squared statistic is a single number that tells you how much difference exists between your observed counts 
# and the counts you would expect if there were no relationship at all in the population. By calculating the 
# chi-squared statistic between a feature and the target vector, we obtain a measurement of the independence between 
# the two. If the target is independent of the feature variable, then it is irrelevant for our purposes because it 
# contains no information we can use for classification. On the other hand, if the two features are highly dependent, 
# they likely are very informative for training our model.

In [11]:
# To use chi-squared in feature selection, we calculate the chi-squared statistic between each feature and the target vector, 
# then select the features with the best chi-square statistics. In scikit-learn, we can use SelectKBest to select the 
# features with the best statistics. The parameter k determines the number of features we want to keep.

# It is important to note that chi-square statistics can only be calculated between two categorical vectors. 
# For this reason, chi-squared for feature selection requires that both the target vector and the features are 
# categorical. However, if we have a numerical feature we can use the chi-squared technique by first transforming 
# the quantitative feature into a categorical feature. Finally, to use our chi-squared approach, all values need 
# to be non-negative.

10.5 Recursively Eliminating Features

In [12]:
# Load libraries
import warnings
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
from sklearn import datasets, linear_model

# Suppress an annoying but harmless warning
warnings.filterwarnings(action="ignore", module="scipy",
                        message="^internal gelsd")

# Generate features matrix, target vector, and the true coefficients
features, target = make_regression(n_samples = 10000,
                                   n_features = 100,
                                   n_informative = 2,
                                   random_state = 1)

# Create a linear regression
ols = linear_model.LinearRegression()

# Recursively eliminate features
rfecv = RFECV(estimator=ols, step=1, scoring="neg_mean_squared_error")
rfecv.fit(features, target)
rfecv.transform(features)

array([[ 0.00850799,  0.7031277 ],
       [-1.07500204,  2.56148527],
       [ 1.37940721, -1.77039484],
       ...,
       [-0.80331656, -1.60648007],
       [ 0.39508844, -1.34564911],
       [-0.55383035,  0.82880112]])

In [13]:
# Number of best features
print(rfecv.n_features_)

# Which categories are best
print(rfecv.support_)

# Rank features best (1) to worst
print(rfecv.ranking_)

2
[False False False False False  True False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False  True False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False]
[71 47 66 26 14  1 18  3 45 72 74 29  5 10 37 97 16 83 73 11 99  7 57 63
 89 49 96 43 55 46  9 75 30 94 70 23 44 68 39  1 80 95 60 64 61 58 42 91
 77 51 34 62 69 90 53 38 88 93 36 20 85 21 15 79 52 54 92 84 76 98  4 82
 41  2 65 81 48 40 13 56  8 32 25 19  6 35 31 78 59 12 33 17 28 27 50 22
 67 24 86 87]
