In [18]:
import pandas as pd
import numpy as np
import json
import sys
import warnings
from sklearn import datasets
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import NMF
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

In [19]:

#9.1 reducing features using Principal Components
digits = datasets.load_digits()

print(digits.data)

[[ 0.  0.  5. ...  0.  0.  0.]
 [ 0.  0.  0. ... 10.  0.  0.]
 [ 0.  0.  0. ... 16.  9.  0.]
 ...
 [ 0.  0.  1. ...  6.  0.  0.]
 [ 0.  0.  2. ... 12.  0.  0.]
 [ 0.  0. 10. ... 12.  1.  0.]]


In [20]:
print(digits.images[1])

[[ 0.  0.  0. 12. 13.  5.  0.  0.]
 [ 0.  0.  0. 11. 16.  9.  0.  0.]
 [ 0.  0.  3. 15. 16.  6.  0.  0.]
 [ 0.  7. 15. 16. 16.  2.  0.  0.]
 [ 0.  0.  1. 16. 16.  3.  0.  0.]
 [ 0.  0.  1. 16. 16.  6.  0.  0.]
 [ 0.  0.  1. 16. 16.  6.  0.  0.]
 [ 0.  0.  0. 11. 16. 10.  0.  0.]]


In [21]:

features= StandardScaler().fit_transform(digits.data)
pca=PCA(n_components=0.99, whiten=True)
features_pca = pca.fit_transform(features)
print("original number of features:", features.shape[1])
print("reduced number of features:", features_pca.shape[1])
print("output from 9.1 done!")

original number of features: 64
reduced number of features: 54
output from 9.1 done!


In [22]:
#9.4 Reducing Features Using Matrix Factorization
features = digits.data
nmf=NMF(n_components=10, random_state=1)
features_nmf=nmf.fit_transform(features)
print("Original number of features:", features.shape[1])
print("reduced number of features:", features_nmf.shape[1])
print("output from 9.4 done!")



Original number of features: 64
reduced number of features: 10
output from 9.4 done!


Variance thresholding (VT) is one of the most basic approaches to feature selection. It
is motivated by the idea that features with low variance are likely less interesting (and
useful) than features with high variance.
First, the variance is not centered; that is, it is in the squared unit of the feature itself.So the VT will not work when feature sets contain different units (e.g., one feature is in years while a different feature is in dollars).
Second, the variance threshold is selected manually, so we have to use our own judgment for a good value to select

In [23]:
#10.1 - Thresholding Numerical Feature Variance

#import data
iris= datasets.load_iris()

In [24]:
print(iris.feature_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [35]:
# Showing the first 5 rows from the array 
print(iris.data[0:5])

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]


In [26]:
print(iris.target_names)

['setosa' 'versicolor' 'virginica']


In [27]:
#create features and target
features=iris.data
target=iris.target

In [28]:

#create thresholder
thresholder = VarianceThreshold(threshold=.5)

#create high variance feature matrix and print
features_high_variance=thresholder.fit_transform(features)
print(features_high_variance[0:3])

[[5.1 1.4 0.2]
 [4.9 1.4 0.2]
 [4.7 1.3 0.2]]


In [29]:
# View the variances
thresholder.fit(features).variances_

array([0.68112222, 0.18871289, 3.09550267, 0.57713289])

In [36]:
# The below process shows how to standardise the feature variances
# Standardize feature matrix
scaler = StandardScaler()
features_std = scaler.fit_transform(features)
print(features_std[0:5])

[[-0.5         0.5        -0.81649658]
 [-0.5         0.5         1.22474487]
 [-0.5         0.5        -0.81649658]
 [-0.5         0.5         1.22474487]
 [ 2.         -2.         -0.81649658]]


In [31]:
# Calculate variance of each feature
selector = VarianceThreshold()
selector.fit(features_std).variances_

array([1., 1., 1., 1.])

In [32]:
#10.2 - Thresholding Binary Feature Variance
# Binary categorical features

features = [[0,1,0],
            [0,1,1],
            [0,1,0],
            [0,1,1],
            [1,0,0]]
# Run thresholder by variance
thresholder=VarianceThreshold(threshold = (.75*(1-.75)))
print(thresholder.fit_transform(features))


[[0]
 [1]
 [0]
 [1]
 [0]]
