## Dimensionality Reduction Using Feature Selection

#### Thresholding Numerical Feature Variance

##### You have a set of numerical features and want to remove those with low variance (i.e., likely containing little information).

In [1]:
# Load libraries
from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold
# import some data to play with
iris = datasets.load_iris()
# Create features and target
features = iris.data
target = iris.target
# Create thresholder
thresholder = VarianceThreshold(threshold=.5)
# Create high variance feature matrix
features_high_variance = thresholder.fit_transform(features)
# View high variance feature matrix
features_high_variance[0:3]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2]])

In [2]:
 # View variances
thresholder.fit(features).variances_

array([0.68112222, 0.18871289, 3.09550267, 0.57713289])

In [3]:
# Load library
from sklearn.preprocessing import StandardScaler
# Standardize feature matrix
scaler = StandardScaler()
features_std = scaler.fit_transform(features)
# Caculate variance of each feature
selector = VarianceThreshold()
selector.fit(features_std).variances_

array([1., 1., 1., 1.])

#### Thresholding Binary Feature Variance

In [4]:
# Load library
from sklearn.feature_selection import VarianceThreshold
# Create feature matrix with:
# Feature 0: 80% class 0
# Feature 1: 80% class 1
# Feature 2: 60% class 0, 40% class 1
features = [[0, 1, 0],
[0, 1, 1],
[0, 1, 0],
[0, 1, 1],[1, 0, 0]]
# Run threshold by variance
thresholder = VarianceThreshold(threshold=(.75 * (1 - .75)))
thresholder.fit_transform(features)

array([[0],
       [1],
       [0],
       [1],
       [0]])

In [5]:
# Run threshold by variance
thresholder = VarianceThreshold(threshold=(.75 * (1 - .75)))
thresholder.fit_transform(features)

array([[0],
       [1],
       [0],
       [1],
       [0]])

#### Handling Highly Correlated Features

##### Use a correlation matrix to check for highly correlated features. If highly correlated features exist, consider dropping one of the correlated features:

In [8]:
import pandas as pd
import numpy as np

# Create feature matrix with two highly correlated features
features = np.array([
    [1, 1, 1],
    [2, 2, 0],
    [3, 3, 1],
    [4, 4, 0],
    [5, 5, 1],
    [6, 6, 0],
    [7, 7, 1],
    [8, 7, 0],
    [9, 7, 1]
])

# Convert feature matrix into DataFrame
dataframe = pd.DataFrame(features)

# Create correlation matrix
corr_matrix = dataframe.corr().abs()

# Select upper triangle of correlation matrix
upper_triangle = np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
upper = corr_matrix.where(upper_triangle)

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

# Drop features
dataframe_dropped = dataframe.drop(dataframe.columns[to_drop], axis=1)

# Display first 3 rows of the modified DataFrame
print(dataframe_dropped.head(3))


   0  2
0  1  1
1  2  0
2  3  1


In [12]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2

# Assuming you have your features and target variables defined elsewhere
# Here, I'll provide dummy data for illustration purposes

# Create feature matrix with two highly correlated features
features = np.array([[1, 1, 1],
                     [2, 2, 0],
                     [3, 3, 1],
                     [4, 4, 0],
                     [5, 5, 1],
                     [6, 6, 0],
                     [7, 7, 1],
                     [8, 7, 0],
                     [9, 7, 1]])

# Dummy target variable (assuming it should have the same number of samples)
target = np.array([0, 1, 0, 1, 0, 1, 0, 1, 0])

# Check if the number of samples in features and target are consistent
if len(features) != len(target):
    raise ValueError("Number of samples in features and target are inconsistent")

# Create SelectKBest object with chi-squared statistics
chi2_selector = SelectKBest(chi2, k=2)

# Fit the SelectKBest object with the features and target
features_kbest = chi2_selector.fit_transform(features, target)

# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])
print("Selected features indices:", chi2_selector.get_support(indices=True))

Original number of features: 3
Reduced number of features: 2
Selected features indices: [1 2]


####  Recursively Eliminating Features

In [13]:
# Load libraries
import warnings
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
from sklearn import datasets, linear_model
# Suppress an annoying but harmless warning
warnings.filterwarnings(action="ignore", module="scipy",message="^internal gelsd")
# Generate features matrix, target vector, and the true coefficients
features, target = make_regression(n_samples = 10000,
n_features = 100,
n_informative = 2,
random_state = 1)
# Create a linear regression
ols = linear_model.LinearRegression()
# Recursively eliminate features
rfecv = RFECV(estimator=ols, step=1, scoring="neg_mean_squared_error")
rfecv.fit(features, target)
rfecv.transform(features)

array([[ 0.00850799,  0.7031277 , -0.84959297],
       [-1.07500204,  2.56148527, -0.5924624 ],
       [ 1.37940721, -1.77039484,  1.61514448],
       ...,
       [-0.80331656, -1.60648007, -0.08279018],
       [ 0.39508844, -1.34564911,  1.20679839],
       [-0.55383035,  0.82880112,  0.38295212]])

In [14]:
# Number of best features
rfecv.n_features_

3

In [15]:
# Which categories are best
rfecv.support_

array([False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])