In [1]:
# Remove Features with Very Low Variance

In [10]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
import numpy as np
from sklearn.datasets import fetch_california_housing

from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectPercentile

In [4]:
data = "dummy data"
var_thresh = VarianceThreshold(threshold=0.1)# remove features with variance less than 0.1
# 
transformed_data = var_thresh.fit_transform(data)
# transformed data will have all columns with variance less than 0.1 removed

In [6]:
# Remove Features with High Correlation

In [8]:
# fetch a regression dataset
data = fetch_california_housing()
x = data["data"]
col_names = data["feature_names"]
y = data["target"]

# convert to pandas frame
df = pd.DataFrame(x, columns=col_names)
# introduce a highly correlation column
df.loc[:, "MedInc_Sqrt"] = df.MedInc.apply(np.sqrt)

# get correlation matrix (pearson)
df.corr()

Downloading Cal. housing from https://ndownloader.figshare.com/files/5976036 to /home/ramesh/scikit_learn_data


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedInc_Sqrt
MedInc,1.0,-0.119034,0.326895,-0.06204,0.004834,0.018766,-0.079809,-0.015176,0.984329
HouseAge,-0.119034,1.0,-0.153277,-0.077747,-0.296244,0.013191,0.011173,-0.108197,-0.132797
AveRooms,0.326895,-0.153277,1.0,0.847621,-0.072213,-0.004852,0.106389,-0.02754,0.326688
AveBedrms,-0.06204,-0.077747,0.847621,1.0,-0.066197,-0.006181,0.069721,0.013344,-0.06691
Population,0.004834,-0.296244,-0.072213,-0.066197,1.0,0.069863,-0.108785,0.099773,0.018415
AveOccup,0.018766,0.013191,-0.004852,-0.006181,0.069863,1.0,0.002366,0.002476,0.015266
Latitude,-0.079809,0.011173,0.106389,0.069721,-0.108785,0.002366,1.0,-0.924664,-0.084303
Longitude,-0.015176,-0.108197,-0.02754,0.013344,0.099773,0.002476,-0.924664,1.0,-0.015569
MedInc_Sqrt,0.984329,-0.132797,0.326688,-0.06691,0.018415,0.015266,-0.084303,-0.015569,1.0


In [9]:
#### we can that feature `MedInc` and `MedInc_Sqrt` have higher correlation. Thus either of them can be removed.

In [11]:
# Univariate Feature Selection
# It is a wrapper for univariate feature selection that you can use for almost any new problem

class UnivariateFeatureSelection:
    def __init__(self, n_features, problem_type, scoring):
        """
        Custom univariate feature selection wrapper on 
        different univariate feature selection models from 
        sklearn

        :param n_features: SelectPercentile if float else SelectKBest
        :param problem_type: classif. or reg.
        :param scoring: scoring func., string

        """

        # for a given problem type, there are only
        # few valid scoring methods
        # you can extend this with your own custom
        # methods if you wish
        if problem_type == "classification":
            valid_scoring = {
                "f_classif": f_classif,
                "chi2": chi2,
                "mutual_info_classif": mutual_info_classif
            }
        else:
            valid_scoring = {
                "f_regression": f_regression,
                "mutual_info_regression": mutual_info_regression
            }

        # raise exception if we do not have a valid scoring method
        if scoring not in valid_scoring:
            raise Exception("Invalid scoring function")

        # if n_features is int, we use selectkbest
        # if n_features is float, we use selectpercentile
        # please note that it is int in both cases in sklearn
        if isinstance(n_features, int):
            self.selection = SelectKBest(
                valid_scoring[scoring],
                k=n_features
            )
        elif isinstance(n_features, float):
            self.selection = SelectPercentile(
                valid_scoring[scoring],
                percentile=int(n_features * 100)
            )
        else:
            raise Exception("Invalid type of feature")

    # same fit function
    def fit(self, X, y):
        return self.selection.fit(X,y)

    # same transform function
    def transform(self, X):
        return self.selection.transform(X)

    # same fit_transform function
    def fit_transform(self, X, y):
        return self.selection.fit_transform(X, y)


In [13]:
# use selectKbest
ufs = UnivariateFeatureSelection(
    n_features=2,
    problem_type="regression",
    scoring="f_regression"
)
ufs.fit(x,y)
X_transformed = ufs.transform(x)

# it choose two best features that are highly related with target

In [14]:
X_transformed

array([[8.3252    , 6.98412698],
       [8.3014    , 6.23813708],
       [7.2574    , 8.28813559],
       ...,
       [1.7       , 5.20554273],
       [1.8672    , 5.32951289],
       [2.3886    , 5.25471698]])

In [16]:
# use `SelectPercentile` method
ufs = UnivariateFeatureSelection(
    n_features=0.3,
    problem_type="regression",
    scoring="f_regression"
)
ufs.fit(x,y)
X_transformed = ufs.transform(x)

In [17]:
X_transformed

array([[ 8.3252    ,  6.98412698, 37.88      ],
       [ 8.3014    ,  6.23813708, 37.86      ],
       [ 7.2574    ,  8.28813559, 37.85      ],
       ...,
       [ 1.7       ,  5.20554273, 39.43      ],
       [ 1.8672    ,  5.32951289, 39.43      ],
       [ 2.3886    ,  5.25471698, 39.37      ]])

In [18]:
## Univariate feature selection may not always perform well. Most of the time, people prefer dong feature selection using a machine learning model



In [None]:
## Gready FeatureSelection
