In [1]:
from sklearn import feature_selection as fs
from sklearn import datasets

# Removing features with low variance
## 設定變異數門檻值，剔除變異過低的特徵
## $$Var(X)=E[(X-\mu)^2]$$
## Bernoulli distribution: $$ Var(X)=p(1-P)$$

In [2]:
X=[[0, 0, 1],
   [0, 1, 0],
   [1, 0, 0], 
   [0, 1, 1], 
   [0, 1, 0], 
   [0, 1, 1]]

In [3]:
sel=fs.VarianceThreshold(threshold=0.8*(1-0.8))
sel.fit_transform(X)

array([[0, 1],
       [1, 0],
       [0, 0],
       [1, 1],
       [1, 0],
       [1, 1]])

# Univariate feature selection
## 透過單獨計算每個特徵的統計值來決定重要特徵
## 1. SelectKBest: 選取排名前K個重要特徵
## 2. SelectPercentile: 選取排名前K%的重要特徵
## For regression:
### f_regression, mutual_info_regression
## For classification:
### chi2, f_classif, mutual_info_classif

In [4]:
iris=datasets.load_iris()
X=iris.data
y=iris.target

In [6]:
X.shape

(150, 4)

In [7]:
X_new = fs.SelectKBest(fs.chi2, k=3).fit_transform(X, y)

In [8]:
X_new.shape

(150, 3)

In [9]:
X_new = fs.SelectPercentile(fs.mutual_info_classif, percentile=50).fit_transform(X, y)

In [11]:
X_new.shape

(150, 2)