## 4.2 过滤器法

**基础知识**

In [1]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest    # ①
from sklearn.feature_selection import chi2    
iris = load_iris()
X, y = iris.data, iris.target
skb = SelectKBest(chi2, k=2)    # ②
result = skb.fit(X, y)    # ③
print("X^2 is: ", result.scores_)
print("P-values is: ", result.pvalues_)

X^2 is:  [ 10.81782088   3.7107283  116.31261309  67.0483602 ]
P-values is:  [4.47651499e-03 1.56395980e-01 5.53397228e-26 2.75824965e-15]


In [2]:
X_new = skb.transform(X)    # ④
X_new.shape

(150, 2)

In [3]:
X_new = skb.fit_transform(X, y)
X_new[:5, :]

array([[1.4, 0.2],
       [1.4, 0.2],
       [1.3, 0.2],
       [1.5, 0.2],
       [1.4, 0.2]])

In [5]:
import numpy as np
[iris.feature_names[np.where(X[0, :]==i)[0][0]] for i in X_new[0, :]]

['petal length (cm)', 'petal width (cm)']

In [6]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [7]:
X = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]) 
X

array([[0, 0, 1],
       [0, 1, 0],
       [1, 0, 0],
       [0, 1, 1],
       [0, 1, 0],
       [0, 1, 1]])

In [8]:
from sklearn.feature_selection import VarianceThreshold 
vt = VarianceThreshold(threshold=(0.8 * (1 - 0.8)))    # ⑤
vt.fit_transform(X)

array([[0, 1],
       [1, 0],
       [0, 0],
       [1, 1],
       [1, 0],
       [1, 1]])

**项目案例**

In [9]:
import pandas as pd
path = "/Users/qiwsir/Documents/Codes/DataSet"
data = pd.read_csv(path + "/santandar/santandar.csv")
data.shape

(76020, 371)

In [22]:
from sklearn.model_selection import train_test_split 
from sklearn.feature_selection import VarianceThreshold
train_features, test_features, train_labels, test_labels = train_test_split(
    data.drop(labels=['TARGET'], axis=1),
    data['TARGET'],
    test_size=0.2,
    random_state=41)
qconstant_filter = VarianceThreshold(threshold=0.01)       # ⑥
qconstant_filter.fit(train_features)  

In [24]:
train_features = qconstant_filter.transform(train_features)  
test_features = qconstant_filter.transform(test_features)

train_features.shape, test_features.shape

((60816, 269), (15204, 269))

**动手练习**

In [25]:
# 第1题
import pandas as pd
path = "/Users/qiwsir/Documents/Codes/DataSet"
data = pd.read_csv(path + "/santandar/santandar.csv")

from sklearn.model_selection import train_test_split 
from sklearn.feature_selection import VarianceThreshold
train_features, test_features, train_labels, test_labels = train_test_split(
    data.drop(labels=['TARGET'], axis=1),
    data['TARGET'],
    test_size=0.2,
    random_state=41)
constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(train_features)

VarianceThreshold(threshold=0)

In [26]:
# 非常数特征
len(train_features.columns[constant_filter.get_support()]) 

332

In [27]:
train_features = constant_filter.transform(train_features)  
test_features = constant_filter.transform(test_features)

train_features.shape, test_features.shape  

((60816, 332), (15204, 332))

In [28]:
# 第2题
import pandas as pd
path = "/Users/qiwsir/Documents/Codes/DataSet"
data = pd.read_csv(path + "/santandar/santandar.csv")

from sklearn.model_selection import train_test_split 
from sklearn.feature_selection import VarianceThreshold
train_features, test_features, train_labels, test_labels = train_test_split(
    data.drop(labels=['TARGET'], axis=1),
    data['TARGET'],
    test_size=0.2,
    random_state=41)

In [29]:
train_features_T = train_features.T  
train_features_T.shape 

(370, 60816)

In [30]:
print(train_features_T.duplicated().sum())    # 重复特征数量

65


In [31]:
unique_features = train_features_T.drop_duplicates(keep='first').T  # 删除重复特征

In [32]:
unique_features.shape

(60816, 305)

In [33]:
#显示重复特征
duplicated_features = [dup_col for dup_col in train_features.columns if dup_col not in unique_features.columns]  
duplicated_features 

['ind_var2',
 'ind_var13_medio',
 'ind_var18',
 'ind_var26',
 'ind_var25',
 'ind_var27_0',
 'ind_var28_0',
 'ind_var28',
 'ind_var27',
 'ind_var29_0',
 'ind_var29',
 'ind_var32',
 'ind_var34',
 'ind_var37',
 'ind_var41',
 'ind_var39',
 'ind_var46_0',
 'ind_var46',
 'num_var13_medio',
 'num_var18',
 'num_var26',
 'num_var25',
 'num_var27_0',
 'num_var28_0',
 'num_var28',
 'num_var27',
 'num_var29_0',
 'num_var29',
 'num_var32',
 'num_var34',
 'num_var37',
 'num_var41',
 'num_var39',
 'num_var46_0',
 'num_var46',
 'saldo_var28',
 'saldo_var27',
 'saldo_var29',
 'saldo_var41',
 'saldo_var46',
 'delta_imp_trasp_var33_out_1y3',
 'delta_num_reemb_var13_1y3',
 'delta_num_reemb_var17_1y3',
 'delta_num_reemb_var33_1y3',
 'delta_num_trasp_var17_in_1y3',
 'delta_num_trasp_var17_out_1y3',
 'delta_num_trasp_var33_in_1y3',
 'delta_num_trasp_var33_out_1y3',
 'imp_amort_var18_hace3',
 'imp_amort_var34_hace3',
 'imp_reemb_var13_hace3',
 'imp_reemb_var33_hace3',
 'imp_trasp_var17_out_hace3',
 'imp_trasp