In [3]:
import numpy as np
import pandas as pd

# 特征变换

### 对指化

In [20]:
np.log([1,2,3,4])
np.exp([1,2,3,4])

array([ 2.71828183,  7.3890561 , 20.08553692, 54.59815003])

### 离散化

In [23]:
lst = [6, 8, 10, 15, 16, 24, 25, 40, 67]
# 等深分箱 平均处理数据长度
pd.qcut(lst, q=3, labels=['low', 'medium', 'high'])

[low, low, low, medium, medium, medium, high, high, high]
Categories (3, object): [low < medium < high]

In [24]:
# 等宽分箱 平均处理数据大小
pd.cut(lst, bins=3, labels=['low', 'medium', 'high'])

[low, low, low, low, low, low, low, medium, high]
Categories (3, object): [low < medium < high]

### 数值化

In [25]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# LabelEncoder 默认按文本排序编码
lb_encoder = LabelEncoder()
lb_encoder.fit_transform(np.array(['low', 'medium', 'high', 'low', 'high']).reshape(-1, 1))

  y = column_or_1d(y, warn=True)


array([1, 2, 0, 1, 0])

In [27]:
# OneHotEncoder可以特征扩维
oh_encoder = OneHotEncoder()
oh_encoder.fit_transform(np.array(['Red', 'Blue', 'Green']).reshape(-1, 1)).toarray()

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

### 正规化

分为 L1 和 L2 两种，将特征间差距转化成特征长度的一个相对值

In [28]:
from sklearn.preprocessing import Normalizer

In [29]:
Normalizer(norm='l1').fit_transform(np.array([[1, 1, 3, -1, 2]]))

array([[ 0.125,  0.125,  0.375, -0.125,  0.25 ]])

### 归一化

数据按比例统一成0-1之间的形式

In [6]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
MinMaxScaler().fit_transform(np.array([1,4,10,15,21]).reshape(-1, 1))



array([[0.  ],
       [0.15],
       [0.45],
       [0.7 ],
       [1.  ]])

### 标准化

把数据转化为标准差为1, 用与展现一个数据同特征下与其他数据的相对大小差距关系

In [7]:
StandardScaler().fit_transform(np.array([1,1,1,1,0,0,0,0]).reshape(-1, 1))



array([[ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [-1.],
       [-1.],
       [-1.],
       [-1.]])

# 特征选择

In [10]:
import scipy.stats as ss
df = pd.DataFrame({'A':ss.norm.rvs(size=10), 
                   'B':ss.norm.rvs(size=10), 
                   'C':ss.norm.rvs(size=10), 
                   'D':np.random.randint(0, 2, size=10)}) # D为标注
df

Unnamed: 0,A,B,C,D
0,0.856963,-1.018157,0.786064,1
1,-1.425053,-1.580649,-0.802966,0
2,2.184314,-0.064868,-1.763257,0
3,-0.576157,-0.781799,0.057625,1
4,3.107851,0.588208,0.310231,1
5,0.908251,0.586399,0.740963,1
6,1.218341,-0.132951,-1.54631,1
7,-1.206157,0.076506,0.892009,1
8,-0.499073,-1.662156,-0.970675,1
9,1.572013,0.197386,1.675919,0


In [11]:
from sklearn.svm import SVR  # 回归器
from sklearn.tree import DecisionTreeRegressor # 决策树回归器
# 分别为过滤思想， 包裹思想， 嵌入思想
from sklearn.feature_selection import SelectKBest, RFE, SelectFromModel
X = df.loc[:, ['A', 'B', 'C']]
Y = df.loc[:, 'D']

In [12]:
# 过滤思想处理
skb = SelectKBest(k=2) # 最终留下2个特征
skb.fit(X, Y)
skb.transform(X)

array([[-1.01815684,  0.78606404],
       [-1.58064936, -0.80296574],
       [-0.06486789, -1.76325705],
       [-0.78179874,  0.05762494],
       [ 0.58820762,  0.31023128],
       [ 0.58639871,  0.74096339],
       [-0.13295112, -1.54631015],
       [ 0.07650625,  0.89200924],
       [-1.66215577, -0.97067477],
       [ 0.19738566,  1.67591949]])

此步骤过滤掉A列

In [13]:
# 包裹思想处理
rfe = RFE(estimator=SVR(kernel="linear"), n_features_to_select=2, step=1)
rfe.fit_transform(X, Y)

array([[ 0.85696295, -1.01815684],
       [-1.4250526 , -1.58064936],
       [ 2.18431358, -0.06486789],
       [-0.57615749, -0.78179874],
       [ 3.10785065,  0.58820762],
       [ 0.90825144,  0.58639871],
       [ 1.21834142, -0.13295112],
       [-1.20615703,  0.07650625],
       [-0.4990735 , -1.66215577],
       [ 1.57201295,  0.19738566]])

此步骤过滤掉C列

In [17]:
# 嵌入思想处理
sfm = SelectFromModel(estimator=DecisionTreeRegressor(), threshold=0.01) # 重要因子
sfm.fit_transform(X, Y)

array([[ 0.85696295, -1.01815684],
       [-1.4250526 , -1.58064936],
       [ 2.18431358, -0.06486789],
       [-0.57615749, -0.78179874],
       [ 3.10785065,  0.58820762],
       [ 0.90825144,  0.58639871],
       [ 1.21834142, -0.13295112],
       [-1.20615703,  0.07650625],
       [-0.4990735 , -1.66215577],
       [ 1.57201295,  0.19738566]])

重要因子大小影响留下的列数

# 特征降维

常用 PCA 和 LDA 降维

In [30]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# LDA降维
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
Y = np.array([1, 1, 1, 2, 2, 2])
LinearDiscriminantAnalysis(n_components=1).fit_transform(X, Y)

array([[-1.73205081],
       [-1.73205081],
       [-3.46410162],
       [ 1.73205081],
       [ 1.73205081],
       [ 3.46410162]])

In [31]:
# 也可以当作分类器
fisher_classifier = LinearDiscriminantAnalysis(n_components=1).fit(X, Y)
fisher_classifier.predict([[0.8, 1]])

array([2])

In [32]:
from sklearn.decomposition import PCA
# PCA降维
lower_dim = PCA(n_components=1)
lower_dim.fit_transform(X)

array([[ 1.38340578],
       [ 2.22189802],
       [ 3.6053038 ],
       [-1.38340578],
       [-2.22189802],
       [-3.6053038 ]])

# 基于HR.csv做下练习

具体HR.csv数据全部的操作出门左拐看kaggle目录

In [41]:
def hr_preprocessing(sl=False, le=False, npr=False, amh=False, tsc=False, wa=False, 
                     pl=False, dep=False, sal=False,lower_d=False, ld_n=1):
    def map_salary(s):
        d = dict([('low', 0), ('medium', 1), ('high', 2)])
        return d.get(s, 0)
    
    df = pd.read_csv("HR.csv")
    
    # 清洗数据
    df = df.dropna(subset=['satisfaction_level', 'last_evaluation'])
    df = df[df['satisfaction_level']<=1][df['salary']!='nme']
    # 标注
    label = df['left']
    df=df.drop('left', axis=1)
    # 特征选择
    # 特征处理
    scaler_lst = [sl, le, npr, amh, tsc, wa, pl]
    column_lst = ['satisfaction_level', 'last_evaluation', 'number_project', 'average_monthly_hours', 
                  'time_spend_company', 'Work_accident', 'promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]] = MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1, 1)).reshape(1, -1)[0]
        else:
            df[column_lst[i]] = StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1, 1)).reshape(1, -1)[0]
    # 对离散值处理
    scaler_lst2 = [sal, dep]
    column_lst2 = ['salary', 'department']
    for i in range(len(scaler_lst2)):
        if not scaler_lst2[i]:
            if column_lst2[i] == 'salary':
                df[column_lst2[i]] = [map_salary(s) for s in df['salary'].values]
            else:
                df[column_lst2[i]] = LabelEncoder().fit_transform(df[column_lst2[i]].values.reshape(-1, 1))
            # 统一归一化处理
            df[column_lst2[i]] = MinMaxScaler().fit_transform(df[column_lst2[i]].values.reshape(-1, 1))
        else:
            df = pd.get_dummies(df, columns=[column_lst2[i]])
    if lower_d:
        # 如果为True，使用PCA降维
        return PCA(n_components=ld_n).fit_transform(df.values), label
    return df, label

In [45]:
df_res = hr_preprocessing(sl=True, le=True, npr=True, amh=True)
print(df_res)

(       satisfaction_level  last_evaluation  number_project  \
0               -0.936495        -1.087275       -1.462863   
1                0.752814         0.840707        0.971113   
2               -2.022479         0.957554        2.593763   
3                0.431041         0.899131        0.971113   
4               -0.976716        -1.145699       -1.462863   
5               -0.815830        -1.262546       -1.462863   
6               -2.062701         0.314894        1.782438   
7                1.235474         0.782283        0.971113   
8                1.114809         1.658639        0.971113   
9               -0.775608        -1.087275       -1.462863   
10              -0.654943        -1.028852       -1.462863   
11              -2.022479         0.548588        1.782438   
12               0.913701         1.191249        0.159788   
13              -0.815830        -0.970428       -1.462863   
14              -1.016938        -0.912004       -1.462863   
15     

  y = column_or_1d(y, warn=True)
