In [27]:
# https://www.zybuluo.com/heavysheep/note/658800?fbclid=IwAR081DAm45KY5QQ4Y0VGWO25N5L0_6sSYdGFefdnlTad7TsQ-k-zIq9ZNhw

In [2]:
from sklearn import preprocessing
import numpy as np
import pandas as pd

## method 1

In [3]:
X = np.array([[ 1., -1.,  2.],  
              [ 2.,  0.,  0.],  
              [ 0.,  1., -1.]])  

In [7]:
# calculate mean  
X_mean = X.mean(axis=0) 
X_mean

array([ 1.        ,  0.        ,  0.33333333])

In [8]:
# calculate variance   
X_std = X.std(axis=0)
X_std

array([ 0.81649658,  0.81649658,  1.24721913])

In [9]:
# standardize X  
X1 = (X-X_mean)/X_std 
X1

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [10]:
# use function preprocessing.scale to standardize X  
X_scale = preprocessing.scale(X) 
X_scale

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

## method 2

In [13]:
scaler = preprocessing.StandardScaler()  
X_scaled = scaler.fit_transform(X) 
X_scaled

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

### Transform features between 0 and 1

In [14]:
min_max_scaler = preprocessing.MinMaxScaler()  
X_minMax = min_max_scaler.fit_transform(X) 
X_minMax

array([[ 0.5       ,  0.        ,  1.        ],
       [ 1.        ,  0.5       ,  0.33333333],
       [ 0.        ,  1.        ,  0.        ]])

In [15]:
X_std=(X-X.min(axis=0))/(X.max(axis=0)-X.min(axis=0))  
X_std

array([[ 0.5       ,  0.        ,  1.        ],
       [ 1.        ,  0.5       ,  0.33333333],
       [ 0.        ,  1.        ,  0.        ]])

In [17]:
X_minmax = X_std/((X.max(axis=0)-X.min(axis=0))+X.min(axis=0))
X_minmax

array([[ 0.25      ,  0.        ,  0.5       ],
       [ 0.5       ,  0.5       ,  0.16666667],
       [ 0.        ,  1.        ,  0.        ]])

## Normalization

In [18]:
X = [[ 1., -1.,  2.],  
     [ 2.,  0.,  0.],  
     [ 0.,  1., -1.]]

In [19]:
# 方法1：使用sklearn.preprocessing.normalize()函数 (p-范数(l1-norm,l2-norm)等于1)
X_normalized = preprocessing.normalize(X, norm='l2')  
X_normalized 

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

In [20]:
# 方法2：sklearn.preprocessing.StandardScaler类
normalizer = preprocessing.Normalizer().fit(X)
normalizer 

Normalizer(copy=True, norm='l2')

In [21]:
normalizer.transform(X) 

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

In [22]:
normalizer.transform([[-1.,  1., 0.]])          

array([[-0.70710678,  0.70710678,  0.        ]])

## Binarization

In [25]:
binarizer = preprocessing.Binarizer().fit(X)  # fit does nothing
binarizer 

Binarizer(copy=True, threshold=0.0)

In [26]:
binarizer.transform(X) 

array([[ 1.,  0.,  1.],
       [ 1.,  0.,  0.],
       [ 0.,  1.,  0.]])

In [28]:
#Binarizer函数也可以设定一个阈值，结果数据值大于阈值的为1，小于阈值的为0
binarizer1 = preprocessing.Binarizer(threshold=1.1)  
binarizer1.transform(X)  

array([[ 0.,  0.,  1.],
       [ 1.,  0.,  0.],
       [ 0.,  0.,  0.]])

## 缺失值處理

In [29]:
from sklearn.preprocessing import Imputer  

In [35]:
# 使用均值来处理

In [31]:
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp

Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)

In [32]:
imp.fit([[1, 2], [np.nan, 3], [7, 6]]) 

Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)

In [33]:
X = [[np.nan, 2], [6, np.nan], [7, 6]]

In [34]:
imp.transform(X)

array([[ 4.        ,  2.        ],
       [ 6.        ,  3.66666667],
       [ 7.        ,  6.        ]])

In [36]:
# 稀疏矩陣
import scipy.sparse as sp  

In [40]:
X = sp.csc_matrix([[1, 2], [0, 3], [7, 6]])  
imp = Imputer(missing_values=0, strategy='mean', axis=0)  
imp

Imputer(axis=0, copy=True, missing_values=0, strategy='mean', verbose=0)

In [41]:
imp.fit(X)

Imputer(axis=0, copy=True, missing_values=0, strategy='mean', verbose=0)

In [42]:
X_test = sp.csc_matrix([[0, 2], [6, 0], [7, 6]])  

In [43]:
imp.transform(X_test)

array([[ 4.        ,  2.        ],
       [ 6.        ,  3.66666667],
       [ 7.        ,  6.        ]])