# 数据预处理

In [1]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np

In [3]:
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
pd.DataFrame(data)

Unnamed: 0,0,1
0,-1.0,2
1,-0.5,6
2,0.0,10
3,1.0,18


## 归一化处理

In [4]:
# 实例化对象
scalar = MinMaxScaler()
scalar = scalar.fit(data) # fit,本质是生成min(x),max(x)
# 把数据导出
result = scalar.transform(data)
result

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [5]:
# 也可以把fit 和 transform 合并在一起执行
result_ = scalar.fit_transform(data)
result_

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

## 归一化结果还原

In [6]:
scalar.inverse_transform(result)

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

## 归一化某个区间范围

默认归一化的区间为[0,1],还可以手动设定区间范围，例如[5,10]

In [48]:
# 实例化，使用feature_range 指定区间范围
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
scaler = MinMaxScaler(feature_range=[5,10])
temp = scaler.fit_transform(data)
temp

array([[ 5.  ,  5.  ],
       [ 6.25,  6.25],
       [ 7.5 ,  7.5 ],
       [10.  , 10.  ]])

## 使用 numpy 进行归一化处理

### 归一化处理
归一化的公式为： y = $ x-min(x) \over max(x)-min(x) $ ，其中 $ max(x)-min(x) $ 为极值

In [18]:
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
X = np.array(data)
# 归一化处理,归一化处理是按列进行处理的。
X_normalization = (X-X.min(axis=0)) / (X.max(axis=0)-X.min(axis=0))
X_normalization

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

### 还原归一化后数据

In [19]:
X_result = X_normalization*(X.max(axis=0)-X.min(axis=0)) + X.min(axis=0)
X_result

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

# 数据标准化

标准化公司为：z-score = $ x - \overline{x} \over \sigma $ ,其中 $ \overline{x} $ 为均值，$ \sigma $ 为标准差

In [41]:
from sklearn.preprocessing import StandardScaler

In [43]:
# 实例化对象
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
scaler_std = StandardScaler()
scaler_std = scaler.fit(data)
x_std = scaler.transform(data)
x_std

array([[-1.18321596, -1.18321596],
       [-0.50709255, -0.50709255],
       [ 0.16903085,  0.16903085],
       [ 1.52127766,  1.52127766]])

In [45]:
# 反向操作
scaler.inverse_transform(x_std)

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

In [46]:
scaler.var_

array([ 0.546875, 35.      ])

In [47]:
scaler.mean_

array([-0.125,  9.   ])

# 缺失值处理

In [1]:
import pandas as pd

In [3]:
# Narrativedata 是一个titanic 提取的数据集

data = pd.read_csv(r"Narrativedata.csv",index_col=0)
data.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,No
1,38.0,female,C,Yes
2,26.0,female,S,Yes
3,35.0,female,S,Yes
4,35.0,male,S,No


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       714 non-null    float64
 1   Sex       891 non-null    object 
 2   Embarked  889 non-null    object 
 3   Survived  891 non-null    object 
dtypes: float64(1), object(3)
memory usage: 34.8+ KB


In [13]:
# data.loc[:,"Age"] 获取所有行，'Age'列的数据
# .values把获取的数据转成数值
Age = data.loc[:,"Age"].values.reshape(-1,1)
Age[:10]

array([[22.],
       [38.],
       [26.],
       [35.],
       [35.],
       [nan],
       [54.],
       [ 2.],
       [27.],
       [14.]])

## SimpleImputer填补缺失值

### 使用均值、中位数、0填补

In [14]:
from sklearn.impute import SimpleImputer

In [15]:
# 实例化
# 默认使用均值来填补
imp_mean = SimpleImputer()
# 使用中位数来填补
imp_median = SimpleImputer(strategy="median")
#使用 0 来填补
imp_zero = SimpleImputer(strategy="constant", fill_value=0)

In [16]:
# 使用fit_transform 完成训练和填充
imp_mean = imp_mean.fit_transform(Age)
imp_median = imp_median.fit_transform(Age)
imp_zero = imp_zero.fit_transform(Age)

In [17]:
imp_mean[:10]

array([[22.        ],
       [38.        ],
       [26.        ],
       [35.        ],
       [35.        ],
       [29.69911765],
       [54.        ],
       [ 2.        ],
       [27.        ],
       [14.        ]])

In [18]:
imp_median[:10]

array([[22.],
       [38.],
       [26.],
       [35.],
       [35.],
       [28.],
       [54.],
       [ 2.],
       [27.],
       [14.]])

In [19]:
imp_zero[:10]

array([[22.],
       [38.],
       [26.],
       [35.],
       [35.],
       [ 0.],
       [54.],
       [ 2.],
       [27.],
       [14.]])

In [21]:
# 最终使用中位数来填补 年龄
data.loc[:,'Age'] = imp_median
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       891 non-null    float64
 1   Sex       891 non-null    object 
 2   Embarked  889 non-null    object 
 3   Survived  891 non-null    object 
dtypes: float64(1), object(3)
memory usage: 74.8+ KB


### 使用众数填补

In [22]:
# 使用众数填补Embarked
Embarked = data.loc[:,'Embarked'].values.reshape(-1,1)
imp_mode = SimpleImputer(strategy='most_frequent')
imp_mode = imp_mode.fit_transform(Embarked)

data.loc[:,'Embarked'] = imp_mode

In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       891 non-null    float64
 1   Sex       891 non-null    object 
 2   Embarked  891 non-null    object 
 3   Survived  891 non-null    object 
dtypes: float64(1), object(3)
memory usage: 74.8+ KB


## 使用pandas和numpy填补

In [25]:
import pandas as pd
import numpy as np

In [27]:
data1 = pd.read_csv(r"Narrativedata.csv",index_col=0)
data1.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,No
1,38.0,female,C,Yes
2,26.0,female,S,Yes
3,35.0,female,S,Yes
4,35.0,male,S,No


In [30]:
data1.loc[:,'Age'] = data1.loc[:,'Age'].fillna(data1.loc[:,'Age'].median())
data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       891 non-null    float64
 1   Sex       891 non-null    object 
 2   Embarked  889 non-null    object 
 3   Survived  891 non-null    object 
dtypes: float64(1), object(3)
memory usage: 34.8+ KB


In [31]:
data1.dropna(axis=0,inplace=True)
data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       889 non-null    float64
 1   Sex       889 non-null    object 
 2   Embarked  889 non-null    object 
 3   Survived  889 non-null    object 
dtypes: float64(1), object(3)
memory usage: 34.7+ KB


## 编码和哑变量