In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.impute import  SimpleImputer #用来填补缺失值
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

In [47]:
boston = load_boston()

In [48]:
xfull, yfull = boston.data, boston.target#无缺失值

In [49]:
n_samples, n_features = xfull.shape

In [50]:
#确定放入缺失值的比例，在这里，我们选取缺失的比例为50％
rng = np.random.RandomState(0)
missing_rate = 0.5
n_missing_samples = int(np.floor(n_samples * n_features * missing_rate))

In [51]:
#随机选择特征值将其赋空值，选取missing_samples个缺失值
#在上下限之间选取n个数值
missing_features = rng.randint(0, n_features, n_missing_samples)

missing_samples = rng.randint(0, n_samples, n_missing_samples)

In [52]:
#在这里我们选择的缺失数量为3289，但是若是缺失值少于506样本数量时候，我们会
#希望在样本中的随机抽取的值不要重复，而使用 .choice
rng.choice(n_samples, n_missing_samples#在这里缺失值少于上限
           , replace=False
          )

ValueError: Cannot take a larger sample than population when 'replace=False'

In [93]:
xmissing = xfull.copy()
ymissing = yfull.copy()

In [94]:
xmissing[missing_samples, missing_features] = np.nan

In [95]:
xmissing = pd.DataFrame(xmissing)

In [103]:
# 利用均值填补,实例化再训练导出
imp_mean = SimpleImputer(missing_values=np.nan, strategy="mean")
X_missing_mean = imp_mean.fit_transform(xmissing)

In [88]:
#用来查看是否还存在空值
pd.DataFrame(X_missing_mean).isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
dtype: int64

In [102]:
#使用固定值填充
imp_zero = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=0)
X_missing_zero = imp_zero.fit_transform(xmissing)

In [96]:
#使用随机森林填补缺失值,从缺失值最少开始
xmissingreg = xmissing.copy()

In [97]:
#找出缺失值从小到大排列，用argsort是对索引排序
sortindex = np.argsort(xmissingreg.isnull().sum(axis=0)).values

In [98]:
s = []
for i in sortindex:
    #构建新特征矩阵和标签
    df = xmissingreg
    fillc = df.iloc[:, i]
    df = pd.concat([df.iloc[:, df.columns != i], pd.DataFrame(yfull)], axis=1)
    #注意到fit_transform否则会报错
    df_0 = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=0).fit_transform(df)
    ytrain = fillc[fillc.notnull()]
    ytest = xfull[fillc.isnull(), i]
    xtrain = df_0[fillc.notnull(), :]
    xtest = df_0[fillc.isnull(), :]
    rfc = RandomForestRegressor(n_estimators=100)
    rfc = rfc.fit(xtrain, ytrain)
    ypredict = rfc.predict(xtest)
    score = rfc.score(xtest, ytest).mean()
    s.append(score)
    xmissingreg.loc[xmissingreg.iloc[:,i].isnull(), i] = ypredict

In [99]:
s

[0.7170656767164376,
 0.7486613304728691,
 0.9083422554122969,
 0.6672213237804563,
 0.8560749891513382,
 0.60546999147995,
 0.8151701582968794,
 0.7002742869537676,
 0.5148492238249522,
 0.7833475429348631,
 -0.08239312056737581,
 0.6834643503299616,
 0.18182942457524154]

In [101]:
xmissingreg.notnull().sum()

0     506
1     506
2     506
3     506
4     506
5     506
6     506
7     506
8     506
9     506
10    506
11    506
12    506
dtype: int64

In [110]:
#对使用均值，常数0， 随机森林填充完之后进行交叉验证
X = [xfull, X_missing_mean, X_missing_zero, xmissingreg]
mse=[]
for x in X:
    rfc = RandomForestRegressor(n_estimators=100, random_state=10)
    scores = cross_val_score(rfc, x, yfull, scoring="neg_mean_squared_error", cv=10).mean()
    mse.append(scores * -1)


In [111]:
mse

[22.068120025176466, 41.65513931886275, 45.18132287070587, 19.253397014196082]

In [113]:
[*zip(["xfull", "x_missing_mean", "x_missing_zeror", "xmissingreg"], mse)]

[('xfull', 22.068120025176466),
 ('x_missing_mean', 41.65513931886275),
 ('x_missing_zeror', 45.18132287070587),
 ('xmissingreg', 19.253397014196082)]