In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings("ignore")

In [2]:
dataset=load_boston()

In [3]:
dataset.data.shape

(506, 13)

In [4]:
dataset.target

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21

In [5]:
X_full,y_full=dataset.data,dataset.target

In [6]:
n_samples=X_full.shape[0]#样本数量
n_features=X_full.shape[1]#标签数量

In [7]:
n_samples

506

In [8]:
n_features

13

In [9]:
#首先确定我们希望放入的缺失数据的比例，在这里我们假设是50%，那总共就要有3289个数据缺失
rng=np.random.RandomState(0)
missing_state=0.5
n_missing_samples=int(np.floor(n_samples*n_features*missing_state))
#np.floor向下取整，返回.0格式的浮点数
n_missing_samples

3289

In [10]:
#所有数据要随机遍布在数据集的各行各列当中，而一个缺失的数据会需要一个行索引和一个列索引
#如果能够创造一个数组，包含3289个分布在0~506中间的行索引，和3289个分布在0~13之间的列索引
#，那我们就可以利用索引来为数据中的任意3289个位置赋空值
#然后我们用0，均值和随机森林来填写这些缺失值，然后查看回归的结果如何

In [11]:
missing_features=rng.randint(0,n_features,n_missing_samples)
#randint(上限，下限，n)请在下限和上限之间取出n个整数
missing_samples=rng.randint(0,n_samples,n_missing_samples)
#我们现在采样了3289个数据，远远超过我们的样本量506，所以我们使用随机抽取的函数randint。但如果我们需要
#的数据量小于我们的样本量506，那我们可以采用np.random.choice来抽样，choice会随机抽取不重复的随机数，
#因此可以帮助我们让数据更加分散，确保数据不会集中在一些行中

In [12]:
missing_features

array([12,  5,  0, ..., 11,  0,  2])

In [13]:
missing_samples

array([150, 125,  28, ..., 132, 456, 402])

In [14]:
X_missing=X_full.copy()
X_missing

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])

In [15]:
y_missing=y_full.copy()

In [16]:
X_missing[missing_samples,missing_features]=np.nan

In [17]:
X_missing

array([[       nan, 1.8000e+01,        nan, ...,        nan,        nan,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00,        nan, ...,        nan, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02,        nan, 7.0700e+00, ...,        nan,        nan,
               nan],
       ...,
       [       nan,        nan, 1.1930e+01, ..., 2.1000e+01,        nan,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ...,        nan, 3.9690e+02,
        7.8800e+00]])

In [18]:
X_missing=pd.DataFrame(X_missing)

In [19]:
#填补缺失值

In [20]:
from sklearn.impute import SimpleImputer
#使用均值填补
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')#实例化
X_missing_mean = imp_mean.fit_transform(X_missing)#训练加导出

In [21]:
#使用0填补
imp_0= SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=0)
X_missing_0 = imp_mean.fit_transform(X_missing)

In [22]:
#使用随机森林

In [23]:
X_missing_reg=X_missing.copy()

In [24]:
#缺失值从少到多的顺序
sortindex=np.argsort(X_missing_reg.isnull().sum(axis=0)).values#argsort 返回的是缺失值数量对应的索引

In [25]:
for i in sortindex:
    
    #构建我们的新特征矩阵（没有被选中去填充的特征 + 原始的标签）和新标签（被选中去填充的特征）
    df = X_missing_reg
    fillc = df.iloc[:,i]#新标签
    df = pd.concat([df.iloc[:,df.columns != i],pd.DataFrame(y_full)],axis=1)#新特征矩阵
    
    #在新特征矩阵中，对含有缺失值的列，进行0的填补
    df_0 =SimpleImputer(missing_values=np.nan,strategy='constant',fill_value=0).fit_transform(df)
                        
    #找出我们的训练集和测试集
    Ytrain = fillc[fillc.notnull()]# Ytrain是被选中要填充的特征中（现在是我们的标签），存在的那些值：非空值
    Ytest = fillc[fillc.isnull()]#Ytest 是被选中要填充的特征中（现在是我们的标签），不存在的那些值：空值。注意我们需要的不是Ytest的值，需要的是Ytest所带的索引
    Xtrain = df_0[Ytrain.index,:]#在新特征矩阵上，被选出来的要填充的特征的非空值所对应的记录
    Xtest = df_0[Ytest.index,:]#在新特征矩阵上，被选出来的要填充的特征的空值所对应的记录
    
    #用随机森林回归来填补缺失值
    rfc = RandomForestRegressor(n_estimators=100)#实例化
    rfc = rfc.fit(Xtrain, Ytrain)#导入训练集进行训练
    Ypredict = rfc.predict(Xtest)#用predict接口将Xtest导入，得到我们的预测结果（回归结果），就是我们要用来填补空值的这些值
    
    #将填补好的特征返回到我们的原始的特征矩阵中
    X_missing_reg.loc[X_missing_reg.iloc[:,i].isnull(),i] = Ypredict

In [26]:
X_missing_reg

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.309451,18.00,6.4172,0.12,0.538000,6.71413,65.200,4.090000,1.00,296.00,18.084,390.2334,4.9800
1,0.027310,0.00,5.5450,0.00,0.469000,6.14344,78.900,4.967100,2.00,304.47,18.279,396.9000,9.1400
2,0.027290,16.02,7.0700,0.00,0.461598,7.18500,61.100,4.162324,2.00,242.00,17.921,384.8937,4.7704
3,0.081036,17.91,2.8607,0.00,0.458000,6.93289,45.800,4.959707,3.33,222.00,18.700,394.5553,5.6689
4,0.069100,0.00,2.1800,0.00,0.467460,7.14700,59.171,5.055527,3.44,228.01,18.700,394.1984,5.3300
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.463504,2.30,10.0906,0.00,0.573000,6.17791,69.100,2.973695,1.00,289.52,21.000,391.1198,9.6700
502,0.045270,0.00,11.9300,0.00,0.573000,6.12000,76.700,2.287500,1.00,273.00,19.162,396.9000,9.0800
503,0.788188,1.32,11.9300,0.15,0.573000,6.97600,91.000,2.753048,4.39,364.15,21.000,381.1755,5.6400
504,0.109590,0.00,11.9300,0.06,0.573000,6.27787,89.300,2.796667,1.00,267.66,21.000,393.4500,6.4800


In [27]:
X = [X_full,X_missing_mean,X_missing_0,X_missing_reg]
 
mse = []
std = []
for x in X:
    estimator = RandomForestRegressor(random_state=0, n_estimators=100)#实例化
    scores = cross_val_score(estimator,x,y_full,scoring='neg_mean_squared_error', cv=5).mean()
    mse.append(scores * -1)

In [28]:
mse

[21.571667100368845, 40.848037216676374, 40.848037216676374, 17.67586856682197]

In [29]:
[*zip(['Full data','Zero Imputation','Mean Imputation','Regressor Imputation'],mse)]

[('Full data', 21.571667100368845),
 ('Zero Imputation', 40.848037216676374),
 ('Mean Imputation', 40.848037216676374),
 ('Regressor Imputation', 17.67586856682197)]