In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.impute import SimpleImputer #填补缺失值的类
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

In [2]:
dataset = load_boston()
dataset


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
         4.9800e+00],
        [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
         9.1400e+00],
        [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
         4.0300e+00],
        ...,
        [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         5.6400e+00],
        [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
         6.4800e+00],
        [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         7.8800e+00]]),
 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
        18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
        15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
        13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
        21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
        35.4, 24.7, 3

In [3]:
#总共506*13=6578个数据
X_full,y_full = dataset.data,dataset.target
n_samples = X_full.shape[0]
n_features = X_full.shape[1]

In [54]:
n_samples

506

In [4]:
#首先确定我们希望放入的缺失数据的比例，假设为50%，共3289个数据缺失
rng = np.random.RandomState(0)
missing_rate = 0.5
n_missing_samples = int(np.floor(n_samples*n_features*missing_rate)) #np.floor向下取整，返回.0格式的浮点数
n_missing_samples

3289

思路：

    所有数据要随机遍布在数据集的各行各列中；
    一个缺失数据需要一个行索引和一个列索引；
    如果能创造一个数组，包含3289个分布在0-506之间的行索引和0-13之间的列索引，就可以利用索引为数据中的任意3289个位置赋空值
    然后用0，均值或随机森林来填补这些缺失值，查看回归结果

In [5]:
missing_features = rng.randint(0,n_features,n_missing_samples)
missing_samples = rng.randint(0,n_samples,n_missing_samples)

In [56]:
missing_features

array([12,  5,  0, ..., 11,  0,  2])

In [7]:
len(missing_samples)

3289

In [8]:
X_missing = X_full.copy()
y_missing = y_full.copy()

In [9]:
X_missing[missing_samples,missing_features] = np.nan
X_missing = pd.DataFrame(X_missing) #为了后续操作方便
X_missing

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,,18.0,,,0.538,,65.2,4.0900,1.0,296.0,,,4.98
1,0.02731,0.0,,0.0,0.469,,78.9,4.9671,2.0,,,396.90,9.14
2,0.02729,,7.07,0.0,,7.185,61.1,,2.0,242.0,,,
3,,,,0.0,0.458,,45.8,,,222.0,18.7,,
4,,0.0,2.18,0.0,,7.147,,,,,18.7,,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,,,,0.0,0.573,,69.1,,1.0,,21.0,,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,,396.90,9.08
503,,,11.93,,0.573,6.976,91.0,,,,21.0,,5.64
504,0.10959,0.0,11.93,,0.573,,89.3,,1.0,,21.0,393.45,6.48


In [10]:
#使用均值填补缺失值
imp_mean = SimpleImputer(missing_values=np.nan,strategy='mean')
X_missing_mean = imp_mean.fit_transform(X_missing)

In [11]:
X_missing_mean

array([[3.62757895e+00, 1.80000000e+01, 1.11634641e+01, ...,
        1.85211921e+01, 3.52741952e+02, 4.98000000e+00],
       [2.73100000e-02, 0.00000000e+00, 1.11634641e+01, ...,
        1.85211921e+01, 3.96900000e+02, 9.14000000e+00],
       [2.72900000e-02, 1.07229508e+01, 7.07000000e+00, ...,
        1.85211921e+01, 3.52741952e+02, 1.29917666e+01],
       ...,
       [3.62757895e+00, 1.07229508e+01, 1.19300000e+01, ...,
        2.10000000e+01, 3.52741952e+02, 5.64000000e+00],
       [1.09590000e-01, 0.00000000e+00, 1.19300000e+01, ...,
        2.10000000e+01, 3.93450000e+02, 6.48000000e+00],
       [4.74100000e-02, 0.00000000e+00, 1.19300000e+01, ...,
        1.85211921e+01, 3.96900000e+02, 7.88000000e+00]])

In [12]:
pd.DataFrame(X_missing_mean)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,3.627579,18.000000,11.163464,0.066007,0.538000,6.305921,65.2,4.090000,1.000000,296.000000,18.521192,352.741952,4.980000
1,0.027310,0.000000,11.163464,0.000000,0.469000,6.305921,78.9,4.967100,2.000000,405.935275,18.521192,396.900000,9.140000
2,0.027290,10.722951,7.070000,0.000000,0.564128,7.185000,61.1,3.856371,2.000000,242.000000,18.521192,352.741952,12.991767
3,3.627579,10.722951,11.163464,0.000000,0.458000,6.305921,45.8,3.856371,9.383871,222.000000,18.700000,352.741952,12.991767
4,3.627579,0.000000,2.180000,0.000000,0.564128,7.147000,67.4,3.856371,9.383871,405.935275,18.700000,352.741952,5.330000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,3.627579,10.722951,11.163464,0.000000,0.573000,6.305921,69.1,3.856371,1.000000,405.935275,21.000000,352.741952,9.670000
502,0.045270,0.000000,11.930000,0.000000,0.573000,6.120000,76.7,2.287500,1.000000,273.000000,18.521192,396.900000,9.080000
503,3.627579,10.722951,11.930000,0.066007,0.573000,6.976000,91.0,3.856371,9.383871,405.935275,21.000000,352.741952,5.640000
504,0.109590,0.000000,11.930000,0.066007,0.573000,6.305921,89.3,3.856371,1.000000,405.935275,21.000000,393.450000,6.480000


In [13]:
pd.DataFrame(X_missing_mean).isnull().sum() #判断是否存在缺失值

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
dtype: int64

In [14]:
#使用0进行填补
imp_0 = SimpleImputer(missing_values=np.nan,strategy='constant',fill_value=0)
X_missing_0 = imp_0.fit_transform(X_missing)

In [15]:
pd.DataFrame(X_missing_0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.00000,18.0,0.00,0.0,0.538,0.000,65.2,4.0900,1.0,296.0,0.0,0.00,4.98
1,0.02731,0.0,0.00,0.0,0.469,0.000,78.9,4.9671,2.0,0.0,0.0,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.000,7.185,61.1,0.0000,2.0,242.0,0.0,0.00,0.00
3,0.00000,0.0,0.00,0.0,0.458,0.000,45.8,0.0000,0.0,222.0,18.7,0.00,0.00
4,0.00000,0.0,2.18,0.0,0.000,7.147,0.0,0.0000,0.0,0.0,18.7,0.00,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.00000,0.0,0.00,0.0,0.573,0.000,69.1,0.0000,1.0,0.0,21.0,0.00,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,0.0,396.90,9.08
503,0.00000,0.0,11.93,0.0,0.573,6.976,91.0,0.0000,0.0,0.0,21.0,0.00,5.64
504,0.10959,0.0,11.93,0.0,0.573,0.000,89.3,0.0000,1.0,0.0,21.0,393.45,6.48


使用随机森林填补缺失值：对某一列缺失值特别多的情况非常适用

In [16]:
X_missing_reg = X_missing.copy()

In [17]:
#找出数据集中缺失值从小到大排列的特征的顺序
sortindex = np.argsort(X_missing_reg.isnull().sum(axis=0)).values

In [18]:
np.sort(X_missing_reg.isnull().sum(axis=0))

array([185, 189, 196, 197, 197, 200, 200, 201, 201, 202, 203, 204, 214],
      dtype=int64)

In [19]:
 np.argsort(X_missing_reg.isnull().sum(axis=0)) #返回从小到大排序的顺序所对应的索引 

0      6
1     12
2      8
3      7
4      9
5      0
6      2
7      1
8      5
9      4
10     3
11    10
12    11
dtype: int64

In [20]:
X_missing_reg.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,,18.0,,,0.538,,65.2,4.09,1.0,296.0,,,4.98
1,0.02731,0.0,,0.0,0.469,,78.9,4.9671,2.0,,,396.9,9.14
2,0.02729,,7.07,0.0,,7.185,61.1,,2.0,242.0,,,
3,,,,0.0,0.458,,45.8,,,222.0,18.7,,
4,,0.0,2.18,0.0,,7.147,,,,,18.7,,5.33


In [21]:
 #构建新特征矩阵和新标签(没有被选中去填充的特征+原始的标签)和新标签（被选取填充的特征）
df = X_missing_reg

In [22]:
#新标签
fillc = df.iloc[:,6]
fillc

0      65.2
1      78.9
2      61.1
3      45.8
4       NaN
       ... 
501    69.1
502    76.7
503    91.0
504    89.3
505     NaN
Name: 6, Length: 506, dtype: float64

In [23]:
#新特征矩阵
df.iloc[:,df.columns !=6]

Unnamed: 0,0,1,2,3,4,5,7,8,9,10,11,12
0,,18.0,,,0.538,,4.0900,1.0,296.0,,,4.98
1,0.02731,0.0,,0.0,0.469,,4.9671,2.0,,,396.90,9.14
2,0.02729,,7.07,0.0,,7.185,,2.0,242.0,,,
3,,,,0.0,0.458,,,,222.0,18.7,,
4,,0.0,2.18,0.0,,7.147,,,,18.7,,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...
501,,,,0.0,0.573,,,1.0,,21.0,,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,2.2875,1.0,273.0,,396.90,9.08
503,,,11.93,,0.573,6.976,,,,21.0,,5.64
504,0.10959,0.0,11.93,,0.573,,,1.0,,21.0,393.45,6.48


In [24]:
df.columns 

RangeIndex(start=0, stop=13, step=1)

In [25]:
df.columns !=6

array([ True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True])

In [26]:
df = pd.concat([df.iloc[:,df.columns !=6],pd.DataFrame(y_full)],axis=1)

In [27]:
df

Unnamed: 0,0,1,2,3,4,5,7,8,9,10,11,12,0.1
0,,18.0,,,0.538,,4.0900,1.0,296.0,,,4.98,24.0
1,0.02731,0.0,,0.0,0.469,,4.9671,2.0,,,396.90,9.14,21.6
2,0.02729,,7.07,0.0,,7.185,,2.0,242.0,,,,34.7
3,,,,0.0,0.458,,,,222.0,18.7,,,33.4
4,,0.0,2.18,0.0,,7.147,,,,18.7,,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,,,,0.0,0.573,,,1.0,,21.0,,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,2.2875,1.0,273.0,,396.90,9.08,20.6
503,,,11.93,,0.573,6.976,,,,21.0,,5.64,23.9
504,0.10959,0.0,11.93,,0.573,,,1.0,,21.0,393.45,6.48,22.0


In [28]:
y_full

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21

In [29]:
#在新特征矩阵中，对含有缺失值的列进行0填补
df_0 = SimpleImputer(missing_values=np.nan,
                       strategy='constant',fill_value=0).fit_transform(df)
df_0

array([[0.0000e+00, 1.8000e+01, 0.0000e+00, ..., 0.0000e+00, 4.9800e+00,
        2.4000e+01],
       [2.7310e-02, 0.0000e+00, 0.0000e+00, ..., 3.9690e+02, 9.1400e+00,
        2.1600e+01],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 0.0000e+00, 0.0000e+00,
        3.4700e+01],
       ...,
       [0.0000e+00, 0.0000e+00, 1.1930e+01, ..., 0.0000e+00, 5.6400e+00,
        2.3900e+01],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 3.9345e+02, 6.4800e+00,
        2.2000e+01],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 3.9690e+02, 7.8800e+00,
        1.1900e+01]])

In [30]:
pd.DataFrame(df_0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.00000,18.0,0.00,0.0,0.538,0.000,4.0900,1.0,296.0,0.0,0.00,4.98,24.0
1,0.02731,0.0,0.00,0.0,0.469,0.000,4.9671,2.0,0.0,0.0,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.000,7.185,0.0000,2.0,242.0,0.0,0.00,0.00,34.7
3,0.00000,0.0,0.00,0.0,0.458,0.000,0.0000,0.0,222.0,18.7,0.00,0.00,33.4
4,0.00000,0.0,2.18,0.0,0.000,7.147,0.0000,0.0,0.0,18.7,0.00,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.00000,0.0,0.00,0.0,0.573,0.000,0.0000,1.0,0.0,21.0,0.00,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,2.2875,1.0,273.0,0.0,396.90,9.08,20.6
503,0.00000,0.0,11.93,0.0,0.573,6.976,0.0000,0.0,0.0,21.0,0.00,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,0.000,0.0000,1.0,0.0,21.0,393.45,6.48,22.0


In [31]:
pd.DataFrame(df_0).isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
dtype: int64

In [32]:
#找出训练集和测试集
#是被选中要填充的特征中（现在是标签）存在的值，非空值
Ytrain = fillc[fillc.notnull()]

In [33]:
fillc.notnull()

0       True
1       True
2       True
3       True
4      False
       ...  
501     True
502     True
503     True
504     True
505    False
Name: 6, Length: 506, dtype: bool

In [34]:
fillc[fillc.notnull()]

0      65.2
1      78.9
2      61.1
3      45.8
5      58.7
       ... 
500    79.7
501    69.1
502    76.7
503    91.0
504    89.3
Name: 6, Length: 321, dtype: float64

In [35]:
#是被选中要填充的特征中（现在是标签）不存在的值，空值
#需要的是Ytest所带的索引
Ytest = fillc[fillc.isnull()]

In [36]:
Ytest

4     NaN
8     NaN
9     NaN
10    NaN
14    NaN
       ..
482   NaN
488   NaN
493   NaN
494   NaN
505   NaN
Name: 6, Length: 185, dtype: float64

In [37]:
Ytrain.index

Int64Index([  0,   1,   2,   3,   5,   6,   7,  11,  12,  13,
            ...
            495, 496, 497, 498, 499, 500, 501, 502, 503, 504],
           dtype='int64', length=321)

In [38]:
#在新特征矩阵中，被选出来的要填充的特征的非空值所对应的记录
Xtrain = df_0[Ytrain.index,:]

In [39]:
#在新特征矩阵中，被选出来的要填充的特征的空值所对应的记录
Xtest = df_0[Ytest.index,:]

In [40]:
#用随机森林回归来填补缺失值
rfc = RandomForestRegressor(n_estimators=100) #实例化
rfc = rfc.fit(Xtrain,Ytrain) #训练
Ypred = rfc.predict(Xtest) #得到预测结果，就是用来填补空值的那些值

In [41]:
Ypred

array([61.096, 79.218, 65.028, 64.96 , 77.169, 63.543, 80.24 , 75.69 ,
       95.115, 88.373, 57.212, 58.393, 80.299, 69.866, 52.779, 41.641,
       29.402, 30.022, 56.711, 46.696, 36.264, 44.287, 27.616, 45.998,
       37.714, 53.329, 42.884, 59.449, 46.257, 39.031, 59.532, 58.296,
       62.13 , 75.611, 74.927, 73.494, 80.917, 76.448, 85.775, 73.075,
       61.945, 90.507, 78.589, 87.827, 85.758, 85.494, 86.759, 95.083,
       90.114, 95.473, 90.435, 96.523, 88.918, 94.553, 86.007, 90.104,
       90.201, 94.613, 91.115, 95.372, 81.791, 72.702, 64.833, 67.072,
       55.995, 66.427, 44.397, 66.894, 59.649, 61.845, 35.629, 29.15 ,
       26.863, 57.966, 71.524, 76.982, 78.826, 65.526, 66.921, 76.498,
       66.869, 66.722, 60.157, 48.11 , 51.977, 31.517, 26.663, 58.281,
       56.054, 32.911, 23.576, 30.194, 53.301, 25.329, 42.941, 77.07 ,
       89.53 , 88.374, 89.936, 88.736, 77.08 , 51.233, 26.207, 48.535,
       42.861, 36.855, 28.202, 25.993, 31.768, 35.166, 44.88 , 33.41 ,
      

In [42]:
len(Ypred)

185

In [43]:
pd.DataFrame(df_0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.00000,18.0,0.00,0.0,0.538,0.000,4.0900,1.0,296.0,0.0,0.00,4.98,24.0
1,0.02731,0.0,0.00,0.0,0.469,0.000,4.9671,2.0,0.0,0.0,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.000,7.185,0.0000,2.0,242.0,0.0,0.00,0.00,34.7
3,0.00000,0.0,0.00,0.0,0.458,0.000,0.0000,0.0,222.0,18.7,0.00,0.00,33.4
4,0.00000,0.0,2.18,0.0,0.000,7.147,0.0000,0.0,0.0,18.7,0.00,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.00000,0.0,0.00,0.0,0.573,0.000,0.0000,1.0,0.0,21.0,0.00,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,2.2875,1.0,273.0,0.0,396.90,9.08,20.6
503,0.00000,0.0,11.93,0.0,0.573,6.976,0.0000,0.0,0.0,21.0,0.00,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,0.000,0.0000,1.0,0.0,21.0,393.45,6.48,22.0


In [44]:
X_missing_reg

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,,18.0,,,0.538,,65.2,4.0900,1.0,296.0,,,4.98
1,0.02731,0.0,,0.0,0.469,,78.9,4.9671,2.0,,,396.90,9.14
2,0.02729,,7.07,0.0,,7.185,61.1,,2.0,242.0,,,
3,,,,0.0,0.458,,45.8,,,222.0,18.7,,
4,,0.0,2.18,0.0,,7.147,,,,,18.7,,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,,,,0.0,0.573,,69.1,,1.0,,21.0,,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,,396.90,9.08
503,,,11.93,,0.573,6.976,91.0,,,,21.0,,5.64
504,0.10959,0.0,11.93,,0.573,,89.3,,1.0,,21.0,393.45,6.48


In [45]:
#将填补好的特征返回到原始的特征矩阵中
X_missing_reg.loc[X_missing_reg.iloc[:,6].isnull(),6] = Ypred

In [46]:
X_missing_reg

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,,18.0,,,0.538,,65.200,4.0900,1.0,296.0,,,4.98
1,0.02731,0.0,,0.0,0.469,,78.900,4.9671,2.0,,,396.90,9.14
2,0.02729,,7.07,0.0,,7.185,61.100,,2.0,242.0,,,
3,,,,0.0,0.458,,45.800,,,222.0,18.7,,
4,,0.0,2.18,0.0,,7.147,61.096,,,,18.7,,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,,,,0.0,0.573,,69.100,,1.0,,21.0,,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.700,2.2875,1.0,273.0,,396.90,9.08
503,,,11.93,,0.573,6.976,91.000,,,,21.0,,5.64
504,0.10959,0.0,11.93,,0.573,,89.300,,1.0,,21.0,393.45,6.48


In [47]:
X_missing_reg.isnull().sum()

0     200
1     201
2     200
3     203
4     202
5     201
6       0
7     197
8     196
9     197
10    204
11    214
12    189
dtype: int64

In [57]:
for i in sortindex:
    
    #构建新特征矩阵和新标签
    df = X_missing_reg
    fillc = df.iloc[:,i]  #新标签
    df = pd.concat([df.iloc[:,df.columns !=i],pd.DataFrame(y_full)],axis=1)
    
    #在新特征矩阵中，对含有缺失值的列进行0填补
    df_0 = SimpleImputer(missing_values=np.nan,
                        strategy='constant',fill_value=0).fit_transform(df)
    
    #找出训练集和测试集
    Ytrain = fillc[fillc.notnull()]
    Ytest = fillc[fillc.isnull()]
    Xtrain = df_0[Ytrain.index,:]
    Xtest = df_0[Ytest.index,:]
    
    #用随机森林回归来填补缺失值
    rfc = RandomForestRegressor(n_estimators=100) #实例化
    rfc = rfc.fit(Xtrain,Ytrain) #训练
    Ypred = rfc.predict(Xtest)
    
    #将填补好的特征返回到原始的特征矩阵中
    X_missing_reg.loc[X_missing_reg.iloc[:,i].isnull(),i] = Ypred


ValueError: Found array with 0 sample(s) (shape=(0, 13)) while a minimum of 1 is required.

对填充好的数据进行建模

In [49]:
x = [X_full,X_missing_mean,X_missing_0,X_missing_reg]

mse = []
for x in x:
    estimator = RandomForestRegressor(random_state=0,n_estimators = 100)
    scores = cross_val_score(estimator,x,y_full,scoring='neg_mean_squared_error',cv=10).mean()
    mse.append(scores*-1)


"\nx = [X_full,X_missing_mean,X_missing_0,X_missing_reg]\n\nmse = []\nfor x in x:\n    estimator = RandomForestRegressor(random_state=0,n_estimators = 100)\n    scores = cross_val_score(estimator,x,y_full,scoring='neg_mean_squared_error',cv=10).mean()\n    mse.append(scores*-1)\n"

In [50]:
mse

In [51]:
[*zip(['X_full','X_missing_mean','X_missing_0','X_missing_reg'],mse)]

In [52]:
x_labels = ['Full data','Mean Imputation','Zero Imputation','Regressor Imputation']
colors = ['r','g','b','orange']
plt.figure(figsize=(12,6))
ax = plt.subplot(111) #添加子图

for i in np.arange(len(mse)):
    ax.barh(i,mse[i],color=colors[i],alpha=0.6,align='center')

ax.set_title('Imputation Techniques with Boston Data')
ax.set_xlim(left=np.min(mse)*0.9,right=np.max(mse)*1.1)
ax.set_yticks(np.arange(len(mse)))
ax.invert_yaxis()
ax.set_yticklabels(x_labels)
plt.show()
'''

"\nx_labels = ['Full data','Mean Imputation','Zero Imputation','Regressor Imputation']\ncolors = ['r','g','b','orange']\nplt.figure(figsize=(12,6))\nax = plt.subplot(111) #添加子图\n\nfor i in np.arange(len(mse)):\n    ax.barh(i,mse[i],color=colors[i],alpha=0.6,align='center')\n\nax.set_title('Imputation Techniques with Boston Data')\nax.set_xlim(left=np.min(mse)*0.9,right=np.max(mse)*1.1)\nax.set_yticks(np.arange(len(mse)))\nax.invert_yaxis()\nax.set_yticklabels(x_labels)\nplt.show()\n"