In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer  #填补缺失值
from sklearn.model_selection import cross_val_score

In [53]:
dataset = load_boston()
dataset.data.shape
dataset.target[:5]

array([24. , 21.6, 34.7, 33.4, 36.2])

In [54]:
X_full,y_full = dataset.data,dataset.target
n_sample = X_full.shape[0]
n_features = X_full.shape[1]

In [55]:
rng = np.random.RandomState(0) #确定随机种子
missing_rate = 0.5             #缺失率为50%

n_missing_samples = int(np.floor(n_sample*n_features*missing_rate))
n_missing_samples

3289

In [56]:
#randint(上限，下限，n)：在上限和下限之间随机取出n个整数
missing_features = rng.randint(0,n_features,n_missing_samples)
print(missing_features)
missing_samples = rng.randint(0,n_sample,n_missing_samples)
print(missing_samples)

[12  5  0 ... 11  0  2]
[150 125  28 ... 132 456 402]


In [57]:
X_missing = X_full.copy()
y_missing = y_full.copy()

In [58]:
X_missing[missing_samples,missing_features] = np.nan
X_missing = pd.DataFrame(X_missing)
print(X_missing)

           0     1      2    3      4      5     6       7    8      9    10  \
0        NaN  18.0    NaN  NaN  0.538    NaN  65.2  4.0900  1.0  296.0   NaN   
1    0.02731   0.0    NaN  0.0  0.469    NaN  78.9  4.9671  2.0    NaN   NaN   
2    0.02729   NaN   7.07  0.0    NaN  7.185  61.1     NaN  2.0  242.0   NaN   
3        NaN   NaN    NaN  0.0  0.458    NaN  45.8     NaN  NaN  222.0  18.7   
4        NaN   0.0   2.18  0.0    NaN  7.147   NaN     NaN  NaN    NaN  18.7   
..       ...   ...    ...  ...    ...    ...   ...     ...  ...    ...   ...   
501      NaN   NaN    NaN  0.0  0.573    NaN  69.1     NaN  1.0    NaN  21.0   
502  0.04527   0.0  11.93  0.0  0.573  6.120  76.7  2.2875  1.0  273.0   NaN   
503      NaN   NaN  11.93  NaN  0.573  6.976  91.0     NaN  NaN    NaN  21.0   
504  0.10959   0.0  11.93  NaN  0.573    NaN  89.3     NaN  1.0    NaN  21.0   
505  0.04741   0.0  11.93  0.0  0.573  6.030   NaN     NaN  1.0    NaN   NaN   

         11    12  
0       NaN  4.98  

In [20]:
#用均值进行填充
imp_mean = SimpleImputer(missing_values = np.nan,strategy = 'mean')  #训练器
x_missing_mean = imp_mean.fit_transform(X_missing)
print(x_missing_mean)

[[6.32000000e-03 1.17877814e+01 1.08613861e+01 ... 1.84477966e+01
  3.96900000e+02 4.98000000e+00]
 [2.73100000e-02 1.17877814e+01 7.07000000e+00 ... 1.84477966e+01
  3.57321406e+02 9.14000000e+00]
 [2.72900000e-02 0.00000000e+00 7.07000000e+00 ... 1.78000000e+01
  3.57321406e+02 4.03000000e+00]
 ...
 [6.07600000e-02 1.17877814e+01 1.08613861e+01 ... 2.10000000e+01
  3.96900000e+02 1.24388136e+01]
 [1.09590000e-01 1.17877814e+01 1.19300000e+01 ... 1.84477966e+01
  3.93450000e+02 6.48000000e+00]
 [4.74100000e-02 0.00000000e+00 1.19300000e+01 ... 1.84477966e+01
  3.96900000e+02 7.88000000e+00]]


In [23]:
#用零值填充
imp_0 = SimpleImputer(missing_values = np.nan,strategy = 'constant',fill_value = 0)
x_missing_0 = imp_0.fit_transform(X_missing)
print(x_missing_0)

[[6.3200e-03 0.0000e+00 0.0000e+00 ... 0.0000e+00 3.9690e+02 4.9800e+00]
 [2.7310e-02 0.0000e+00 7.0700e+00 ... 0.0000e+00 0.0000e+00 9.1400e+00]
 [2.7290e-02 0.0000e+00 7.0700e+00 ... 1.7800e+01 0.0000e+00 4.0300e+00]
 ...
 [6.0760e-02 0.0000e+00 0.0000e+00 ... 2.1000e+01 3.9690e+02 0.0000e+00]
 [1.0959e-01 0.0000e+00 1.1930e+01 ... 0.0000e+00 3.9345e+02 6.4800e+00]
 [4.7410e-02 0.0000e+00 1.1930e+01 ... 0.0000e+00 3.9690e+02 7.8800e+00]]


In [59]:
x_missing_reg = X_missing.copy()
sortindex = np.argsort(x_missing_reg.isnull().sum(axis = 0)).values
#np.argsort用来排序，返回从小到大的顺序对对应的索引，然后取出其中的values
print(sortindex)

[ 6 12  8  7  9  0  2  1  5  4  3 10 11]


In [60]:
for i in sortindex:
    #构造新特征矩阵和新标签
    df = x_missing_reg
    fillc = df.iloc[:,i]
    df = pd.concat([df.iloc[:,df.columns!=i],pd.DataFrame(y_full)],axis = 1)
    
    #在新特征矩阵中，对含有缺失值的列，进行0的填补
    df_0 = SimpleImputer(missing_values = np.nan,strategy = 'constant',fill_value = 0).fit_transform(df)
    
    #划分训练集以及测试集
    y_train = fillc[fillc.notnull()]   #不为空值的
    y_test = fillc[fillc.isnull()]     #为空值的
    x_train = df_0[y_train.index,:]
    x_test = df_0[y_test.index,:]
    
    #用随机森林填补缺失值
    rfc = RandomForestRegressor()
    rfc.fit(x_train,y_train)
    y_predict = rfc.predict(x_test)
    
    x_missing_reg.loc[x_missing_reg.iloc[:,i].isnull(),i] = y_predict   #第六列全为空值的

In [63]:
print(pd.DataFrame(x_missing_reg))
print(pd.DataFrame(X_full))

            0      1        2     3         4        5       6         7  \
0    0.320633  18.00   6.4273  0.17  0.538000  6.79731  65.200  4.090000   
1    0.027310   0.00   5.6431  0.00  0.469000  6.12845  78.900  4.967100   
2    0.027290  13.29   7.0700  0.00  0.462228  7.18500  61.100  4.213639   
3    0.108232  19.32   3.1765  0.00  0.458000  6.79860  45.800  4.776611   
4    0.109887   0.00   2.1800  0.00  0.462237  7.14700  60.873  4.505270   
..        ...    ...      ...   ...       ...      ...     ...       ...   
501  0.522264   1.98   8.2308  0.00  0.573000  6.28678  69.100  3.207110   
502  0.045270   0.00  11.9300  0.00  0.573000  6.12000  76.700  2.287500   
503  0.664796   0.66  11.9300  0.23  0.573000  6.97600  91.000  2.374716   
504  0.109590   0.00  11.9300  0.04  0.573000  6.30903  89.300  2.846462   
505  0.047410   0.00  11.9300  0.00  0.573000  6.03000  89.036  2.914466   

        8       9      10        11      12  
0    1.00  296.00  18.191  389.2672  4.98

In [68]:
X = [X_full,x_missing_mean,x_missing_0,x_missing_reg]
mse = []

In [71]:
for x in X:
    estimator = RandomForestRegressor(random_state=0)
    scores = cross_val_score(estimator,x,y_full,
                            scoring = 'neg_mean_squared_error',cv = 5).mean()
    mse.append(scores*-1)
print([*zip(['X_full','x_missing_mean','x_missing_0','x_missing_reg'],mse)])

[('X_full', 21.571667100368845), ('x_missing_mean', 34.78707694678704), ('x_missing_0', 40.03646636688022), ('x_missing_reg', 18.633250566783136)]


ValueError: Found array with 0 sample(s) (shape=(0, 13)) while a minimum of 1 is required.