In [1]:
import numpy as np

from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.cross_validation import cross_val_score



In [2]:
rng = np.random.RandomState(0)

In [3]:
dataset = load_boston()
X_full, y_full = dataset.data, dataset.target
n_samples = X_full.shape[0]
n_features = X_full.shape[1]

In [20]:
# Estimate the score on the entire dataset, with no missing values
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
scores = cross_val_score(estimator, X_full, y_full)
losses = -cross_val_score(estimator, X_full, y_full, scoring='neg_mean_squared_error')

print(scores)
print("Score with the entire dataset = %.2f" % scores.mean())

print(losses)
print("Loss with the entire dataset = %.2f" % losses.mean())

[ 0.81091858  0.56628405  0.30857975]
Score with the entire dataset = 0.56
[  9.37578355  31.98996467  45.48843112]
Loss with the entire dataset = 28.95


In [21]:
# Add missing values in 75% of the lines
missing_rate = 0.75
n_missing_samples = np.floor(n_samples * missing_rate)
missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
                                      dtype=np.bool),
                             np.ones(n_missing_samples,
                                     dtype=np.bool)))
rng.shuffle(missing_samples)
missing_features = rng.randint(0, n_features, n_missing_samples)

# Estimate the score without the lines containing missing values
X_filtered = X_full[~missing_samples, :]
y_filtered = y_full[~missing_samples]
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
score = cross_val_score(estimator, X_filtered, y_filtered).mean()


scores = cross_val_score(estimator, X_filtered, y_filtered)
losses = -cross_val_score(estimator, X_filtered, y_filtered, scoring='neg_mean_squared_error')

print(scores)
print("Score with the entire dataset = %.2f" % scores.mean())

print(losses)
print("Loss with the entire dataset = %.2f" % losses.mean())

  """
  a = empty(shape, dtype, order)
  if __name__ == '__main__':


[ 0.82323676  0.58687537  0.37972318]
Score with the entire dataset = 0.60
[  8.34470095  32.02712598  51.2502424 ]
Loss with the entire dataset = 30.54


In [22]:
# Estimate the score after imputation of the missing values
X_missing = X_full.copy()
X_missing[np.where(missing_samples)[0], missing_features] = 0
y_missing = y_full.copy()
estimator = Pipeline([("imputer", Imputer(missing_values=0,
                                          strategy="mean",
                                          axis=0)),
                      ("forest", RandomForestRegressor(random_state=0,
                                                       n_estimators=100))])
scores = cross_val_score(estimator, X_missing, y_missing)
losses = -cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error')

print(scores)
print("Score with the entire dataset = %.2f" % scores.mean())

print(losses)
print("Loss with the entire dataset = %.2f" % losses.mean())

[ 0.80239     0.58573787  0.31407662]
Score with the entire dataset = 0.57
[  9.79868153  30.55509199  45.12679325]
Loss with the entire dataset = 28.49
