In [None]:
"""
======================================================
Imputing missing values before building an estimator
======================================================

This example shows techinques to handle missing values instead of discarding
the samples containing any missing value.

The first technique used to handle missing values is by imputing the missing values
based on the feature-wise statistic. Note that imputing does not always improve
the predictions, so please check via cross-validation. Sometimes dropping rows
or using marker values is more effective.

In this example, we artificially mark some of the elements in complete
dataset as missing. Then we estimate performance using the complete dataset,
dataset without the missing samples, after imputation without the indicator
matrix and imputation with the indicator matrix for the missing values.

The imputation can be done by replacing the missing values by the mean, the
median or the most frequent value using the ``strategy`` hyper-parameter.
The median is a more robust estimator for data with high magnitude variables
which could dominate results (otherwise known as a 'long tail').

Script output::

  Score with the complete dataset = 0.56
  Score without the samples containing missing values = 0.48
  Score after imputation of the missing values = 0.55
  Score after imputation with indicator features = 0.57
  Score 

In this case, imputing helps the classifier get close to the original score.
  
"""

# Author:  Nicolas Trésegnie <nicolas.tresegnie@gmail.com>
#          David Fletcher <madder_dan@yahoo.co.uk>
#          Lars Buitinck <l.j.buitinck@uva.nl>
#          Gilles Louppe <g.louppe@gmail.com>
#          Maniteja Nandana <manitejanmt@gmail.com>
#          Raghav R V <rvraghav93@gmail.com>
#
# License: BSD 3 clause




import numpy as np

from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score

rng = np.random.RandomState(0)

dataset = load_boston()
X_full, y_full = dataset.data, dataset.target
n_samples = X_full.shape[0]
n_features = X_full.shape[1]

# Estimate the score on the entire dataset, with no missing values
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
score = cross_val_score(estimator, X_full, y_full).mean()
print("Score with the complete dataset = %.2f" % score)

# Add missing values in 75% of the lines
missing_rate = 0.75
n_missing_samples = int(n_samples * missing_rate)
missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
                                      dtype=np.bool),
                             np.ones(n_missing_samples,
                                     dtype=np.bool)))
rng.shuffle(missing_samples)
missing_features = rng.randint(0, n_features, n_missing_samples)

# Estimate the score without the lines containing missing values
X_filtered = X_full[~missing_samples, :]
y_filtered = y_full[~missing_samples]
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
score = cross_val_score(estimator, X_filtered, y_filtered).mean()
print("Score without the samples containing missing values = %.2f" % score)

# Estimate the score after imputation of the missing values
X_missing = X_full.copy()
X_missing[np.where(missing_samples)[0], missing_features] = 0
y_missing = y_full.copy()
estimator = Pipeline([("imputer", Imputer(missing_values=0,
                                          strategy="mean",
                                          axis=0)),
                      ("forest", RandomForestRegressor(random_state=0,
                                                       n_estimators=100))])
score = cross_val_score(estimator, X_missing, y_missing).mean()
print("Score after imputation of the missing values = %.2f" % score)

# Estimate score after imputation of the missing values with indicator matrix
estimator = Pipeline([("imputer", Imputer(missing_values=0,
                                          strategy="mean",
                                          axis=0, add_indicator_features=True)),
                      ("forest", RandomForestRegressor(random_state=0,
                                                       n_estimators=100))])
score = cross_val_score(estimator, X_missing, y_missing).mean()
print("Score after imputation with indicator features = %.2f" % score)