scikit-learn · jnothman · Sep 3, 2019 · Jul 31, 2017 · Jul 31, 2017 · Jul 31, 2017
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -662,6 +662,7 @@ Kernels:
    impute.SimpleImputer
    impute.IterativeImputer
    impute.MissingIndicator
+   impute.KNNImputer
 
 .. _kernel_approximation_ref:
 
@@ -964,6 +965,7 @@ See the :ref:`metrics` section of the user guide for further details.
    metrics.pairwise.laplacian_kernel
    metrics.pairwise.linear_kernel
    metrics.pairwise.manhattan_distances
+   metrics.pairwise.nan_euclidean_distances
    metrics.pairwise.pairwise_kernels
    metrics.pairwise.polynomial_kernel
    metrics.pairwise.rbf_kernel

diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
@@ -32,6 +32,14 @@ missing values (e.g. :class:`impute.IterativeImputer`).
 Univariate feature imputation
 =============================
 
+Imputer transformers can be used in a Pipeline as a way to build a composite
+estimator that supports imputation.  See
+:ref:`sphx_glr_auto_examples_plot_missing_values.py`.
+
+
+Simple univariate imputation
+============================
+
 The :class:`SimpleImputer` class provides basic strategies for imputing missing
 values. Missing values can be imputed with a provided constant value, or using
 the statistics (mean, median or most frequent) of each column in which the
@@ -178,6 +186,49 @@ References
 .. [2] Roderick J A Little and Donald B Rubin (1986). "Statistical Analysis
    with Missing Data". John Wiley & Sons, Inc., New York, NY, USA.
 
+.. _knnimpute:
+
+Nearest neighbors imputation
+============================
+
+The :class:`KNNImputer` class provides imputation for completing missing
+values using the k-Nearest Neighbors approach. Each sample's missing values
+are imputed using values from ``n_neighbors`` nearest neighbors found in the
+training set. In this context, a donor is defined to be a neighbor that
+contributes to the imputation of a given sample. For each missing feature in a
+sample, the donors are selected such that they have the feature present and 
+they are one of the ``n_neighbors`` nearest neighbors.
+
+Each sample can potentially have multiple sets of ``n_neighbors`` donors 
+depending on the particular feature being imputed.
+
+Each missing feature is then imputed as the average, either weighted or
+unweighted, of these neighbors. When the number of donor neighbors is less
+than ``n_neighbors``, the training set average for that feature is
+used for imputation. When a sample has more than a ``feature_max_missing`` 
+fraction of its features missing, then it is excluded from being a donor for 
+imputation. For more information on the methodology, see ref. [OLGA]_.
+
+The following snippet demonstrates how to replace missing values,
+encoded as ``np.nan``, using the mean feature value of the two nearest
+neighbors of samples with missing values::
+
+    >>> import numpy as np
+    >>> from sklearn.impute import KNNImputer
+    >>> nan = np.nan
+    >>> X = [[1, 2, nan], [3, 4, 3], [nan, 6, 5], [8, 8, 7]]
+    >>> imputer = KNNImputer(n_neighbors=2, weights="uniform")
+    >>> imputer.fit_transform(X)
+    array([[1. , 2. , 4. ],
+           [3. , 4. , 3. ],
+           [5.5, 6. , 5. ],
+           [8. , 8. , 7. ]])
+
+.. [OLGA] Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor
+    Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing value
+    estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17 no. 6, 2001
+    Pages 520-525.
+
 .. _missing_indicator:
 
 Marking imputed values

diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
@@ -592,6 +592,11 @@ Support for Python 3.4 and below has been officially dropped.
   ``fit.predict`` were not equivalent. :pr:`13142` by
   :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
+- |Feature| Added the :func:`metrics.nan_euclidean_distances` metirc, which
+  calculates euclidean distances in the presence of missing values.
+  :issue:`12852` by :user:`Ashim Bhattarai <ashimb9>` and 
+  :user:`Thomas Fan <thomasjpfan>`.
+
 :mod:`sklearn.model_selection`
 ..............................
 

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
@@ -39,6 +39,13 @@ Changelog
     :pr:`123456` by :user:`Joe Bloggs <joeongithub>`.
     where 123456 is the *pull request* number, not the issue number.
 
+:mod:`sklearn.impute`
+.....................
+
+- |MajorFeature| Added :class:`impute.KNNImputer`, to impute missing values using 
+  k-Nearest Neighbors. :issue:`12852` by :user:`Ashim Bhattarai <ashimb9>` and 
+  :user:`Thomas Fan <thomasjpfan>`.
+
 :mod:`sklearn.svm`
 ..................
 

diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py
@@ -8,6 +8,9 @@
 The median is a more robust estimator for data with high magnitude variables
 which could dominate results (otherwise known as a 'long tail').
 
+With ``KNNImputer``, missing values can be imputed using the weighted
+or unweighted mean of the desired number of nearest neighbors.
+
 Another option is the :class:`sklearn.impute.IterativeImputer`. This uses
 round-robin linear regression, treating every variable as an output in
 turn. The version implemented assumes Gaussian (output) variables. If your
@@ -27,7 +30,8 @@
 from sklearn.datasets import load_boston
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.pipeline import make_pipeline, make_union
-from sklearn.impute import SimpleImputer, IterativeImputer, MissingIndicator
+from sklearn.impute import (
+    SimpleImputer, KNNImputer, IterativeImputer, MissingIndicator)
 from sklearn.model_selection import cross_val_score
 
 rng = np.random.RandomState(0)
@@ -79,6 +83,13 @@ def get_results(dataset):
     imputer = SimpleImputer(missing_values=0, strategy="mean")
     mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
 
+    # Estimate the score after kNN-imputation of the missing values
+    knn_rf_estimator = make_pipeline(
+        KNNImputer(missing_values=0, sample_max_missing=0.99),
+        RandomForestRegressor(random_state=0, n_estimators=100))
+    knn_impute_scores = cross_val_score(knn_rf_estimator, X_missing, y_missing,
+                                        scoring='neg_mean_squared_error')
+
     # Estimate the score after iterative imputation of the missing values
     imputer = IterativeImputer(missing_values=0,
                                random_state=0,
@@ -90,6 +101,7 @@ def get_results(dataset):
     return ((full_scores.mean(), full_scores.std()),
             (zero_impute_scores.mean(), zero_impute_scores.std()),
             (mean_impute_scores.mean(), mean_impute_scores.std()),
+            (knn_impute_scores.mean(), knn_impute_scores.std()),
             (iterative_impute_scores.mean(), iterative_impute_scores.std()))
 
 
@@ -107,8 +119,9 @@ def get_results(dataset):
 x_labels = ['Full data',
             'Zero imputation',
             'Mean Imputation',
+            'KNN Imputation',
             'Multivariate Imputation']
-colors = ['r', 'g', 'b', 'orange']
+colors = ['r', 'g', 'b', 'orange', 'black']
 
 # plot diabetes results
 plt.figure(figsize=(12, 6))