scikit-learn · jnothman · Jul 9, 2018 · Jun 14, 2018 · Jun 15, 2018 · Jun 18, 2018
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -548,8 +548,6 @@ K-bins discretization
   ...               [  0., 6., 14 ],
   ...               [  6., 3., 11 ]])
   >>> est = preprocessing.KBinsDiscretizer(n_bins=[3, 3, 2], encode='ordinal').fit(X)
-  >>> est.bin_width_
-  array([3., 1., 2.])
 
 By default the output is one-hot encoded into a sparse matrix
 (See :ref:`preprocessing_categorical_features`)
@@ -565,7 +563,7 @@ example, these intervals are defined as:
  Based on these bin intervals, ``X`` is transformed as follows::
 
   >>> est.transform(X)                      # doctest: +SKIP
-  array([[ 0., 2., 1.],
+  array([[ 0., 1., 1.],
          [ 1., 2., 1.],
          [ 2., 0., 0.]])
 
@@ -576,6 +574,18 @@ Discretization is similar to constructing histograms for continuous data.
 However, histograms focus on counting features which fall into particular
 bins, whereas discretization focuses on assigning feature values to these bins.
 
+:class:`KBinsDiscretizer` implements different binning strategies, which can be
+selected with the ``strategy`` parameter. The 'uniform' strategy uses
+constant-width bins. The 'quantile' strategy uses the quantiles values to have
+equally populated bins in each feature. The 'kmeans' strategy defines bins based
+on a k-means clustering procedure performed on each feature independently.
+
+.. topic:: Examples:
+
+  * :ref:`sphx_glr_auto_examples_plot_discretization.py`
+  * :ref:`sphx_glr_auto_examples_plot_discretization_classification.py`
+  * :ref:`sphx_glr_auto_examples_plot_discretization_strategies.py`
+
 .. _preprocessing_binarization:
 
 Feature binarization

diff --git a/examples/preprocessing/plot_discretization.py b/examples/preprocessing/plot_discretization.py
@@ -77,8 +77,7 @@
 ax2.plot(line, reg.predict(line_binned), linewidth=2, color='red',
          linestyle=':', label='decision tree')
 ax2.plot(X[:, 0], y, 'o', c='k')
-bins = enc.offset_[0] + enc.bin_width_[0] * np.arange(1, enc.n_bins_[0])
-ax2.vlines(bins, *plt.gca().get_ylim(), linewidth=1, alpha=.2)
+ax2.vlines(enc.bin_edges_[0], *plt.gca().get_ylim(), linewidth=1, alpha=.2)
 ax2.legend(loc="best")
 ax2.set_xlabel("Input feature")
 ax2.set_title("Result after discretization")

diff --git a/examples/preprocessing/plot_discretization_classification.py b/examples/preprocessing/plot_discretization_classification.py
@@ -28,8 +28,6 @@
 semi-transparent. The lower right shows the classification accuracy on the test
 set.
 """
-print(__doc__)
-
 # Code source: Tom Dupré la Tour
 # Adapted from plot_classifier_comparison by Gaël Varoquaux and Andreas Müller
 #
@@ -48,6 +46,8 @@
 from sklearn.svm import SVC, LinearSVC
 from sklearn.ensemble import GradientBoostingClassifier
 
+print(__doc__)
+
 h = .02  # step size in the mesh
 
 

diff --git a/examples/preprocessing/plot_discretization_strategies.py b/examples/preprocessing/plot_discretization_strategies.py
@@ -0,0 +1,94 @@
+# -*- coding: utf-8 -*-
+"""
+==========================================================
+Demonstrating the different strategies of KBinsDiscretizer
+==========================================================
+
+This example presents the different strategies implemented in KBinsDiscretizer:
+- 'uniform': The discretization is uniform in each feature, which means that
+  the bin widths are constant in each dimension.
+- quantile': The discretization is done on the quantiled values, which means
+  that each bin has approximately the same number of samples.
+- 'kmeans': The discretization is based on the centroids of a KMeans clustering
+  procedure.
+
+The plot shows the regions where the discretized encoding is constant.
+"""
+
+# Author: Tom Dupré la Tour
+# License: BSD 3 clause
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+from sklearn.preprocessing import KBinsDiscretizer
+from sklearn.datasets import make_blobs
+
+print(__doc__)
+
+strategies = ['uniform', 'quantile', 'kmeans']
+
+n_samples = 200
+centers_0 = np.array([[0, 0], [0, 5], [2, 4], [8, 8]])
+centers_1 = np.array([[0, 0], [3, 1]])
+
+# construct the datasets
+random_state = 42
+X_list = [
+    np.random.RandomState(random_state).uniform(-3, 3, size=(n_samples, 2)),
+    make_blobs(n_samples=[n_samples // 10, n_samples * 4 // 10,
+                          n_samples // 10, n_samples * 4 // 10],
+               cluster_std=0.5, centers=centers_0,
+               random_state=random_state)[0],
+    make_blobs(n_samples=[n_samples // 5, n_samples * 4 // 5],
+               cluster_std=0.5, centers=centers_1,
+               random_state=random_state)[0],
+]
+
+figure = plt.figure(figsize=(14, 9))
+i = 1
+for ds_cnt, X in enumerate(X_list):
+
+    ax = plt.subplot(len(X_list), len(strategies) + 1, i)
+    ax.scatter(X[:, 0], X[:, 1], edgecolors='k')
+    if ds_cnt == 0:
+        ax.set_title("Input data", size=14)
+
+    xx, yy = np.meshgrid(
+        np.linspace(X[:, 0].min(), X[:, 0].max(), 300),
+        np.linspace(X[:, 1].min(), X[:, 1].max(), 300))
+    grid = np.c_[xx.ravel(), yy.ravel()]
+
+    ax.set_xlim(xx.min(), xx.max())
+    ax.set_ylim(yy.min(), yy.max())
+    ax.set_xticks(())
+    ax.set_yticks(())
+
+    i += 1
+    # transform the dataset with KBinsDiscretizer
+    for strategy in strategies:
+        enc = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy=strategy)
+        enc.fit(X)
+        grid_encoded = enc.transform(grid)
+
+        ax = plt.subplot(len(X_list), len(strategies) + 1, i)
+
+        # horizontal stripes
+        horizontal = grid_encoded[:, 0].reshape(xx.shape)
+        ax.contourf(xx, yy, horizontal, alpha=.5)
+        # vertical stripes
+        vertical = grid_encoded[:, 1].reshape(xx.shape)
+        ax.contourf(xx, yy, vertical, alpha=.5)
+
+        ax.scatter(X[:, 0], X[:, 1], edgecolors='k')
+        ax.set_xlim(xx.min(), xx.max())
+        ax.set_ylim(yy.min(), yy.max())
+        ax.set_xticks(())
+        ax.set_yticks(())
+        if ds_cnt == 0:
+            ax.set_title("strategy='%s'" % (strategy, ), size=14)
+
+        i += 1
+
+plt.tight_layout()
+plt.show()
diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py
@@ -26,13 +26,14 @@
 from .data import PowerTransformer
 from .data import CategoricalEncoder
 from .data import PolynomialFeatures
-from .discretization import KBinsDiscretizer
 
 from .label import label_binarize
 from .label import LabelBinarizer
 from .label import LabelEncoder
 from .label import MultiLabelBinarizer
 
+from .discretization import KBinsDiscretizer
+
 from .imputation import Imputer