FIX Remove bins whose width <= 1e-8 with a warning in KBinsDiscretizer (

#13165) * remove redundant bins * tests * what's new * issue number * numeric issue * move what's new * Joel's comment * forget something * flake8 * more doc update * Joel's comment * redundant bins * new message * comment
scikit-learn · Feb 20, 2019 · b40868d · b40868d
1 parent 3fbcb2c
commit b40868d
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 16 deletions.
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
@@ -64,6 +64,10 @@ Changelog
   combination with ``handle_unknown='ignore'``.
   :issue:`12881` by `Joris Van den Bossche`_.
 
+- |Fix| Bins whose width are too small (i.e., <= 1e-8) are removed
+  with a warning in :class:`preprocessing.KBinsDiscretizer`.
+  :issue:`13165` by :user:`Hanmin Qin <qinhanmin2014>`.
+
 :mod:`sklearn.feature_extraction.text`
 ......................................
 

diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py
@@ -56,7 +56,8 @@ class KBinsDiscretizer(BaseEstimator, TransformerMixin):
     Attributes
     ----------
     n_bins_ : int array, shape (n_features,)
-        Number of bins per feature.
+        Number of bins per feature. Bins whose width are too small
+        (i.e., <= 1e-8) are removed with a warning.
 
     bin_edges_ : array of arrays, shape (n_features, )
         The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``
@@ -102,6 +103,11 @@ class KBinsDiscretizer(BaseEstimator, TransformerMixin):
     :class:`sklearn.compose.ColumnTransformer` if you only want to preprocess
     part of the features.
 
+    ``KBinsDiscretizer`` might produce constant features (e.g., when
+    ``encode = 'onehot'`` and certain bins do not contain any data).
+    These features can be removed with feature selection algorithms
+    (e.g., :class:`sklearn.feature_selection.VarianceThreshold`).
+
     See also
     --------
      sklearn.preprocessing.Binarizer : class used to bin values as ``0`` or
@@ -177,6 +183,16 @@ def fit(self, X, y=None):
                 bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5
                 bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max]
 
+            # Remove bins whose width are too small (i.e., <= 1e-8)
+            if self.strategy in ('quantile', 'kmeans'):
+                mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8
+                bin_edges[jj] = bin_edges[jj][mask]
+                if len(bin_edges[jj]) - 1 != n_bins[jj]:
+                    warnings.warn('Bins whose width are too small (i.e., <= '
+                                  '1e-8) in feature %d are removed. Consider '
+                                  'decreasing the number of bins.' % jj)
+                    n_bins[jj] = len(bin_edges[jj]) - 1
+
         self.bin_edges_ = bin_edges
         self.n_bins_ = n_bins
 

diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py
@@ -7,6 +7,7 @@
 from sklearn.preprocessing import KBinsDiscretizer
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.utils.testing import (
+    assert_array_almost_equal,
     assert_array_equal,
     assert_raises,
     assert_raise_message,
@@ -209,24 +210,22 @@ def test_nonuniform_strategies(
     assert_array_equal(expected_5bins, Xt.ravel())
 
 
-@pytest.mark.parametrize('strategy', ['uniform', 'kmeans', 'quantile'])
+@pytest.mark.parametrize(
+    'strategy, expected_inv',
+    [('uniform', [[-1.5, 2., -3.5, -0.5], [-0.5, 3., -2.5, -0.5],
+                  [0.5, 4., -1.5, 0.5], [0.5, 4., -1.5, 1.5]]),
+     ('kmeans', [[-1.375, 2.125, -3.375, -0.5625],
+                 [-1.375, 2.125, -3.375, -0.5625],
+                 [-0.125, 3.375, -2.125, 0.5625],
+                 [0.75, 4.25, -1.25, 1.625]]),
+     ('quantile', [[-1.5, 2., -3.5, -0.75], [-0.5, 3., -2.5, 0.],
+                   [0.5, 4., -1.5, 1.25], [0.5, 4., -1.5, 1.25]])])
 @pytest.mark.parametrize('encode', ['ordinal', 'onehot', 'onehot-dense'])
-def test_inverse_transform(strategy, encode):
-    X = np.random.RandomState(0).randn(100, 3)
+def test_inverse_transform(strategy, encode, expected_inv):
     kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode)
     Xt = kbd.fit_transform(X)
-    X2 = kbd.inverse_transform(Xt)
-    X2t = kbd.fit_transform(X2)
-    if encode == 'onehot':
-        assert_array_equal(Xt.todense(), X2t.todense())
-    else:
-        assert_array_equal(Xt, X2t)
-    if 'onehot' in encode:
-        Xt = kbd._encoder.inverse_transform(Xt)
-        X2t = kbd._encoder.inverse_transform(X2t)
-
-    assert_array_equal(Xt.max(axis=0) + 1, kbd.n_bins_)
-    assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
+    Xinv = kbd.inverse_transform(Xt)
+    assert_array_almost_equal(expected_inv, Xinv)
 
 
 @pytest.mark.parametrize('strategy', ['uniform', 'kmeans', 'quantile'])
@@ -253,3 +252,28 @@ def test_overwrite():
     Xinv = est.inverse_transform(Xt)
     assert_array_equal(Xt, Xt_before)
     assert_array_equal(Xinv, np.array([[0.5], [1.5], [2.5], [2.5]]))
+
+
+@pytest.mark.parametrize(
+    'strategy, expected_bin_edges',
+    [('quantile', [0, 1, 3]), ('kmeans', [0, 1.5, 3])])
+def test_redundant_bins(strategy, expected_bin_edges):
+    X = [[0], [0], [0], [0], [3], [3]]
+    kbd = KBinsDiscretizer(n_bins=3, strategy=strategy)
+    msg = ("Bins whose width are too small (i.e., <= 1e-8) in feature 0 "
+           "are removed. Consider decreasing the number of bins.")
+    assert_warns_message(UserWarning, msg, kbd.fit, X)
+    assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges)
+
+
+def test_percentile_numeric_stability():
+    X = np.array([0.05, 0.05, 0.95]).reshape(-1, 1)
+    bin_edges = np.array([0.05, 0.23, 0.41, 0.59, 0.77, 0.95])
+    Xt = np.array([0, 0, 4]).reshape(-1, 1)
+    kbd = KBinsDiscretizer(n_bins=10, encode='ordinal',
+                           strategy='quantile')
+    msg = ("Bins whose width are too small (i.e., <= 1e-8) in feature 0 "
+           "are removed. Consider decreasing the number of bins.")
+    assert_warns_message(UserWarning, msg, kbd.fit, X)
+    assert_array_almost_equal(kbd.bin_edges_[0], bin_edges)
+    assert_array_almost_equal(kbd.transform(X), Xt)