Skip to content

Commit

Permalink
FIX Remove bins whose width <= 1e-8 with a warning in KBinsDiscretizer (
Browse files Browse the repository at this point in the history
#13165)

* remove redundant bins

* tests

* what's new

* issue number

* numeric issue

* move what's new

* Joel's comment

* forget something

* flake8

* more doc update

* Joel's comment

* redundant bins

* new message

* comment
  • Loading branch information
qinhanmin2014 authored and adrinjalali committed Feb 20, 2019
1 parent 3fbcb2c commit b40868d
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 16 deletions.
4 changes: 4 additions & 0 deletions doc/whats_new/v0.20.rst
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ Changelog
combination with ``handle_unknown='ignore'``.
:issue:`12881` by `Joris Van den Bossche`_.

- |Fix| Bins whose width are too small (i.e., <= 1e-8) are removed
with a warning in :class:`preprocessing.KBinsDiscretizer`.
:issue:`13165` by :user:`Hanmin Qin <qinhanmin2014>`.

:mod:`sklearn.feature_extraction.text`
......................................

Expand Down
18 changes: 17 additions & 1 deletion sklearn/preprocessing/_discretization.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ class KBinsDiscretizer(BaseEstimator, TransformerMixin):
Attributes
----------
n_bins_ : int array, shape (n_features,)
Number of bins per feature.
Number of bins per feature. Bins whose width are too small
(i.e., <= 1e-8) are removed with a warning.
bin_edges_ : array of arrays, shape (n_features, )
The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``
Expand Down Expand Up @@ -102,6 +103,11 @@ class KBinsDiscretizer(BaseEstimator, TransformerMixin):
:class:`sklearn.compose.ColumnTransformer` if you only want to preprocess
part of the features.
``KBinsDiscretizer`` might produce constant features (e.g., when
``encode = 'onehot'`` and certain bins do not contain any data).
These features can be removed with feature selection algorithms
(e.g., :class:`sklearn.feature_selection.VarianceThreshold`).
See also
--------
sklearn.preprocessing.Binarizer : class used to bin values as ``0`` or
Expand Down Expand Up @@ -177,6 +183,16 @@ def fit(self, X, y=None):
bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5
bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max]

# Remove bins whose width are too small (i.e., <= 1e-8)
if self.strategy in ('quantile', 'kmeans'):
mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8
bin_edges[jj] = bin_edges[jj][mask]
if len(bin_edges[jj]) - 1 != n_bins[jj]:
warnings.warn('Bins whose width are too small (i.e., <= '
'1e-8) in feature %d are removed. Consider '
'decreasing the number of bins.' % jj)
n_bins[jj] = len(bin_edges[jj]) - 1

self.bin_edges_ = bin_edges
self.n_bins_ = n_bins

Expand Down
54 changes: 39 additions & 15 deletions sklearn/preprocessing/tests/test_discretization.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.testing import (
assert_array_almost_equal,
assert_array_equal,
assert_raises,
assert_raise_message,
Expand Down Expand Up @@ -209,24 +210,22 @@ def test_nonuniform_strategies(
assert_array_equal(expected_5bins, Xt.ravel())


@pytest.mark.parametrize('strategy', ['uniform', 'kmeans', 'quantile'])
@pytest.mark.parametrize(
'strategy, expected_inv',
[('uniform', [[-1.5, 2., -3.5, -0.5], [-0.5, 3., -2.5, -0.5],
[0.5, 4., -1.5, 0.5], [0.5, 4., -1.5, 1.5]]),
('kmeans', [[-1.375, 2.125, -3.375, -0.5625],
[-1.375, 2.125, -3.375, -0.5625],
[-0.125, 3.375, -2.125, 0.5625],
[0.75, 4.25, -1.25, 1.625]]),
('quantile', [[-1.5, 2., -3.5, -0.75], [-0.5, 3., -2.5, 0.],
[0.5, 4., -1.5, 1.25], [0.5, 4., -1.5, 1.25]])])
@pytest.mark.parametrize('encode', ['ordinal', 'onehot', 'onehot-dense'])
def test_inverse_transform(strategy, encode):
X = np.random.RandomState(0).randn(100, 3)
def test_inverse_transform(strategy, encode, expected_inv):
kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode)
Xt = kbd.fit_transform(X)
X2 = kbd.inverse_transform(Xt)
X2t = kbd.fit_transform(X2)
if encode == 'onehot':
assert_array_equal(Xt.todense(), X2t.todense())
else:
assert_array_equal(Xt, X2t)
if 'onehot' in encode:
Xt = kbd._encoder.inverse_transform(Xt)
X2t = kbd._encoder.inverse_transform(X2t)

assert_array_equal(Xt.max(axis=0) + 1, kbd.n_bins_)
assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
Xinv = kbd.inverse_transform(Xt)
assert_array_almost_equal(expected_inv, Xinv)


@pytest.mark.parametrize('strategy', ['uniform', 'kmeans', 'quantile'])
Expand All @@ -253,3 +252,28 @@ def test_overwrite():
Xinv = est.inverse_transform(Xt)
assert_array_equal(Xt, Xt_before)
assert_array_equal(Xinv, np.array([[0.5], [1.5], [2.5], [2.5]]))


@pytest.mark.parametrize(
'strategy, expected_bin_edges',
[('quantile', [0, 1, 3]), ('kmeans', [0, 1.5, 3])])
def test_redundant_bins(strategy, expected_bin_edges):
X = [[0], [0], [0], [0], [3], [3]]
kbd = KBinsDiscretizer(n_bins=3, strategy=strategy)
msg = ("Bins whose width are too small (i.e., <= 1e-8) in feature 0 "
"are removed. Consider decreasing the number of bins.")
assert_warns_message(UserWarning, msg, kbd.fit, X)
assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges)


def test_percentile_numeric_stability():
X = np.array([0.05, 0.05, 0.95]).reshape(-1, 1)
bin_edges = np.array([0.05, 0.23, 0.41, 0.59, 0.77, 0.95])
Xt = np.array([0, 0, 4]).reshape(-1, 1)
kbd = KBinsDiscretizer(n_bins=10, encode='ordinal',
strategy='quantile')
msg = ("Bins whose width are too small (i.e., <= 1e-8) in feature 0 "
"are removed. Consider decreasing the number of bins.")
assert_warns_message(UserWarning, msg, kbd.fit, X)
assert_array_almost_equal(kbd.bin_edges_[0], bin_edges)
assert_array_almost_equal(kbd.transform(X), Xt)

0 comments on commit b40868d

Please sign in to comment.