From a43d5348d5f9173ee8deda3d5ec1d8aee6b3dc28 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 13 Feb 2019 22:31:23 +0800 Subject: [PATCH 01/14] remove redundant bins --- sklearn/preprocessing/_discretization.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 35d654399dc27..a5405c5f1fbb1 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -142,6 +142,7 @@ def fit(self, X, y=None): n_features = X.shape[1] n_bins = self._validate_n_bins(n_features) + actual_n_bins = n_bins.copy() bin_edges = np.zeros(n_features, dtype=object) for jj in range(n_features): @@ -176,9 +177,11 @@ def fit(self, X, y=None): centers.sort() bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5 bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max] + bin_edges[jj] = np.unique(bin_edges[jj]) + actual_n_bins[jj] = len(bin_edges[jj]) self.bin_edges_ = bin_edges - self.n_bins_ = n_bins + self.n_bins_ = actual_n_bins if 'onehot' in self.encode: self._encoder = OneHotEncoder( From b7acc86183786d82c9e224b934083dca388684a3 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Thu, 14 Feb 2019 22:42:11 +0800 Subject: [PATCH 02/14] tests --- sklearn/preprocessing/_discretization.py | 21 +++++++--- .../tests/test_discretization.py | 41 ++++++++++++------- 2 files changed, 42 insertions(+), 20 deletions(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index a5405c5f1fbb1..2d46f03e7b5f2 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -56,7 +56,8 @@ class KBinsDiscretizer(BaseEstimator, TransformerMixin): Attributes ---------- n_bins_ : int array, shape (n_features,) - Number of bins per feature. + Number of bins per feature. Redundant bins (i.e., bins whose width = 0) + are removed with a warning. bin_edges_ : array of arrays, shape (n_features, ) The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )`` @@ -102,6 +103,11 @@ class KBinsDiscretizer(BaseEstimator, TransformerMixin): :class:`sklearn.compose.ColumnTransformer` if you only want to preprocess part of the features. + ``KBinsDiscretizer`` might produce constant features (e.g., when + ``encode = 'onehot'`` and certain bins do not contain any data). + These features can be removed with feature selection algorithms + (e.g., :class:`sklearn.compose.VarianceThreshold`). + See also -------- sklearn.preprocessing.Binarizer : class used to bin values as ``0`` or @@ -142,7 +148,6 @@ def fit(self, X, y=None): n_features = X.shape[1] n_bins = self._validate_n_bins(n_features) - actual_n_bins = n_bins.copy() bin_edges = np.zeros(n_features, dtype=object) for jj in range(n_features): @@ -177,11 +182,17 @@ def fit(self, X, y=None): centers.sort() bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5 bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max] - bin_edges[jj] = np.unique(bin_edges[jj]) - actual_n_bins[jj] = len(bin_edges[jj]) + + # Remove redundant bins (i.e., bins whose width = 0) + if self.strategy in ('quantile', 'kmeans'): + bin_edges[jj] = np.unique(bin_edges[jj]) + if len(bin_edges[jj]) - 1 != n_bins[jj]: + warnings.warn('Redundant bins (i.e., bins whose width = 0)' + ' in feature %d are removed.' % jj) + n_bins[jj] = len(bin_edges[jj]) - 1 self.bin_edges_ = bin_edges - self.n_bins_ = actual_n_bins + self.n_bins_ = n_bins if 'onehot' in self.encode: self._encoder = OneHotEncoder( diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py index 81dc6fa5668a5..005bdb0e8b018 100644 --- a/sklearn/preprocessing/tests/test_discretization.py +++ b/sklearn/preprocessing/tests/test_discretization.py @@ -7,6 +7,7 @@ from sklearn.preprocessing import KBinsDiscretizer from sklearn.preprocessing import OneHotEncoder from sklearn.utils.testing import ( + assert_array_almost_equal, assert_array_equal, assert_raises, assert_raise_message, @@ -209,24 +210,22 @@ def test_nonuniform_strategies( assert_array_equal(expected_5bins, Xt.ravel()) -@pytest.mark.parametrize('strategy', ['uniform', 'kmeans', 'quantile']) +@pytest.mark.parametrize( + 'strategy, expected_inv', + [('uniform', [[-1.5, 2., -3.5, -0.5], [-0.5, 3., -2.5, -0.5], + [0.5, 4., -1.5, 0.5], [0.5, 4., -1.5, 1.5]]), + ('kmeans', [[-1.375, 2.125, -3.375, -0.5625], + [-1.375, 2.125, -3.375, -0.5625], + [-0.125, 3.375, -2.125, 0.5625], + [0.75, 4.25, -1.25, 1.625 ]]), + ('quantile', [[-1.5, 2., -3.5, -0.75], [-0.5, 3., -2.5, 0.], + [0.5, 4., -1.5, 1.25], [0.5, 4., -1.5, 1.25]])]) @pytest.mark.parametrize('encode', ['ordinal', 'onehot', 'onehot-dense']) -def test_inverse_transform(strategy, encode): - X = np.random.RandomState(0).randn(100, 3) +def test_inverse_transform(strategy, encode, expected_inv): kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode) Xt = kbd.fit_transform(X) - X2 = kbd.inverse_transform(Xt) - X2t = kbd.fit_transform(X2) - if encode == 'onehot': - assert_array_equal(Xt.todense(), X2t.todense()) - else: - assert_array_equal(Xt, X2t) - if 'onehot' in encode: - Xt = kbd._encoder.inverse_transform(Xt) - X2t = kbd._encoder.inverse_transform(X2t) - - assert_array_equal(Xt.max(axis=0) + 1, kbd.n_bins_) - assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_) + Xinv = kbd.inverse_transform(Xt) + assert_array_almost_equal(expected_inv, Xinv) @pytest.mark.parametrize('strategy', ['uniform', 'kmeans', 'quantile']) @@ -253,3 +252,15 @@ def test_overwrite(): Xinv = est.inverse_transform(Xt) assert_array_equal(Xt, Xt_before) assert_array_equal(Xinv, np.array([[0.5], [1.5], [2.5], [2.5]])) + + +@pytest.mark.parametrize( + 'strategy, expected_bin_edges', + [('quantile', [0, 1, 3]), ('kmeans', [0, 1.5, 3])]) +def test_redundant_bins(strategy, expected_bin_edges): + X = [[0], [0], [0], [0], [3], [3]] + kbd = KBinsDiscretizer(n_bins=3, strategy=strategy) + msg = ("Redundant bins (i.e., bins whose width = 0) " + "in feature 0 are removed.") + assert_warns_message(UserWarning, msg, kbd.fit, X) + assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges) From 54f9ff46fc7aeafdd05b0afa5af7abb10842fd51 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Thu, 14 Feb 2019 22:47:13 +0800 Subject: [PATCH 03/14] what's new --- doc/whats_new/v0.21.rst | 4 ++++ sklearn/preprocessing/_discretization.py | 2 +- sklearn/preprocessing/tests/test_discretization.py | 4 ++-- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index fc545741454f2..075dbbd346dc9 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -260,6 +260,10 @@ Support for Python 3.4 and below has been officially dropped. in the dense case. Also added a new parameter ``order`` which controls output order for further speed performances. :issue:`12251` by `Tom Dupre la Tour`_. +- |Fix| Redundant bins (i.e., bins whose width = 0) are removed with a warning + in :class:`preprocessing.KBinsDiscretizer`. + :issue:`13164` by :user:`Hanmin Qin `. + - |Fix| Fixed the calculation overflow when using a float16 dtype with :class:`preprocessing.StandardScaler`. :issue:`13007` by :user:`Raffaello Baluyot ` diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 2d46f03e7b5f2..4f0a779dd1e85 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -182,7 +182,7 @@ def fit(self, X, y=None): centers.sort() bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5 bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max] - + # Remove redundant bins (i.e., bins whose width = 0) if self.strategy in ('quantile', 'kmeans'): bin_edges[jj] = np.unique(bin_edges[jj]) diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py index 005bdb0e8b018..929719bb7ff10 100644 --- a/sklearn/preprocessing/tests/test_discretization.py +++ b/sklearn/preprocessing/tests/test_discretization.py @@ -217,9 +217,9 @@ def test_nonuniform_strategies( ('kmeans', [[-1.375, 2.125, -3.375, -0.5625], [-1.375, 2.125, -3.375, -0.5625], [-0.125, 3.375, -2.125, 0.5625], - [0.75, 4.25, -1.25, 1.625 ]]), + [0.75, 4.25, -1.25, 1.625]]), ('quantile', [[-1.5, 2., -3.5, -0.75], [-0.5, 3., -2.5, 0.], - [0.5, 4., -1.5, 1.25], [0.5, 4., -1.5, 1.25]])]) + [0.5, 4., -1.5, 1.25], [0.5, 4., -1.5, 1.25]])]) @pytest.mark.parametrize('encode', ['ordinal', 'onehot', 'onehot-dense']) def test_inverse_transform(strategy, encode, expected_inv): kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode) From 389d674b66ffe1a44b6e19fb52666db7092be6f8 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Thu, 14 Feb 2019 22:54:37 +0800 Subject: [PATCH 04/14] issue number --- doc/whats_new/v0.21.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 075dbbd346dc9..609a270d784aa 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -262,7 +262,7 @@ Support for Python 3.4 and below has been officially dropped. - |Fix| Redundant bins (i.e., bins whose width = 0) are removed with a warning in :class:`preprocessing.KBinsDiscretizer`. - :issue:`13164` by :user:`Hanmin Qin `. + :issue:`13165` by :user:`Hanmin Qin `. - |Fix| Fixed the calculation overflow when using a float16 dtype with :class:`preprocessing.StandardScaler`. :issue:`13007` by From 8660ff33b2401e588311100b85f0591a8b4c6739 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 20 Feb 2019 10:54:21 +0800 Subject: [PATCH 05/14] numeric issue --- sklearn/preprocessing/_discretization.py | 5 ++++- sklearn/preprocessing/tests/test_discretization.py | 13 +++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 4f0a779dd1e85..333203330862d 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -185,7 +185,10 @@ def fit(self, X, y=None): # Remove redundant bins (i.e., bins whose width = 0) if self.strategy in ('quantile', 'kmeans'): - bin_edges[jj] = np.unique(bin_edges[jj]) + bin_edges[jj] = np.array( + [bin_edges[jj][0]] + + [bin_edges[jj][i] for i in range(1, len(bin_edges[jj])) + if bin_edges[jj][i] - bin_edges[jj][i - 1] > 1e-8]) if len(bin_edges[jj]) - 1 != n_bins[jj]: warnings.warn('Redundant bins (i.e., bins whose width = 0)' ' in feature %d are removed.' % jj) diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py index 929719bb7ff10..02d2c3a4bdcb7 100644 --- a/sklearn/preprocessing/tests/test_discretization.py +++ b/sklearn/preprocessing/tests/test_discretization.py @@ -264,3 +264,16 @@ def test_redundant_bins(strategy, expected_bin_edges): "in feature 0 are removed.") assert_warns_message(UserWarning, msg, kbd.fit, X) assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges) + + +def test_percentile_numeric_stability(): + X = np.array([0.05, 0.05, 0.95]).reshape(-1, 1) + bin_edges = np.array([0.05, 0.23, 0.41, 0.59, 0.77, 0.95]) + Xt = np.array([0, 0, 4]).reshape(-1, 1) + kbd = KBinsDiscretizer(n_bins=10, encode='ordinal', + strategy='quantile') + msg = ("Redundant bins (i.e., bins whose width = 0) " + "in feature 0 are removed.") + assert_warns_message(UserWarning, msg, kbd.fit, X) + assert_array_almost_equal(kbd.bin_edges_[0], bin_edges) + assert_array_almost_equal(kbd.transform(X), Xt) From 72831cd4e872718fab8a633ac9173a5f1b40627e Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 20 Feb 2019 11:24:28 +0800 Subject: [PATCH 06/14] move what's new --- doc/whats_new/v0.20.rst | 4 ++++ doc/whats_new/v0.21.rst | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 4daae88572b48..a34ecbfba7159 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -64,6 +64,10 @@ Changelog combination with ``handle_unknown='ignore'``. :issue:`12881` by `Joris Van den Bossche`_. +- |Fix| Redundant bins (i.e., bins whose width = 0) are removed with a warning + in :class:`preprocessing.KBinsDiscretizer`. + :issue:`13165` by :user:`Hanmin Qin `. + :mod:`sklearn.feature_extraction.text` ...................................... diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 609a270d784aa..fc545741454f2 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -260,10 +260,6 @@ Support for Python 3.4 and below has been officially dropped. in the dense case. Also added a new parameter ``order`` which controls output order for further speed performances. :issue:`12251` by `Tom Dupre la Tour`_. -- |Fix| Redundant bins (i.e., bins whose width = 0) are removed with a warning - in :class:`preprocessing.KBinsDiscretizer`. - :issue:`13165` by :user:`Hanmin Qin `. - - |Fix| Fixed the calculation overflow when using a float16 dtype with :class:`preprocessing.StandardScaler`. :issue:`13007` by :user:`Raffaello Baluyot ` From 17089109be038f39837cd12818733ed100d8dd32 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 20 Feb 2019 11:38:51 +0800 Subject: [PATCH 07/14] Joel's comment --- sklearn/preprocessing/_discretization.py | 10 ++++------ sklearn/preprocessing/tests/test_discretization.py | 4 ++-- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 333203330862d..c082272426cec 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -185,13 +185,11 @@ def fit(self, X, y=None): # Remove redundant bins (i.e., bins whose width = 0) if self.strategy in ('quantile', 'kmeans'): - bin_edges[jj] = np.array( - [bin_edges[jj][0]] + - [bin_edges[jj][i] for i in range(1, len(bin_edges[jj])) - if bin_edges[jj][i] - bin_edges[jj][i - 1] > 1e-8]) + mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8 + bin_edges[jj] = bin_edges[jj][mask] if len(bin_edges[jj]) - 1 != n_bins[jj]: - warnings.warn('Redundant bins (i.e., bins whose width = 0)' - ' in feature %d are removed.' % jj) + warnings.warn('Redundant bins (i.e., bins whose width ' + '<= 0) in feature %d are removed.' % jj) n_bins[jj] = len(bin_edges[jj]) - 1 self.bin_edges_ = bin_edges diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py index 02d2c3a4bdcb7..7776e512d01e9 100644 --- a/sklearn/preprocessing/tests/test_discretization.py +++ b/sklearn/preprocessing/tests/test_discretization.py @@ -260,7 +260,7 @@ def test_overwrite(): def test_redundant_bins(strategy, expected_bin_edges): X = [[0], [0], [0], [0], [3], [3]] kbd = KBinsDiscretizer(n_bins=3, strategy=strategy) - msg = ("Redundant bins (i.e., bins whose width = 0) " + msg = ("Redundant bins (i.e., bins whose width <= 0) " "in feature 0 are removed.") assert_warns_message(UserWarning, msg, kbd.fit, X) assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges) @@ -272,7 +272,7 @@ def test_percentile_numeric_stability(): Xt = np.array([0, 0, 4]).reshape(-1, 1) kbd = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile') - msg = ("Redundant bins (i.e., bins whose width = 0) " + msg = ("Redundant bins (i.e., bins whose width <= 0) " "in feature 0 are removed.") assert_warns_message(UserWarning, msg, kbd.fit, X) assert_array_almost_equal(kbd.bin_edges_[0], bin_edges) From 38c7b23daa50d8d8cacbc406a4568cf4c9e0f943 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 20 Feb 2019 11:39:46 +0800 Subject: [PATCH 08/14] forget something --- sklearn/preprocessing/_discretization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index c082272426cec..9ec5efc02d3dc 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -183,7 +183,7 @@ def fit(self, X, y=None): bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5 bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max] - # Remove redundant bins (i.e., bins whose width = 0) + # Remove redundant bins (i.e., bins whose width <= 0) if self.strategy in ('quantile', 'kmeans'): mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8 bin_edges[jj] = bin_edges[jj][mask] From 4414982e85b54cd5e171f59d2d234342b75e850f Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 20 Feb 2019 11:41:26 +0800 Subject: [PATCH 09/14] flake8 --- sklearn/preprocessing/_discretization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 9ec5efc02d3dc..70ad323e46d66 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -188,7 +188,7 @@ def fit(self, X, y=None): mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8 bin_edges[jj] = bin_edges[jj][mask] if len(bin_edges[jj]) - 1 != n_bins[jj]: - warnings.warn('Redundant bins (i.e., bins whose width ' + warnings.warn('Redundant bins (i.e., bins whose width ' '<= 0) in feature %d are removed.' % jj) n_bins[jj] = len(bin_edges[jj]) - 1 From 0c09988a2376dba579e00228be98f45f59b5247f Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 20 Feb 2019 11:49:45 +0800 Subject: [PATCH 10/14] more doc update --- doc/whats_new/v0.20.rst | 2 +- sklearn/preprocessing/_discretization.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index a34ecbfba7159..30c38efc0f841 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -64,7 +64,7 @@ Changelog combination with ``handle_unknown='ignore'``. :issue:`12881` by `Joris Van den Bossche`_. -- |Fix| Redundant bins (i.e., bins whose width = 0) are removed with a warning +- |Fix| Redundant bins (i.e., bins whose width <= 0) are removed with a warning in :class:`preprocessing.KBinsDiscretizer`. :issue:`13165` by :user:`Hanmin Qin `. diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 70ad323e46d66..532f64ea6109f 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -56,8 +56,8 @@ class KBinsDiscretizer(BaseEstimator, TransformerMixin): Attributes ---------- n_bins_ : int array, shape (n_features,) - Number of bins per feature. Redundant bins (i.e., bins whose width = 0) - are removed with a warning. + Number of bins per feature. Redundant bins (i.e., bins whose + width <= 0) are removed with a warning. bin_edges_ : array of arrays, shape (n_features, ) The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )`` From 481267ec5eaf4a5ce9eb772ffe4fd9671bbab469 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 20 Feb 2019 16:25:10 +0800 Subject: [PATCH 11/14] Joel's comment --- sklearn/preprocessing/_discretization.py | 5 +++-- sklearn/preprocessing/tests/test_discretization.py | 8 ++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 532f64ea6109f..2c9d07015c930 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -106,7 +106,7 @@ class KBinsDiscretizer(BaseEstimator, TransformerMixin): ``KBinsDiscretizer`` might produce constant features (e.g., when ``encode = 'onehot'`` and certain bins do not contain any data). These features can be removed with feature selection algorithms - (e.g., :class:`sklearn.compose.VarianceThreshold`). + (e.g., :class:`sklearn.feature_selection.VarianceThreshold`). See also -------- @@ -189,7 +189,8 @@ def fit(self, X, y=None): bin_edges[jj] = bin_edges[jj][mask] if len(bin_edges[jj]) - 1 != n_bins[jj]: warnings.warn('Redundant bins (i.e., bins whose width ' - '<= 0) in feature %d are removed.' % jj) + '<= 0) in feature %d are removed. Consider ' + 'decreasing the number of bins.' % jj) n_bins[jj] = len(bin_edges[jj]) - 1 self.bin_edges_ = bin_edges diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py index 7776e512d01e9..d63ad7fdd039e 100644 --- a/sklearn/preprocessing/tests/test_discretization.py +++ b/sklearn/preprocessing/tests/test_discretization.py @@ -260,8 +260,8 @@ def test_overwrite(): def test_redundant_bins(strategy, expected_bin_edges): X = [[0], [0], [0], [0], [3], [3]] kbd = KBinsDiscretizer(n_bins=3, strategy=strategy) - msg = ("Redundant bins (i.e., bins whose width <= 0) " - "in feature 0 are removed.") + msg = ("Redundant bins (i.e., bins whose width <= 0) in feature 0 " + "are removed. Consider decreasing the number of bins.") assert_warns_message(UserWarning, msg, kbd.fit, X) assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges) @@ -272,8 +272,8 @@ def test_percentile_numeric_stability(): Xt = np.array([0, 0, 4]).reshape(-1, 1) kbd = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile') - msg = ("Redundant bins (i.e., bins whose width <= 0) " - "in feature 0 are removed.") + msg = ("Redundant bins (i.e., bins whose width <= 0) in feature 0 " + "are removed. Consider decreasing the number of bins.") assert_warns_message(UserWarning, msg, kbd.fit, X) assert_array_almost_equal(kbd.bin_edges_[0], bin_edges) assert_array_almost_equal(kbd.transform(X), Xt) From a40838048655701722f38f3c50414e0734feda29 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 20 Feb 2019 18:15:54 +0800 Subject: [PATCH 12/14] redundant bins --- doc/whats_new/v0.20.rst | 4 ++-- sklearn/preprocessing/_discretization.py | 10 +++++----- sklearn/preprocessing/tests/test_discretization.py | 8 ++++---- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 30c38efc0f841..0d84aaf240cea 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -64,8 +64,8 @@ Changelog combination with ``handle_unknown='ignore'``. :issue:`12881` by `Joris Van den Bossche`_. -- |Fix| Redundant bins (i.e., bins whose width <= 0) are removed with a warning - in :class:`preprocessing.KBinsDiscretizer`. +- |Fix| Bins whose width <= 0 are removed with a warning in + :class:`preprocessing.KBinsDiscretizer`. :issue:`13165` by :user:`Hanmin Qin `. :mod:`sklearn.feature_extraction.text` diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 2c9d07015c930..0df0b9a4a1947 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -56,8 +56,8 @@ class KBinsDiscretizer(BaseEstimator, TransformerMixin): Attributes ---------- n_bins_ : int array, shape (n_features,) - Number of bins per feature. Redundant bins (i.e., bins whose - width <= 0) are removed with a warning. + Number of bins per feature. Bins whose width <= 0 are removed with a + warning. bin_edges_ : array of arrays, shape (n_features, ) The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )`` @@ -188,9 +188,9 @@ def fit(self, X, y=None): mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8 bin_edges[jj] = bin_edges[jj][mask] if len(bin_edges[jj]) - 1 != n_bins[jj]: - warnings.warn('Redundant bins (i.e., bins whose width ' - '<= 0) in feature %d are removed. Consider ' - 'decreasing the number of bins.' % jj) + warnings.warn('Bins whose width <= 0 in feature %d are ' + 'removed. Consider decreasing the number of ' + 'bins.' % jj) n_bins[jj] = len(bin_edges[jj]) - 1 self.bin_edges_ = bin_edges diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py index d63ad7fdd039e..afc3bb1ef32a3 100644 --- a/sklearn/preprocessing/tests/test_discretization.py +++ b/sklearn/preprocessing/tests/test_discretization.py @@ -260,8 +260,8 @@ def test_overwrite(): def test_redundant_bins(strategy, expected_bin_edges): X = [[0], [0], [0], [0], [3], [3]] kbd = KBinsDiscretizer(n_bins=3, strategy=strategy) - msg = ("Redundant bins (i.e., bins whose width <= 0) in feature 0 " - "are removed. Consider decreasing the number of bins.") + msg = ("Bins whose width <= 0 in feature 0 are removed. " + "Consider decreasing the number of bins.") assert_warns_message(UserWarning, msg, kbd.fit, X) assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges) @@ -272,8 +272,8 @@ def test_percentile_numeric_stability(): Xt = np.array([0, 0, 4]).reshape(-1, 1) kbd = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile') - msg = ("Redundant bins (i.e., bins whose width <= 0) in feature 0 " - "are removed. Consider decreasing the number of bins.") + msg = ("Bins whose width <= 0 in feature 0 are removed. " + "Consider decreasing the number of bins.") assert_warns_message(UserWarning, msg, kbd.fit, X) assert_array_almost_equal(kbd.bin_edges_[0], bin_edges) assert_array_almost_equal(kbd.transform(X), Xt) From 65696701439e31767d165ca7e50698ab952a7d53 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 20 Feb 2019 20:24:29 +0800 Subject: [PATCH 13/14] new message --- doc/whats_new/v0.20.rst | 4 ++-- sklearn/preprocessing/_discretization.py | 10 +++++----- sklearn/preprocessing/tests/test_discretization.py | 8 ++++---- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 0d84aaf240cea..c7eaa53a2dc2d 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -64,8 +64,8 @@ Changelog combination with ``handle_unknown='ignore'``. :issue:`12881` by `Joris Van den Bossche`_. -- |Fix| Bins whose width <= 0 are removed with a warning in - :class:`preprocessing.KBinsDiscretizer`. +- |Fix| Bins whose width are too small (i.e., <= 1e-8) are removed + with a warning in :class:`preprocessing.KBinsDiscretizer`. :issue:`13165` by :user:`Hanmin Qin `. :mod:`sklearn.feature_extraction.text` diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 0df0b9a4a1947..cb26ab4179dcc 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -56,8 +56,8 @@ class KBinsDiscretizer(BaseEstimator, TransformerMixin): Attributes ---------- n_bins_ : int array, shape (n_features,) - Number of bins per feature. Bins whose width <= 0 are removed with a - warning. + Number of bins per feature. Bins whose width are too small + (i.e., <= 1e-8) are removed with a warning. bin_edges_ : array of arrays, shape (n_features, ) The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )`` @@ -188,9 +188,9 @@ def fit(self, X, y=None): mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8 bin_edges[jj] = bin_edges[jj][mask] if len(bin_edges[jj]) - 1 != n_bins[jj]: - warnings.warn('Bins whose width <= 0 in feature %d are ' - 'removed. Consider decreasing the number of ' - 'bins.' % jj) + warnings.warn('Bins whose width are too small (i.e., <= ' + '1e-8) in feature %d are removed. Consider ' + 'decreasing the number of bins.' % jj) n_bins[jj] = len(bin_edges[jj]) - 1 self.bin_edges_ = bin_edges diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py index afc3bb1ef32a3..102b789eb093d 100644 --- a/sklearn/preprocessing/tests/test_discretization.py +++ b/sklearn/preprocessing/tests/test_discretization.py @@ -260,8 +260,8 @@ def test_overwrite(): def test_redundant_bins(strategy, expected_bin_edges): X = [[0], [0], [0], [0], [3], [3]] kbd = KBinsDiscretizer(n_bins=3, strategy=strategy) - msg = ("Bins whose width <= 0 in feature 0 are removed. " - "Consider decreasing the number of bins.") + msg = ("Bins whose width are too small (i.e., <= 1e-8) in feature 0 " + "are removed. Consider decreasing the number of bins.") assert_warns_message(UserWarning, msg, kbd.fit, X) assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges) @@ -272,8 +272,8 @@ def test_percentile_numeric_stability(): Xt = np.array([0, 0, 4]).reshape(-1, 1) kbd = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile') - msg = ("Bins whose width <= 0 in feature 0 are removed. " - "Consider decreasing the number of bins.") + msg = ("Bins whose width are too small (i.e., <= 1e-8) in feature 0 " + "are removed. Consider decreasing the number of bins.") assert_warns_message(UserWarning, msg, kbd.fit, X) assert_array_almost_equal(kbd.bin_edges_[0], bin_edges) assert_array_almost_equal(kbd.transform(X), Xt) From 77922341574539434cc9ebc5b9c96747cf9d8d55 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 20 Feb 2019 20:27:02 +0800 Subject: [PATCH 14/14] comment --- sklearn/preprocessing/_discretization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index cb26ab4179dcc..ebde8e07a0ad4 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -183,7 +183,7 @@ def fit(self, X, y=None): bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5 bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max] - # Remove redundant bins (i.e., bins whose width <= 0) + # Remove bins whose width are too small (i.e., <= 1e-8) if self.strategy in ('quantile', 'kmeans'): mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8 bin_edges[jj] = bin_edges[jj][mask]