From 38fda13e97f91941cba2f0733894a3de276755c2 Mon Sep 17 00:00:00 2001 From: Albert Thomas Date: Thu, 28 Feb 2019 15:54:18 +0100 Subject: [PATCH 1/9] dynamically set n_quantiles to min(n_quantiles, n_samples) --- sklearn/preprocessing/data.py | 23 +++++++++++++++++++++-- sklearn/preprocessing/tests/test_data.py | 6 ++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 7069a5ba874e7..c5139276c92e0 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -424,7 +424,7 @@ def minmax_scale(X, feature_range=(0, 1), axis=0, copy=True): X_scaled = X_std * (max - min) + min where min, max = feature_range. - + The transformation is calculated as (when ``axis=0``):: X_scaled = scale * X + min - X.min(axis=0) * scale @@ -592,7 +592,7 @@ class StandardScaler(BaseEstimator, TransformerMixin): ----- NaNs are treated as missing values: disregarded in fit, and maintained in transform. - + We use a biased estimator for the standard deviation, equivalent to `numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to affect model performance. @@ -2044,6 +2044,10 @@ class QuantileTransformer(BaseEstimator, TransformerMixin): n_quantiles : int, optional (default=1000) Number of quantiles to be computed. It corresponds to the number of landmarks used to discretize the cumulative distribution function. + If n_quantiles is larger than the number of samples, n_quantiles is set + to the number of samples as a larger number of quantiles does not give + a better approximation of the cumulative distribution function + estimator. output_distribution : str, optional (default='uniform') Marginal distribution for the transformed data. The choices are @@ -2218,6 +2222,17 @@ def fit(self, X, y=None): self.subsample)) X = self._check_inputs(X) + n_samples = X.shape[0] + + if self.n_quantiles > n_samples: + self.n_quantiles = n_samples + warnings.warn("n_quantiles (%s) is greater than the total number " + "of samples (%s). n_quantiles will be set to " + "n_samples as more quantiles do not lead to a " + "better approximation of the used cumulative " + "distribution function estimator." + % (self.n_quantiles, n_samples)) + rng = check_random_state(self.random_state) # Create the quantiles of reference @@ -2446,6 +2461,10 @@ def quantile_transform(X, axis=0, n_quantiles=1000, n_quantiles : int, optional (default=1000) Number of quantiles to be computed. It corresponds to the number of landmarks used to discretize the cumulative distribution function. + If n_quantiles is larger than the number of samples, n_quantiles is set + to the number of samples as a larger number of quantiles does not give + a better approximation of the cumulative distribution function + estimator. output_distribution : str, optional (default='uniform') Marginal distribution for the transformed data. The choices are diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 836877e67be62..2286ba5cb1b3c 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1260,6 +1260,12 @@ def test_quantile_transform_check_error(): assert_raise_message(ValueError, 'Expected 2D array, got scalar array instead', transformer.transform, 10) + # check that a warning is raised is n_quantiles > n_samples + transformer = QuantileTransformer(n_quantiles=100) + assert_warns_message(UserWarning, + "n_quantiles will be set to n_samples", + transformer.fit, X) + assert transformer.n_quantiles == X.shape[0] def test_quantile_transform_sparse_ignore_zeros(): From a24dd8e1fcad883157b07c49f7808e4508247e7d Mon Sep 17 00:00:00 2001 From: Albert Thomas Date: Thu, 28 Feb 2019 17:12:32 +0100 Subject: [PATCH 2/9] fix overwritting paramaters --- sklearn/preprocessing/data.py | 8 ++++++-- sklearn/preprocessing/tests/test_data.py | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index c5139276c92e0..9c9eee49eeb08 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2076,6 +2076,10 @@ class QuantileTransformer(BaseEstimator, TransformerMixin): Attributes ---------- + n_quantiles_ : integer + The actual number of quantiles used to discretize the cumulative + distribution function. + quantiles_ : ndarray, shape (n_quantiles, n_features) The values corresponding the quantiles of reference. @@ -2225,18 +2229,18 @@ def fit(self, X, y=None): n_samples = X.shape[0] if self.n_quantiles > n_samples: - self.n_quantiles = n_samples warnings.warn("n_quantiles (%s) is greater than the total number " "of samples (%s). n_quantiles will be set to " "n_samples as more quantiles do not lead to a " "better approximation of the used cumulative " "distribution function estimator." % (self.n_quantiles, n_samples)) + self.n_quantiles_ = max(1, min(self.n_quantiles, n_samples)) rng = check_random_state(self.random_state) # Create the quantiles of reference - self.references_ = np.linspace(0, 1, self.n_quantiles, + self.references_ = np.linspace(0, 1, self.n_quantiles_, endpoint=True) if sparse.issparse(X): self._sparse_fit(X, rng) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 2286ba5cb1b3c..cccb0a3c304f6 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1265,7 +1265,7 @@ def test_quantile_transform_check_error(): assert_warns_message(UserWarning, "n_quantiles will be set to n_samples", transformer.fit, X) - assert transformer.n_quantiles == X.shape[0] + assert transformer.n_quantiles_ == X.shape[0] def test_quantile_transform_sparse_ignore_zeros(): From 841abaaad8311f5863c40c2b053c5a8eb7ecff32 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 28 Feb 2019 17:19:22 +0100 Subject: [PATCH 3/9] add n_samples to default value Co-Authored-By: albertcthomas --- sklearn/preprocessing/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 9c9eee49eeb08..c7ced19a2e2c4 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2041,7 +2041,7 @@ class QuantileTransformer(BaseEstimator, TransformerMixin): Parameters ---------- - n_quantiles : int, optional (default=1000) + n_quantiles : int, optional (default=1000 or n_samples) Number of quantiles to be computed. It corresponds to the number of landmarks used to discretize the cumulative distribution function. If n_quantiles is larger than the number of samples, n_quantiles is set From def84ec4f0d3fd1d6e283b375ea612bad3dc137e Mon Sep 17 00:00:00 2001 From: Albert Thomas Date: Thu, 28 Feb 2019 17:32:17 +0100 Subject: [PATCH 4/9] use pytest to catch warning --- sklearn/preprocessing/tests/test_data.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index cccb0a3c304f6..1c07da664d30a 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1262,9 +1262,10 @@ def test_quantile_transform_check_error(): transformer.transform, 10) # check that a warning is raised is n_quantiles > n_samples transformer = QuantileTransformer(n_quantiles=100) - assert_warns_message(UserWarning, - "n_quantiles will be set to n_samples", - transformer.fit, X) + warn_msg = "n_quantiles will be set to n_samples" + with pytest.warns(UserWarning, match=warn_msg) as record: + transformer.fit(X) + assert len(record) == 1 assert transformer.n_quantiles_ == X.shape[0] From 37bcdd938134bef5fc57314cb72a874c584957a0 Mon Sep 17 00:00:00 2001 From: Albert Thomas Date: Thu, 28 Feb 2019 17:36:11 +0100 Subject: [PATCH 5/9] add default info in quantile_transform function --- sklearn/preprocessing/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index c7ced19a2e2c4..01bce8e24c532 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2462,7 +2462,7 @@ def quantile_transform(X, axis=0, n_quantiles=1000, Axis used to compute the means and standard deviations along. If 0, transform each feature, otherwise (if 1) transform each sample. - n_quantiles : int, optional (default=1000) + n_quantiles : int, optional (default=1000 or n_samples) Number of quantiles to be computed. It corresponds to the number of landmarks used to discretize the cumulative distribution function. If n_quantiles is larger than the number of samples, n_quantiles is set From 168af38886ea149f447a99c12e9549c03e6cac81 Mon Sep 17 00:00:00 2001 From: Albert Thomas Date: Thu, 28 Feb 2019 17:42:54 +0100 Subject: [PATCH 6/9] whatsnew entry --- doc/whats_new/v0.21.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index f185491ded469..0211a3d6d1daf 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -376,6 +376,12 @@ Support for Python 3.4 and below has been officially dropped. :class:`preprocessing.StandardScaler`. :issue:`13007` by :user:`Raffaello Baluyot ` +- |Fix| Fixed a bug in :class:`preprocessing.QuantileTransformer` and + :func:`preprocessing.quantile_transform` to force n_quantiles to be at most + equal to n_samples. Values of n_quantiles larger than n_samples were either + useless or resulting in a wrong approximation of the cumulative distribution + function estimator. :issue:`13333` by :user:`Albert Thomas `. + :mod:`sklearn.svm` .................. From 03e948c6b41587075dfa80ba41ea7397da07de08 Mon Sep 17 00:00:00 2001 From: Albert Thomas Date: Thu, 28 Feb 2019 18:25:43 +0100 Subject: [PATCH 7/9] fix docstest --- doc/modules/preprocessing.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index b7906e828de47..4c68f9e635498 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -387,13 +387,13 @@ Using the earlier example with the iris dataset:: ... output_distribution='normal', random_state=0) >>> X_trans = quantile_transformer.fit_transform(X) >>> quantile_transformer.quantiles_ # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE - array([[4.3..., 2..., 1..., 0.1...], - [4.31..., 2.02..., 1.01..., 0.1...], - [4.32..., 2.05..., 1.02..., 0.1...], + array([[4.3, 2. , 1. , 0.1], + [4.4, 2.2, 1.1, 0.1], + [4.4, 2.2, 1.2, 0.1], ..., - [7.84..., 4.34..., 6.84..., 2.5...], - [7.87..., 4.37..., 6.87..., 2.5...], - [7.9..., 4.4..., 6.9..., 2.5...]]) + [7.7, 4.1, 6.7, 2.5], + [7.7, 4.2, 6.7, 2.5], + [7.9, 4.4, 6.9, 2.5]]) Thus the median of the input becomes the mean of the output, centered at 0. The normal output is clipped so that the input's minimum and maximum --- From cfb2b19d7b987fb81c943c6d59dee11e289d8adb Mon Sep 17 00:00:00 2001 From: Albert Thomas Date: Fri, 1 Mar 2019 09:48:04 +0100 Subject: [PATCH 8/9] simplify warning --- sklearn/preprocessing/data.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 01bce8e24c532..1c32df11e1602 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2230,10 +2230,8 @@ def fit(self, X, y=None): if self.n_quantiles > n_samples: warnings.warn("n_quantiles (%s) is greater than the total number " - "of samples (%s). n_quantiles will be set to " - "n_samples as more quantiles do not lead to a " - "better approximation of the used cumulative " - "distribution function estimator." + "of samples (%s). n_quantiles is set to " + "n_samples." % (self.n_quantiles, n_samples)) self.n_quantiles_ = max(1, min(self.n_quantiles, n_samples)) From 6778a8f39c5fad7fc021c5dc42895933e446e01a Mon Sep 17 00:00:00 2001 From: Albert Thomas Date: Fri, 1 Mar 2019 10:21:01 +0100 Subject: [PATCH 9/9] fix test --- sklearn/preprocessing/tests/test_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 1c07da664d30a..12db099bd7f87 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1262,7 +1262,7 @@ def test_quantile_transform_check_error(): transformer.transform, 10) # check that a warning is raised is n_quantiles > n_samples transformer = QuantileTransformer(n_quantiles=100) - warn_msg = "n_quantiles will be set to n_samples" + warn_msg = "n_quantiles is set to n_samples" with pytest.warns(UserWarning, match=warn_msg) as record: transformer.fit(X) assert len(record) == 1