From 38fda13e97f91941cba2f0733894a3de276755c2 Mon Sep 17 00:00:00 2001
From: Albert Thomas <albertthomas88@gmail.com>
Date: Thu, 28 Feb 2019 15:54:18 +0100
Subject: [PATCH 1/9] dynamically set n_quantiles to min(n_quantiles,
 n_samples)

---
 sklearn/preprocessing/data.py            | 23 +++++++++++++++++++++--
 sklearn/preprocessing/tests/test_data.py |  6 ++++++
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 7069a5ba874e7..c5139276c92e0 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -424,7 +424,7 @@ def minmax_scale(X, feature_range=(0, 1), axis=0, copy=True):
         X_scaled = X_std * (max - min) + min
 
     where min, max = feature_range.
- 
+
     The transformation is calculated as (when ``axis=0``)::
 
        X_scaled = scale * X + min - X.min(axis=0) * scale
@@ -592,7 +592,7 @@ class StandardScaler(BaseEstimator, TransformerMixin):
     -----
     NaNs are treated as missing values: disregarded in fit, and maintained in
     transform.
-    
+
     We use a biased estimator for the standard deviation, equivalent to
     `numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to
     affect model performance.
@@ -2044,6 +2044,10 @@ class QuantileTransformer(BaseEstimator, TransformerMixin):
     n_quantiles : int, optional (default=1000)
         Number of quantiles to be computed. It corresponds to the number
         of landmarks used to discretize the cumulative distribution function.
+        If n_quantiles is larger than the number of samples, n_quantiles is set
+        to the number of samples as a larger number of quantiles does not give
+        a better approximation of the cumulative distribution function
+        estimator.
 
     output_distribution : str, optional (default='uniform')
         Marginal distribution for the transformed data. The choices are
@@ -2218,6 +2222,17 @@ def fit(self, X, y=None):
                                                        self.subsample))
 
         X = self._check_inputs(X)
+        n_samples = X.shape[0]
+
+        if self.n_quantiles > n_samples:
+            self.n_quantiles = n_samples
+            warnings.warn("n_quantiles (%s) is greater than the total number "
+                          "of samples (%s). n_quantiles will be set to "
+                          "n_samples as more quantiles do not lead to a "
+                          "better approximation of the used cumulative "
+                          "distribution function estimator."
+                          % (self.n_quantiles, n_samples))
+
         rng = check_random_state(self.random_state)
 
         # Create the quantiles of reference
@@ -2446,6 +2461,10 @@ def quantile_transform(X, axis=0, n_quantiles=1000,
     n_quantiles : int, optional (default=1000)
         Number of quantiles to be computed. It corresponds to the number
         of landmarks used to discretize the cumulative distribution function.
+        If n_quantiles is larger than the number of samples, n_quantiles is set
+        to the number of samples as a larger number of quantiles does not give
+        a better approximation of the cumulative distribution function
+        estimator.
 
     output_distribution : str, optional (default='uniform')
         Marginal distribution for the transformed data. The choices are
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 836877e67be62..2286ba5cb1b3c 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -1260,6 +1260,12 @@ def test_quantile_transform_check_error():
     assert_raise_message(ValueError,
                          'Expected 2D array, got scalar array instead',
                          transformer.transform, 10)
+    # check that a warning is raised is n_quantiles > n_samples
+    transformer = QuantileTransformer(n_quantiles=100)
+    assert_warns_message(UserWarning,
+                         "n_quantiles will be set to n_samples",
+                         transformer.fit, X)
+    assert transformer.n_quantiles == X.shape[0]
 
 
 def test_quantile_transform_sparse_ignore_zeros():

From a24dd8e1fcad883157b07c49f7808e4508247e7d Mon Sep 17 00:00:00 2001
From: Albert Thomas <albertthomas88@gmail.com>
Date: Thu, 28 Feb 2019 17:12:32 +0100
Subject: [PATCH 2/9] fix overwritting paramaters

---
 sklearn/preprocessing/data.py            | 8 ++++++--
 sklearn/preprocessing/tests/test_data.py | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index c5139276c92e0..9c9eee49eeb08 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -2076,6 +2076,10 @@ class QuantileTransformer(BaseEstimator, TransformerMixin):
 
     Attributes
     ----------
+    n_quantiles_ : integer
+        The actual number of quantiles used to discretize the cumulative
+        distribution function.
+
     quantiles_ : ndarray, shape (n_quantiles, n_features)
         The values corresponding the quantiles of reference.
 
@@ -2225,18 +2229,18 @@ def fit(self, X, y=None):
         n_samples = X.shape[0]
 
         if self.n_quantiles > n_samples:
-            self.n_quantiles = n_samples
             warnings.warn("n_quantiles (%s) is greater than the total number "
                           "of samples (%s). n_quantiles will be set to "
                           "n_samples as more quantiles do not lead to a "
                           "better approximation of the used cumulative "
                           "distribution function estimator."
                           % (self.n_quantiles, n_samples))
+        self.n_quantiles_ = max(1, min(self.n_quantiles, n_samples))
 
         rng = check_random_state(self.random_state)
 
         # Create the quantiles of reference
-        self.references_ = np.linspace(0, 1, self.n_quantiles,
+        self.references_ = np.linspace(0, 1, self.n_quantiles_,
                                        endpoint=True)
         if sparse.issparse(X):
             self._sparse_fit(X, rng)
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 2286ba5cb1b3c..cccb0a3c304f6 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -1265,7 +1265,7 @@ def test_quantile_transform_check_error():
     assert_warns_message(UserWarning,
                          "n_quantiles will be set to n_samples",
                          transformer.fit, X)
-    assert transformer.n_quantiles == X.shape[0]
+    assert transformer.n_quantiles_ == X.shape[0]
 
 
 def test_quantile_transform_sparse_ignore_zeros():

From 841abaaad8311f5863c40c2b053c5a8eb7ecff32 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 28 Feb 2019 17:19:22 +0100
Subject: [PATCH 3/9] add n_samples to default value

Co-Authored-By: albertcthomas <albertthomas88@gmail.com>
---
 sklearn/preprocessing/data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 9c9eee49eeb08..c7ced19a2e2c4 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -2041,7 +2041,7 @@ class QuantileTransformer(BaseEstimator, TransformerMixin):
 
     Parameters
     ----------
-    n_quantiles : int, optional (default=1000)
+    n_quantiles : int, optional (default=1000 or n_samples)
         Number of quantiles to be computed. It corresponds to the number
         of landmarks used to discretize the cumulative distribution function.
         If n_quantiles is larger than the number of samples, n_quantiles is set

From def84ec4f0d3fd1d6e283b375ea612bad3dc137e Mon Sep 17 00:00:00 2001
From: Albert Thomas <albertthomas88@gmail.com>
Date: Thu, 28 Feb 2019 17:32:17 +0100
Subject: [PATCH 4/9] use pytest to catch warning

---
 sklearn/preprocessing/tests/test_data.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index cccb0a3c304f6..1c07da664d30a 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -1262,9 +1262,10 @@ def test_quantile_transform_check_error():
                          transformer.transform, 10)
     # check that a warning is raised is n_quantiles > n_samples
     transformer = QuantileTransformer(n_quantiles=100)
-    assert_warns_message(UserWarning,
-                         "n_quantiles will be set to n_samples",
-                         transformer.fit, X)
+    warn_msg = "n_quantiles will be set to n_samples"
+    with pytest.warns(UserWarning, match=warn_msg) as record:
+        transformer.fit(X)
+    assert len(record) == 1
     assert transformer.n_quantiles_ == X.shape[0]
 
 

From 37bcdd938134bef5fc57314cb72a874c584957a0 Mon Sep 17 00:00:00 2001
From: Albert Thomas <albertthomas88@gmail.com>
Date: Thu, 28 Feb 2019 17:36:11 +0100
Subject: [PATCH 5/9] add default info in quantile_transform function

---
 sklearn/preprocessing/data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index c7ced19a2e2c4..01bce8e24c532 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -2462,7 +2462,7 @@ def quantile_transform(X, axis=0, n_quantiles=1000,
         Axis used to compute the means and standard deviations along. If 0,
         transform each feature, otherwise (if 1) transform each sample.
 
-    n_quantiles : int, optional (default=1000)
+    n_quantiles : int, optional (default=1000 or n_samples)
         Number of quantiles to be computed. It corresponds to the number
         of landmarks used to discretize the cumulative distribution function.
         If n_quantiles is larger than the number of samples, n_quantiles is set

From 168af38886ea149f447a99c12e9549c03e6cac81 Mon Sep 17 00:00:00 2001
From: Albert Thomas <albertthomas88@gmail.com>
Date: Thu, 28 Feb 2019 17:42:54 +0100
Subject: [PATCH 6/9] whatsnew entry

---
 doc/whats_new/v0.21.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index f185491ded469..0211a3d6d1daf 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -376,6 +376,12 @@ Support for Python 3.4 and below has been officially dropped.
   :class:`preprocessing.StandardScaler`. :issue:`13007` by
   :user:`Raffaello Baluyot <baluyotraf>`
 
+- |Fix| Fixed a bug in :class:`preprocessing.QuantileTransformer` and
+  :func:`preprocessing.quantile_transform` to force n_quantiles to be at most
+  equal to n_samples. Values of n_quantiles larger than n_samples were either
+  useless or resulting in a wrong approximation of the cumulative distribution
+  function estimator. :issue:`13333` by :user:`Albert Thomas <albertcthomas>`.
+
 :mod:`sklearn.svm`
 ..................
 

From 03e948c6b41587075dfa80ba41ea7397da07de08 Mon Sep 17 00:00:00 2001
From: Albert Thomas <albertthomas88@gmail.com>
Date: Thu, 28 Feb 2019 18:25:43 +0100
Subject: [PATCH 7/9] fix docstest

---
 doc/modules/preprocessing.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index b7906e828de47..4c68f9e635498 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -387,13 +387,13 @@ Using the earlier example with the iris dataset::
   ...     output_distribution='normal', random_state=0)
   >>> X_trans = quantile_transformer.fit_transform(X)
   >>> quantile_transformer.quantiles_ # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
-  array([[4.3...,   2...,     1...,     0.1...],
-         [4.31...,  2.02...,  1.01...,  0.1...],
-         [4.32...,  2.05...,  1.02...,  0.1...],
+  array([[4.3, 2. , 1. , 0.1],
+         [4.4, 2.2, 1.1, 0.1],
+         [4.4, 2.2, 1.2, 0.1],
          ...,
-         [7.84...,  4.34...,  6.84...,  2.5...],
-         [7.87...,  4.37...,  6.87...,  2.5...],
-         [7.9...,   4.4...,   6.9...,   2.5...]])
+         [7.7, 4.1, 6.7, 2.5],
+         [7.7, 4.2, 6.7, 2.5],
+         [7.9, 4.4, 6.9, 2.5]])
 
 Thus the median of the input becomes the mean of the output, centered at 0. The
 normal output is clipped so that the input's minimum and maximum ---

From cfb2b19d7b987fb81c943c6d59dee11e289d8adb Mon Sep 17 00:00:00 2001
From: Albert Thomas <albertthomas88@gmail.com>
Date: Fri, 1 Mar 2019 09:48:04 +0100
Subject: [PATCH 8/9] simplify warning

---
 sklearn/preprocessing/data.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 01bce8e24c532..1c32df11e1602 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -2230,10 +2230,8 @@ def fit(self, X, y=None):
 
         if self.n_quantiles > n_samples:
             warnings.warn("n_quantiles (%s) is greater than the total number "
-                          "of samples (%s). n_quantiles will be set to "
-                          "n_samples as more quantiles do not lead to a "
-                          "better approximation of the used cumulative "
-                          "distribution function estimator."
+                          "of samples (%s). n_quantiles is set to "
+                          "n_samples."
                           % (self.n_quantiles, n_samples))
         self.n_quantiles_ = max(1, min(self.n_quantiles, n_samples))
 

From 6778a8f39c5fad7fc021c5dc42895933e446e01a Mon Sep 17 00:00:00 2001
From: Albert Thomas <albertthomas88@gmail.com>
Date: Fri, 1 Mar 2019 10:21:01 +0100
Subject: [PATCH 9/9] fix test

---
 sklearn/preprocessing/tests/test_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 1c07da664d30a..12db099bd7f87 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -1262,7 +1262,7 @@ def test_quantile_transform_check_error():
                          transformer.transform, 10)
     # check that a warning is raised is n_quantiles > n_samples
     transformer = QuantileTransformer(n_quantiles=100)
-    warn_msg = "n_quantiles will be set to n_samples"
+    warn_msg = "n_quantiles is set to n_samples"
     with pytest.warns(UserWarning, match=warn_msg) as record:
         transformer.fit(X)
     assert len(record) == 1