REL backport fixes for 0.23.2 (#18068)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com> Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com> Co-authored-by: Lucy Liu <jliu176@gmail.com> Co-authored-by: Kevin Markham <justmarkham@users.noreply.github.com> Co-authored-by: Juan Carlos Alfaro Jiménez <JuanCarlos.Alfaro@uclm.es> Co-authored-by: Forrest Koch <forrest.c.koch@gmail.com> Co-authored-by: Chiara Marmo <cmarmo@users.noreply.github.com> Co-authored-by: Swier <swierh@users.noreply.github.com> Co-authored-by: t-kusanagi2 <61999178+t-kusanagi2@users.noreply.github.com> Co-authored-by: Markus Rempfler <markus.rempfler@tum.de> Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org> Co-authored-by: Bruno Charron <bcharron@gmx.com> Co-authored-by: Bruno Charron <bruno@charron.email> Co-authored-by: amy12xx <meezamanda@yahoo.com> Co-authored-by: Allan <allanbutler9@gmail.com> Co-authored-by: Roman Yurchak <rth.yurchak@gmail.com> Co-authored-by: Hirofumi Suzuki <hs-nazuna@users.noreply.github.com> Co-authored-by: Loïc Estève <loic.esteve@ymail.com> Co-authored-by: Joel Nothman <joel.nothman@gmail.com> Co-authored-by: Charles Patel <charlespatel07@gmail.com>
scikit-learn · Aug 4, 2020 · 947f542 · 947f542
1 parent 24e7be0
commit 947f542
Show file tree

Hide file tree

Showing 60 changed files with 895 additions and 528 deletions.
diff --git a/.binder/requirements.txt b/.binder/requirements.txt
@@ -1,8 +1,5 @@
---extra-index https://pypi.anaconda.org/scipy-wheels-nightly/simple scikit-learn
---pre
 matplotlib
 scikit-image
 pandas
 sphinx-gallery
-scikit-learn
-
+scikit-learn>=0.23,<0.24
diff --git a/build_tools/azure/test_script.cmd b/build_tools/azure/test_script.cmd
@@ -10,7 +10,8 @@ mkdir %TMP_FOLDER%
 cd %TMP_FOLDER%
 
 if "%CHECK_WARNINGS%" == "true" (
-    set PYTEST_ARGS=%PYTEST_ARGS% -Werror::DeprecationWarning -Werror::FutureWarning
+    REM numpy's 1.19.0's tostring() deprecation is ignored until scipy and joblib removes its usage
+    set PYTEST_ARGS=%PYTEST_ARGS% -Werror::DeprecationWarning -Werror::FutureWarning -Wignore:tostring:DeprecationWarning
 )
 
 if "%COVERAGE%" == "true" (

diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh
@@ -29,7 +29,8 @@ if [[ "$COVERAGE" == "true" ]]; then
 fi
 
 if [[ -n "$CHECK_WARNINGS" ]]; then
-    TEST_CMD="$TEST_CMD -Werror::DeprecationWarning -Werror::FutureWarning"
+    # numpy's 1.19.0's tostring() deprecation is ignored until scipy and joblib removes its usage
+    TEST_CMD="$TEST_CMD -Werror::DeprecationWarning -Werror::FutureWarning -Wignore:tostring:DeprecationWarning"
 fi
 
 if [[ "$PYTHON_VERSION" == "*" ]]; then

diff --git a/conftest.py b/conftest.py
@@ -7,7 +7,6 @@
 
 import platform
 import sys
-from distutils.version import LooseVersion
 import os
 
 import pytest
@@ -17,10 +16,11 @@
 from sklearn.utils import _IS_32BIT
 from sklearn.externals import _pilutil
 from sklearn._build_utils.deprecated_modules import _DEPRECATED_MODULES
+from sklearn.utils.fixes import np_version, parse_version
 
 PYTEST_MIN_VERSION = '3.3.0'
 
-if LooseVersion(pytest.__version__) < PYTEST_MIN_VERSION:
+if parse_version(pytest.__version__) < parse_version(PYTEST_MIN_VERSION):
     raise ImportError('Your version of pytest is too old, you should have '
                       'at least pytest >= {} installed.'
                       .format(PYTEST_MIN_VERSION))
@@ -54,8 +54,7 @@ def pytest_collection_modifyitems(config, items):
     # run doctests only for numpy >= 1.14.
     skip_doctests = False
     try:
-        import numpy as np
-        if LooseVersion(np.__version__) < LooseVersion('1.14'):
+        if np_version < parse_version('1.14'):
             reason = 'doctests are only run for numpy >= 1.14'
             skip_doctests = True
         elif _IS_32BIT:

diff --git a/doc/conf.py b/doc/conf.py
@@ -80,7 +80,7 @@
 
 # General information about the project.
 project = 'scikit-learn'
-copyright = '2007 - 2019, scikit-learn developers (BSD License)'
+copyright = '2007 - 2020, scikit-learn developers (BSD License)'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
@@ -997,8 +997,8 @@ Examples of use cases include:
 * Risk modeling / insurance policy pricing:  number of claim events /
   policyholder per year (Poisson), cost per event (Gamma), total cost per
   policyholder per year (Tweedie / Compound Poisson Gamma).
-* Predictive maintenance: number of production interruption events per year:
-  Poisson, duration of interruption: Gamma, total interruption time per year
+* Predictive maintenance: number of production interruption events per year
+  (Poisson), duration of interruption (Gamma), total interruption time per year
   (Tweedie / Compound Poisson Gamma).
 
 

diff --git a/doc/modules/model_persistence.rst b/doc/modules/model_persistence.rst
@@ -20,7 +20,7 @@ Persistence example
 -------------------
 
 It is possible to save a model in scikit-learn by using Python's built-in
-persistence model, namely `pickle <https://docs.python.org/2/library/pickle.html>`_::
+persistence model, namely `pickle <https://docs.python.org/3/library/pickle.html>`_::
 
   >>> from sklearn import svm
   >>> from sklearn import datasets

diff --git a/doc/templates/index.html b/doc/templates/index.html
@@ -155,6 +155,10 @@ <h4 class="sk-landing-call-header">News</h4>
         <ul class="sk-landing-call-list list-unstyled">
         <li><strong>On-going development:</strong>
         <a href="https://scikit-learn.org/dev/whats_new.html"><strong>What's new</strong> (Changelog)</a>
+        <li><strong>August 2020.</strong> scikit-learn 0.23.2 is available for download (<a href="whats_new/v0.23.html#version-0-23-2">Changelog</a>).
+        </li>
+        <li><strong>May 2020.</strong> scikit-learn 0.23.1 is available for download (<a href="whats_new/v0.23.html#version-0-23-1">Changelog</a>).
+        </li>
         <li><strong>May 2020.</strong> scikit-learn 0.23.0 is available for download (<a href="whats_new/v0.23.html#version-0-23-0">Changelog</a>).
         </li>
         <li><strong>Scikit-learn from 0.23 requires Python 3.6 or greater.</strong>

diff --git a/doc/themes/scikit-learn-modern/static/css/theme.css b/doc/themes/scikit-learn-modern/static/css/theme.css
@@ -83,12 +83,12 @@ span.highlighted {
 }
 
 div.highlight {
-  padding: 0.2rem 0.5rem;
   border: 1px solid #ddd;
   margin-bottom: 1rem;
 }
 
 div.highlight pre {
+  padding: 0.2rem 0.5rem;
   margin-bottom: 0;
   line-height: 1.2rem;
 }

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
@@ -2,6 +2,122 @@
 
 .. currentmodule:: sklearn
 
+.. _changes_0_23_2:
+
+Version 0.23.2
+==============
+
+**August 3 2020**
+
+Changed models
+--------------
+
+The following estimators and functions, when fit with the same data and
+parameters, may produce different models from the previous version. This often
+occurs due to changes in the modelling logic (bug fixes or enhancements), or in
+random sampling procedures.
+
+- |Fix| ``inertia_`` attribute of :class:`cluster.KMeans` and
+  :class:`cluster.MiniBatchKMeans`.
+
+Details are listed in the changelog below.
+
+(While we are trying to better inform users by providing this information, we
+cannot assure that this list is complete.)
+
+Changelog
+---------
+
+:mod:`sklearn.cluster`
+......................
+
+- |Fix| Fixed a bug in :class:`cluster.KMeans` where rounding errors could
+  prevent convergence to be declared when `tol=0`. :pr:`17959` by
+  :user:`Jérémie du Boisberranger <jeremiedbb>`.
+
+- |Fix| Fixed a bug in :class:`cluster.KMeans` and
+  :class:`cluster.MiniBatchKMeans` where the reported inertia was incorrectly
+  weighted by the sample weights. :pr:`17848` by
+  :user:`Jérémie du Boisberranger <jeremiedbb>`.
+
+- |Fix| Fixed a bug in :class:`cluster.MeanShift` with `bin_seeding=True`. When
+  the estimated bandwidth is 0, the behavior is equivalent to
+  `bin_seeding=False`.
+  :pr:`17742` by :user:`Jeremie du Boisberranger <jeremiedbb>`.
+
+- |Fix| Fixed a bug in :class:`cluster.AffinityPropagation`, that
+  gives incorrect clusters when the array dtype is float32.
+  :pr:`17995` by :user:`Thomaz Santana  <Wikilicious>` and
+  :user:`Amanda Dsouza <amy12xx>`.
+
+:mod:`sklearn.decomposition`
+............................
+
+- |Fix| Fixed a bug in
+  :func:`decomposition.MiniBatchDictionaryLearning.partial_fit` which should
+  update the dictionary by iterating only once over a mini-batch.
+  :pr:`17433` by :user:`Chiara Marmo <cmarmo>`.
+
+- |Fix| Avoid overflows on Windows in
+  :func:`decomposition.IncrementalPCA.partial_fit` for large ``batch_size`` and
+  ``n_samples`` values.
+  :pr:`17985` by :user:`Alan Butler <aldee153>` and
+  :user:`Amanda Dsouza <amy12xx>`.
+
+:mod:`sklearn.ensemble`
+.......................
+
+- |Fix| Fixed bug in :class:`ensemble.MultinomialDeviance` where the
+  average of logloss was incorrectly calculated as sum of logloss.
+  :pr:`17694` by :user:`Markus Rempfler <rempfler>` and
+  :user:`Tsutomu Kusanagi <t-kusanagi2>`.
+
+- |Fix| Fixes :class:`ensemble.StackingClassifier` and
+  :class:`ensemble.StackingRegressor` compatibility with estimators that
+  do not define `n_features_in_`. :pr:`17357` by `Thomas Fan`_.
+
+:mod:`sklearn.feature_extraction`
+.................................
+
+- |Fix| Fixes bug in :class:`feature_extraction.text.CountVectorizer` where
+  sample order invariance was broken when `max_features` was set and features
+  had the same count. :pr:`18016` by `Thomas Fan`_, `Roman Yurchak`_, and
+  `Joel Nothman`_.
+
+:mod:`sklearn.linear_model`
+...........................
+
+- |Fix| :func:`linear_model.lars_path` does not overwrite `X` when
+  `X_copy=True` and `Gram='auto'`. :pr:`17914` by `Thomas Fan`_.
+
+:mod:`sklearn.manifold`
+.......................
+
+- |Fix| Fixed a bug where :func:`metrics.pairwise_distances` would raise an
+  error if ``metric='seuclidean'`` and ``X`` is not type ``np.float64``.
+  :pr:`15730` by :user:`Forrest Koch <ForrestCKoch>`.
+
+:mod:`sklearn.metrics`
+......................
+
+- |Fix| Fixed a bug in :func:`metrics.mean_squared_error` where the
+  average of multiple RMSE values was incorrectly calculated as the root of the
+  average of multiple MSE values.
+  :pr:`17309` by :user:`Swier Heeres <swierh>`.
+
+:mod:`sklearn.pipeline`
+.......................
+
+- |Fix| :class:`pipeline.FeatureUnion` raises a deprecation warning when
+  `None` is included in `transformer_list`. :pr:`17360` by `Thomas Fan`_.
+
+:mod:`sklearn.utils`
+....................
+
+- |Fix| Fix :func:`utils.estimator_checks.check_estimator` so that all test
+  cases support the `binary_only` estimator tag.
+  :pr:`17812` by :user:`Bruno Charron <brcharron>`.
+
 .. _changes_0_23_1:
 
 Version 0.23.1
@@ -23,6 +139,7 @@ Changelog
   provided by the user were modified in place. :pr:`17204` by
   :user:`Jeremie du Boisberranger <jeremiedbb>`.
 
+
 Miscellaneous
 .............
 
@@ -44,8 +161,6 @@ refer to
 
 .. include:: changelog_legend.inc
 
-Put the changes in their relevant module.
-
 Enforcing keyword-only arguments
 --------------------------------
 
@@ -171,7 +286,7 @@ Changelog
   deprecated. It has no effect. :pr:`11950` by
   :user:`Jeremie du Boisberranger <jeremiedbb>`.
 
-- |API| The ``random_state`` parameter has been added to 
+- |API| The ``random_state`` parameter has been added to
   :class:`cluster.AffinityPropagation`. :pr:`16801` by :user:`rcwoolston`
   and :user:`Chiara Marmo <cmarmo>`.
 
@@ -364,7 +479,7 @@ Changelog
   for each feature. :pr:`16403` by :user:`Narendra Mukherjee <narendramukherjee>`.
 
 - |Enhancement| :class:`impute.SimpleImputer`, :class:`impute.KNNImputer`, and
-  :class:`impute.SimpleImputer` accepts pandas' nullable integer dtype with
+  :class:`impute.IterativeImputer` accepts pandas' nullable integer dtype with
   missing values. :pr:`16508` by `Thomas Fan`_.
 
 :mod:`sklearn.inspection`
@@ -467,7 +582,7 @@ Changelog
   an error when `y_true` and `y_pred` were length zero and `labels` was
   not `None`. In addition, we raise an error when an empty list is given to
   the `labels` parameter.
-  :pr:`16442` by `Kyle Parsons <parsons-kyle-89>`.
+  :pr:`16442` by :user:`Kyle Parsons <parsons-kyle-89>`.
 
 - |API| Changed the formatting of values in
   :meth:`metrics.ConfusionMatrixDisplay.plot` and
@@ -491,7 +606,7 @@ Changelog
   :pr:`15622` by :user:`Gregory Morse <GregoryMorse>`.
 
 - |Fix| :func:`model_selection.cross_val_predict` supports
-  `method="predict_proba"` when `y=None`.:pr:`15918` by
+  `method="predict_proba"` when `y=None`. :pr:`15918` by
   :user:`Luca Kubin <lkubin>`.
 
 - |Fix| :func:`model_selection.fit_grid_point` is deprecated in 0.23 and will

diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py
@@ -25,7 +25,6 @@
 import time
 
 import numpy as np
-from distutils.version import LooseVersion
 from scipy.ndimage.filters import gaussian_filter
 import matplotlib.pyplot as plt
 import skimage
@@ -34,9 +33,10 @@
 
 from sklearn.feature_extraction import image
 from sklearn.cluster import spectral_clustering
+from sklearn.utils.fixes import parse_version
 
 # these were introduced in skimage-0.14
-if LooseVersion(skimage.__version__) >= '0.14':
+if parse_version(skimage.__version__) >= parse_version('0.14'):
     rescale_params = {'anti_aliasing': False, 'multichannel': False}
 else:
     rescale_params = {}

diff --git a/examples/cluster/plot_coin_ward_segmentation.py b/examples/cluster/plot_coin_ward_segmentation.py
@@ -17,7 +17,6 @@
 import time as time
 
 import numpy as np
-from distutils.version import LooseVersion
 from scipy.ndimage.filters import gaussian_filter
 
 import matplotlib.pyplot as plt
@@ -28,9 +27,10 @@
 
 from sklearn.feature_extraction.image import grid_to_graph
 from sklearn.cluster import AgglomerativeClustering
+from sklearn.utils.fixes import parse_version
 
 # these were introduced in skimage-0.14
-if LooseVersion(skimage.__version__) >= '0.14':
+if parse_version(skimage.__version__) >= parse_version('0.14'):
     rescale_params = {'anti_aliasing': False, 'multichannel': False}
 else:
     rescale_params = {}

diff --git a/examples/compose/plot_transformed_target.py b/examples/compose/plot_transformed_target.py
@@ -21,7 +21,6 @@
 import numpy as np
 import matplotlib
 import matplotlib.pyplot as plt
-from distutils.version import LooseVersion
 
 print(__doc__)
 
@@ -34,10 +33,11 @@
 from sklearn.linear_model import RidgeCV
 from sklearn.compose import TransformedTargetRegressor
 from sklearn.metrics import median_absolute_error, r2_score
+from sklearn.utils.fixes import parse_version
 
 
 # `normed` is being deprecated in favor of `density` in histograms
-if LooseVersion(matplotlib.__version__) >= '2.1':
+if parse_version(matplotlib.__version__) >= parse_version('2.1'):
     density_param = {'density': True}
 else:
     density_param = {'normed': True}

diff --git a/examples/decomposition/plot_sparse_coding.py b/examples/decomposition/plot_sparse_coding.py
@@ -16,12 +16,11 @@
 """
 print(__doc__)
 
-from distutils.version import LooseVersion
-
 import numpy as np
 import matplotlib.pyplot as plt
 
 from sklearn.decomposition import SparseCoder
+from sklearn.utils.fixes import np_version, parse_version
 
 
 def ricker_function(resolution, center, width):
@@ -68,7 +67,7 @@ def ricker_matrix(width, resolution, n_components):
               ('Lasso', 'lasso_lars', 2, None, 'turquoise'), ]
 lw = 2
 # Avoid FutureWarning about default value change when numpy >= 1.14
-lstsq_rcond = None if LooseVersion(np.__version__) >= '1.14' else -1
+lstsq_rcond = None if np_version >= parse_version('1.14') else -1
 
 plt.figure(figsize=(13, 6))
 for subplot, (D, title) in enumerate(zip((D_fixed, D_multi),

diff --git a/examples/miscellaneous/plot_johnson_lindenstrauss_bound.py b/examples/miscellaneous/plot_johnson_lindenstrauss_bound.py
@@ -19,15 +19,15 @@
 import numpy as np
 import matplotlib
 import matplotlib.pyplot as plt
-from distutils.version import LooseVersion
 from sklearn.random_projection import johnson_lindenstrauss_min_dim
 from sklearn.random_projection import SparseRandomProjection
 from sklearn.datasets import fetch_20newsgroups_vectorized
 from sklearn.datasets import load_digits
 from sklearn.metrics.pairwise import euclidean_distances
+from sklearn.utils.fixes import parse_version
 
 # `normed` is being deprecated in favor of `density` in histograms
-if LooseVersion(matplotlib.__version__) >= '2.1':
+if parse_version(matplotlib.__version__) >= parse_version('2.1'):
     density_param = {'density': True}
 else:
     density_param = {'normed': True}