From 903d250b1348de025f00df6d48ca38b45e50527e Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Fri, 18 Aug 2023 16:59:57 +0200 Subject: [PATCH 1/8] added first draft of MaxAbsScaler.partial_fit with Array API --- sklearn/preprocessing/_data.py | 18 ++++++++++------ sklearn/preprocessing/tests/test_data.py | 27 ++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index b51f2af2c3d17..ebb43f7cca641 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -22,7 +22,8 @@ TransformerMixin, _fit_context, ) -from ..utils import check_array +from ..utils import _array_api, check_array +from ..utils._array_api import get_namespace from ..utils._param_validation import Interval, Options, StrOptions, validate_params from ..utils.extmath import _incremental_mean_and_var, row_norms from ..utils.sparsefuncs import ( @@ -103,16 +104,17 @@ def _handle_zeros_in_scale(scale, copy=True, constant_mask=None): if scale == 0.0: scale = 1.0 return scale - elif isinstance(scale, np.ndarray): + else: + xp, _ = get_namespace(scale) if constant_mask is None: # Detect near constant values to avoid dividing by a very small # value that could lead to surprising results and numerical # stability issues. - constant_mask = scale < 10 * np.finfo(scale.dtype).eps + constant_mask = scale < 10 * xp.finfo(scale.dtype).eps if copy: # New array to avoid side-effects - scale = scale.copy() + scale = xp.asarray(scale, copy=True) scale[constant_mask] = 1.0 return scale @@ -1203,12 +1205,14 @@ def partial_fit(self, X, y=None): self : object Fitted scaler. """ + xp, _ = get_namespace(X) + first_pass = not hasattr(self, "n_samples_seen_") X = self._validate_data( X, reset=first_pass, accept_sparse=("csr", "csc"), - dtype=FLOAT_DTYPES, + dtype=_array_api.supported_float_dtypes(xp), force_all_finite="allow-nan", ) @@ -1216,12 +1220,12 @@ def partial_fit(self, X, y=None): mins, maxs = min_max_axis(X, axis=0, ignore_nan=True) max_abs = np.maximum(np.abs(mins), np.abs(maxs)) else: - max_abs = np.nanmax(np.abs(X), axis=0) + max_abs = _array_api._nanmax(xp.abs(X), axis=0) if first_pass: self.n_samples_seen_ = X.shape[0] else: - max_abs = np.maximum(self.max_abs_, max_abs) + max_abs = xp.maximum(self.max_abs_, max_abs) self.n_samples_seen_ += X.shape[0] self.max_abs_ = max_abs diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 189d8875bc8f2..ecee9ff3788c7 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -41,6 +41,9 @@ from sklearn.preprocessing._data import BOUNDS_THRESHOLD, _handle_zeros_in_scale from sklearn.svm import SVR from sklearn.utils import gen_batches, shuffle +from sklearn.utils._array_api import ( + yield_namespace_device_dtype_combinations, +) from sklearn.utils._testing import ( _convert_container, assert_allclose, @@ -51,6 +54,10 @@ assert_array_less, skip_if_32bit, ) +from sklearn.utils.estimator_checks import ( + _get_check_estimator_ids, + check_array_api_input_and_values, +) from sklearn.utils.sparsefuncs import mean_variance_axis iris = datasets.load_iris() @@ -1640,6 +1647,26 @@ def test_robust_scaler_unit_variance(): assert X_trans.std() == pytest.approx(1, abs=1e-2) +@pytest.mark.parametrize( + "array_namespace, device, dtype", yield_namespace_device_dtype_combinations() +) +@pytest.mark.parametrize( + "check", + [check_array_api_input_and_values], + ids=_get_check_estimator_ids, +) +@pytest.mark.parametrize( + "estimator", + [MinMaxScaler()], + ids=_get_check_estimator_ids, +) +def test_maxabscaler_array_api_compliance( + estimator, check, array_namespace, device, dtype +): + name = estimator.__class__.__name__ + check(name, estimator, array_namespace, device=device, dtype=dtype) + + def test_maxabs_scaler_zero_variance_features(): # Check MaxAbsScaler on toy data with zero variance features X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.3], [0.0, 1.0, +1.5], [0.0, 0.0, +0.0]] From a8f45588e67753d1a80bb1eb11fc62f58860322b Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Sat, 19 Aug 2023 11:14:03 +0200 Subject: [PATCH 2/8] fixed test estimator --- sklearn/preprocessing/tests/test_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 90b03c5b860e2..e76ff7156a9ed 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1677,7 +1677,7 @@ def test_robust_scaler_unit_variance(): ) @pytest.mark.parametrize( "estimator", - [MinMaxScaler()], + [MaxAbsScaler()], ids=_get_check_estimator_ids, ) def test_maxabscaler_array_api_compliance( From 406f332c22f95caade0757e86d9b342af5c11ca3 Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Sat, 19 Aug 2023 11:16:14 +0200 Subject: [PATCH 3/8] fixed nanmin and nanmax so that +/-inf arrays are sent to the correct device --- sklearn/utils/_array_api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 2bc49117792c9..0fb5a0e264aa8 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -495,7 +495,7 @@ def _nanmin(X, axis=None): else: mask = xp.isnan(X) - X = xp.min(xp.where(mask, xp.asarray(+xp.inf), X), axis=axis) + X = xp.min(xp.where(mask, xp.asarray(+xp.inf, device=device(X)), X), axis=axis) # Replace Infs from all NaN slices with NaN again mask = xp.all(mask, axis=axis) if xp.any(mask): @@ -512,7 +512,7 @@ def _nanmax(X, axis=None): else: mask = xp.isnan(X) - X = xp.max(xp.where(mask, xp.asarray(-xp.inf), X), axis=axis) + X = xp.max(xp.where(mask, xp.asarray(-xp.inf, device=device(X)), X), axis=axis) # Replace Infs from all NaN slices with NaN again mask = xp.all(mask, axis=axis) if xp.any(mask): From 20bf65046e8747ec10ad937bde38133f16d61c1d Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Sat, 19 Aug 2023 11:16:43 +0200 Subject: [PATCH 4/8] implemented transform and inverse_transform with the array api --- sklearn/preprocessing/_data.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index d4ef1df569972..ba8627935f28d 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -1256,12 +1256,15 @@ def transform(self, X): Transformed array. """ check_is_fitted(self) + + xp, _ = get_namespace(X) + X = self._validate_data( X, accept_sparse=("csr", "csc"), copy=self.copy, reset=False, - dtype=FLOAT_DTYPES, + dtype=_array_api.supported_float_dtypes(xp), force_all_finite="allow-nan", ) @@ -1285,11 +1288,14 @@ def inverse_transform(self, X): Transformed array. """ check_is_fitted(self) + + xp, _ = get_namespace(X) + X = check_array( X, accept_sparse=("csr", "csc"), copy=self.copy, - dtype=FLOAT_DTYPES, + dtype=_array_api.supported_float_dtypes(xp), force_all_finite="allow-nan", ) From 9daaa49f938b682b515fb30b7855a34acfac4124 Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Sat, 19 Aug 2023 11:43:53 +0200 Subject: [PATCH 5/8] updated docs --- doc/modules/array_api.rst | 1 + doc/whats_new/v1.4.rst | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/modules/array_api.rst b/doc/modules/array_api.rst index 7effd3ee6eb43..b1f1f332fb057 100644 --- a/doc/modules/array_api.rst +++ b/doc/modules/array_api.rst @@ -97,6 +97,7 @@ Estimators `svd_solver="randomized"` and `power_iteration_normalizer="QR"`) - :class:`discriminant_analysis.LinearDiscriminantAnalysis` (with `solver="svd"`) - :class:`preprocessing.MinMaxScaler` +- :class:`preprocessing.MaxAbsScaler` Tools ----- diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index bfa490ff5e821..02eb1bfbb1db4 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -200,11 +200,11 @@ Changelog when `sparse_output=True` and the output is configured to be pandas. :pr:`26931` by `Thomas Fan`_. -- |MajorFeature| :class:`preprocessing.MinMaxScaler` now +- |MajorFeature| :class:`preprocessing.MinMaxScaler` and :class:`preprocessing.MaxAbsScaler` now supports the `Array API `_. Array API support is considered experimental and might evolve without being subject to our usual rolling deprecation cycle policy. See - :ref:`array_api` for more details. :pr:`26243` by `Tim Head`_. + :ref:`array_api` for more details. :pr:`26243` by `Tim Head`_ and :pr:`27110` by :user:`Edoardo Abati `. :mod:`sklearn.tree` ................... From 6ea0834e128e34cf60c07f84a95444c1d37a181a Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Mon, 21 Aug 2023 12:32:57 +0200 Subject: [PATCH 6/8] merged the array_api test functions --- sklearn/preprocessing/tests/test_data.py | 26 ++---------------------- 1 file changed, 2 insertions(+), 24 deletions(-) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index e76ff7156a9ed..c395d4d21b378 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -701,12 +701,10 @@ def test_standard_check_array_of_inverse_transform(): ) @pytest.mark.parametrize( "estimator", - [MinMaxScaler()], + [MinMaxScaler(), MaxAbsScaler()], ids=_get_check_estimator_ids, ) -def test_minmaxscaler_array_api_compliance( - estimator, check, array_namespace, device, dtype -): +def test_scaler_array_api_compliance(estimator, check, array_namespace, device, dtype): name = estimator.__class__.__name__ check(name, estimator, array_namespace, device=device, dtype=dtype) @@ -1667,26 +1665,6 @@ def test_robust_scaler_unit_variance(): assert X_trans.std() == pytest.approx(1, abs=1e-2) -@pytest.mark.parametrize( - "array_namespace, device, dtype", yield_namespace_device_dtype_combinations() -) -@pytest.mark.parametrize( - "check", - [check_array_api_input_and_values], - ids=_get_check_estimator_ids, -) -@pytest.mark.parametrize( - "estimator", - [MaxAbsScaler()], - ids=_get_check_estimator_ids, -) -def test_maxabscaler_array_api_compliance( - estimator, check, array_namespace, device, dtype -): - name = estimator.__class__.__name__ - check(name, estimator, array_namespace, device=device, dtype=dtype) - - def test_maxabs_scaler_zero_variance_features(): # Check MaxAbsScaler on toy data with zero variance features X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.3], [0.0, 1.0, +1.5], [0.0, 0.0, +0.0]] From 81fa4f24aee89053ea5d2a6579f42a6f6e28c9bd Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Mon, 21 Aug 2023 12:34:03 +0200 Subject: [PATCH 7/8] removed extra space --- doc/whats_new/v1.4.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index 0fc598c65f62d..351e155672098 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -108,7 +108,7 @@ Changelog `full` and `randomized` solvers (with QR power iterations). See :ref:`array_api` for more details. :pr:`26315` and :pr:`27098` by :user:`Mateusz Sokół `, - :user:`Olivier Grisel ` and :user:` Edoardo Abati `. + :user:`Olivier Grisel ` and :user:`Edoardo Abati `. - |Enhancement| :func:`decomposition.non_negative_factorization`, :class:`decomposition.NMF`, and :class:`decomposition.MiniBatchNMF` now support :class:`scipy.sparse.sparray` From 3fb8d5a5fc60c2abdfef07963dc6c76ad6ceaf60 Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Tue, 22 Aug 2023 18:55:57 +0200 Subject: [PATCH 8/8] scalers in alphabetical order --- doc/modules/array_api.rst | 2 +- sklearn/preprocessing/tests/test_data.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/modules/array_api.rst b/doc/modules/array_api.rst index b1f1f332fb057..161307dfddcf4 100644 --- a/doc/modules/array_api.rst +++ b/doc/modules/array_api.rst @@ -96,8 +96,8 @@ Estimators - :class:`decomposition.PCA` (with `svd_solver="full"`, `svd_solver="randomized"` and `power_iteration_normalizer="QR"`) - :class:`discriminant_analysis.LinearDiscriminantAnalysis` (with `solver="svd"`) -- :class:`preprocessing.MinMaxScaler` - :class:`preprocessing.MaxAbsScaler` +- :class:`preprocessing.MinMaxScaler` Tools ----- diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index c395d4d21b378..d61996b76a25c 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -701,7 +701,7 @@ def test_standard_check_array_of_inverse_transform(): ) @pytest.mark.parametrize( "estimator", - [MinMaxScaler(), MaxAbsScaler()], + [MaxAbsScaler(), MinMaxScaler()], ids=_get_check_estimator_ids, ) def test_scaler_array_api_compliance(estimator, check, array_namespace, device, dtype):