From 903d250b1348de025f00df6d48ca38b45e50527e Mon Sep 17 00:00:00 2001
From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com>
Date: Fri, 18 Aug 2023 16:59:57 +0200
Subject: [PATCH 1/8] added first draft of MaxAbsScaler.partial_fit with Array
API
---
sklearn/preprocessing/_data.py | 18 ++++++++++------
sklearn/preprocessing/tests/test_data.py | 27 ++++++++++++++++++++++++
2 files changed, 38 insertions(+), 7 deletions(-)
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index b51f2af2c3d17..ebb43f7cca641 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -22,7 +22,8 @@
TransformerMixin,
_fit_context,
)
-from ..utils import check_array
+from ..utils import _array_api, check_array
+from ..utils._array_api import get_namespace
from ..utils._param_validation import Interval, Options, StrOptions, validate_params
from ..utils.extmath import _incremental_mean_and_var, row_norms
from ..utils.sparsefuncs import (
@@ -103,16 +104,17 @@ def _handle_zeros_in_scale(scale, copy=True, constant_mask=None):
if scale == 0.0:
scale = 1.0
return scale
- elif isinstance(scale, np.ndarray):
+ else:
+ xp, _ = get_namespace(scale)
if constant_mask is None:
# Detect near constant values to avoid dividing by a very small
# value that could lead to surprising results and numerical
# stability issues.
- constant_mask = scale < 10 * np.finfo(scale.dtype).eps
+ constant_mask = scale < 10 * xp.finfo(scale.dtype).eps
if copy:
# New array to avoid side-effects
- scale = scale.copy()
+ scale = xp.asarray(scale, copy=True)
scale[constant_mask] = 1.0
return scale
@@ -1203,12 +1205,14 @@ def partial_fit(self, X, y=None):
self : object
Fitted scaler.
"""
+ xp, _ = get_namespace(X)
+
first_pass = not hasattr(self, "n_samples_seen_")
X = self._validate_data(
X,
reset=first_pass,
accept_sparse=("csr", "csc"),
- dtype=FLOAT_DTYPES,
+ dtype=_array_api.supported_float_dtypes(xp),
force_all_finite="allow-nan",
)
@@ -1216,12 +1220,12 @@ def partial_fit(self, X, y=None):
mins, maxs = min_max_axis(X, axis=0, ignore_nan=True)
max_abs = np.maximum(np.abs(mins), np.abs(maxs))
else:
- max_abs = np.nanmax(np.abs(X), axis=0)
+ max_abs = _array_api._nanmax(xp.abs(X), axis=0)
if first_pass:
self.n_samples_seen_ = X.shape[0]
else:
- max_abs = np.maximum(self.max_abs_, max_abs)
+ max_abs = xp.maximum(self.max_abs_, max_abs)
self.n_samples_seen_ += X.shape[0]
self.max_abs_ = max_abs
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 189d8875bc8f2..ecee9ff3788c7 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -41,6 +41,9 @@
from sklearn.preprocessing._data import BOUNDS_THRESHOLD, _handle_zeros_in_scale
from sklearn.svm import SVR
from sklearn.utils import gen_batches, shuffle
+from sklearn.utils._array_api import (
+ yield_namespace_device_dtype_combinations,
+)
from sklearn.utils._testing import (
_convert_container,
assert_allclose,
@@ -51,6 +54,10 @@
assert_array_less,
skip_if_32bit,
)
+from sklearn.utils.estimator_checks import (
+ _get_check_estimator_ids,
+ check_array_api_input_and_values,
+)
from sklearn.utils.sparsefuncs import mean_variance_axis
iris = datasets.load_iris()
@@ -1640,6 +1647,26 @@ def test_robust_scaler_unit_variance():
assert X_trans.std() == pytest.approx(1, abs=1e-2)
+@pytest.mark.parametrize(
+ "array_namespace, device, dtype", yield_namespace_device_dtype_combinations()
+)
+@pytest.mark.parametrize(
+ "check",
+ [check_array_api_input_and_values],
+ ids=_get_check_estimator_ids,
+)
+@pytest.mark.parametrize(
+ "estimator",
+ [MinMaxScaler()],
+ ids=_get_check_estimator_ids,
+)
+def test_maxabscaler_array_api_compliance(
+ estimator, check, array_namespace, device, dtype
+):
+ name = estimator.__class__.__name__
+ check(name, estimator, array_namespace, device=device, dtype=dtype)
+
+
def test_maxabs_scaler_zero_variance_features():
# Check MaxAbsScaler on toy data with zero variance features
X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.3], [0.0, 1.0, +1.5], [0.0, 0.0, +0.0]]
From a8f45588e67753d1a80bb1eb11fc62f58860322b Mon Sep 17 00:00:00 2001
From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com>
Date: Sat, 19 Aug 2023 11:14:03 +0200
Subject: [PATCH 2/8] fixed test estimator
---
sklearn/preprocessing/tests/test_data.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 90b03c5b860e2..e76ff7156a9ed 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -1677,7 +1677,7 @@ def test_robust_scaler_unit_variance():
)
@pytest.mark.parametrize(
"estimator",
- [MinMaxScaler()],
+ [MaxAbsScaler()],
ids=_get_check_estimator_ids,
)
def test_maxabscaler_array_api_compliance(
From 406f332c22f95caade0757e86d9b342af5c11ca3 Mon Sep 17 00:00:00 2001
From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com>
Date: Sat, 19 Aug 2023 11:16:14 +0200
Subject: [PATCH 3/8] fixed nanmin and nanmax so that +/-inf arrays are sent to
the correct device
---
sklearn/utils/_array_api.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py
index 2bc49117792c9..0fb5a0e264aa8 100644
--- a/sklearn/utils/_array_api.py
+++ b/sklearn/utils/_array_api.py
@@ -495,7 +495,7 @@ def _nanmin(X, axis=None):
else:
mask = xp.isnan(X)
- X = xp.min(xp.where(mask, xp.asarray(+xp.inf), X), axis=axis)
+ X = xp.min(xp.where(mask, xp.asarray(+xp.inf, device=device(X)), X), axis=axis)
# Replace Infs from all NaN slices with NaN again
mask = xp.all(mask, axis=axis)
if xp.any(mask):
@@ -512,7 +512,7 @@ def _nanmax(X, axis=None):
else:
mask = xp.isnan(X)
- X = xp.max(xp.where(mask, xp.asarray(-xp.inf), X), axis=axis)
+ X = xp.max(xp.where(mask, xp.asarray(-xp.inf, device=device(X)), X), axis=axis)
# Replace Infs from all NaN slices with NaN again
mask = xp.all(mask, axis=axis)
if xp.any(mask):
From 20bf65046e8747ec10ad937bde38133f16d61c1d Mon Sep 17 00:00:00 2001
From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com>
Date: Sat, 19 Aug 2023 11:16:43 +0200
Subject: [PATCH 4/8] implemented transform and inverse_transform with the
array api
---
sklearn/preprocessing/_data.py | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index d4ef1df569972..ba8627935f28d 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -1256,12 +1256,15 @@ def transform(self, X):
Transformed array.
"""
check_is_fitted(self)
+
+ xp, _ = get_namespace(X)
+
X = self._validate_data(
X,
accept_sparse=("csr", "csc"),
copy=self.copy,
reset=False,
- dtype=FLOAT_DTYPES,
+ dtype=_array_api.supported_float_dtypes(xp),
force_all_finite="allow-nan",
)
@@ -1285,11 +1288,14 @@ def inverse_transform(self, X):
Transformed array.
"""
check_is_fitted(self)
+
+ xp, _ = get_namespace(X)
+
X = check_array(
X,
accept_sparse=("csr", "csc"),
copy=self.copy,
- dtype=FLOAT_DTYPES,
+ dtype=_array_api.supported_float_dtypes(xp),
force_all_finite="allow-nan",
)
From 9daaa49f938b682b515fb30b7855a34acfac4124 Mon Sep 17 00:00:00 2001
From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com>
Date: Sat, 19 Aug 2023 11:43:53 +0200
Subject: [PATCH 5/8] updated docs
---
doc/modules/array_api.rst | 1 +
doc/whats_new/v1.4.rst | 4 ++--
2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/doc/modules/array_api.rst b/doc/modules/array_api.rst
index 7effd3ee6eb43..b1f1f332fb057 100644
--- a/doc/modules/array_api.rst
+++ b/doc/modules/array_api.rst
@@ -97,6 +97,7 @@ Estimators
`svd_solver="randomized"` and `power_iteration_normalizer="QR"`)
- :class:`discriminant_analysis.LinearDiscriminantAnalysis` (with `solver="svd"`)
- :class:`preprocessing.MinMaxScaler`
+- :class:`preprocessing.MaxAbsScaler`
Tools
-----
diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
index bfa490ff5e821..02eb1bfbb1db4 100644
--- a/doc/whats_new/v1.4.rst
+++ b/doc/whats_new/v1.4.rst
@@ -200,11 +200,11 @@ Changelog
when `sparse_output=True` and the output is configured to be pandas.
:pr:`26931` by `Thomas Fan`_.
-- |MajorFeature| :class:`preprocessing.MinMaxScaler` now
+- |MajorFeature| :class:`preprocessing.MinMaxScaler` and :class:`preprocessing.MaxAbsScaler` now
supports the `Array API `_. Array API
support is considered experimental and might evolve without being subject to
our usual rolling deprecation cycle policy. See
- :ref:`array_api` for more details. :pr:`26243` by `Tim Head`_.
+ :ref:`array_api` for more details. :pr:`26243` by `Tim Head`_ and :pr:`27110` by :user:`Edoardo Abati `.
:mod:`sklearn.tree`
...................
From 6ea0834e128e34cf60c07f84a95444c1d37a181a Mon Sep 17 00:00:00 2001
From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com>
Date: Mon, 21 Aug 2023 12:32:57 +0200
Subject: [PATCH 6/8] merged the array_api test functions
---
sklearn/preprocessing/tests/test_data.py | 26 ++----------------------
1 file changed, 2 insertions(+), 24 deletions(-)
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index e76ff7156a9ed..c395d4d21b378 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -701,12 +701,10 @@ def test_standard_check_array_of_inverse_transform():
)
@pytest.mark.parametrize(
"estimator",
- [MinMaxScaler()],
+ [MinMaxScaler(), MaxAbsScaler()],
ids=_get_check_estimator_ids,
)
-def test_minmaxscaler_array_api_compliance(
- estimator, check, array_namespace, device, dtype
-):
+def test_scaler_array_api_compliance(estimator, check, array_namespace, device, dtype):
name = estimator.__class__.__name__
check(name, estimator, array_namespace, device=device, dtype=dtype)
@@ -1667,26 +1665,6 @@ def test_robust_scaler_unit_variance():
assert X_trans.std() == pytest.approx(1, abs=1e-2)
-@pytest.mark.parametrize(
- "array_namespace, device, dtype", yield_namespace_device_dtype_combinations()
-)
-@pytest.mark.parametrize(
- "check",
- [check_array_api_input_and_values],
- ids=_get_check_estimator_ids,
-)
-@pytest.mark.parametrize(
- "estimator",
- [MaxAbsScaler()],
- ids=_get_check_estimator_ids,
-)
-def test_maxabscaler_array_api_compliance(
- estimator, check, array_namespace, device, dtype
-):
- name = estimator.__class__.__name__
- check(name, estimator, array_namespace, device=device, dtype=dtype)
-
-
def test_maxabs_scaler_zero_variance_features():
# Check MaxAbsScaler on toy data with zero variance features
X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.3], [0.0, 1.0, +1.5], [0.0, 0.0, +0.0]]
From 81fa4f24aee89053ea5d2a6579f42a6f6e28c9bd Mon Sep 17 00:00:00 2001
From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com>
Date: Mon, 21 Aug 2023 12:34:03 +0200
Subject: [PATCH 7/8] removed extra space
---
doc/whats_new/v1.4.rst | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
index 0fc598c65f62d..351e155672098 100644
--- a/doc/whats_new/v1.4.rst
+++ b/doc/whats_new/v1.4.rst
@@ -108,7 +108,7 @@ Changelog
`full` and `randomized` solvers (with QR power iterations). See
:ref:`array_api` for more details.
:pr:`26315` and :pr:`27098` by :user:`Mateusz Sokół `,
- :user:`Olivier Grisel ` and :user:` Edoardo Abati `.
+ :user:`Olivier Grisel ` and :user:`Edoardo Abati `.
- |Enhancement| :func:`decomposition.non_negative_factorization`, :class:`decomposition.NMF`,
and :class:`decomposition.MiniBatchNMF` now support :class:`scipy.sparse.sparray`
From 3fb8d5a5fc60c2abdfef07963dc6c76ad6ceaf60 Mon Sep 17 00:00:00 2001
From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com>
Date: Tue, 22 Aug 2023 18:55:57 +0200
Subject: [PATCH 8/8] scalers in alphabetical order
---
doc/modules/array_api.rst | 2 +-
sklearn/preprocessing/tests/test_data.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/doc/modules/array_api.rst b/doc/modules/array_api.rst
index b1f1f332fb057..161307dfddcf4 100644
--- a/doc/modules/array_api.rst
+++ b/doc/modules/array_api.rst
@@ -96,8 +96,8 @@ Estimators
- :class:`decomposition.PCA` (with `svd_solver="full"`,
`svd_solver="randomized"` and `power_iteration_normalizer="QR"`)
- :class:`discriminant_analysis.LinearDiscriminantAnalysis` (with `solver="svd"`)
-- :class:`preprocessing.MinMaxScaler`
- :class:`preprocessing.MaxAbsScaler`
+- :class:`preprocessing.MinMaxScaler`
Tools
-----
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index c395d4d21b378..d61996b76a25c 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -701,7 +701,7 @@ def test_standard_check_array_of_inverse_transform():
)
@pytest.mark.parametrize(
"estimator",
- [MinMaxScaler(), MaxAbsScaler()],
+ [MaxAbsScaler(), MinMaxScaler()],
ids=_get_check_estimator_ids,
)
def test_scaler_array_api_compliance(estimator, check, array_namespace, device, dtype):