scikit-learn · betatim · May 16, 2024 · Sep 15, 2023 · Sep 15, 2023 · Sep 15, 2023
diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
@@ -273,6 +273,10 @@ Changelog
   our usual rolling deprecation cycle policy. See
   :ref:`array_api` for more details. :pr:`26243` by `Tim Head`_ and :pr:`27110` by :user:`Edoardo Abati <EdAbati>`.
 
+- |Enhancement| :class:`preprocessing.LabelEncoder` now supports the
+  `Array API <https://data-apis.org/array-api/latest/>`_. See :ref:`array_api`
+  for more details. :pr:`27381` by :user:`Omar Salman <OmarManzoor>`.
+
 :mod:`sklearn.tree`
 ...................
 

diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py
@@ -17,6 +17,7 @@
 
 from ..base import BaseEstimator, TransformerMixin, _fit_context
 from ..utils import column_or_1d
+from ..utils._array_api import _setdiff1d, get_namespace
 from ..utils._encode import _encode, _unique
 from ..utils._param_validation import Interval, validate_params
 from ..utils.multiclass import type_of_target, unique_labels
@@ -129,10 +130,11 @@ def transform(self, y):
             Labels as normalized encodings.
         """
         check_is_fitted(self)
+        xp, _ = get_namespace(y)
         y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
         # transform of empty array is empty array
         if _num_samples(y) == 0:
-            return np.array([])
+            return xp.asarray([])
 
         return _encode(y, uniques=self.classes_)
 
@@ -150,16 +152,17 @@ def inverse_transform(self, y):
             Original encoding.
         """
         check_is_fitted(self)
+        xp, _ = get_namespace(y)
         y = column_or_1d(y, warn=True)
         # inverse transform of empty array is empty array
         if _num_samples(y) == 0:
-            return np.array([])
+            return xp.asarray([])
 
-        diff = np.setdiff1d(y, np.arange(len(self.classes_)))
-        if len(diff):
+        diff = _setdiff1d(ar1=y, ar2=xp.arange(self.classes_.shape[0]), xp=xp)
+        if diff.shape[0]:
             raise ValueError("y contains previously unseen labels: %s" % str(diff))
-        y = np.asarray(y)
-        return self.classes_[y]
+        y = xp.asarray(y)
+        return xp.take(self.classes_, y, axis=0)
 
     def _more_tags(self):
         return {"X_types": ["1dlabels"]}

diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
@@ -12,7 +12,12 @@
     label_binarize,
 )
 from sklearn.utils import _to_object_array
+from sklearn.utils._array_api import yield_namespace_device_dtype_combinations
 from sklearn.utils._testing import assert_array_equal, ignore_warnings
+from sklearn.utils.estimator_checks import (
+    _get_check_estimator_ids,
+    check_array_api_input_and_values,
+)
 from sklearn.utils.fixes import (
     COO_CONTAINERS,
     CSC_CONTAINERS,
@@ -697,3 +702,23 @@ def test_label_encoders_do_not_have_set_output(encoder):
     y_encoded_with_kwarg = encoder.fit_transform(y=["a", "b", "c"])
     y_encoded_positional = encoder.fit_transform(["a", "b", "c"])
     assert_array_equal(y_encoded_with_kwarg, y_encoded_positional)
+
+
+@pytest.mark.parametrize(
+    "array_namespace, device, dtype", yield_namespace_device_dtype_combinations()
+)
+@pytest.mark.parametrize(
+    "check",
+    [check_array_api_input_and_values],
+    ids=_get_check_estimator_ids,
+)
+@pytest.mark.parametrize(
+    "estimator",
+    [LabelEncoder()],
+    ids=_get_check_estimator_ids,
+)
+def test_label_encoder_array_api_compliance(
+    estimator, check, array_namespace, device, dtype
+):
+    name = estimator.__class__.__name__
+    check(name, estimator, array_namespace, device=device, dtype=dtype)
diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py
@@ -232,6 +232,12 @@ def take(self, X, indices, *, axis=0):
     def isdtype(self, dtype, kind):
         return isdtype(dtype, kind, xp=self._namespace)
 
+    def searchsorted(self, a, v, *, side="left", sorter=None):
+        a = _convert_to_numpy(a, xp=self._namespace)
+        v = _convert_to_numpy(v, xp=self._namespace)
+        indices = numpy.searchsorted(a, v, side=side, sorter=sorter)
+        return self._namespace.asarray(indices)
+
 
 def _check_device_cpu(device):  # noqa
     if device not in {"cpu", None}:
@@ -330,6 +336,11 @@ def unique_counts(self, x):
     def unique_values(self, x):
         return numpy.unique(x)
 
+    def unique_all(self, x):
+        return numpy.unique(
+            x, return_index=True, return_inverse=True, return_counts=True
+        )
+
     def concat(self, arrays, *, axis=None):
         return numpy.concatenate(arrays, axis=axis)
 
@@ -595,3 +606,101 @@ def _estimator_with_converted_arrays(estimator, converter):
 def _atol_for_type(dtype):
     """Return the absolute tolerance for a given dtype."""
     return numpy.finfo(dtype).eps * 100
+
+
+def _setdiff1d(ar1, ar2, xp, assume_unique=False):
+    """Find the set difference of two arrays.
+
+    Return the unique values in `ar1` that are not in `ar2`.
+    """
+    if _is_numpy_namespace(xp):
+        return xp.asarray(
+            numpy.setdiff1d(
+                ar1=ar1,
+                ar2=ar2,
+                assume_unique=assume_unique,
+            )
+        )
+
+    if assume_unique:
+        ar1 = xp.reshape(xp.asarray(ar1), (-1,))
+    else:
+        ar1 = xp.unique_values(ar1)
+        ar2 = xp.unique_values(ar2)
+    return ar1[__in1d(ar1=ar1, ar2=ar2, xp=xp, assume_unique=True, invert=True)]
+
+
+def _isin(element, test_elements, xp, assume_unique=False, invert=False):
+    """Calculates ``element in test_elements``, broadcasting over `element`
+    only.
+
+    Returns a boolean array of the same shape as `element` that is True
+    where an element of `element` is in `test_elements` and False otherwise.
+    """
+    if _is_numpy_namespace(xp):
+        return xp.asarray(
+            numpy.isin(
+                element=element,
+                test_elements=test_elements,
+                assume_unique=assume_unique,
+                invert=invert,
+            )
+        )
+
+    original_element_shape = element.shape
+    element = xp.reshape(xp.asarray(element), (-1,))
+    test_elements = xp.reshape(xp.asarray(test_elements), (-1,))
+    return xp.reshape(
+        __in1d(
+            ar1=element,
+            ar2=test_elements,
+            xp=xp,
+            assume_unique=assume_unique,
+            invert=invert,
+        ),
+        original_element_shape,
+    )
+
+
+# Note: This is a helper for the functions `_isin` and
+# `_setdiff1d`. It is not meant to be called directly.
+def __in1d(ar1, ar2, xp, assume_unique=False, invert=False):
+    """Checks whether each element of an array is also present in a
+    second array.
+
+    Returns a boolean array the same length as `ar1` that is True
+    where an element of `ar1` is in `ar2` and False otherwise
+    """
+
+    # This code is run to make the code significantly faster
+    if ar2.shape[0] < 10 * ar1.shape[0] ** 0.145:
+        if invert:
+            mask = xp.ones(ar1.shape[0], dtype=xp.bool)
+            for a in ar2:
+                mask &= ar1 != a
+        else:
+            mask = xp.zeros(ar1.shape[0], dtype=xp.bool)
+            for a in ar2:
+                mask |= ar1 == a
+        return mask
+
+    if not assume_unique:
+        ar1, rev_idx = xp.unique_inverse(ar1)
+        ar2 = xp.unique_values(ar2)
+
+    ar = xp.concat((ar1, ar2))
+    # We need this to be a stable sort.
+    order = ar.argsort(stable=True)
+    sar = ar[order]
+    if invert:
+        bool_ar = sar[1:] != sar[:-1]
+    else:
+        bool_ar = sar[1:] == sar[:-1]
+    flag = xp.concat((bool_ar, xp.asarray([invert])))
+    ret = xp.empty(ar.shape, dtype=xp.bool)
+    ret[order] = flag
+
+    if assume_unique:
+        return ret[: len(ar1)]
+    else:
+        return ret[rev_idx]
diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py
@@ -5,6 +5,7 @@
 import numpy as np
 
 from . import is_scalar_nan
+from ._array_api import _convert_to_numpy, _isin, _setdiff1d, get_namespace
 
 
 def _unique(values, *, return_inverse=False, return_counts=False):
@@ -51,31 +52,29 @@ def _unique(values, *, return_inverse=False, return_counts=False):
 def _unique_np(values, return_inverse=False, return_counts=False):
     """Helper function to find unique values for numpy arrays that correctly
     accounts for nans. See `_unique` documentation for details."""
-    uniques = np.unique(
-        values, return_inverse=return_inverse, return_counts=return_counts
-    )
+    xp, _ = get_namespace(values)
 
     inverse, counts = None, None
 
-    if return_counts:
-        *uniques, counts = uniques
-
-    if return_inverse:
-        *uniques, inverse = uniques
-
-    if return_counts or return_inverse:
-        uniques = uniques[0]
+    if return_inverse and return_counts:
+        uniques, _, inverse, counts = xp.unique_all(values)
+    elif return_inverse:
+        uniques, inverse = xp.unique_inverse(values)
+    elif return_counts:
+        uniques, counts = xp.unique_counts(values)
+    else:
+        uniques = xp.unique_values(values)
 
     # np.unique will have duplicate missing values at the end of `uniques`
     # here we clip the nans and remove it from uniques
     if uniques.size and is_scalar_nan(uniques[-1]):
-        nan_idx = np.searchsorted(uniques, np.nan)
+        nan_idx = xp.searchsorted(uniques, xp.nan)
         uniques = uniques[: nan_idx + 1]
         if return_inverse:
             inverse[inverse > nan_idx] = nan_idx
 
         if return_counts:
-            counts[nan_idx] = np.sum(counts[nan_idx:])
+            counts[nan_idx] = xp.sum(counts[nan_idx:])
             counts = counts[: nan_idx + 1]
 
     ret = (uniques,)
@@ -161,8 +160,9 @@ def __missing__(self, key):
 
 def _map_to_integer(values, uniques):
     """Map values based on its position in uniques."""
+    xp, _ = get_namespace(values, uniques)
     table = _nandict({val: i for i, val in enumerate(uniques)})
-    return np.array([table[v] for v in values])
+    return xp.asarray([table[v] for v in values])
 
 
 def _unique_python(values, *, return_inverse, return_counts):
@@ -220,7 +220,13 @@ def _encode(values, *, uniques, check_unknown=True):
     encoded : ndarray
         Encoded values
     """
-    if values.dtype.kind in "OUS":
+    xp, is_array_api_compliant = get_namespace(values, uniques)
+    if is_array_api_compliant:
+        dtype_kind = _convert_to_numpy(values, xp).dtype.kind
+    else:
+        dtype_kind = values.dtype.kind
+
+    if dtype_kind in "OUS":
         try:
             return _map_to_integer(values, uniques)
         except KeyError as e:
@@ -230,7 +236,7 @@ def _encode(values, *, uniques, check_unknown=True):
             diff = _check_unknown(values, uniques)
             if diff:
                 raise ValueError(f"y contains previously unseen labels: {str(diff)}")
-        return np.searchsorted(uniques, values)
+        return xp.searchsorted(uniques, values)
 
 
 def _check_unknown(values, known_values, return_mask=False):
@@ -258,9 +264,14 @@ def _check_unknown(values, known_values, return_mask=False):
         Additionally returned if ``return_mask=True``.
 
     """
+    xp, is_array_api_compliant = get_namespace(values, known_values)
     valid_mask = None
+    if is_array_api_compliant:
+        dtype_kind = _convert_to_numpy(values, xp).dtype.kind
+    else:
+        dtype_kind = values.dtype.kind
 
-    if values.dtype.kind in "OUS":
+    if dtype_kind in "OUS":
         values_set = set(values)
         values_set, missing_in_values = _extract_missing(values_set)
 
@@ -282,31 +293,31 @@ def is_valid(value):
 
         if return_mask:
             if diff or nan_in_diff or none_in_diff:
-                valid_mask = np.array([is_valid(value) for value in values])
+                valid_mask = xp.array([is_valid(value) for value in values])
             else:
-                valid_mask = np.ones(len(values), dtype=bool)
+                valid_mask = xp.ones(len(values), dtype=xp.bool)
 
         diff = list(diff)
         if none_in_diff:
             diff.append(None)
         if nan_in_diff:
             diff.append(np.nan)
     else:
-        unique_values = np.unique(values)
-        diff = np.setdiff1d(unique_values, known_values, assume_unique=True)
+        unique_values = xp.unique_values(values)
+        diff = _setdiff1d(unique_values, known_values, xp, assume_unique=True)
         if return_mask:
             if diff.size:
-                valid_mask = np.isin(values, known_values)
+                valid_mask = _isin(values, known_values, xp)
             else:
-                valid_mask = np.ones(len(values), dtype=bool)
+                valid_mask = xp.ones(len(values), dtype=xp.bool)
 
         # check for nans in the known_values
-        if np.isnan(known_values).any():
-            diff_is_nan = np.isnan(diff)
-            if diff_is_nan.any():
+        if xp.any(xp.isnan(known_values)):
+            diff_is_nan = xp.isnan(diff)
+            if xp.any(diff_is_nan):
                 # removes nan from valid_mask
                 if diff.size and return_mask:
-                    is_nan = np.isnan(values)
+                    is_nan = xp.isnan(values)
                     valid_mask[is_nan] = 1
 
                 # remove nan from diff