Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH Array API support for LabelEncoder #27381

Merged
merged 31 commits into from
May 16, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
4869c0d
ENH Array API support for LabelEncoder
OmarManzoor Sep 15, 2023
7fbd458
Add changelog
OmarManzoor Sep 15, 2023
ec6ccc6
Add tests for array api functions
OmarManzoor Sep 15, 2023
a9d94ea
Merge branch 'main' into label_encoder_array_api
OmarManzoor Sep 23, 2023
43b039d
Updates: PR suggestions
OmarManzoor Sep 23, 2023
6198558
Merge branch 'main' into label_encoder_array_api
OmarManzoor Apr 4, 2024
cfdabeb
Fix dtype_name parameter
OmarManzoor Apr 4, 2024
cfcabd2
Merge branch 'main' into label_encoder_array_api
Apr 12, 2024
23ee510
Updates as suggested in review
Apr 12, 2024
fa0e27c
Merge branch 'main' into label_encoder_array_api
May 3, 2024
6177475
Revert changes is estimator_checks
May 3, 2024
a21a490
Improve the tests and handle device in _in1d
OmarManzoor May 3, 2024
b09b57b
Fix missing device specification and explicit conversion to numpy
ogrisel May 3, 2024
0544c32
Fix _isin to work with Array API inputs
ogrisel May 3, 2024
7cbbc20
Merge branch 'main' into label_encoder_array_api
ogrisel May 3, 2024
a34138b
Fix the errors, make searchsorted a helper function
OmarManzoor May 6, 2024
58c5aa0
Merge branch 'main' into label_encoder_array_api
OmarManzoor May 6, 2024
beb036a
Add array_api_support tag
OmarManzoor May 6, 2024
34c2d92
Updates: according to some pr suggestions
OmarManzoor May 7, 2024
bdb2d7e
Merge branch 'main' into label_encoder_array_api
ogrisel May 7, 2024
db32acf
Use xp.isdtype(values.dtype, "numeric") directly
ogrisel May 7, 2024
a593478
Update changelog
OmarManzoor May 7, 2024
22fa611
Update docstring for inverse transform
OmarManzoor May 7, 2024
f814441
Change array-like to array
OmarManzoor May 7, 2024
b5350ea
Merge branch 'main' into label_encoder_array_api
OmarManzoor May 8, 2024
8ce860d
Update the changelog definition to make it consistent
OmarManzoor May 8, 2024
fae25aa
Revert and update parameter and return type names
OmarManzoor May 10, 2024
e1bca48
Merge branch 'main' into label_encoder_array_api
OmarManzoor May 15, 2024
30f026b
Merge remote-tracking branch 'upstream/main' into label_encoder_array…
OmarManzoor May 16, 2024
dbf233a
Updates: Address further PR suggestions
OmarManzoor May 16, 2024
7500c2f
Minor adjustment
OmarManzoor May 16, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats_new/v1.4.rst
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,10 @@ Changelog
our usual rolling deprecation cycle policy. See
:ref:`array_api` for more details. :pr:`26243` by `Tim Head`_ and :pr:`27110` by :user:`Edoardo Abati <EdAbati>`.

- |Enhancement| :class:`preprocessing.LabelEncoder` now supports the
`Array API <https://data-apis.org/array-api/latest/>`_. See :ref:`array_api`
for more details. :pr:`27381` by :user:`Omar Salman <OmarManzoor>`.
OmarManzoor marked this conversation as resolved.
Show resolved Hide resolved

:mod:`sklearn.tree`
...................

Expand Down
15 changes: 9 additions & 6 deletions sklearn/preprocessing/_label.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from ..base import BaseEstimator, TransformerMixin, _fit_context
from ..utils import column_or_1d
from ..utils._array_api import _setdiff1d, get_namespace
from ..utils._encode import _encode, _unique
from ..utils._param_validation import Interval, validate_params
from ..utils.multiclass import type_of_target, unique_labels
Expand Down Expand Up @@ -129,10 +130,11 @@ def transform(self, y):
Labels as normalized encodings.
"""
check_is_fitted(self)
xp, _ = get_namespace(y)
y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
# transform of empty array is empty array
if _num_samples(y) == 0:
return np.array([])
return xp.asarray([])

return _encode(y, uniques=self.classes_)

Expand All @@ -150,16 +152,17 @@ def inverse_transform(self, y):
Original encoding.
"""
check_is_fitted(self)
xp, _ = get_namespace(y)
y = column_or_1d(y, warn=True)
# inverse transform of empty array is empty array
if _num_samples(y) == 0:
return np.array([])
return xp.asarray([])

diff = np.setdiff1d(y, np.arange(len(self.classes_)))
if len(diff):
diff = _setdiff1d(ar1=y, ar2=xp.arange(self.classes_.shape[0]), xp=xp)
if diff.shape[0]:
raise ValueError("y contains previously unseen labels: %s" % str(diff))
y = np.asarray(y)
return self.classes_[y]
y = xp.asarray(y)
return xp.take(self.classes_, y, axis=0)

def _more_tags(self):
return {"X_types": ["1dlabels"]}
Expand Down
25 changes: 25 additions & 0 deletions sklearn/preprocessing/tests/test_label.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,12 @@
label_binarize,
)
from sklearn.utils import _to_object_array
from sklearn.utils._array_api import yield_namespace_device_dtype_combinations
from sklearn.utils._testing import assert_array_equal, ignore_warnings
from sklearn.utils.estimator_checks import (
_get_check_estimator_ids,
check_array_api_input_and_values,
)
from sklearn.utils.fixes import (
COO_CONTAINERS,
CSC_CONTAINERS,
Expand Down Expand Up @@ -697,3 +702,23 @@ def test_label_encoders_do_not_have_set_output(encoder):
y_encoded_with_kwarg = encoder.fit_transform(y=["a", "b", "c"])
y_encoded_positional = encoder.fit_transform(["a", "b", "c"])
assert_array_equal(y_encoded_with_kwarg, y_encoded_positional)


@pytest.mark.parametrize(
"array_namespace, device, dtype", yield_namespace_device_dtype_combinations()
)
@pytest.mark.parametrize(
"check",
[check_array_api_input_and_values],
ids=_get_check_estimator_ids,
)
@pytest.mark.parametrize(
"estimator",
[LabelEncoder()],
ids=_get_check_estimator_ids,
)
def test_label_encoder_array_api_compliance(
estimator, check, array_namespace, device, dtype
):
name = estimator.__class__.__name__
check(name, estimator, array_namespace, device=device, dtype=dtype)
109 changes: 109 additions & 0 deletions sklearn/utils/_array_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,12 @@ def take(self, X, indices, *, axis=0):
def isdtype(self, dtype, kind):
return isdtype(dtype, kind, xp=self._namespace)

def searchsorted(self, a, v, *, side="left", sorter=None):
a = _convert_to_numpy(a, xp=self._namespace)
ogrisel marked this conversation as resolved.
Show resolved Hide resolved
v = _convert_to_numpy(v, xp=self._namespace)
indices = numpy.searchsorted(a, v, side=side, sorter=sorter)
return self._namespace.asarray(indices)
ogrisel marked this conversation as resolved.
Show resolved Hide resolved


def _check_device_cpu(device): # noqa
if device not in {"cpu", None}:
Expand Down Expand Up @@ -330,6 +336,11 @@ def unique_counts(self, x):
def unique_values(self, x):
return numpy.unique(x)

def unique_all(self, x):
return numpy.unique(
x, return_index=True, return_inverse=True, return_counts=True
)

def concat(self, arrays, *, axis=None):
return numpy.concatenate(arrays, axis=axis)

Expand Down Expand Up @@ -595,3 +606,101 @@ def _estimator_with_converted_arrays(estimator, converter):
def _atol_for_type(dtype):
"""Return the absolute tolerance for a given dtype."""
return numpy.finfo(dtype).eps * 100


def _setdiff1d(ar1, ar2, xp, assume_unique=False):
"""Find the set difference of two arrays.

Return the unique values in `ar1` that are not in `ar2`.
"""
if _is_numpy_namespace(xp):
return xp.asarray(
numpy.setdiff1d(
ar1=ar1,
ar2=ar2,
assume_unique=assume_unique,
)
)

if assume_unique:
ar1 = xp.reshape(xp.asarray(ar1), (-1,))
ogrisel marked this conversation as resolved.
Show resolved Hide resolved
else:
ar1 = xp.unique_values(ar1)
ar2 = xp.unique_values(ar2)
return ar1[__in1d(ar1=ar1, ar2=ar2, xp=xp, assume_unique=True, invert=True)]


def _isin(element, test_elements, xp, assume_unique=False, invert=False):
"""Calculates ``element in test_elements``, broadcasting over `element`
only.

Returns a boolean array of the same shape as `element` that is True
where an element of `element` is in `test_elements` and False otherwise.
"""
if _is_numpy_namespace(xp):
return xp.asarray(
numpy.isin(
element=element,
test_elements=test_elements,
assume_unique=assume_unique,
invert=invert,
)
)

original_element_shape = element.shape
element = xp.reshape(xp.asarray(element), (-1,))
test_elements = xp.reshape(xp.asarray(test_elements), (-1,))
return xp.reshape(
__in1d(
ar1=element,
ar2=test_elements,
xp=xp,
assume_unique=assume_unique,
invert=invert,
),
original_element_shape,
)


# Note: This is a helper for the functions `_isin` and
# `_setdiff1d`. It is not meant to be called directly.
def __in1d(ar1, ar2, xp, assume_unique=False, invert=False):
ogrisel marked this conversation as resolved.
Show resolved Hide resolved
"""Checks whether each element of an array is also present in a
second array.

Returns a boolean array the same length as `ar1` that is True
where an element of `ar1` is in `ar2` and False otherwise
"""

# This code is run to make the code significantly faster
if ar2.shape[0] < 10 * ar1.shape[0] ** 0.145:
if invert:
mask = xp.ones(ar1.shape[0], dtype=xp.bool)
for a in ar2:
mask &= ar1 != a
else:
mask = xp.zeros(ar1.shape[0], dtype=xp.bool)
for a in ar2:
mask |= ar1 == a
return mask

if not assume_unique:
ar1, rev_idx = xp.unique_inverse(ar1)
ar2 = xp.unique_values(ar2)

ar = xp.concat((ar1, ar2))
# We need this to be a stable sort.
order = ar.argsort(stable=True)
sar = ar[order]
if invert:
bool_ar = sar[1:] != sar[:-1]
else:
bool_ar = sar[1:] == sar[:-1]
flag = xp.concat((bool_ar, xp.asarray([invert])))
ret = xp.empty(ar.shape, dtype=xp.bool)
ret[order] = flag

if assume_unique:
return ret[: len(ar1)]
else:
return ret[rev_idx]
65 changes: 38 additions & 27 deletions sklearn/utils/_encode.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np

from . import is_scalar_nan
from ._array_api import _convert_to_numpy, _isin, _setdiff1d, get_namespace


def _unique(values, *, return_inverse=False, return_counts=False):
Expand Down Expand Up @@ -51,31 +52,29 @@ def _unique(values, *, return_inverse=False, return_counts=False):
def _unique_np(values, return_inverse=False, return_counts=False):
"""Helper function to find unique values for numpy arrays that correctly
accounts for nans. See `_unique` documentation for details."""
uniques = np.unique(
values, return_inverse=return_inverse, return_counts=return_counts
)
xp, _ = get_namespace(values)

inverse, counts = None, None

if return_counts:
*uniques, counts = uniques

if return_inverse:
*uniques, inverse = uniques

if return_counts or return_inverse:
uniques = uniques[0]
if return_inverse and return_counts:
uniques, _, inverse, counts = xp.unique_all(values)
elif return_inverse:
uniques, inverse = xp.unique_inverse(values)
elif return_counts:
uniques, counts = xp.unique_counts(values)
else:
uniques = xp.unique_values(values)

# np.unique will have duplicate missing values at the end of `uniques`
# here we clip the nans and remove it from uniques
if uniques.size and is_scalar_nan(uniques[-1]):
nan_idx = np.searchsorted(uniques, np.nan)
nan_idx = xp.searchsorted(uniques, xp.nan)
OmarManzoor marked this conversation as resolved.
Show resolved Hide resolved
uniques = uniques[: nan_idx + 1]
if return_inverse:
inverse[inverse > nan_idx] = nan_idx

if return_counts:
counts[nan_idx] = np.sum(counts[nan_idx:])
counts[nan_idx] = xp.sum(counts[nan_idx:])
counts = counts[: nan_idx + 1]

ret = (uniques,)
Expand Down Expand Up @@ -161,8 +160,9 @@ def __missing__(self, key):

def _map_to_integer(values, uniques):
"""Map values based on its position in uniques."""
xp, _ = get_namespace(values, uniques)
table = _nandict({val: i for i, val in enumerate(uniques)})
return np.array([table[v] for v in values])
return xp.asarray([table[v] for v in values])
ogrisel marked this conversation as resolved.
Show resolved Hide resolved


def _unique_python(values, *, return_inverse, return_counts):
Expand Down Expand Up @@ -220,7 +220,13 @@ def _encode(values, *, uniques, check_unknown=True):
encoded : ndarray
Encoded values
"""
if values.dtype.kind in "OUS":
xp, is_array_api_compliant = get_namespace(values, uniques)
if is_array_api_compliant:
dtype_kind = _convert_to_numpy(values, xp).dtype.kind
OmarManzoor marked this conversation as resolved.
Show resolved Hide resolved
else:
dtype_kind = values.dtype.kind

if dtype_kind in "OUS":
try:
return _map_to_integer(values, uniques)
except KeyError as e:
Expand All @@ -230,7 +236,7 @@ def _encode(values, *, uniques, check_unknown=True):
diff = _check_unknown(values, uniques)
if diff:
raise ValueError(f"y contains previously unseen labels: {str(diff)}")
return np.searchsorted(uniques, values)
return xp.searchsorted(uniques, values)


def _check_unknown(values, known_values, return_mask=False):
Expand Down Expand Up @@ -258,9 +264,14 @@ def _check_unknown(values, known_values, return_mask=False):
Additionally returned if ``return_mask=True``.

"""
xp, is_array_api_compliant = get_namespace(values, known_values)
valid_mask = None
if is_array_api_compliant:
dtype_kind = _convert_to_numpy(values, xp).dtype.kind
else:
dtype_kind = values.dtype.kind

if values.dtype.kind in "OUS":
if dtype_kind in "OUS":
values_set = set(values)
values_set, missing_in_values = _extract_missing(values_set)

Expand All @@ -282,31 +293,31 @@ def is_valid(value):

if return_mask:
if diff or nan_in_diff or none_in_diff:
valid_mask = np.array([is_valid(value) for value in values])
valid_mask = xp.array([is_valid(value) for value in values])
else:
valid_mask = np.ones(len(values), dtype=bool)
valid_mask = xp.ones(len(values), dtype=xp.bool)

diff = list(diff)
if none_in_diff:
diff.append(None)
if nan_in_diff:
diff.append(np.nan)
else:
unique_values = np.unique(values)
diff = np.setdiff1d(unique_values, known_values, assume_unique=True)
unique_values = xp.unique_values(values)
diff = _setdiff1d(unique_values, known_values, xp, assume_unique=True)
if return_mask:
if diff.size:
valid_mask = np.isin(values, known_values)
valid_mask = _isin(values, known_values, xp)
else:
valid_mask = np.ones(len(values), dtype=bool)
valid_mask = xp.ones(len(values), dtype=xp.bool)

# check for nans in the known_values
if np.isnan(known_values).any():
diff_is_nan = np.isnan(diff)
if diff_is_nan.any():
if xp.any(xp.isnan(known_values)):
diff_is_nan = xp.isnan(diff)
if xp.any(diff_is_nan):
# removes nan from valid_mask
if diff.size and return_mask:
is_nan = np.isnan(values)
is_nan = xp.isnan(values)
valid_mask[is_nan] = 1

# remove nan from diff
Expand Down
Loading