From 80fe90a2beef2391333eee6d695a3f69e6d4a1be Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 23 Oct 2023 18:23:30 +0800 Subject: [PATCH 01/16] Implement basic support for ordinal encoder. - Implement `OrdinalEncoder`. - Implement dask version. - Fix dask transformers with DataFrame input by using `dask_cudf` to construct return df. --- .../sklearn/utils/skl_dependencies.py | 2 + python/cuml/dask/common/base.py | 3 +- python/cuml/dask/preprocessing/__init__.py | 3 +- python/cuml/dask/preprocessing/encoders.py | 176 ++++++-- python/cuml/preprocessing/__init__.py | 3 +- python/cuml/preprocessing/encoders.py | 378 ++++++++++++++---- .../cuml/preprocessing/ordinalencoder_mg.py | 36 ++ .../tests/dask/test_dask_ordinal_encoder.py | 69 ++++ python/cuml/tests/test_ordinal_encoder.py | 106 +++++ 9 files changed, 667 insertions(+), 109 deletions(-) create mode 100644 python/cuml/preprocessing/ordinalencoder_mg.py create mode 100644 python/cuml/tests/dask/test_dask_ordinal_encoder.py create mode 100644 python/cuml/tests/test_ordinal_encoder.py diff --git a/python/cuml/_thirdparty/sklearn/utils/skl_dependencies.py b/python/cuml/_thirdparty/sklearn/utils/skl_dependencies.py index 3d47bf262b..9b095db31f 100644 --- a/python/cuml/_thirdparty/sklearn/utils/skl_dependencies.py +++ b/python/cuml/_thirdparty/sklearn/utils/skl_dependencies.py @@ -58,6 +58,8 @@ def _check_n_features(self, X, reset): if reset: self.n_features_in_ = n_features + if hasattr(X, "columns"): + self.feature_names_in_ = [str(c) for c in X.columns] else: if not hasattr(self, 'n_features_in_'): raise RuntimeError( diff --git a/python/cuml/dask/common/base.py b/python/cuml/dask/common/base.py index 718056e01c..a9949310be 100644 --- a/python/cuml/dask/common/base.py +++ b/python/cuml/dask/common/base.py @@ -36,6 +36,7 @@ np = cpu_only_import("numpy") +dask_cudf = gpu_only_import("dask_cudf") dcDataFrame = gpu_only_import_from("dask_cudf.core", "DataFrame") @@ -343,7 +344,7 @@ def _run_parallel_func( if output_futures: return self.client.compute(preds) else: - output = dask.dataframe.from_delayed(preds) + output = dask_cudf.from_delayed(preds) return output if delayed else output.persist() else: raise ValueError( diff --git a/python/cuml/dask/preprocessing/__init__.py b/python/cuml/dask/preprocessing/__init__.py index 17380238ef..f5959467ae 100644 --- a/python/cuml/dask/preprocessing/__init__.py +++ b/python/cuml/dask/preprocessing/__init__.py @@ -13,12 +13,13 @@ # limitations under the License. # +from cuml.dask.preprocessing.encoders import OneHotEncoder, OrdinalEncoder from cuml.dask.preprocessing.label import LabelBinarizer -from cuml.dask.preprocessing.encoders import OneHotEncoder from cuml.dask.preprocessing.LabelEncoder import LabelEncoder __all__ = [ "LabelBinarizer", "OneHotEncoder", + "OrdinalEncoder", "LabelEncoder", ] diff --git a/python/cuml/dask/preprocessing/encoders.py b/python/cuml/dask/preprocessing/encoders.py index 0033f89eca..490f31b447 100644 --- a/python/cuml/dask/preprocessing/encoders.py +++ b/python/cuml/dask/preprocessing/encoders.py @@ -12,23 +12,48 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from dask_cudf.core import Series as daskSeries +from collections.abc import Sequence + from cuml.common import with_cupy_rmm +from cuml.dask.common.base import ( + BaseEstimator, + DelayedInverseTransformMixin, + DelayedTransformMixin, +) +from cuml.internals.safe_imports import gpu_only_import_from, gpu_only_import +from dask_cudf.core import Series as daskSeries +from toolz import first -from cuml.dask.common.base import BaseEstimator -from cuml.dask.common.base import DelayedTransformMixin -from cuml.dask.common.base import DelayedInverseTransformMixin +dask_cudf = gpu_only_import("dask_cudf") +dcDataFrame = gpu_only_import_from("dask_cudf.core", "DataFrame") -from toolz import first -from collections.abc import Sequence -from cuml.internals.safe_imports import gpu_only_import_from +class DelayedFitTransformMixin: + def fit_transform(self, X, delayed=True): + """ + Fit the encoder to X, then transform X. + Equivalent to fit(X).transform(X). -dcDataFrame = gpu_only_import_from("dask_cudf.core", "DataFrame") + Parameters + ---------- + X : Dask cuDF DataFrame or CuPy backed Dask Array + The data to encode. + delayed : bool (default = True) + Whether to execute as a delayed task or eager. + + Returns + ------- + out : Dask cuDF DataFrame or CuPy backed Dask Array + Distributed object containing the transformed data + """ + return self.fit(X).transform(X, delayed=delayed) class OneHotEncoder( - BaseEstimator, DelayedTransformMixin, DelayedInverseTransformMixin + BaseEstimator, + DelayedTransformMixin, + DelayedInverseTransformMixin, + DelayedFitTransformMixin, ): """ Encode categorical features as a one-hot numeric array. @@ -111,10 +136,10 @@ def fit(self, X): return self - def fit_transform(self, X, delayed=True): + @with_cupy_rmm + def transform(self, X, delayed=True): """ - Fit OneHotEncoder to X, then transform X. - Equivalent to fit(X).transform(X). + Transform X using one-hot encoding. Parameters ---------- @@ -126,52 +151,149 @@ def fit_transform(self, X, delayed=True): Returns ------- out : Dask cuDF DataFrame or CuPy backed Dask Array - Distributed object containing the transformed data + Distributed object containing the transformed input. """ - return self.fit(X).transform(X, delayed=delayed) + return self._transform( + X, + n_dims=2, + delayed=delayed, + output_dtype=self._get_internal_model().dtype, + output_collection_type="cupy", + ) @with_cupy_rmm - def transform(self, X, delayed=True): + def inverse_transform(self, X, delayed=True): """ - Transform X using one-hot encoding. + Convert the data back to the original representation. + In case unknown categories are encountered (all zeros in the + one-hot encoding), ``None`` is used to represent this category. Parameters ---------- - X : Dask cuDF DataFrame or CuPy backed Dask Array - The data to encode. + X : CuPy backed Dask Array, shape [n_samples, n_encoded_features] + The transformed data. delayed : bool (default = True) Whether to execute as a delayed task or eager. Returns ------- - out : Dask cuDF DataFrame or CuPy backed Dask Array - Distributed object containing the transformed input. + X_tr : Dask cuDF DataFrame or CuPy backed Dask Array + Distributed object containing the inverse transformed array. """ - return self._transform( + dtype = self._get_internal_model().dtype + return self._inverse_transform( + X, + n_dims=2, + delayed=delayed, + output_dtype=dtype, + output_collection_type=self.datatype, + ) + + +class OrdinalEncoder( + BaseEstimator, + DelayedTransformMixin, + DelayedInverseTransformMixin, + DelayedFitTransformMixin, +): + """Encode categorical features as an integer array. + + The input to this transformer should be an :py:class:`dask_cudf.DataFrame` or a + :py:class:`dask.array.Array` backed by cupy, denoting the unique values taken on by + categorical (discrete) features. The features are converted to ordinal + integers. This results in a single column of integers (0 to n_categories - 1) per + feature. + + Parameters + ---------- + categories : 'auto' an cupy.ndarray or a cudf.DataFrame, default='auto' + Categories (unique values) per feature: + + - 'auto' : Determine categories automatically from the training data. + + - DataFrame/ndarray : ``categories[col]`` holds the categories expected + in the feature col. + handle_unknown : {'error', 'ignore'}, default='error' + Whether to raise an error or ignore if an unknown categorical feature is + present during transform (default is to raise). When this parameter is set + to 'ignore' and an unknown category is encountered during transform, the + resulting encoded value would be null when output type is cudf + dataframe. + verbose : int or boolean, default=False + Sets logging level. It must be one of `cuml.common.logger.level_*`. See + :ref:`verbosity-levels` for more info. + + """ + def __init__(self, *, client=None, verbose=False, **kwargs) -> None: + super().__init__(client=client, verbose=verbose, **kwargs) + + @with_cupy_rmm + def fit(self, X): + """ + Fit Ordinal to X. + + Parameters + ---------- + X : :py:class:`dask_cudf.DataFrame` or a :py:class:`dask.array.Array` backed, + shape = (n_samples, n_features) The data to determine the categories of each + feature. + y : None + Ignored. This parameter exists for compatibility only. + + Returns + ------- + self + + """ + from cuml.preprocessing.ordinalencoder_mg import OrdinalEncoderMG + + el = first(X) if isinstance(X, Sequence) else X + self.datatype = ( + "cudf" if isinstance(el, (dcDataFrame, daskSeries)) else "cupy" + ) + + self._set_internal_model(OrdinalEncoderMG(**self.kwargs).fit(X)) + + return self + + @with_cupy_rmm + def transform(self, X, delayed=True): + """ + Transform X using ordinal encoding. + + Parameters + ---------- + X : :py:class:`dask_cudf.DataFrame` or cupy backed dask array. The data to + encode. + + Returns + ------- + X_out : + Transformed input. + """ + Xt = self._transform( X, n_dims=2, delayed=delayed, output_dtype=self._get_internal_model().dtype, - output_collection_type="cupy", + output_collection_type=self.datatype, ) + return Xt @with_cupy_rmm def inverse_transform(self, X, delayed=True): """ Convert the data back to the original representation. - In case unknown categories are encountered (all zeros in the - one-hot encoding), ``None`` is used to represent this category. Parameters ---------- - X : CuPy backed Dask Array, shape [n_samples, n_encoded_features] - The transformed data. + X : :py:class:`dask_cudf.DataFrame` or cupy backed dask array. delayed : bool (default = True) Whether to execute as a delayed task or eager. Returns ------- - X_tr : Dask cuDF DataFrame or CuPy backed Dask Array + X_tr : Distributed object containing the inverse transformed array. """ dtype = self._get_internal_model().dtype diff --git a/python/cuml/preprocessing/__init__.py b/python/cuml/preprocessing/__init__.py index 368c570b09..fc07aba50c 100644 --- a/python/cuml/preprocessing/__init__.py +++ b/python/cuml/preprocessing/__init__.py @@ -16,7 +16,7 @@ from cuml.model_selection import train_test_split from cuml.preprocessing.LabelEncoder import LabelEncoder from cuml.preprocessing.label import LabelBinarizer, label_binarize -from cuml.preprocessing.encoders import OneHotEncoder +from cuml.preprocessing.encoders import OneHotEncoder, OrdinalEncoder from cuml.preprocessing.TargetEncoder import TargetEncoder from cuml.preprocessing import text @@ -63,6 +63,7 @@ "MissingIndicator", "Normalizer", "OneHotEncoder", + "OrdinalEncoder", "PolynomialFeatures", "PowerTransformer", "QuantileTransformer", diff --git a/python/cuml/preprocessing/encoders.py b/python/cuml/preprocessing/encoders.py index 32a8defc69..c10c0df359 100644 --- a/python/cuml/preprocessing/encoders.py +++ b/python/cuml/preprocessing/encoders.py @@ -13,31 +13,120 @@ # limitations under the License. # import warnings +from typing import List, Optional, TypeVar + import cuml.internals.logger as logger -from cuml.internals.safe_imports import gpu_only_import_from from cudf import DataFrame, Series -from cuml.preprocessing import LabelEncoder from cuml import Base +from cuml._thirdparty.sklearn.utils.skl_dependencies import BaseEstimator +from cuml.common import input_to_cuml_array from cuml.common.exceptions import NotFittedError -from cuml.internals.safe_imports import gpu_only_import -from cuml.internals.safe_imports import cpu_only_import +from cuml.internals.safe_imports import ( + cpu_only_import, + gpu_only_import, + gpu_only_import_from, +) +from cuml.preprocessing import LabelEncoder np = cpu_only_import("numpy") +cudf = gpu_only_import("cudf") cp = gpu_only_import("cupy") cupyx = gpu_only_import("cupyx") GenericIndex = gpu_only_import_from("cudf", "GenericIndex") -class OneHotEncoder(Base): +class BaseEncoder(BaseEstimator): + """Base implementation for encoding categorical values, uses + :py:class:`~cuml.preprocessing.LabelEncoder` for obtaining unique values. + + """ + + def _set_input_type(self, value): + if self.input_type is None: + self.input_type = value + + def _check_input(self, X, is_categories=False): + """ + If input is cupy, convert it to a DataFrame with 0 copies + """ + if isinstance(X, cp.ndarray): + self._set_input_type("array") + if is_categories: + X = X.transpose() + return DataFrame(X) + else: + self._set_input_type("df") + return X + + def _check_input_fit(self, X, is_categories=False): + """Helper function used in fit. Can be overridden in subclasses.""" + self._check_n_features(X, reset=True) + return self._check_input(X, is_categories=is_categories) + + def _fit(self, X, need_drop: bool): + X = self._check_input_fit(X) + if type(self.categories) is str and self.categories == "auto": + self._features = X.columns + self._encoders = { + feature: LabelEncoder( + handle=self.handle, + verbose=self.verbose, + output_type=self.output_type, + handle_unknown=self.handle_unknown, + ).fit(self._unique(X[feature])) + for feature in self._features + } + else: + self.categories = self._check_input_fit(self.categories, True) + self._features = self.categories.columns + if len(self._features) != X.shape[1]: + raise ValueError( + "Shape mismatch: if categories is not 'auto'," + " it has to be of shape (n_features, _)." + ) + self._encoders = dict() + for feature in self._features: + le = LabelEncoder( + handle=self.handle, + verbose=self.verbose, + output_type=self.output_type, + handle_unknown=self.handle_unknown, + ) + + self._encoders[feature] = le.fit(self.categories[feature]) + + if self.handle_unknown == "error": + if self._has_unknown( + X[feature], self._encoders[feature].classes_ + ): + msg = ( + "Found unknown categories in column {0}" + " during fit".format(feature) + ) + raise KeyError(msg) + + if need_drop: + self.drop_idx_ = self._compute_drop_idx() + self._fitted = True + + @property + def categories_(self): + """ + Returns categories used for the one hot encoding in the correct order. + """ + return [self._encoders[f].classes_ for f in self._features] + + +class OneHotEncoder(BaseEncoder): """ Encode categorical features as a one-hot numeric array. - The input to this estimator should be a cuDF.DataFrame or a cupy.ndarray, - denoting the unique values taken on by categorical (discrete) features. - The features are encoded using a one-hot (aka 'one-of-K' or 'dummy') - encoding scheme. This creates a binary column for each category and - returns a sparse matrix or dense array (depending on the ``sparse`` - parameter). + The input to this estimator should be a :py:class:`cuDF.DataFrame` or a + :py:class:`cupy.ndarray`, denoting the unique values taken on by categorical + (discrete) features. The features are encoded using a one-hot (aka 'one-of-K' or + 'dummy') encoding scheme. This creates a binary column for each category and returns + a sparse matrix or dense array (depending on the ``sparse`` parameter). + By default, the encoder derives the categories based on the unique values in each feature. Alternatively, you can also specify the `categories` manually. @@ -209,30 +298,6 @@ def _compute_drop_idx(self): ) raise ValueError(msg.format(type(self.drop))) - @property - def categories_(self): - """ - Returns categories used for the one hot encoding in the correct order. - """ - return [self._encoders[f].classes_ for f in self._features] - - def _set_input_type(self, value): - if self.input_type is None: - self.input_type = value - - def _check_input(self, X, is_categories=False): - """ - If input is cupy, convert it to a DataFrame with 0 copies - """ - if isinstance(X, cp.ndarray): - self._set_input_type("array") - if is_categories: - X = X.transpose() - return DataFrame(X) - else: - self._set_input_type("df") - return X - def _check_input_fit(self, X, is_categories=False): """Helper function used in fit. Can be overridden in subclasses.""" return self._check_input(X, is_categories=is_categories) @@ -265,50 +330,7 @@ def fit(self, X, y=None): """ self._validate_keywords() - X = self._check_input_fit(X) - if type(self.categories) is str and self.categories == "auto": - self._features = X.columns - self._encoders = { - feature: LabelEncoder( - handle=self.handle, - verbose=self.verbose, - output_type=self.output_type, - handle_unknown=self.handle_unknown, - ).fit(self._unique(X[feature])) - for feature in self._features - } - else: - self.categories = self._check_input_fit(self.categories, True) - self._features = self.categories.columns - if len(self._features) != X.shape[1]: - raise ValueError( - "Shape mismatch: if categories is not 'auto'," - " it has to be of shape (n_features, _)." - ) - self._encoders = dict() - for feature in self._features: - - le = LabelEncoder( - handle=self.handle, - verbose=self.verbose, - output_type=self.output_type, - handle_unknown=self.handle_unknown, - ) - - self._encoders[feature] = le.fit(self.categories[feature]) - - if self.handle_unknown == "error": - if self._has_unknown( - X[feature], self._encoders[feature].classes_ - ): - msg = ( - "Found unknown categories in column {0}" - " during fit".format(feature) - ) - raise KeyError(msg) - - self.drop_idx_ = self._compute_drop_idx() - self._fitted = True + self._fit(X, True) return self def fit_transform(self, X, y=None): @@ -544,3 +566,201 @@ def get_param_names(self): "dtype", "handle_unknown", ] + + +def _slice_feat(X, i): + if hasattr(X, "iloc"): + return X[i] + return X[:, i] + + +def _get_output( + output_type: Optional[str], + input_type: Optional[str], + out: DataFrame, + dtype, +): + if output_type == "input": + if input_type == "array": + output_type = "cupy" + elif input_type == "df": + output_type = "cudf" + + if output_type is None: + output_type = "cupy" + + if output_type == "cudf": + return out + elif output_type == "cupy": + return out.astype(dtype).to_cupy(na_value=np.nan) + elif output_type == "numpy": + return cp.asnumpy(out.to_cupy(na_value=np.nan, dtype=dtype)) + elif output_type == "pandas": + return out.to_pandas() + else: + raise ValueError("Unsupported output type.") + + +class OrdinalEncoder(BaseEncoder): + def __init__( + self, + *, + categories="auto", + dtype=np.float64, + handle_unknown="error", + handle=None, + verbose=False, + output_type=None, + ) -> None: + """Encode categorical features as an integer array. + + The input to this transformer should be an :py:class:`cudf.DataFrame` or a + :py:class:`cupy.ndarray`, denoting the unique values taken on by categorical + (discrete) features. The features are converted to ordinal integers. This + results in a single column of integers (0 to n_categories - 1) per feature. + + Parameters + ---------- + categories : 'auto' an cupy.ndarray or a cudf.DataFrame, default='auto' + Categories (unique values) per feature: + + - 'auto' : Determine categories automatically from the training data. + + - DataFrame/ndarray : ``categories[col]`` holds the categories expected + in the feature col. + handle_unknown : {'error', 'ignore'}, default='error' + Whether to raise an error or ignore if an unknown categorical feature is + present during transform (default is to raise). When this parameter is set + to 'ignore' and an unknown category is encountered during transform, the + resulting encoded value would be null when output type is cudf + dataframe. + handle : cuml.Handle + Specifies the cuml.handle that holds internal CUDA state for computations in + this model. Most importantly, this specifies the CUDA stream that will be + used for the model's computations, so users can run different models + concurrently in different streams by creating handles in several streams. + + If it is None, a new one is created. + verbose : int or boolean, default=False + Sets logging level. It must be one of `cuml.common.logger.level_*`. See + :ref:`verbosity-levels` for more info. + output_type : {'input', 'cupy', 'numpy', 'cudf', 'pandas'}, default=None + + Return results and set estimator attributes to the indicated output type. If + None, the output type set at the module level + (`cuml.global_settings.output_type`) will be used. See + :ref:`output-data-type-configuration` for more info. + """ + # This is called in `BaseEncoder` via a decorated __init__ method and the last + # three parameters are removed. The `output_type` in here is guaranteed to be + # `None`. Setting it again with a call to base init overrides the value with + # `None`. + + # super().__init__( + # handle=handle, verbose=verbose, output_type=output_type + # ) + + self.categories = categories + self.dtype = dtype + self.handle_unknown = handle_unknown + + self.input_type = None + + def _unique(self, inp): + """Helper function used in fit. Can be overridden in subclasses.""" + + # Default implementation passes input through directly since this is + # performed in `LabelEncoder.fit()` + return inp + + def fit(self, X, y=None) -> "OrdinalEncoder": + """ + Fit Ordinal to X. + + Parameters + ---------- + X : cuDF.DataFrame or cupy.ndarray, shape = (n_samples, n_features) + The data to determine the categories of each feature. + y : None + Ignored. This parameter exists for compatibility only. + + Returns + ------- + self + + """ + self._fit(X, need_drop=False) + return self + + def transform(self, X): + """ + Transform X using ordinal encoding. + + Parameters + ---------- + X : cudf.DataFrame or cupy.ndarray + The data to encode. + + Returns + ------- + X_out : Type is specified by the `output_type` parameter. + Transformed input. + """ + self._check_n_features(X, reset=False) + + result = {} + for feature in self._features: + Xi = _slice_feat(X, feature) + col_idx = self._encoders[feature].transform(Xi) + result[feature] = col_idx + + r = DataFrame(result) + return _get_output(self.output_type, self.input_type, r, self.dtype) + + def fit_transform(self, X, y=None): + """Fit OrdinalEncoder to X, then transform X. Equivalent to + fit(X).transform(X). + + Parameters + ---------- + X : cudf.DataFrame or cupy.ndarray, shape = (n_samples, n_features) + The data to encode. + + Returns + ------- + X_out : Type is specified by the `output_type` parameter. + Transformed output. + """ + X = self._check_input(X) + return self.fit(X).transform(X) + + def inverse_transform(self, X): + """Convert the data back to the original representation. + + Parameters + ---------- + X : array-like or sparse matrix, shape [n_samples, n_encoded_features] + The transformed data. + + Returns + ------- + X_tr : Type is specified by the `output_type` parameter. + Inverse transformed array. + """ + self._check_n_features(X, reset=False) + + result = {} + for feature in self._features: + Xi = X[feature] + inv = self._encoders[feature].inverse_transform(Xi) + result[feature] = inv + + r = DataFrame(result) + return _get_output(self.output_type, self.input_type, r, self.dtype) + + def get_param_names(self): + return super().get_param_names() + [ + "categories", + "dtype", + "handle_unknown", + ] diff --git a/python/cuml/preprocessing/ordinalencoder_mg.py b/python/cuml/preprocessing/ordinalencoder_mg.py new file mode 100644 index 0000000000..8ec8760e2d --- /dev/null +++ b/python/cuml/preprocessing/ordinalencoder_mg.py @@ -0,0 +1,36 @@ +import cupy as cp +import dask +from cuml.dask.common.dask_arr_utils import to_dask_cudf +from cuml.internals.safe_imports import gpu_only_import, gpu_only_import_from +from cuml.preprocessing.encoders import OrdinalEncoder + +cp = gpu_only_import("cupy") +DataFrame = gpu_only_import_from("cudf", "DataFrame") + + +class OrdinalEncoderMG(OrdinalEncoder): + def __init__(self, *, client=None, **kwargs): + # force cupy output type, otherwise, dask doesn't would construct the output as + # numpy array. + super().__init__(**kwargs) + self.client = client + + def _check_input_fit(self, X, is_categories=False): + """Helper function to check input of fit within the multi-gpu model""" + if isinstance(X, (dask.array.core.Array, cp.ndarray)): + self._set_input_type("array") + if is_categories: + X = X.transpose() + if isinstance(X, cp.ndarray): + return DataFrame(X) + else: + return to_dask_cudf(X, client=self.client) + else: + self._set_input_type("df") + return X + + def _unique(self, inp): + return inp.unique().compute() + + def _has_unknown(self, X_cat, encoder_cat): + return not X_cat.isin(encoder_cat).all().compute() diff --git a/python/cuml/tests/dask/test_dask_ordinal_encoder.py b/python/cuml/tests/dask/test_dask_ordinal_encoder.py new file mode 100644 index 0000000000..905f872ee0 --- /dev/null +++ b/python/cuml/tests/dask/test_dask_ordinal_encoder.py @@ -0,0 +1,69 @@ +# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import dask_cudf +import numpy as np +import pandas as pd +import pytest +from cudf import DataFrame +from cuml.dask.preprocessing import OrdinalEncoder +from distributed import Client + + +@pytest.mark.mg +def test_ordinal_encoder_df(client: Client) -> None: + X = DataFrame({"gender": ["M", "F", "F"], "int": [1, 3, 2]}) + X = dask_cudf.from_cudf(X, npartitions=2) + + enc = OrdinalEncoder() + enc.fit(X) + Xt = enc.transform(X).compute() + + X_1 = DataFrame({"gender": ["F", "F"], "int": [1, 2]}) + X_1 = dask_cudf.from_cudf(X_1, npartitions=2) + + enc = OrdinalEncoder(client=client) + enc.fit(X) + Xt_1 = enc.transform(X_1).compute() + + assert Xt_1.iloc[0, 0] == Xt.iloc[1, 0] + assert Xt_1.iloc[1, 0] == Xt.iloc[1, 0] + assert Xt_1.iloc[0, 1] == Xt.iloc[0, 1] + assert Xt_1.iloc[1, 1] == Xt.iloc[2, 1] + + +@pytest.mark.parametrize("as_array", [True, False], ids=["cupy", "cudf"]) +def test_handle_unknown(client, as_array: bool) -> None: + X = DataFrame({"data": [0, 1]}) + Y = DataFrame({"data": [3, 1]}) + + X = dask_cudf.from_cudf(X, npartitions=2) + Y = dask_cudf.from_cudf(Y, npartitions=2) + + if as_array: + X = X.values + Y = Y.values + + enc = OrdinalEncoder(handle_unknown="error") + enc = enc.fit(X) + with pytest.raises(KeyError): + enc.transform(Y).compute() + + enc = OrdinalEncoder(handle_unknown="ignore") + enc = enc.fit(X) + encoded = enc.transform(Y).compute() + if as_array: + np.isnan(encoded[0, 0]) + else: + assert pd.isna(encoded.iloc[0, 0]) diff --git a/python/cuml/tests/test_ordinal_encoder.py b/python/cuml/tests/test_ordinal_encoder.py new file mode 100644 index 0000000000..9a9669049c --- /dev/null +++ b/python/cuml/tests/test_ordinal_encoder.py @@ -0,0 +1,106 @@ +# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import cupy as cp +import numpy as np +import pandas as pd +import pytest +from cuml.internals.safe_imports import gpu_only_import_from +from cuml.preprocessing import OrdinalEncoder +from sklearn.preprocessing import OrdinalEncoder as skOrdinalEncoder + +DataFrame = gpu_only_import_from("cudf", "DataFrame") + + +def test_ordinal_df() -> None: + X = DataFrame({"gender": ["M", "F", "F"], "int": [1, 3, 2]}) + enc = OrdinalEncoder() + enc.fit(X) + Xt = enc.transform(X) + + X_1 = DataFrame({"gender": ["F", "F"], "int": [1, 2]}) + Xt_1 = enc.transform(X_1) + + assert Xt_1.iloc[0, 0] == Xt.iloc[1, 0] + assert Xt_1.iloc[1, 0] == Xt.iloc[1, 0] + assert Xt_1.iloc[0, 1] == Xt.iloc[0, 1] + assert Xt_1.iloc[1, 1] == Xt.iloc[2, 1] + + inv_Xt = enc.inverse_transform(Xt) + + inv_Xt_1 = enc.inverse_transform(Xt_1) + + assert inv_Xt.equals(X) + assert inv_Xt_1.equals(X_1) + + assert enc.n_features_in_ == 2 + + +def test_ordinal_array() -> None: + X = cp.arange(32).reshape(16, 2) + + enc = OrdinalEncoder() + enc.fit(X) + Xt = enc.transform(X) + + Xh = cp.asnumpy(X) + skenc = skOrdinalEncoder() + skenc.fit(Xh) + Xt_sk = skenc.transform(Xh) + + cp.testing.assert_allclose(Xt, Xt_sk) + + +def test_output_type() -> None: + X = DataFrame({"gender": ["M", "F", "F"], "int": [1, 3, 2]}) + enc = OrdinalEncoder(output_type="cupy").fit(X) + assert isinstance(enc.transform(X), cp.ndarray) + enc = OrdinalEncoder(output_type="cudf").fit(X) + assert isinstance(enc.transform(X), DataFrame) + enc = OrdinalEncoder(output_type="pandas").fit(X) + assert isinstance(enc.transform(X), pd.DataFrame) + enc = OrdinalEncoder(output_type="numpy").fit(X) + assert isinstance(enc.transform(X), np.ndarray) + # output_type == "input" + enc = OrdinalEncoder().fit(X) + assert isinstance(enc.transform(X), DataFrame) + + +def test_feature_names() -> None: + X = DataFrame({"gender": ["M", "F", "F"], "int": [1, 3, 2]}) + enc = OrdinalEncoder().fit(X) + assert enc.feature_names_in_ == ["gender", "int"] + + +@pytest.mark.parametrize("as_array", [True, False], ids=["cupy", "cudf"]) +def test_handle_unknown(as_array: bool) -> None: + + X = DataFrame({"data": [0, 1]}) + Y = DataFrame({"data": [3, 1]}) + + if as_array: + X = X.values + Y = Y.values + + enc = OrdinalEncoder(handle_unknown="error") + enc = enc.fit(X) + with pytest.raises(KeyError): + enc.transform(Y) + + enc = OrdinalEncoder(handle_unknown="ignore") + enc = enc.fit(X) + encoded = enc.transform(Y) + if as_array: + np.isnan(encoded[0, 0]) + else: + assert pd.isna(encoded.iloc[0, 0]) From c26eca7d3ef4b113b6fe7b34af945edd338f3161 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 7 Nov 2023 05:14:55 +0800 Subject: [PATCH 02/16] black. --- python/cuml/dask/preprocessing/encoders.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cuml/dask/preprocessing/encoders.py b/python/cuml/dask/preprocessing/encoders.py index 490f31b447..b4306ec330 100644 --- a/python/cuml/dask/preprocessing/encoders.py +++ b/python/cuml/dask/preprocessing/encoders.py @@ -224,6 +224,7 @@ class OrdinalEncoder( :ref:`verbosity-levels` for more info. """ + def __init__(self, *, client=None, verbose=False, **kwargs) -> None: super().__init__(client=client, verbose=verbose, **kwargs) From 5b92aa01cab1864c864abafac75e3dfa64a5b93f Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 7 Nov 2023 09:13:42 +0800 Subject: [PATCH 03/16] Fixes for doc. --- python/cuml/dask/preprocessing/encoders.py | 4 +--- python/cuml/preprocessing/ordinalencoder_mg.py | 15 +++++++++++++++ .../cuml/tests/dask/test_dask_ordinal_encoder.py | 1 + 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/python/cuml/dask/preprocessing/encoders.py b/python/cuml/dask/preprocessing/encoders.py index b4306ec330..d9bdd10b44 100644 --- a/python/cuml/dask/preprocessing/encoders.py +++ b/python/cuml/dask/preprocessing/encoders.py @@ -235,11 +235,9 @@ def fit(self, X): Parameters ---------- - X : :py:class:`dask_cudf.DataFrame` or a :py:class:`dask.array.Array` backed, + X : :py:class:`dask_cudf.DataFrame` or a CuPy backed :py:class:`dask.array.Array`. shape = (n_samples, n_features) The data to determine the categories of each feature. - y : None - Ignored. This parameter exists for compatibility only. Returns ------- diff --git a/python/cuml/preprocessing/ordinalencoder_mg.py b/python/cuml/preprocessing/ordinalencoder_mg.py index 8ec8760e2d..2cab3b56a7 100644 --- a/python/cuml/preprocessing/ordinalencoder_mg.py +++ b/python/cuml/preprocessing/ordinalencoder_mg.py @@ -1,3 +1,18 @@ +# +# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# import cupy as cp import dask from cuml.dask.common.dask_arr_utils import to_dask_cudf diff --git a/python/cuml/tests/dask/test_dask_ordinal_encoder.py b/python/cuml/tests/dask/test_dask_ordinal_encoder.py index 905f872ee0..009d0e297a 100644 --- a/python/cuml/tests/dask/test_dask_ordinal_encoder.py +++ b/python/cuml/tests/dask/test_dask_ordinal_encoder.py @@ -43,6 +43,7 @@ def test_ordinal_encoder_df(client: Client) -> None: assert Xt_1.iloc[1, 1] == Xt.iloc[2, 1] +@pytest.mark.mg @pytest.mark.parametrize("as_array", [True, False], ids=["cupy", "cudf"]) def test_handle_unknown(client, as_array: bool) -> None: X = DataFrame({"data": [0, 1]}) From c92115853062bf16ab4205444afe28c7268c7ed5 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 7 Nov 2023 09:19:52 +0800 Subject: [PATCH 04/16] Indirect class. --- python/cuml/preprocessing/encoders.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/python/cuml/preprocessing/encoders.py b/python/cuml/preprocessing/encoders.py index c10c0df359..c470468dc4 100644 --- a/python/cuml/preprocessing/encoders.py +++ b/python/cuml/preprocessing/encoders.py @@ -42,6 +42,15 @@ class BaseEncoder(BaseEstimator): """ + def __init__( + self, + *, + handle=None, + verbose=False, + output_type=None, + ) -> None: + super().__init__(handle=handle, verbose=verbose, output_type=output_type) + def _set_input_type(self, value): if self.input_type is None: self.input_type = value @@ -651,14 +660,9 @@ def __init__( (`cuml.global_settings.output_type`) will be used. See :ref:`output-data-type-configuration` for more info. """ - # This is called in `BaseEncoder` via a decorated __init__ method and the last - # three parameters are removed. The `output_type` in here is guaranteed to be - # `None`. Setting it again with a call to base init overrides the value with - # `None`. - - # super().__init__( - # handle=handle, verbose=verbose, output_type=output_type - # ) + super().__init__( + handle=handle, verbose=verbose, output_type=output_type + ) self.categories = categories self.dtype = dtype From 9e217b5ef6a1568f19061a2745196e7be7347d57 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 7 Nov 2023 09:46:42 +0800 Subject: [PATCH 05/16] Use Base instead. --- .../sklearn/utils/skl_dependencies.py | 2 -- python/cuml/preprocessing/encoders.py | 25 ++++++++++++++++--- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/python/cuml/_thirdparty/sklearn/utils/skl_dependencies.py b/python/cuml/_thirdparty/sklearn/utils/skl_dependencies.py index 9b095db31f..3d47bf262b 100644 --- a/python/cuml/_thirdparty/sklearn/utils/skl_dependencies.py +++ b/python/cuml/_thirdparty/sklearn/utils/skl_dependencies.py @@ -58,8 +58,6 @@ def _check_n_features(self, X, reset): if reset: self.n_features_in_ = n_features - if hasattr(X, "columns"): - self.feature_names_in_ = [str(c) for c in X.columns] else: if not hasattr(self, 'n_features_in_'): raise RuntimeError( diff --git a/python/cuml/preprocessing/encoders.py b/python/cuml/preprocessing/encoders.py index c470468dc4..e04ca995ac 100644 --- a/python/cuml/preprocessing/encoders.py +++ b/python/cuml/preprocessing/encoders.py @@ -18,8 +18,6 @@ import cuml.internals.logger as logger from cudf import DataFrame, Series from cuml import Base -from cuml._thirdparty.sklearn.utils.skl_dependencies import BaseEstimator -from cuml.common import input_to_cuml_array from cuml.common.exceptions import NotFittedError from cuml.internals.safe_imports import ( cpu_only_import, @@ -36,7 +34,28 @@ GenericIndex = gpu_only_import_from("cudf", "GenericIndex") -class BaseEncoder(BaseEstimator): +class CheckFeaturesMixIn: + def _check_n_features(self, X, reset: bool = False): + n_features = X.shape[1] + if reset: + self.n_features_in_ = n_features + if hasattr(X, "columns"): + self.feature_names_in_ = [str(c) for c in X.columns] + else: + if not hasattr(self, 'n_features_in_'): + raise RuntimeError( + "The reset parameter is False but there is no " + "n_features_in_ attribute. Is this estimator fitted?" + ) + if n_features != self.n_features_in_: + raise ValueError( + 'X has {} features, but this {} is expecting {} features ' + 'as input.'.format(n_features, self.__class__.__name__, + self.n_features_in_) + ) + + +class BaseEncoder(Base, CheckFeaturesMixIn): """Base implementation for encoding categorical values, uses :py:class:`~cuml.preprocessing.LabelEncoder` for obtaining unique values. From 098c0c06794b52a6bf77fce5356899fe1a8725a3 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 7 Nov 2023 09:57:20 +0800 Subject: [PATCH 06/16] lint. --- python/cuml/preprocessing/encoders.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/python/cuml/preprocessing/encoders.py b/python/cuml/preprocessing/encoders.py index e04ca995ac..6386bb6850 100644 --- a/python/cuml/preprocessing/encoders.py +++ b/python/cuml/preprocessing/encoders.py @@ -42,16 +42,19 @@ def _check_n_features(self, X, reset: bool = False): if hasattr(X, "columns"): self.feature_names_in_ = [str(c) for c in X.columns] else: - if not hasattr(self, 'n_features_in_'): + if not hasattr(self, "n_features_in_"): raise RuntimeError( "The reset parameter is False but there is no " "n_features_in_ attribute. Is this estimator fitted?" ) if n_features != self.n_features_in_: raise ValueError( - 'X has {} features, but this {} is expecting {} features ' - 'as input.'.format(n_features, self.__class__.__name__, - self.n_features_in_) + "X has {} features, but this {} is expecting {} features " + "as input.".format( + n_features, + self.__class__.__name__, + self.n_features_in_, + ) ) @@ -68,7 +71,9 @@ def __init__( verbose=False, output_type=None, ) -> None: - super().__init__(handle=handle, verbose=verbose, output_type=output_type) + super().__init__( + handle=handle, verbose=verbose, output_type=output_type + ) def _set_input_type(self, value): if self.input_type is None: From 4855a68b7dc96f065945f9483c80fa0043b630a1 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 7 Nov 2023 19:19:06 +0800 Subject: [PATCH 07/16] Doc fix. --- python/cuml/preprocessing/encoders.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/python/cuml/preprocessing/encoders.py b/python/cuml/preprocessing/encoders.py index 6386bb6850..285be2ee9c 100644 --- a/python/cuml/preprocessing/encoders.py +++ b/python/cuml/preprocessing/encoders.py @@ -62,6 +62,24 @@ class BaseEncoder(Base, CheckFeaturesMixIn): """Base implementation for encoding categorical values, uses :py:class:`~cuml.preprocessing.LabelEncoder` for obtaining unique values. + Parameters + ---------- + handle : cuml.Handle + Specifies the cuml.handle that holds internal CUDA state for + computations in this model. Most importantly, this specifies the CUDA + stream that will be used for the model's computations, so users can + run different models concurrently in different streams by creating + handles in several streams. + If it is None, a new one is created. + verbose : int or boolean, default=False + Sets logging level. It must be one of `cuml.common.logger.level_*`. + See :ref:`verbosity-levels` for more info. + output_type : {'input', 'cupy', 'numpy', 'cudf', 'pandas'}, default=None + Return results and set estimator attributes to the indicated output type. If + None, the output type set at the module level + (`cuml.global_settings.output_type`) will be used. See + :ref:`output-data-type-configuration` for more info. + """ def __init__( From 9eb3bc57791e825918a742418da5a9ab4b66b45f Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Wed, 8 Nov 2023 04:48:15 +0800 Subject: [PATCH 08/16] doc test. --- python/cuml/dask/preprocessing/encoders.py | 5 +++-- python/cuml/preprocessing/encoders.py | 16 +++++++++------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/python/cuml/dask/preprocessing/encoders.py b/python/cuml/dask/preprocessing/encoders.py index d9bdd10b44..494145d9ff 100644 --- a/python/cuml/dask/preprocessing/encoders.py +++ b/python/cuml/dask/preprocessing/encoders.py @@ -206,8 +206,9 @@ class OrdinalEncoder( Parameters ---------- - categories : 'auto' an cupy.ndarray or a cudf.DataFrame, default='auto' - Categories (unique values) per feature: + categories : :py:class:`cupy.ndarray` or :py:class`cudf.DataFrameq, default='auto' + Categories (unique values) per feature. All categories are expected to + fit on one GPU. - 'auto' : Determine categories automatically from the training data. diff --git a/python/cuml/preprocessing/encoders.py b/python/cuml/preprocessing/encoders.py index 285be2ee9c..4cfa0004cc 100644 --- a/python/cuml/preprocessing/encoders.py +++ b/python/cuml/preprocessing/encoders.py @@ -74,9 +74,10 @@ class BaseEncoder(Base, CheckFeaturesMixIn): verbose : int or boolean, default=False Sets logging level. It must be one of `cuml.common.logger.level_*`. See :ref:`verbosity-levels` for more info. - output_type : {'input', 'cupy', 'numpy', 'cudf', 'pandas'}, default=None - Return results and set estimator attributes to the indicated output type. If - None, the output type set at the module level + output_type : {'input', 'array', 'dataframe', 'series', 'df_obj', \ + 'numba', 'cupy', 'numpy', 'cudf', 'pandas'}, default=None + Return results and set estimator attributes to the indicated output + type. If None, the output type set at the module level (`cuml.global_settings.output_type`) will be used. See :ref:`output-data-type-configuration` for more info. @@ -695,12 +696,13 @@ def __init__( verbose : int or boolean, default=False Sets logging level. It must be one of `cuml.common.logger.level_*`. See :ref:`verbosity-levels` for more info. - output_type : {'input', 'cupy', 'numpy', 'cudf', 'pandas'}, default=None - - Return results and set estimator attributes to the indicated output type. If - None, the output type set at the module level + output_type : {'input', 'array', 'dataframe', 'series', 'df_obj', \ + 'numba', 'cupy', 'numpy', 'cudf', 'pandas'}, default=None + Return results and set estimator attributes to the indicated output + type. If None, the output type set at the module level (`cuml.global_settings.output_type`) will be used. See :ref:`output-data-type-configuration` for more info. + """ super().__init__( handle=handle, verbose=verbose, output_type=output_type From 983d4b5f99878f58f56a4fe70685ee1101cba3b2 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 14 Nov 2023 08:54:10 +0800 Subject: [PATCH 09/16] Cleanup doc strings in local. --- python/cuml/common/doc_utils.py | 2 + python/cuml/preprocessing/encoders.py | 110 +++++++++----------------- 2 files changed, 41 insertions(+), 71 deletions(-) diff --git a/python/cuml/common/doc_utils.py b/python/cuml/common/doc_utils.py index 5421bbb6d3..47dd91cc73 100644 --- a/python/cuml/common/doc_utils.py +++ b/python/cuml/common/doc_utils.py @@ -94,6 +94,8 @@ " Ignored when return_sparse=False.\n" " If True, values in the inverse transform below this parameter\n" " are clipped to 0.", + None: "{name} : None\n" + " Ignored. This parameter exists for compatibility only." } _parameter_possible_values = [ diff --git a/python/cuml/preprocessing/encoders.py b/python/cuml/preprocessing/encoders.py index 4cfa0004cc..5df30bccd3 100644 --- a/python/cuml/preprocessing/encoders.py +++ b/python/cuml/preprocessing/encoders.py @@ -18,6 +18,7 @@ import cuml.internals.logger as logger from cudf import DataFrame, Series from cuml import Base +from cuml.common.doc_utils import generate_docstring from cuml.common.exceptions import NotFittedError from cuml.internals.safe_imports import ( cpu_only_import, @@ -365,58 +366,41 @@ def _has_unknown(self, X_cat, encoder_cat): """Check if X_cat has categories that are not present in encoder_cat""" return not X_cat.isin(encoder_cat).all() + @generate_docstring(y=None) def fit(self, X, y=None): - """ - Fit OneHotEncoder to X. - - Parameters - ---------- - X : cuDF.DataFrame or cupy.ndarray, shape = (n_samples, n_features) - The data to determine the categories of each feature. - y : None - Ignored. This parameter exists for compatibility only. - - Returns - ------- - self + """Fit OneHotEncoder to X. """ self._validate_keywords() self._fit(X, True) return self + @generate_docstring( + y=None, + return_values={ + "name": "X_out", + "description": "Transformed input.", + "type": "sparse matrix if sparse=True else a 2-d array", + } + ) def fit_transform(self, X, y=None): """ - Fit OneHotEncoder to X, then transform X. - Equivalent to fit(X).transform(X). - - Parameters - ---------- - X : cudf.DataFrame or cupy.ndarray, shape = (n_samples, n_features) - The data to encode. - - Returns - ------- - X_out : sparse matrix if sparse=True else a 2-d array - Transformed input. + Fit OneHotEncoder to X, then transform X. Equivalent to fit(X).transform(X). """ X = self._check_input(X) return self.fit(X).transform(X) + @generate_docstring( + return_values={ + "name": "X_out", + "description": "Transformed input.", + "type": "sparse matrix if sparse=True else a 2-d array", + } + ) def transform(self, X): - """ - Transform X using one-hot encoding. + """Transform X using one-hot encoding. - Parameters - ---------- - X : cudf.DataFrame or cupy.ndarray - The data to encode. - - Returns - ------- - X_out : sparse matrix if sparse=True else a 2-d array - Transformed input. """ self._check_is_fitted() X = self._check_input(X) @@ -721,38 +705,24 @@ def _unique(self, inp): # performed in `LabelEncoder.fit()` return inp + @generate_docstring(y=None) def fit(self, X, y=None) -> "OrdinalEncoder": - """ - Fit Ordinal to X. - - Parameters - ---------- - X : cuDF.DataFrame or cupy.ndarray, shape = (n_samples, n_features) - The data to determine the categories of each feature. - y : None - Ignored. This parameter exists for compatibility only. - - Returns - ------- - self + """Fit Ordinal to X. """ self._fit(X, need_drop=False) return self + @generate_docstring( + return_values={ + "name": "X_out", + "description": "Transformed input.", + "type": "Type is specified by the `output_type` parameter.", + } + ) def transform(self, X): - """ - Transform X using ordinal encoding. - - Parameters - ---------- - X : cudf.DataFrame or cupy.ndarray - The data to encode. + """Transform X using ordinal encoding. - Returns - ------- - X_out : Type is specified by the `output_type` parameter. - Transformed input. """ self._check_n_features(X, reset=False) @@ -765,19 +735,17 @@ def transform(self, X): r = DataFrame(result) return _get_output(self.output_type, self.input_type, r, self.dtype) + @generate_docstring( + y=None, + return_values={ + "name": "X_out", + "description": "Transformed input.", + "type": "Type is specified by the `output_type` parameter.", + } + ) def fit_transform(self, X, y=None): - """Fit OrdinalEncoder to X, then transform X. Equivalent to - fit(X).transform(X). + """Fit OrdinalEncoder to X, then transform X. Equivalent to fit(X).transform(X). - Parameters - ---------- - X : cudf.DataFrame or cupy.ndarray, shape = (n_samples, n_features) - The data to encode. - - Returns - ------- - X_out : Type is specified by the `output_type` parameter. - Transformed output. """ X = self._check_input(X) return self.fit(X).transform(X) From 692e42687d916733c64e7e85b960c1773f1eb60c Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 14 Nov 2023 09:02:25 +0800 Subject: [PATCH 10/16] black. --- python/cuml/preprocessing/encoders.py | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/python/cuml/preprocessing/encoders.py b/python/cuml/preprocessing/encoders.py index 5df30bccd3..58adb356e7 100644 --- a/python/cuml/preprocessing/encoders.py +++ b/python/cuml/preprocessing/encoders.py @@ -368,9 +368,7 @@ def _has_unknown(self, X_cat, encoder_cat): @generate_docstring(y=None) def fit(self, X, y=None): - """Fit OneHotEncoder to X. - - """ + """Fit OneHotEncoder to X.""" self._validate_keywords() self._fit(X, True) return self @@ -381,7 +379,7 @@ def fit(self, X, y=None): "name": "X_out", "description": "Transformed input.", "type": "sparse matrix if sparse=True else a 2-d array", - } + }, ) def fit_transform(self, X, y=None): """ @@ -399,9 +397,7 @@ def fit_transform(self, X, y=None): } ) def transform(self, X): - """Transform X using one-hot encoding. - - """ + """Transform X using one-hot encoding.""" self._check_is_fitted() X = self._check_input(X) @@ -707,9 +703,7 @@ def _unique(self, inp): @generate_docstring(y=None) def fit(self, X, y=None) -> "OrdinalEncoder": - """Fit Ordinal to X. - - """ + """Fit Ordinal to X.""" self._fit(X, need_drop=False) return self @@ -721,9 +715,7 @@ def fit(self, X, y=None) -> "OrdinalEncoder": } ) def transform(self, X): - """Transform X using ordinal encoding. - - """ + """Transform X using ordinal encoding.""" self._check_n_features(X, reset=False) result = {} @@ -741,12 +733,10 @@ def transform(self, X): "name": "X_out", "description": "Transformed input.", "type": "Type is specified by the `output_type` parameter.", - } + }, ) def fit_transform(self, X, y=None): - """Fit OrdinalEncoder to X, then transform X. Equivalent to fit(X).transform(X). - - """ + """Fit OrdinalEncoder to X, then transform X. Equivalent to fit(X).transform(X).""" X = self._check_input(X) return self.fit(X).transform(X) From 7c21a1a9d709580ff74a90390618572f8d097aa1 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 14 Nov 2023 09:02:44 +0800 Subject: [PATCH 11/16] black. --- python/cuml/common/doc_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/cuml/common/doc_utils.py b/python/cuml/common/doc_utils.py index 47dd91cc73..03054f0664 100644 --- a/python/cuml/common/doc_utils.py +++ b/python/cuml/common/doc_utils.py @@ -95,7 +95,7 @@ " If True, values in the inverse transform below this parameter\n" " are clipped to 0.", None: "{name} : None\n" - " Ignored. This parameter exists for compatibility only." + " Ignored. This parameter exists for compatibility only.", } _parameter_possible_values = [ @@ -224,7 +224,6 @@ def deco(func): if ( "X" in params or "y" in params or parameters ) and not skip_parameters_heading: - func.__doc__ += "\nParameters\n----------\n" # Check if we want to prepend the parameters From 4bf271b06f335e11f8a6e92713db503160810662 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Wed, 15 Nov 2023 11:10:07 +0800 Subject: [PATCH 12/16] Add tests for array. Cleanup. --- python/cuml/dask/preprocessing/encoders.py | 36 ++----- python/cuml/preprocessing/encoders.py | 94 +++++-------------- .../cuml/preprocessing/ordinalencoder_mg.py | 2 - .../tests/dask/test_dask_ordinal_encoder.py | 63 +++++++++++-- python/cuml/tests/test_ordinal_encoder.py | 40 ++++++-- 5 files changed, 122 insertions(+), 113 deletions(-) diff --git a/python/cuml/dask/preprocessing/encoders.py b/python/cuml/dask/preprocessing/encoders.py index 494145d9ff..4ae29c4cdd 100644 --- a/python/cuml/dask/preprocessing/encoders.py +++ b/python/cuml/dask/preprocessing/encoders.py @@ -30,9 +30,7 @@ class DelayedFitTransformMixin: def fit_transform(self, X, delayed=True): - """ - Fit the encoder to X, then transform X. - Equivalent to fit(X).transform(X). + """Fit the encoder to X, then transform X. Equivalent to fit(X).transform(X). Parameters ---------- @@ -113,8 +111,7 @@ def __init__(self, *, client=None, verbose=False, **kwargs): @with_cupy_rmm def fit(self, X): - """ - Fit a multi-node multi-gpu OneHotEncoder to X. + """Fit a multi-node multi-gpu OneHotEncoder to X. Parameters ---------- @@ -138,8 +135,7 @@ def fit(self, X): @with_cupy_rmm def transform(self, X, delayed=True): - """ - Transform X using one-hot encoding. + """Transform X using one-hot encoding. Parameters ---------- @@ -163,10 +159,9 @@ def transform(self, X, delayed=True): @with_cupy_rmm def inverse_transform(self, X, delayed=True): - """ - Convert the data back to the original representation. - In case unknown categories are encountered (all zeros in the - one-hot encoding), ``None`` is used to represent this category. + """Convert the data back to the original representation. In case unknown + categories are encountered (all zeros in the one-hot encoding), ``None`` is used + to represent this category. Parameters ---------- @@ -209,9 +204,7 @@ class OrdinalEncoder( categories : :py:class:`cupy.ndarray` or :py:class`cudf.DataFrameq, default='auto' Categories (unique values) per feature. All categories are expected to fit on one GPU. - - 'auto' : Determine categories automatically from the training data. - - DataFrame/ndarray : ``categories[col]`` holds the categories expected in the feature col. handle_unknown : {'error', 'ignore'}, default='error' @@ -223,16 +216,11 @@ class OrdinalEncoder( verbose : int or boolean, default=False Sets logging level. It must be one of `cuml.common.logger.level_*`. See :ref:`verbosity-levels` for more info. - """ - def __init__(self, *, client=None, verbose=False, **kwargs) -> None: - super().__init__(client=client, verbose=verbose, **kwargs) - @with_cupy_rmm def fit(self, X): - """ - Fit Ordinal to X. + """Fit Ordinal to X. Parameters ---------- @@ -243,7 +231,6 @@ def fit(self, X): Returns ------- self - """ from cuml.preprocessing.ordinalencoder_mg import OrdinalEncoderMG @@ -258,8 +245,7 @@ def fit(self, X): @with_cupy_rmm def transform(self, X, delayed=True): - """ - Transform X using ordinal encoding. + """Transform X using ordinal encoding. Parameters ---------- @@ -271,19 +257,17 @@ def transform(self, X, delayed=True): X_out : Transformed input. """ - Xt = self._transform( + return self._transform( X, n_dims=2, delayed=delayed, output_dtype=self._get_internal_model().dtype, output_collection_type=self.datatype, ) - return Xt @with_cupy_rmm def inverse_transform(self, X, delayed=True): - """ - Convert the data back to the original representation. + """Convert the data back to the original representation. Parameters ---------- diff --git a/python/cuml/preprocessing/encoders.py b/python/cuml/preprocessing/encoders.py index 58adb356e7..efd1f727d8 100644 --- a/python/cuml/preprocessing/encoders.py +++ b/python/cuml/preprocessing/encoders.py @@ -61,48 +61,14 @@ def _check_n_features(self, X, reset: bool = False): class BaseEncoder(Base, CheckFeaturesMixIn): """Base implementation for encoding categorical values, uses - :py:class:`~cuml.preprocessing.LabelEncoder` for obtaining unique values. - - Parameters - ---------- - handle : cuml.Handle - Specifies the cuml.handle that holds internal CUDA state for - computations in this model. Most importantly, this specifies the CUDA - stream that will be used for the model's computations, so users can - run different models concurrently in different streams by creating - handles in several streams. - If it is None, a new one is created. - verbose : int or boolean, default=False - Sets logging level. It must be one of `cuml.common.logger.level_*`. - See :ref:`verbosity-levels` for more info. - output_type : {'input', 'array', 'dataframe', 'series', 'df_obj', \ - 'numba', 'cupy', 'numpy', 'cudf', 'pandas'}, default=None - Return results and set estimator attributes to the indicated output - type. If None, the output type set at the module level - (`cuml.global_settings.output_type`) will be used. See - :ref:`output-data-type-configuration` for more info. - - """ - - def __init__( - self, - *, - handle=None, - verbose=False, - output_type=None, - ) -> None: - super().__init__( - handle=handle, verbose=verbose, output_type=output_type - ) + :py:class:`~cuml.preprocessing.LabelEncoder` for obtaining unique values.""" def _set_input_type(self, value): if self.input_type is None: self.input_type = value def _check_input(self, X, is_categories=False): - """ - If input is cupy, convert it to a DataFrame with 0 copies - """ + """If input is cupy, convert it to a DataFrame with 0 copies.""" if isinstance(X, cp.ndarray): self._set_input_type("array") if is_categories: @@ -113,10 +79,17 @@ def _check_input(self, X, is_categories=False): return X def _check_input_fit(self, X, is_categories=False): - """Helper function used in fit. Can be overridden in subclasses.""" + """Helper function used in fit, can be overridden in subclasses.""" self._check_n_features(X, reset=True) return self._check_input(X, is_categories=is_categories) + def _unique(self, inp): + """Helper function used in fit. Can be overridden in subclasses.""" + + # Default implementation passes input through directly since this is + # performed in `LabelEncoder.fit()` + return inp + def _fit(self, X, need_drop: bool): X = self._check_input_fit(X) if type(self.categories) is str and self.categories == "auto": @@ -165,9 +138,7 @@ def _fit(self, X, need_drop: bool): @property def categories_(self): - """ - Returns categories used for the one hot encoding in the correct order. - """ + """Returns categories used for the one hot encoding in the correct order.""" return [self._encoders[f].classes_ for f in self._features] @@ -247,7 +218,6 @@ class OneHotEncoder(BaseEncoder): ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category to be dropped for each feature. None if all the transformed features will be retained. - """ def __init__( @@ -307,7 +277,7 @@ def _check_is_fitted(self): raise NotFittedError(msg) def _compute_drop_idx(self): - """Helper to compute indices to drop from category to drop""" + """Helper to compute indices to drop from category to drop.""" if self.drop is None: return None elif isinstance(self.drop, str) and self.drop == "first": @@ -355,15 +325,8 @@ def _check_input_fit(self, X, is_categories=False): """Helper function used in fit. Can be overridden in subclasses.""" return self._check_input(X, is_categories=is_categories) - def _unique(self, inp): - """Helper function used in fit. Can be overridden in subclasses.""" - - # Default implementation passes input through directly since this is - # performed in `LabelEncoder.fit()` - return inp - def _has_unknown(self, X_cat, encoder_cat): - """Check if X_cat has categories that are not present in encoder_cat""" + """Check if X_cat has categories that are not present in encoder_cat.""" return not X_cat.isin(encoder_cat).all() @generate_docstring(y=None) @@ -479,10 +442,9 @@ def transform(self, X): ) def inverse_transform(self, X): - """ - Convert the data back to the original representation. - In case unknown categories are encountered (all zeros in the - one-hot encoding), ``None`` is used to represent this category. + """Convert the data back to the original representation. In case unknown + categories are encountered (all zeros in the one-hot encoding), ``None`` is used + to represent this category. The return type is the same as the type of the input used by the first call to fit on this estimator instance. @@ -655,9 +617,7 @@ def __init__( ---------- categories : 'auto' an cupy.ndarray or a cudf.DataFrame, default='auto' Categories (unique values) per feature: - - 'auto' : Determine categories automatically from the training data. - - DataFrame/ndarray : ``categories[col]`` holds the categories expected in the feature col. handle_unknown : {'error', 'ignore'}, default='error' @@ -682,7 +642,6 @@ def __init__( type. If None, the output type set at the module level (`cuml.global_settings.output_type`) will be used. See :ref:`output-data-type-configuration` for more info. - """ super().__init__( handle=handle, verbose=verbose, output_type=output_type @@ -694,16 +653,11 @@ def __init__( self.input_type = None - def _unique(self, inp): - """Helper function used in fit. Can be overridden in subclasses.""" - - # Default implementation passes input through directly since this is - # performed in `LabelEncoder.fit()` - return inp - @generate_docstring(y=None) def fit(self, X, y=None) -> "OrdinalEncoder": - """Fit Ordinal to X.""" + """Fit Ordinal to X. + + """ self._fit(X, need_drop=False) return self @@ -715,7 +669,9 @@ def fit(self, X, y=None) -> "OrdinalEncoder": } ) def transform(self, X): - """Transform X using ordinal encoding.""" + """Transform X using ordinal encoding. + + """ self._check_n_features(X, reset=False) result = {} @@ -736,7 +692,9 @@ def transform(self, X): }, ) def fit_transform(self, X, y=None): - """Fit OrdinalEncoder to X, then transform X. Equivalent to fit(X).transform(X).""" + """Fit OrdinalEncoder to X, then transform X. Equivalent to fit(X).transform(X). + + """ X = self._check_input(X) return self.fit(X).transform(X) @@ -757,7 +715,7 @@ def inverse_transform(self, X): result = {} for feature in self._features: - Xi = X[feature] + Xi = _slice_feat(X, feature) inv = self._encoders[feature].inverse_transform(Xi) result[feature] = inv diff --git a/python/cuml/preprocessing/ordinalencoder_mg.py b/python/cuml/preprocessing/ordinalencoder_mg.py index 2cab3b56a7..8b47f67819 100644 --- a/python/cuml/preprocessing/ordinalencoder_mg.py +++ b/python/cuml/preprocessing/ordinalencoder_mg.py @@ -25,8 +25,6 @@ class OrdinalEncoderMG(OrdinalEncoder): def __init__(self, *, client=None, **kwargs): - # force cupy output type, otherwise, dask doesn't would construct the output as - # numpy array. super().__init__(**kwargs) self.client = client diff --git a/python/cuml/tests/dask/test_dask_ordinal_encoder.py b/python/cuml/tests/dask/test_dask_ordinal_encoder.py index 009d0e297a..36b5fa92d3 100644 --- a/python/cuml/tests/dask/test_dask_ordinal_encoder.py +++ b/python/cuml/tests/dask/test_dask_ordinal_encoder.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import cupy as cp import dask_cudf import numpy as np import pandas as pd @@ -23,24 +24,70 @@ @pytest.mark.mg def test_ordinal_encoder_df(client: Client) -> None: - X = DataFrame({"gender": ["M", "F", "F"], "int": [1, 3, 2]}) + X = DataFrame({"cat": ["M", "F", "F"], "int": [1, 3, 2]}) X = dask_cudf.from_cudf(X, npartitions=2) enc = OrdinalEncoder() enc.fit(X) - Xt = enc.transform(X).compute() + Xt = enc.transform(X) - X_1 = DataFrame({"gender": ["F", "F"], "int": [1, 2]}) + X_1 = DataFrame({"cat": ["F", "F"], "int": [1, 2]}) X_1 = dask_cudf.from_cudf(X_1, npartitions=2) enc = OrdinalEncoder(client=client) enc.fit(X) - Xt_1 = enc.transform(X_1).compute() + Xt_1 = enc.transform(X_1) - assert Xt_1.iloc[0, 0] == Xt.iloc[1, 0] - assert Xt_1.iloc[1, 0] == Xt.iloc[1, 0] - assert Xt_1.iloc[0, 1] == Xt.iloc[0, 1] - assert Xt_1.iloc[1, 1] == Xt.iloc[2, 1] + Xt_r = Xt.compute() + Xt_1_r = Xt_1.compute() + assert Xt_1_r.iloc[0, 0] == Xt_r.iloc[1, 0] + assert Xt_1_r.iloc[1, 0] == Xt_r.iloc[1, 0] + assert Xt_1_r.iloc[0, 1] == Xt_r.iloc[0, 1] + assert Xt_1_r.iloc[1, 1] == Xt_r.iloc[2, 1] + + # Turn Int64Index to RangeIndex for testing equality + inv_Xt = enc.inverse_transform(Xt).compute().reset_index(drop=True) + inv_Xt_1 = enc.inverse_transform(Xt_1).compute().reset_index(drop=True) + + X_r = X.compute() + X_1_r = X_1.compute() + + assert inv_Xt.equals(X_r) + assert inv_Xt_1.equals(X_1_r) + + assert enc.n_features_in_ == 2 + + +@pytest.mark.mg +def test_ordinal_encoder_array(client: Client) -> None: + X = DataFrame({"A": [4, 1, 1], "B": [1, 3, 2]}) + X = dask_cudf.from_cudf(X, npartitions=2).values + + enc = OrdinalEncoder() + enc.fit(X) + Xt = enc.transform(X) + + X_1 = DataFrame({"A": [1, 1], "B": [1, 2]}) + X_1 = dask_cudf.from_cudf(X_1, npartitions=2).values + + enc = OrdinalEncoder(client=client) + enc.fit(X) + Xt_1 = enc.transform(X_1) + + Xt_r = Xt.compute() + Xt_1_r = Xt_1.compute() + assert Xt_1_r[0, 0] == Xt_r[1, 0] + assert Xt_1_r[1, 0] == Xt_r[1, 0] + assert Xt_1_r[0, 1] == Xt_r[0, 1] + assert Xt_1_r[1, 1] == Xt_r[2, 1] + + inv_Xt = enc.inverse_transform(Xt) + inv_Xt_1 = enc.inverse_transform(Xt_1) + + cp.testing.assert_allclose(X.compute(), inv_Xt.compute()) + cp.testing.assert_allclose(X_1.compute(), inv_Xt_1.compute()) + + assert enc.n_features_in_ == 2 @pytest.mark.mg diff --git a/python/cuml/tests/test_ordinal_encoder.py b/python/cuml/tests/test_ordinal_encoder.py index 9a9669049c..1ce0503757 100644 --- a/python/cuml/tests/test_ordinal_encoder.py +++ b/python/cuml/tests/test_ordinal_encoder.py @@ -15,20 +15,21 @@ import numpy as np import pandas as pd import pytest +from sklearn.preprocessing import OrdinalEncoder as skOrdinalEncoder + from cuml.internals.safe_imports import gpu_only_import_from from cuml.preprocessing import OrdinalEncoder -from sklearn.preprocessing import OrdinalEncoder as skOrdinalEncoder DataFrame = gpu_only_import_from("cudf", "DataFrame") -def test_ordinal_df() -> None: - X = DataFrame({"gender": ["M", "F", "F"], "int": [1, 3, 2]}) +def test_ordinal_encoder_df() -> None: + X = DataFrame({"cat": ["M", "F", "F"], "num": [1, 3, 2]}) enc = OrdinalEncoder() enc.fit(X) Xt = enc.transform(X) - X_1 = DataFrame({"gender": ["F", "F"], "int": [1, 2]}) + X_1 = DataFrame({"cat": ["F", "F"], "num": [1, 2]}) Xt_1 = enc.transform(X_1) assert Xt_1.iloc[0, 0] == Xt.iloc[1, 0] @@ -37,7 +38,6 @@ def test_ordinal_df() -> None: assert Xt_1.iloc[1, 1] == Xt.iloc[2, 1] inv_Xt = enc.inverse_transform(Xt) - inv_Xt_1 = enc.inverse_transform(Xt_1) assert inv_Xt.equals(X) @@ -46,6 +46,29 @@ def test_ordinal_df() -> None: assert enc.n_features_in_ == 2 +def test_ordinal_encoder_array() -> None: + X = DataFrame({"A": [4, 1, 1], "B": [1, 3, 2]}).values + enc = OrdinalEncoder() + enc.fit(X) + Xt = enc.transform(X) + + X_1 = DataFrame({"A": [1, 1], "B": [1, 2]}).values + Xt_1 = enc.transform(X_1) + + assert Xt_1[0, 0] == Xt[1, 0] + assert Xt_1[1, 0] == Xt[1, 0] + assert Xt_1[0, 1] == Xt[0, 1] + assert Xt_1[1, 1] == Xt[2, 1] + + inv_Xt = enc.inverse_transform(Xt) + inv_Xt_1 = enc.inverse_transform(Xt_1) + + cp.testing.assert_allclose(X, inv_Xt) + cp.testing.assert_allclose(X_1, inv_Xt_1) + + assert enc.n_features_in_ == 2 + + def test_ordinal_array() -> None: X = cp.arange(32).reshape(16, 2) @@ -62,7 +85,7 @@ def test_ordinal_array() -> None: def test_output_type() -> None: - X = DataFrame({"gender": ["M", "F", "F"], "int": [1, 3, 2]}) + X = DataFrame({"cat": ["M", "F", "F"], "num": [1, 3, 2]}) enc = OrdinalEncoder(output_type="cupy").fit(X) assert isinstance(enc.transform(X), cp.ndarray) enc = OrdinalEncoder(output_type="cudf").fit(X) @@ -77,14 +100,13 @@ def test_output_type() -> None: def test_feature_names() -> None: - X = DataFrame({"gender": ["M", "F", "F"], "int": [1, 3, 2]}) + X = DataFrame({"cat": ["M", "F", "F"], "num": [1, 3, 2]}) enc = OrdinalEncoder().fit(X) - assert enc.feature_names_in_ == ["gender", "int"] + assert enc.feature_names_in_ == ["cat", "num"] @pytest.mark.parametrize("as_array", [True, False], ids=["cupy", "cudf"]) def test_handle_unknown(as_array: bool) -> None: - X = DataFrame({"data": [0, 1]}) Y = DataFrame({"data": [3, 1]}) From dee2a996c207e360a2d82e5077852e6322d9234c Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Wed, 15 Nov 2023 12:10:15 +0800 Subject: [PATCH 13/16] remove init. --- python/cuml/dask/preprocessing/encoders.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/cuml/dask/preprocessing/encoders.py b/python/cuml/dask/preprocessing/encoders.py index 4ae29c4cdd..8bf2503578 100644 --- a/python/cuml/dask/preprocessing/encoders.py +++ b/python/cuml/dask/preprocessing/encoders.py @@ -106,9 +106,6 @@ class OneHotEncoder( will be denoted as None. """ - def __init__(self, *, client=None, verbose=False, **kwargs): - super().__init__(client=client, verbose=verbose, **kwargs) - @with_cupy_rmm def fit(self, X): """Fit a multi-node multi-gpu OneHotEncoder to X. From 55269332a703cdb351f6b941041f4826dfbb07fa Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Wed, 15 Nov 2023 12:17:25 +0800 Subject: [PATCH 14/16] black. --- python/cuml/preprocessing/encoders.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/python/cuml/preprocessing/encoders.py b/python/cuml/preprocessing/encoders.py index efd1f727d8..789ed1c296 100644 --- a/python/cuml/preprocessing/encoders.py +++ b/python/cuml/preprocessing/encoders.py @@ -61,7 +61,8 @@ def _check_n_features(self, X, reset: bool = False): class BaseEncoder(Base, CheckFeaturesMixIn): """Base implementation for encoding categorical values, uses - :py:class:`~cuml.preprocessing.LabelEncoder` for obtaining unique values.""" + :py:class:`~cuml.preprocessing.LabelEncoder` for obtaining unique values. + """ def _set_input_type(self, value): if self.input_type is None: @@ -655,9 +656,7 @@ def __init__( @generate_docstring(y=None) def fit(self, X, y=None) -> "OrdinalEncoder": - """Fit Ordinal to X. - - """ + """Fit Ordinal to X.""" self._fit(X, need_drop=False) return self @@ -669,9 +668,7 @@ def fit(self, X, y=None) -> "OrdinalEncoder": } ) def transform(self, X): - """Transform X using ordinal encoding. - - """ + """Transform X using ordinal encoding.""" self._check_n_features(X, reset=False) result = {} @@ -692,9 +689,7 @@ def transform(self, X): }, ) def fit_transform(self, X, y=None): - """Fit OrdinalEncoder to X, then transform X. Equivalent to fit(X).transform(X). - - """ + """Fit OrdinalEncoder to X, then transform X. Equivalent to fit(X).transform(X).""" X = self._check_input(X) return self.fit(X).transform(X) From dff373fa68cf4e3bb168ace5cd644867e90f2428 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 16 Nov 2023 17:17:52 +0800 Subject: [PATCH 15/16] extract a test sample. --- python/cuml/tests/test_ordinal_encoder.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/python/cuml/tests/test_ordinal_encoder.py b/python/cuml/tests/test_ordinal_encoder.py index 1ce0503757..c9379a43be 100644 --- a/python/cuml/tests/test_ordinal_encoder.py +++ b/python/cuml/tests/test_ordinal_encoder.py @@ -23,8 +23,14 @@ DataFrame = gpu_only_import_from("cudf", "DataFrame") -def test_ordinal_encoder_df() -> None: +@pytest.fixture +def test_sample(): X = DataFrame({"cat": ["M", "F", "F"], "num": [1, 3, 2]}) + return X + + +def test_ordinal_encoder_df(test_sample) -> None: + X = test_sample enc = OrdinalEncoder() enc.fit(X) Xt = enc.transform(X) @@ -84,8 +90,8 @@ def test_ordinal_array() -> None: cp.testing.assert_allclose(Xt, Xt_sk) -def test_output_type() -> None: - X = DataFrame({"cat": ["M", "F", "F"], "num": [1, 3, 2]}) +def test_output_type(test_sample) -> None: + X = test_sample enc = OrdinalEncoder(output_type="cupy").fit(X) assert isinstance(enc.transform(X), cp.ndarray) enc = OrdinalEncoder(output_type="cudf").fit(X) @@ -99,9 +105,8 @@ def test_output_type() -> None: assert isinstance(enc.transform(X), DataFrame) -def test_feature_names() -> None: - X = DataFrame({"cat": ["M", "F", "F"], "num": [1, 3, 2]}) - enc = OrdinalEncoder().fit(X) +def test_feature_names(test_sample) -> None: + enc = OrdinalEncoder().fit(test_sample) assert enc.feature_names_in_ == ["cat", "num"] From 1030d5b768a7c27697dfa4ee7d5ca55416909a14 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 21 Nov 2023 19:45:41 +0800 Subject: [PATCH 16/16] doc checker is now happy. --- python/cuml/preprocessing/encoders.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/python/cuml/preprocessing/encoders.py b/python/cuml/preprocessing/encoders.py index 789ed1c296..272655b552 100644 --- a/python/cuml/preprocessing/encoders.py +++ b/python/cuml/preprocessing/encoders.py @@ -62,6 +62,26 @@ def _check_n_features(self, X, reset: bool = False): class BaseEncoder(Base, CheckFeaturesMixIn): """Base implementation for encoding categorical values, uses :py:class:`~cuml.preprocessing.LabelEncoder` for obtaining unique values. + + Parameters + ---------- + + handle : cuml.Handle + Specifies the cuml.handle that holds internal CUDA state for + computations in this model. Most importantly, this specifies the CUDA + stream that will be used for the model's computations, so users can + run different models concurrently in different streams by creating + handles in several streams. + If it is None, a new one is created. + verbose : int or boolean, default=False + Sets logging level. It must be one of `cuml.common.logger.level_*`. + See :ref:`verbosity-levels` for more info. + output_type : {'input', 'array', 'dataframe', 'series', 'df_obj', \ + 'numba', 'cupy', 'numpy', 'cudf', 'pandas'}, default=None + Return results and set estimator attributes to the indicated output + type. If None, the output type set at the module level + (`cuml.global_settings.output_type`) will be used. See + :ref:`output-data-type-configuration` for more info. """ def _set_input_type(self, value):