From 87a90f4843dc4b58c3aa0a021142034480cdeaa2 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 25 Aug 2022 08:45:53 -0700 Subject: [PATCH 1/7] updates, tests --- python/cudf/cudf/core/column/categorical.py | 7 ++----- python/cudf/cudf/core/dataframe.py | 7 +++---- python/cudf/cudf/tests/test_series.py | 10 ++++++++++ python/cudf/cudf/utils/utils.py | 6 ++++++ 4 files changed, 21 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 38b610d0991..2b5d4f88ba7 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -25,6 +25,7 @@ min_signed_type, min_unsigned_type, ) +from cudf.utils.utils import dedup_preserve_order if TYPE_CHECKING: from cudf._typing import SeriesOrIndex, SeriesOrSingleColumnIndex @@ -1456,11 +1457,7 @@ def _set_categories( # Ensure new_categories is unique first if not (is_unique or new_cats.is_unique): # drop_duplicates() instead of unique() to preserve order - new_cats = ( - cudf.Series(new_cats) - .drop_duplicates(ignore_index=True) - ._column - ) + new_cats = dedup_preserve_order(cudf.Series(new_cats))._column cur_codes = self.codes max_cat_size = ( diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 7a8745371c1..c87496850a2 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -102,6 +102,7 @@ GetAttrGetItemMixin, _cudf_nvtx_annotate, _external_only_api, + dedup_preserve_order, ) T = TypeVar("T", bound="DataFrame") @@ -7244,11 +7245,9 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes): isinstance(col, cudf.core.column.CategoricalColumn) for col in cols ): # Combine and de-dupe the categories - categories[idx] = ( + categories[idx] = dedup_preserve_order( cudf.Series(concat_columns([col.categories for col in cols])) - .drop_duplicates(ignore_index=True) - ._column - ) + )._column # Set the column dtype to the codes' dtype. The categories # will be re-assigned at the end dtypes[idx] = min_scalar_type(len(categories[idx])) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 6de27980ec2..e18f0997e60 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -21,6 +21,7 @@ assert_exceptions_equal, gen_rand, ) +from cudf.utils.utils import dedup_preserve_order def _series_na_data(): @@ -1928,3 +1929,12 @@ def test_default_integer_bitwidth_construction(default_integer_bitwidth, data): def test_default_float_bitwidth_construction(default_float_bitwidth, data): s = cudf.Series(data) assert s.dtype == np.dtype(f"f{default_float_bitwidth//8}") + + +def test_series_ordered_dedup(): + # part of https://github.com/rapidsai/cudf/issues/11486 + sr = cudf.Series(np.random.randint(0, 100, 1000)) + # pandas unique() preserves order + expect = pd.Series(sr.to_pandas().unique()) + got = dedup_preserve_order(sr) + assert_eq(expect.values, got.values) diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 48d4c07f1fe..007e20e66da 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -368,6 +368,12 @@ def _cudf_nvtx_annotate(func, domain="cudf_python"): )(func) +def dedup_preserve_order(sr): + lhs = sr.reset_index(drop=True) + deduped = lhs.drop_duplicates() + return deduped.sort_index() + + _dask_cudf_nvtx_annotate = partial( _cudf_nvtx_annotate, domain="dask_cudf_python" ) From 9a78e66ecec020f3f13521239add573938f2a755 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 30 Aug 2022 09:35:32 -0700 Subject: [PATCH 2/7] address reviews --- python/cudf/cudf/core/column/categorical.py | 3 +-- python/cudf/cudf/core/column/column.py | 10 ++++++++++ python/cudf/cudf/core/dataframe.py | 7 +++---- python/cudf/cudf/tests/test_series.py | 3 +-- python/cudf/cudf/utils/utils.py | 6 ------ 5 files changed, 15 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 2b5d4f88ba7..ea8c8c15626 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -25,7 +25,6 @@ min_signed_type, min_unsigned_type, ) -from cudf.utils.utils import dedup_preserve_order if TYPE_CHECKING: from cudf._typing import SeriesOrIndex, SeriesOrSingleColumnIndex @@ -1457,7 +1456,7 @@ def _set_categories( # Ensure new_categories is unique first if not (is_unique or new_cats.is_unique): # drop_duplicates() instead of unique() to preserve order - new_cats = dedup_preserve_order(cudf.Series(new_cats))._column + new_cats = cudf.Series(new_cats)._column._dedup_preserve_order() cur_codes = self.codes max_cat_size = ( diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index fbf9ab13da8..d12a325cd35 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -217,6 +217,16 @@ def dropna(self, drop_nan: bool = False) -> ColumnBase: # The drop_nan argument is only used for numerical columns. return drop_nulls([self])[0] + def _dedup_preserve_order(self): + ind = as_column(cupy.arange(0, len(self))) + + # dedup based on the column of data only + ind, col = drop_duplicates([ind, self], keys=[1]) + + # sort sort col based on ind + map = ind.argsort() + return col.take(map) + def to_arrow(self) -> pa.Array: """Convert to PyArrow Array diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c87496850a2..6e47830409a 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -102,7 +102,6 @@ GetAttrGetItemMixin, _cudf_nvtx_annotate, _external_only_api, - dedup_preserve_order, ) T = TypeVar("T", bound="DataFrame") @@ -7245,9 +7244,9 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes): isinstance(col, cudf.core.column.CategoricalColumn) for col in cols ): # Combine and de-dupe the categories - categories[idx] = dedup_preserve_order( - cudf.Series(concat_columns([col.categories for col in cols])) - )._column + categories[idx] = cudf.Series( + concat_columns([col.categories for col in cols]) + )._column._dedup_preserve_order() # Set the column dtype to the codes' dtype. The categories # will be re-assigned at the end dtypes[idx] = min_scalar_type(len(categories[idx])) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index e18f0997e60..24dbafa141e 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -21,7 +21,6 @@ assert_exceptions_equal, gen_rand, ) -from cudf.utils.utils import dedup_preserve_order def _series_na_data(): @@ -1936,5 +1935,5 @@ def test_series_ordered_dedup(): sr = cudf.Series(np.random.randint(0, 100, 1000)) # pandas unique() preserves order expect = pd.Series(sr.to_pandas().unique()) - got = dedup_preserve_order(sr) + got = cudf.Series(sr._column._dedup_preserve_order()) assert_eq(expect.values, got.values) diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 007e20e66da..48d4c07f1fe 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -368,12 +368,6 @@ def _cudf_nvtx_annotate(func, domain="cudf_python"): )(func) -def dedup_preserve_order(sr): - lhs = sr.reset_index(drop=True) - deduped = lhs.drop_duplicates() - return deduped.sort_index() - - _dask_cudf_nvtx_annotate = partial( _cudf_nvtx_annotate, domain="dask_cudf_python" ) From 4ea9d5e1bab5836a0c8afe6aa5a7f8f84091e9b5 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 1 Sep 2022 06:12:53 -0700 Subject: [PATCH 3/7] add tests for failing concat cases --- python/cudf/cudf/tests/test_concat.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 5094d938ea1..bf1e9de5d1a 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -1761,3 +1761,19 @@ def test_concat_decimal_non_numeric(s1, s2, expected): def test_concat_struct_column(s1, s2, expected): s = gd.concat([s1, s2]) assert_eq(s, expected, check_index_type=True) + + +def test_concat_categorical_ordering(): + # https://github.com/rapidsai/cudf/issues/11486 + sr = pd.Series( + ["a", "b", "c", "d", "e", "a", "b", "c", "d", "e"], dtype="category" + ) + sr = sr.cat.set_categories(["d", "a", "b", "c", "e"]) + + df = pd.DataFrame({"a": sr}) + gdf = gd.from_pandas(df) + + expect = pd.concat([df, df, df]) + got = gd.concat([gdf, gdf, gdf]) + + assert_eq(expect, got) From 04cd1899f4d0b3bc18469a26b954410770329f53 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 1 Sep 2022 08:30:43 -0700 Subject: [PATCH 4/7] dedupe and preserve order in CategoricalColumn._concat --- python/cudf/cudf/core/column/categorical.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index ea8c8c15626..1268422bc60 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1152,6 +1152,7 @@ def fillna( fill_value = column.as_column(fill_value, nan_as_null=False) if isinstance(fill_value, CategoricalColumn): if self.dtype != fill_value.dtype: + breakpoint() raise TypeError( "Cannot set a Categorical with another, " "without identical categories" @@ -1316,7 +1317,9 @@ def _concat( head = next((obj for obj in objs if obj.valid_count), objs[0]) # Combine and de-dupe the categories - cats = column.concat_columns([o.categories for o in objs]).unique() + cats = column.concat_columns( + [o.categories for o in objs] + )._dedup_preserve_order() objs = [o._set_categories(cats, is_unique=True) for o in objs] codes = [o.codes for o in objs] From 987aa263e57456fa288b85f2b019d2ada2a09560 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 1 Sep 2022 09:03:09 -0700 Subject: [PATCH 5/7] cleanup --- python/cudf/cudf/core/column/categorical.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 02ccdc14a97..f8eb5092868 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1152,7 +1152,6 @@ def fillna( fill_value = column.as_column(fill_value, nan_as_null=False) if isinstance(fill_value, CategoricalColumn): if self.dtype != fill_value.dtype: - breakpoint() raise TypeError( "Cannot set a Categorical with another, " "without identical categories" From 65529226932cac5293398f69916b4879a861d378 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 1 Sep 2022 11:26:36 -0700 Subject: [PATCH 6/7] use unique(preserve_order=True) instead --- python/cudf/cudf/core/column/categorical.py | 14 ++++++++------ python/cudf/cudf/core/column/column.py | 21 ++++++++++----------- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/tests/test_series.py | 2 +- 4 files changed, 20 insertions(+), 19 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index f8eb5092868..d438f47e1c4 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -945,8 +945,8 @@ def clip(self, lo: ScalarLike, hi: ScalarLike) -> "column.ColumnBase": def data_array_view(self) -> cuda.devicearray.DeviceNDArray: return self.codes.data_array_view - def unique(self) -> CategoricalColumn: - codes = self.as_numerical.unique() + def unique(self, preserve_order=False) -> CategoricalColumn: + codes = self.as_numerical.unique(preserve_order=preserve_order) return column.build_categorical_column( categories=self.categories, codes=column.build_column(codes.base_data, dtype=codes.dtype), @@ -1316,9 +1316,9 @@ def _concat( head = next((obj for obj in objs if obj.valid_count), objs[0]) # Combine and de-dupe the categories - cats = column.concat_columns( - [o.categories for o in objs] - )._dedup_preserve_order() + cats = column.concat_columns([o.categories for o in objs]).unique( + preserve_order=True + ) objs = [o._set_categories(cats, is_unique=True) for o in objs] codes = [o.codes for o in objs] @@ -1458,7 +1458,9 @@ def _set_categories( # Ensure new_categories is unique first if not (is_unique or new_cats.is_unique): # drop_duplicates() instead of unique() to preserve order - new_cats = cudf.Series(new_cats)._column._dedup_preserve_order() + new_cats = cudf.Series(new_cats)._column.unique( + preserve_order=True + ) cur_codes = self.codes max_cat_size = ( diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 97c566903a9..08e97ede46f 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -217,16 +217,6 @@ def dropna(self, drop_nan: bool = False) -> ColumnBase: # The drop_nan argument is only used for numerical columns. return drop_nulls([self])[0] - def _dedup_preserve_order(self): - ind = as_column(cupy.arange(0, len(self))) - - # dedup based on the column of data only - ind, col = drop_duplicates([ind, self], keys=[1]) - - # sort sort col based on ind - map = ind.argsort() - return col.take(map) - def to_arrow(self) -> pa.Array: """Convert to PyArrow Array @@ -1038,7 +1028,7 @@ def searchsorted( values, side, ascending=ascending, na_position=na_position ) - def unique(self) -> ColumnBase: + def unique(self, preserve_order=False) -> ColumnBase: """ Get unique values in the data """ @@ -1047,6 +1037,15 @@ def unique(self) -> ColumnBase: # Few things to note before we can do this optimization is # the following issue resolved: # https://github.com/rapidsai/cudf/issues/5286 + if preserve_order: + ind = as_column(cupy.arange(0, len(self))) + + # dedup based on the column of data only + ind, col = drop_duplicates([ind, self], keys=[1]) + + # sort sort col based on ind + map = ind.argsort() + return col.take(map) return drop_duplicates([self], keep="first")[0] diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 6e47830409a..97f06efe642 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7246,7 +7246,7 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes): # Combine and de-dupe the categories categories[idx] = cudf.Series( concat_columns([col.categories for col in cols]) - )._column._dedup_preserve_order() + )._column.unique(preserve_order=True) # Set the column dtype to the codes' dtype. The categories # will be re-assigned at the end dtypes[idx] = min_scalar_type(len(categories[idx])) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 24dbafa141e..b1ecb38e4d4 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1935,5 +1935,5 @@ def test_series_ordered_dedup(): sr = cudf.Series(np.random.randint(0, 100, 1000)) # pandas unique() preserves order expect = pd.Series(sr.to_pandas().unique()) - got = cudf.Series(sr._column._dedup_preserve_order()) + got = cudf.Series(sr._column.unique(preserve_order=True)) assert_eq(expect.values, got.values) From 0624a3c742f2ef88820866250792a576de8187b5 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 2 Sep 2022 06:48:18 -0700 Subject: [PATCH 7/7] minor comment update --- python/cudf/cudf/core/column/column.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 08e97ede46f..649be68f032 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1043,7 +1043,7 @@ def unique(self, preserve_order=False) -> ColumnBase: # dedup based on the column of data only ind, col = drop_duplicates([ind, self], keys=[1]) - # sort sort col based on ind + # sort col based on ind map = ind.argsort() return col.take(map)