diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index 3c454c85720..74ce6e42d7e 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -230,10 +230,13 @@ std::unique_ptr make_collect_list_aggregation( * @param null_handling Indicates whether to include/exclude nulls during collection * @param nulls_equal Flag to specify whether null entries within each list should be considered * equal + * @param nans_equal Flag to specify whether NaN values in floating point column should be + * considered equal */ std::unique_ptr make_collect_set_aggregation( null_policy null_handling = null_policy::INCLUDE, - null_equality null_equal = null_equality::EQUAL); + null_equality nulls_equal = null_equality::EQUAL, + nan_equality nans_equal = nan_equality::UNEQUAL); /// Factory to create a LAG aggregation std::unique_ptr make_lag_aggregation(size_type offset); diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 18bef301e03..0bfe6b84ae2 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -345,24 +345,32 @@ struct collect_list_aggregation final : derived_aggregation */ struct collect_set_aggregation final : derived_aggregation { explicit collect_set_aggregation(null_policy null_handling = null_policy::INCLUDE, - null_equality null_equal = null_equality::EQUAL) - : derived_aggregation{COLLECT_SET}, _null_handling{null_handling}, _null_equal(null_equal) + null_equality nulls_equal = null_equality::EQUAL, + nan_equality nans_equal = nan_equality::UNEQUAL) + : derived_aggregation{COLLECT_SET}, + _null_handling{null_handling}, + _nulls_equal(nulls_equal), + _nans_equal(nans_equal) { } null_policy _null_handling; ///< include or exclude nulls - null_equality _null_equal; ///< whether to consider nulls as equal values + null_equality _nulls_equal; ///< whether to consider nulls as equal values + nan_equality _nans_equal; ///< whether to consider NaNs as equal value (applicable only to + ///< floating point types) protected: friend class derived_aggregation; bool operator==(collect_set_aggregation const& other) const { - return _null_handling == other._null_handling && _null_equal == other._null_equal; + return _null_handling == other._null_handling && _nulls_equal == other._nulls_equal && + _nans_equal == other._nans_equal; } size_t hash_impl() const { - return std::hash{}(static_cast(_null_handling) ^ static_cast(_null_equal)); + return std::hash{}(static_cast(_null_handling) ^ static_cast(_nulls_equal) ^ + static_cast(_nans_equal)); } }; diff --git a/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp b/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp index ba3e1d17d7f..53b31015145 100644 --- a/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp +++ b/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp @@ -31,6 +31,7 @@ namespace detail { std::unique_ptr drop_list_duplicates( lists_column_view const& lists_column, null_equality nulls_equal, + nan_equality nans_equal, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/lists/drop_list_duplicates.hpp b/cpp/include/cudf/lists/drop_list_duplicates.hpp index 0939bd7956a..f1ce3b7f0e3 100644 --- a/cpp/include/cudf/lists/drop_list_duplicates.hpp +++ b/cpp/include/cudf/lists/drop_list_duplicates.hpp @@ -41,6 +41,8 @@ namespace lists { * * @param lists_column The input lists_column_view * @param nulls_equal Flag to specify whether null entries should be considered equal + * @param nans_equal Flag to specify whether NaN entries should be considered as equal value (only + * applicable for floating point data column) * @param mr Device resource used to allocate memory * * @code{.pseudo} @@ -56,6 +58,7 @@ namespace lists { std::unique_ptr drop_list_duplicates( lists_column_view const& lists_column, null_equality nulls_equal = null_equality::EQUAL, + nan_equality nans_equal = nan_equality::UNEQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index 1b8d83883b3..789bb3037f4 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -137,6 +137,15 @@ enum class nan_policy : bool { NAN_IS_VALID ///< treat nans as valid elements (non-null) }; +/** + * @brief Enum to consider different elements (of floating point types) holding NaN value as equal + * or unequal + */ +enum class nan_equality /*unspecified*/ { + ALL_EQUAL, ///< All NaNs compare equal, regardless of sign + UNEQUAL ///< All NaNs compare unequal (IEEE754 behavior) +}; + /** * @brief */ diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp index 33c19617308..3a044a42101 100644 --- a/cpp/src/aggregation/aggregation.cpp +++ b/cpp/src/aggregation/aggregation.cpp @@ -132,9 +132,10 @@ std::unique_ptr make_collect_list_aggregation(null_policy null_hand } /// Factory to create a COLLECT_SET aggregation std::unique_ptr make_collect_set_aggregation(null_policy null_handling, - null_equality null_equal) + null_equality nulls_equal, + nan_equality nans_equal) { - return std::make_unique(null_handling, null_equal); + return std::make_unique(null_handling, nulls_equal, nans_equal); } /// Factory to create a LAG aggregation std::unique_ptr make_lag_aggregation(size_type offset) diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index 4e2303c8b9b..46185e07600 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -379,11 +379,14 @@ void aggregrate_result_functor::operator()(aggregation auto const collect_result = detail::group_collect( get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr); auto const nulls_equal = - static_cast(agg)._null_equal; - cache.add_result(col_idx, - agg, - lists::detail::drop_list_duplicates( - lists_column_view(collect_result->view()), nulls_equal, stream, mr)); + static_cast(agg)._nulls_equal; + auto const nans_equal = + static_cast(agg)._nans_equal; + cache.add_result( + col_idx, + agg, + lists::detail::drop_list_duplicates( + lists_column_view(collect_result->view()), nulls_equal, nans_equal, stream, mr)); }; } // namespace detail diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 584b9791d19..564d919b65d 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -16,15 +16,16 @@ #include #include -#include +#include #include #include #include #include #include -#include +#include #include +#include #include #include @@ -34,62 +35,100 @@ namespace cudf { namespace lists { namespace detail { namespace { +template +struct has_negative_nans { + column_device_view const d_entries; + bool const has_nulls; + + __device__ Type operator()(size_type idx) const noexcept + { + if (has_nulls && d_entries.is_null_nocheck(idx)) { return false; } + + auto const val = d_entries.element(idx); + return std::isnan(val) && std::signbit(val); // std::signbit(x) == true if x is negative + } +}; /** - * @brief Copy list entries and entry list offsets ignoring duplicates - * - * Given an array of all entries flattened from a list column and an array that maps each entry to - * the offset of the list containing that entry, those entries and list offsets are copied into - * new arrays such that the duplicated entries within each list will be ignored. - * - * @param all_lists_entries The input array containing all list entries - * @param entries_list_offsets A map from list entries to their corresponding list offsets - * @param nulls_equal Flag to specify whether null entries should be considered equal - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device resource used to allocate memory - * - * @return A pair of columns, the first one contains unique list entries and the second one - * contains their corresponding list offsets + * @brief A structure to be used along with type_dispatcher to check if a + * `column_view` has any negative NaN entry */ -template -std::vector> get_unique_entries_and_list_offsets( - column_view const& all_lists_entries, - column_view const& entries_list_offsets, - null_equality nulls_equal, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - // Create an intermediate table, since the comparator only work on tables - auto const device_input_table = - cudf::table_device_view::create(table_view{{all_lists_entries}}, stream); - auto const comp = row_equality_comparator( - *device_input_table, *device_input_table, nulls_equal == null_equality::EQUAL); +struct has_negative_nans_fn { + template >* = nullptr> + bool operator()(column_view const& lists_entries, rmm::cuda_stream_view stream) const noexcept + { + auto const d_entries = column_device_view::create(lists_entries, stream); + return thrust::count_if(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(lists_entries.size()), + detail::has_negative_nans{*d_entries, lists_entries.has_nulls()}); + } - auto const num_entries = all_lists_entries.size(); - // Allocate memory to store the indices of the unique entries - auto const unique_indices = cudf::make_numeric_column( - entries_list_offsets.type(), num_entries, mask_state::UNALLOCATED, stream); - auto const unique_indices_begin = unique_indices->mutable_view().begin(); + template >* = nullptr> + bool operator()(column_view const&, rmm::cuda_stream_view) const noexcept + { + // Columns of non floating-point data will never contain NaN + return false; + } +}; - auto const copy_end = thrust::unique_copy( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - unique_indices_begin, - [list_offsets = entries_list_offsets.begin(), comp] __device__(auto i, auto j) { - return list_offsets[i] == list_offsets[j] && comp(i, j); - }); +template +struct replace_negative_nans { + __device__ Type operator()(Type val) const noexcept + { + return std::isnan(val) ? std::numeric_limits::quiet_NaN() : val; + } +}; - // Collect unique entries and entry list offsets - auto const indices = cudf::detail::slice( - unique_indices->view(), 0, thrust::distance(unique_indices_begin, copy_end)); - return cudf::detail::gather(table_view{{all_lists_entries, entries_list_offsets}}, - indices, - cudf::out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr) - ->release(); +/** + * @brief A structure to be used along with type_dispatcher to replace -NaN by NaN for all entries + * of a floating-point data column + */ +struct replace_negative_nans_fn { + template >* = nullptr> + void operator()(column_view const&, mutable_column_view const&, rmm::cuda_stream_view) const + { + CUDF_FAIL("Cannot operate on a type that is not floating-point."); + } + + template >* = nullptr> + void operator()(column_view const& lists_entries, + mutable_column_view const& new_entries, + rmm::cuda_stream_view stream) const noexcept + { + // Do not care whether an entry is null or not, just consider it as a floating-point value + thrust::transform(rmm::exec_policy(stream), + lists_entries.begin(), + lists_entries.end(), + new_entries.begin(), + detail::replace_negative_nans{}); + } +}; + +/** + * @brief Transform a given lists column to a new lists column in which all the list entries holding + * -NaN value are replaced by (positive) NaN + */ +std::unique_ptr replace_negative_nans_entries(column_view const& lists_entries, + lists_column_view const& lists_column, + rmm::cuda_stream_view stream) +{ + auto new_offsets = std::make_unique(lists_column.offsets()); + auto new_entries = std::make_unique(lists_entries); + + type_dispatcher(lists_entries.type(), + detail::replace_negative_nans_fn{}, + lists_entries, + new_entries->mutable_view(), + stream); + + return make_lists_column( + lists_column.size(), + std::move(new_offsets), + std::move(new_entries), + lists_column.null_count(), + cudf::detail::copy_bitmask( + lists_column.parent(), stream, rmm::mr::get_current_device_resource())); } /** @@ -165,6 +204,189 @@ std::unique_ptr generate_entry_list_offsets(size_type num_entries, return entry_list_offsets; } +/** + * @brief Performs an equality comparison between two entries in a lists column + * + * For the two elements that are in the same list in the lists column, they will always be + * considered as different. If they are from the same list and their type is one of floating + * point types, this functor will return the same comparison result as + * `cudf::element_equality_comparator`. + * + * For floating-point types, entries holding NaN value can be considered as different values or the + * same value depending on the nans_equal parameter. + * + * @tparam Type The data type of entries + * @tparam nans_equal Flag to specify whether NaN entries should be considered as equal value (only + * applicable for floating-point data column) + */ +template +class list_entry_comparator { + public: + list_entry_comparator(offset_type const* list_offsets, + column_device_view d_view, + null_equality nulls_equal, + bool has_nulls) + : list_offsets(list_offsets), d_view{d_view}, nulls_equal{nulls_equal}, has_nulls(has_nulls) + { + } + + template + std::enable_if_t and nans_equal_, bool> __device__ + operator()(size_type i, size_type j) const noexcept + { + // Two entries are not considered for equality if they belong to different lists + if (list_offsets[i] != list_offsets[j]) { return false; } + + if (has_nulls) { + bool const nullable = d_view.nullable(); + bool const lhs_is_null{nullable and d_view.is_null_nocheck(i)}; + bool const rhs_is_null{nullable and d_view.is_null_nocheck(j)}; + if (lhs_is_null and rhs_is_null) { + return nulls_equal == null_equality::EQUAL; + } else if (lhs_is_null != rhs_is_null) { + return false; + } + } + + // For floating-point types, if both element(i) and element(j) are NaNs then this comparison + // will return `true`. This is the desired behavior in Pandas. + auto const lhs = d_view.element(i); + auto const rhs = d_view.element(j); + if (std::isnan(lhs) and std::isnan(rhs)) { return true; } + return lhs == rhs; + } + + template + std::enable_if_t or not nans_equal_, bool> __device__ + operator()(size_type i, size_type j) const noexcept + { + // Two entries are not considered for equality if they belong to different lists + if (list_offsets[i] != list_offsets[j]) { return false; } + + if (has_nulls) { + bool const nullable = d_view.nullable(); + bool const lhs_is_null{nullable and d_view.is_null_nocheck(i)}; + bool const rhs_is_null{nullable and d_view.is_null_nocheck(j)}; + if (lhs_is_null and rhs_is_null) { + return nulls_equal == null_equality::EQUAL; + } else if (lhs_is_null != rhs_is_null) { + return false; + } + } + + // For floating-point types, if both element(i) and element(j) are NaNs then this comparison + // will return `false`. This is the desired behavior in Apache Spark. + return d_view.element(i) == d_view.element(j); + } + + private: + offset_type const* list_offsets; + column_device_view d_view; + null_equality nulls_equal; + bool has_nulls; +}; + +/** + * @brief Construct type-dispatched function object for copying indices of the list entries + * ignoring duplicates + */ +struct get_unique_entries_fn { + template ()>* = nullptr> + offset_type* operator()(offset_type const*, + column_device_view&, + size_type, + offset_type*, + null_equality, + nan_equality, + bool, + rmm::cuda_stream_view) const + { + CUDF_FAIL("Cannot operate on types that are not equally comparable."); + } + + template ()>* = nullptr> + offset_type* operator()(offset_type const* list_offsets, + column_device_view& d_view, + size_type num_entries, + offset_type* output_begin, + null_equality nulls_equal, + nan_equality nans_equal, + bool has_nulls, + rmm::cuda_stream_view stream) const noexcept + { + if (nans_equal == nan_equality::ALL_EQUAL) { + list_entry_comparator const comp{list_offsets, d_view, nulls_equal, has_nulls}; + return thrust::unique_copy(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + output_begin, + comp); + } else { + list_entry_comparator const comp{list_offsets, d_view, nulls_equal, has_nulls}; + return thrust::unique_copy(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + output_begin, + comp); + } + } +}; + +/** + * @brief Copy list entries and entry list offsets ignoring duplicates + * + * Given an array of all entries flattened from a list column and an array that maps each entry to + * the offset of the list containing that entry, those entries and list offsets are copied into + * new arrays such that the duplicated entries within each list will be ignored. + * + * @param all_lists_entries The input array containing all list entries + * @param entries_list_offsets A map from list entries to their corresponding list offsets + * @param nulls_equal Flag to specify whether null entries should be considered equal + * @param nans_equal Flag to specify whether NaN entries should be considered as equal + * value (only applicable for floating-point data column) + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device resource used to allocate memory + * + * @return A pair of columns, the first one contains unique list entries and the second one + * contains their corresponding list offsets + */ +std::vector> get_unique_entries_and_list_offsets( + column_view const& all_lists_entries, + column_view const& entries_list_offsets, + null_equality nulls_equal, + nan_equality nans_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const num_entries = all_lists_entries.size(); + auto const d_view_entries = column_device_view::create(all_lists_entries, stream); + + // Allocate memory to store the indices of the unique entries + auto unique_indices = rmm::device_uvector(num_entries, stream); + auto const output_begin = unique_indices.begin(); + auto const output_end = type_dispatcher(all_lists_entries.type(), + get_unique_entries_fn{}, + entries_list_offsets.begin(), + *d_view_entries, + num_entries, + output_begin, + nulls_equal, + nans_equal, + all_lists_entries.has_nulls(), + stream); + + // Collect unique entries and entry list offsets + // The new null_count and bitmask of the unique entries will also be generated + // by the gather function + return cudf::detail::gather(table_view{{all_lists_entries, entries_list_offsets}}, + output_begin, + output_end, + cudf::out_of_bounds_policy::DONT_CHECK, + stream, + mr) + ->release(); +} + /** * @brief Generate list offsets from entry offsets * @@ -225,6 +447,7 @@ void generate_offsets(size_type num_entries, return offsets[i - prefix_sum_empty_lists[i]]; }); } + } // anonymous namespace /** @@ -234,6 +457,7 @@ void generate_offsets(size_type num_entries, */ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_column, null_equality nulls_equal, + nan_equality nans_equal, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -242,27 +466,40 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu CUDF_FAIL("Nested types are not supported in drop_list_duplicates."); } - // Call segmented sort on the list elements and store them in a temporary column sorted_list - auto const sorted_lists = - detail::sort_lists(lists_column, order::ASCENDING, null_order::AFTER, stream); - // Flatten all entries (depth = 1) of the lists column - auto const all_lists_entries = lists_column_view(sorted_lists->view()).get_sliced_child(stream); + auto const lists_entries = lists_column.get_sliced_child(stream); + + // sorted_lists will store the results of the original lists after calling segmented_sort + auto const sorted_lists = [&]() { + // If nans_equal == ALL_EQUAL and the column contains lists of floating-point data type, + // we need to replace -NaN by NaN before sorting + auto const replace_negative_nan = + nans_equal == nan_equality::ALL_EQUAL and + type_dispatcher(lists_entries.type(), detail::has_negative_nans_fn{}, lists_entries, stream); + if (replace_negative_nan) { + // The column new_lists_column is temporary, thus we will not pass in `mr` + auto const new_lists_column = + detail::replace_negative_nans_entries(lists_entries, lists_column, stream); + return detail::sort_lists( + lists_column_view(new_lists_column->view()), order::ASCENDING, null_order::AFTER, stream); + } else { + return detail::sort_lists(lists_column, order::ASCENDING, null_order::AFTER, stream); + } + }(); + + auto const sorted_lists_entries = + lists_column_view(sorted_lists->view()).get_sliced_child(stream); // Generate a 0-based offset column auto lists_offsets = detail::generate_clean_offsets(lists_column, stream, mr); // Generate a mapping from list entries to offsets of the lists containing those entries auto const entries_list_offsets = - detail::generate_entry_list_offsets(all_lists_entries.size(), lists_offsets->view(), stream); + detail::generate_entry_list_offsets(sorted_lists_entries.size(), lists_offsets->view(), stream); // Copy non-duplicated entries (along with their list offsets) to new arrays - auto unique_entries_and_list_offsets = - all_lists_entries.has_nulls() - ? detail::get_unique_entries_and_list_offsets( - all_lists_entries, entries_list_offsets->view(), nulls_equal, stream, mr) - : detail::get_unique_entries_and_list_offsets( - all_lists_entries, entries_list_offsets->view(), nulls_equal, stream, mr); + auto unique_entries_and_list_offsets = detail::get_unique_entries_and_list_offsets( + sorted_lists_entries, entries_list_offsets->view(), nulls_equal, nans_equal, stream, mr); // Generate offsets for the new lists column detail::generate_offsets(unique_entries_and_list_offsets.front()->size(), @@ -271,6 +508,10 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu stream); // Construct a new lists column without duplicated entries + // Reuse the null_count and bitmask of the lists_column: those are the null information for + // the list elements (rows) + // For the entries of those lists (rows), their null_count and bitmask were generated separately + // during the step `get_unique_entries_and_list_offsets` above return make_lists_column(lists_column.size(), std::move(lists_offsets), std::move(unique_entries_and_list_offsets.front()), @@ -285,10 +526,12 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu */ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_column, null_equality nulls_equal, + nan_equality nans_equal, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::drop_list_duplicates(lists_column, nulls_equal, rmm::cuda_stream_default, mr); + return detail::drop_list_duplicates( + lists_column, nulls_equal, nans_equal, rmm::cuda_stream_default, mr); } } // namespace lists diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index 0948ba96f62..bc413fd220a 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -14,174 +14,241 @@ * limitations under the License. */ -#include - #include #include +#include + +#include +#include + +#include +#include -using float_type = float; using int_type = int32_t; -using INT_LCW = cudf::test::lists_column_wrapper; -using FLT_LCW = cudf::test::lists_column_wrapper; -using STR_LCW = cudf::test::lists_column_wrapper; +using float_type = float; + +using LIST_COL_FLT = cudf::test::lists_column_wrapper; +using LIST_COL_STR = cudf::test::lists_column_wrapper; -template +auto constexpr neg_NaN = -std::numeric_limits::quiet_NaN(); +auto constexpr neg_Inf = -std::numeric_limits::infinity(); +auto constexpr NaN = std::numeric_limits::quiet_NaN(); +auto constexpr Inf = std::numeric_limits::infinity(); + +template void test_once(cudf::column_view const& input, LCW const& expected, cudf::null_equality nulls_equal = cudf::null_equality::EQUAL) { auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{input}, nulls_equal); - if (equal_test) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, true); + if (cudf::is_floating_point(input.type())) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } else { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); } } struct DropListDuplicatesTest : public cudf::test::BaseFixture { }; -TEST_F(DropListDuplicatesTest, InvalidCasesTests) +TEST_F(DropListDuplicatesTest, FloatingPointTestsWithSignedZero) { - // Lists of nested types are not supported - EXPECT_THROW( - cudf::lists::drop_list_duplicates(cudf::lists_column_view{INT_LCW{INT_LCW{{1, 2}, {3}}}}), - cudf::logic_error); - EXPECT_THROW( - cudf::lists::drop_list_duplicates(cudf::lists_column_view{FLT_LCW{FLT_LCW{{1, 2}, {3}}}}), - cudf::logic_error); - EXPECT_THROW( - cudf::lists::drop_list_duplicates(cudf::lists_column_view{STR_LCW{STR_LCW{STR_LCW{"string"}}}}), - cudf::logic_error); + // -0.0 and 0.0 should be considered equal + test_once(LIST_COL_FLT{0.0, 1, 2, -0.0, 1, 2, 0.0, 1, 2, -0.0, -0.0, 0.0, 0.0}, + LIST_COL_FLT{0, 1, 2}); +} + +TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInf) +{ + // Lists contain inf + test_once(LIST_COL_FLT{0, 1, 2, 0, 1, 2, 0, 1, 2, Inf, Inf, Inf}, LIST_COL_FLT{0, 1, 2, Inf}); + test_once(LIST_COL_FLT{Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf}, + LIST_COL_FLT{neg_Inf, 0, Inf}); +} + +// The position of NaN is undefined after sorting, thus we need to offload the data to CPU to +// check for validity +// We will not store NaN in the results_expected variable (an unordered_set) because we can't check +// for NaN existence in a set. Instead, we will count the number of NaNs in the input and compare +// with the number of NaNs in the output. +static void test_floating_point(std::vector const& h_input, + std::unordered_set const& results_expected, + cudf::nan_equality nans_equal) +{ + // If NaNs are considered as equal value, the final result should always contain at max ONE NaN + // entry per list + std::size_t const num_NaNs = + nans_equal == cudf::nan_equality::ALL_EQUAL + ? std::size_t{1} + : std::count_if(h_input.begin(), h_input.end(), [](auto x) { return std::isnan(x); }); + + auto const results_col = cudf::lists::drop_list_duplicates( + cudf::lists_column_view{LIST_COL_FLT(h_input.begin(), h_input.end())}, + cudf::null_equality::EQUAL, + nans_equal); + auto const results_arr = + cudf::test::to_host(cudf::lists_column_view(results_col->view()).child()).first; + + EXPECT_EQ(results_arr.size(), results_expected.size() + num_NaNs); + + std::size_t NaN_count{0}; + std::unordered_set results; + for (auto const x : results_arr) { + if (std::isnan(x)) { + ++NaN_count; + } else { + results.insert(x); + } + } + EXPECT_TRUE(results_expected.size() == results.size() && NaN_count == num_NaNs); } -TEST_F(DropListDuplicatesTest, FloatingPointTestsNonNull) +TEST_F(DropListDuplicatesTest, FloatingPointTestsWithNaNs) +{ + std::vector h_input{ + 0, -1, 1, NaN, 2, 0, neg_NaN, 1, -2, 2, 0, 1, 2, neg_NaN, NaN, NaN, NaN, neg_NaN}; + std::unordered_set results_expected{-2, -1, 0, 1, 2}; + test_floating_point(h_input, results_expected, cudf::nan_equality::UNEQUAL); + test_floating_point(h_input, results_expected, cudf::nan_equality::ALL_EQUAL); +} + +TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInfsAndNaNs) +{ + std::vector h_input{neg_Inf, 0, neg_NaN, 1, -1, -2, NaN, NaN, Inf, NaN, + neg_NaN, 2, -1, 0, neg_NaN, 1, 2, Inf, 0, 1, + neg_Inf, 2, neg_NaN, Inf, neg_NaN, neg_NaN, NaN, neg_Inf}; + std::unordered_set results_expected{-2, -1, 0, 1, 2, neg_Inf, Inf}; + test_floating_point(h_input, results_expected, cudf::nan_equality::UNEQUAL); + test_floating_point(h_input, results_expected, cudf::nan_equality::ALL_EQUAL); +} + +TEST_F(DropListDuplicatesTest, StringTestsNonNull) { // Trivial cases - test_once(FLT_LCW{{}}, FLT_LCW{{}}); - test_once(FLT_LCW{{0, 1, 2, 3, 4, 5}, {}}, FLT_LCW{{0, 1, 2, 3, 4, 5}, {}}); + test_once(LIST_COL_STR{{}}, LIST_COL_STR{{}}); + test_once(LIST_COL_STR{"this", "is", "a", "string"}, LIST_COL_STR{"a", "is", "string", "this"}); - // Multiple empty lists - test_once(FLT_LCW{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}, - FLT_LCW{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}); + // One list column + test_once(LIST_COL_STR{"this", "is", "is", "is", "a", "string", "string"}, + LIST_COL_STR{"a", "is", "string", "this"}); - auto constexpr p_inf = std::numeric_limits::infinity(); - auto constexpr m_inf = -std::numeric_limits::infinity(); + // Multiple lists column + test_once( + LIST_COL_STR{LIST_COL_STR{"this", "is", "a", "no duplicate", "string"}, + LIST_COL_STR{"this", "is", "is", "a", "one duplicate", "string"}, + LIST_COL_STR{"this", "is", "is", "is", "a", "two duplicates", "string"}, + LIST_COL_STR{"this", "is", "is", "is", "is", "a", "three duplicates", "string"}}, + LIST_COL_STR{LIST_COL_STR{"a", "is", "no duplicate", "string", "this"}, + LIST_COL_STR{"a", "is", "one duplicate", "string", "this"}, + LIST_COL_STR{"a", "is", "string", "this", "two duplicates"}, + LIST_COL_STR{"a", "is", "string", "this", "three duplicates"}}); +} - // Lists contain inf - // We can't test for lists containing nan because the order of nan is - // undefined after sorting - test_once(FLT_LCW{0, 1, 2, 0, 1, 2, 0, 1, 2, p_inf, p_inf, p_inf}, - FLT_LCW{0, 1, 2, p_inf}); - test_once(FLT_LCW{p_inf, 0, m_inf, 0, p_inf, 0, m_inf, 0, p_inf, 0, m_inf}, - FLT_LCW{m_inf, 0, p_inf}); +TEST_F(DropListDuplicatesTest, StringTestsWithNulls) +{ + auto const null = std::string(""); + + // One list column with null entries + test_once( + LIST_COL_STR{{"this", null, "is", "is", "is", "a", null, "string", null, "string"}, + cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i != 1 && i != 6 && i != 8; })}, + LIST_COL_STR{{"a", "is", "string", "this", null}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; })}); + + // Multiple lists column with null lists and null entries + test_once( + LIST_COL_STR{ + {LIST_COL_STR{ + {"this", null, "is", null, "a", null, "no duplicate", null, "string"}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; })}, + LIST_COL_STR{}, + LIST_COL_STR{"this", "is", "is", "a", "one duplicate", "string"}}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}, + LIST_COL_STR{{LIST_COL_STR{{"a", "is", "no duplicate", "string", "this", null}, + cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i <= 4; })}, + LIST_COL_STR{}, + LIST_COL_STR{"a", "is", "one duplicate", "string", "this"}}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}); } -TEST_F(DropListDuplicatesTest, IntegerTestsNonNull) +template +struct DropListDuplicatesTypedTest : public cudf::test::BaseFixture { +}; +#define LIST_COL cudf::test::lists_column_wrapper + +using TypesForTest = + cudf::test::Concat; +TYPED_TEST_CASE(DropListDuplicatesTypedTest, TypesForTest); + +TYPED_TEST(DropListDuplicatesTypedTest, InvalidInputTests) { + // Lists of nested types are not supported + EXPECT_THROW( + cudf::lists::drop_list_duplicates(cudf::lists_column_view{LIST_COL{LIST_COL{{1, 2}, {3}}}}), + cudf::logic_error); +} + +TYPED_TEST(DropListDuplicatesTypedTest, TrivialInputTests) +{ + // Empty input + test_once(LIST_COL{{}}, LIST_COL{{}}); + // Trivial cases - test_once(INT_LCW{{}}, INT_LCW{{}}); - test_once(INT_LCW{{0, 1, 2, 3, 4, 5}, {}}, INT_LCW{{0, 1, 2, 3, 4, 5}, {}}); + test_once(LIST_COL{0, 1, 2, 3, 4, 5}, LIST_COL{0, 1, 2, 3, 4, 5}); // Multiple empty lists - test_once(INT_LCW{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}, - INT_LCW{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}); + test_once(LIST_COL{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}, + LIST_COL{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}); +} +TYPED_TEST(DropListDuplicatesTypedTest, NonNullInputTests) +{ // Adjacent lists containing the same entries - test_once( - INT_LCW{{1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 2, 2, 2}, {2, 2, 2, 2, 3, 3, 3, 3}}, - INT_LCW{{1}, {1, 2}, {2, 3}}); + test_once(LIST_COL{{1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 2, 2, 2}, {2, 2, 2, 2, 3, 3, 3, 3}}, + LIST_COL{{1}, {1, 2}, {2, 3}}); // Sliced list column - auto const list0 = INT_LCW{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}}; + auto const list0 = + LIST_COL{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}}; auto const list1 = cudf::slice(list0, {0, 5})[0]; auto const list2 = cudf::slice(list0, {1, 5})[0]; auto const list3 = cudf::slice(list0, {1, 3})[0]; auto const list4 = cudf::slice(list0, {0, 3})[0]; - test_once(list0, INT_LCW{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); - test_once(list1, INT_LCW{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); - test_once(list2, INT_LCW{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); - test_once(list3, INT_LCW{{1, 2, 3, 4}, {5}}); - test_once(list4, INT_LCW{{1, 2, 3}, {1, 2, 3, 4}, {5}}); + test_once(list0, LIST_COL{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); + test_once(list1, LIST_COL{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); + test_once(list2, LIST_COL{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); + test_once(list3, LIST_COL{{1, 2, 3, 4}, {5}}); + test_once(list4, LIST_COL{{1, 2, 3}, {1, 2, 3, 4}, {5}}); } -TEST_F(DropListDuplicatesTest, IntegerTestsWithNulls) +TYPED_TEST(DropListDuplicatesTypedTest, WithNullInputTests) { - auto constexpr null = std::numeric_limits::max(); + auto constexpr null = TypeParam{0}; // null lists - test_once(INT_LCW{{{3, 2, 1, 4, 1}, {5}, {}, {}, {10, 8, 9}, {6, 7}}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i != 2 && i != 3; })}, - INT_LCW{{{1, 2, 3, 4}, {5}, {}, {}, {8, 9, 10}, {6, 7}}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i != 2 && i != 3; })}); + test_once(LIST_COL{{{3, 2, 1, 4, 1}, {5}, {}, {}, {10, 8, 9}, {6, 7}}, + cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i != 2 && i != 3; })}, + LIST_COL{{{1, 2, 3, 4}, {5}, {}, {}, {8, 9, 10}, {6, 7}}, + cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i != 2 && i != 3; })}); // null entries are equal - test_once( - INT_LCW{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, - INT_LCW{{1, 3, 5, 7, 9, null}, - std::initializer_list{true, true, true, true, true, false}}); + test_once( + LIST_COL{std::initializer_list{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, + LIST_COL{std::initializer_list{1, 3, 5, 7, 9, null}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 5; })}); // nulls entries are not equal - test_once( - INT_LCW{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, - INT_LCW{ - {1, 3, 5, 7, 9, null, null, null, null, null}, - std::initializer_list{true, true, true, true, true, false, false, false, false, false}}, + test_once( + LIST_COL{std::initializer_list{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, + LIST_COL{std::initializer_list{1, 3, 5, 7, 9, null, null, null, null, null}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i < 5; })}, cudf::null_equality::UNEQUAL); } - -TEST_F(DropListDuplicatesTest, StringTestsNonNull) -{ - // Trivial cases - test_once(STR_LCW{{}}, STR_LCW{{}}); - test_once(STR_LCW{"this", "is", "a", "string"}, STR_LCW{"a", "is", "string", "this"}); - - // One list column - test_once(STR_LCW{"this", "is", "is", "is", "a", "string", "string"}, - STR_LCW{"a", "is", "string", "this"}); - - // Multiple lists column - test_once( - STR_LCW{STR_LCW{"this", "is", "a", "no duplicate", "string"}, - STR_LCW{"this", "is", "is", "a", "one duplicate", "string"}, - STR_LCW{"this", "is", "is", "is", "a", "two duplicates", "string"}, - STR_LCW{"this", "is", "is", "is", "is", "a", "three duplicates", "string"}}, - STR_LCW{STR_LCW{"a", "is", "no duplicate", "string", "this"}, - STR_LCW{"a", "is", "one duplicate", "string", "this"}, - STR_LCW{"a", "is", "string", "this", "two duplicates"}, - STR_LCW{"a", "is", "string", "this", "three duplicates"}}); -} - -TEST_F(DropListDuplicatesTest, StringTestsWithNulls) -{ - auto const null = std::string(""); - - // One list column with null entries - test_once( - STR_LCW{{"this", null, "is", "is", "is", "a", null, "string", null, "string"}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i != 1 && i != 6 && i != 8; })}, - STR_LCW{{"a", "is", "string", "this", null}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; })}); - - // Multiple lists column with null lists and null entries - test_once( - STR_LCW{{STR_LCW{{"this", null, "is", null, "a", null, "no duplicate", null, "string"}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i % 2 == 0; })}, - STR_LCW{}, - STR_LCW{"this", "is", "is", "a", "one duplicate", "string"}}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}, - STR_LCW{ - {STR_LCW{{"a", "is", "no duplicate", "string", "this", null}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i <= 4; })}, - STR_LCW{}, - STR_LCW{"a", "is", "one duplicate", "string", "this"}}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}); -}