From 81e0d7961020a57f4646773e61cd7f22e2c3661d Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 17 Mar 2021 17:47:32 -0600 Subject: [PATCH 01/34] Add tests for drop_list_duplicates in case of input containing floating point numbers with NaN --- .../lists/drop_list_duplicates_tests.cpp | 50 +++++++++++++++++-- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index 0948ba96f62..0a0f1e08330 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -14,11 +14,13 @@ * limitations under the License. */ -#include - #include #include +#include + +#include + using float_type = float; using int_type = int32_t; using INT_LCW = cudf::test::lists_column_wrapper; @@ -70,12 +72,52 @@ TEST_F(DropListDuplicatesTest, FloatingPointTestsNonNull) auto constexpr m_inf = -std::numeric_limits::infinity(); // Lists contain inf - // We can't test for lists containing nan because the order of nan is - // undefined after sorting test_once(FLT_LCW{0, 1, 2, 0, 1, 2, 0, 1, 2, p_inf, p_inf, p_inf}, FLT_LCW{0, 1, 2, p_inf}); test_once(FLT_LCW{p_inf, 0, m_inf, 0, p_inf, 0, m_inf, 0, p_inf, 0, m_inf}, FLT_LCW{m_inf, 0, p_inf}); + + // Lists contain NaN + // The position of NaN is undefined after sorting, thus we need to offload the data to CPU to + // check for validity + auto constexpr NaN = std::numeric_limits::quiet_NaN(); + + // We will not store NaN in an unordered_set because it can't check for NaN existence + std::unordered_set results_expected{0, 1, 2}; + auto results_col = cudf::lists::drop_list_duplicates( + cudf::lists_column_view{FLT_LCW{0, 1, NaN, 2, 0, NaN, 1, 2, 0, 1, 2, NaN, NaN, NaN}}); + auto results_arr = + cudf::test::to_host(cudf::lists_column_view(results_col->view()).child()).first; + EXPECT_EQ(results_arr.size(), results_expected.size() + 1); + int NaN_count{0}; + std::unordered_set results; + for (auto const x : results_arr) { + if (std::isnan(x)) { + ++NaN_count; + } else { + results.insert(x); + } + } + EXPECT_TRUE(results_expected.size() == results.size() && NaN_count == 1); + + // Lists contain both NaN and inf + // We will not store NaN in an unordered_set because it can't check for NaN existence + results_expected = std::unordered_set{0, 1, 2, m_inf, p_inf}; + results_col = cudf::lists::drop_list_duplicates(cudf::lists_column_view{FLT_LCW{ + m_inf, 0, 1, p_inf, NaN, 2, 0, NaN, 1, 2, p_inf, 0, 1, m_inf, 2, NaN, p_inf, NaN, NaN, m_inf}}); + results_arr = + cudf::test::to_host(cudf::lists_column_view(results_col->view()).child()).first; + EXPECT_EQ(results_arr.size(), results_expected.size() + 1); + results.clear(); + NaN_count = 0; + for (auto const x : results_arr) { + if (std::isnan(x)) { + ++NaN_count; + } else { + results.insert(x); + } + } + EXPECT_TRUE(results_expected.size() == results.size() && NaN_count == 1); } TEST_F(DropListDuplicatesTest, IntegerTestsNonNull) From e4315494b7e33a99aeba04e0a6ba9b2593d86d06 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 19 Mar 2021 14:02:03 -0600 Subject: [PATCH 02/34] Add negative NaN into the tests --- .../lists/drop_list_duplicates_tests.cpp | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index 0a0f1e08330..27d14f3c434 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -80,15 +80,16 @@ TEST_F(DropListDuplicatesTest, FloatingPointTestsNonNull) // Lists contain NaN // The position of NaN is undefined after sorting, thus we need to offload the data to CPU to // check for validity - auto constexpr NaN = std::numeric_limits::quiet_NaN(); + auto constexpr m_NaN = -std::numeric_limits::quiet_NaN(); + auto constexpr p_NaN = std::numeric_limits::quiet_NaN(); // We will not store NaN in an unordered_set because it can't check for NaN existence - std::unordered_set results_expected{0, 1, 2}; - auto results_col = cudf::lists::drop_list_duplicates( - cudf::lists_column_view{FLT_LCW{0, 1, NaN, 2, 0, NaN, 1, 2, 0, 1, 2, NaN, NaN, NaN}}); + std::unordered_set results_expected{-2, -1, 0, 1, 2}; + auto results_col = cudf::lists::drop_list_duplicates(cudf::lists_column_view{ + FLT_LCW{0, -1, 1, p_NaN, 2, 0, m_NaN, 1, -2, 2, 0, 1, 2, m_NaN, p_NaN, p_NaN, p_NaN, m_NaN}}); auto results_arr = cudf::test::to_host(cudf::lists_column_view(results_col->view()).child()).first; - EXPECT_EQ(results_arr.size(), results_expected.size() + 1); + EXPECT_EQ(results_arr.size(), results_expected.size() + 2); int NaN_count{0}; std::unordered_set results; for (auto const x : results_arr) { @@ -98,16 +99,17 @@ TEST_F(DropListDuplicatesTest, FloatingPointTestsNonNull) results.insert(x); } } - EXPECT_TRUE(results_expected.size() == results.size() && NaN_count == 1); + EXPECT_TRUE(results_expected.size() == results.size() && NaN_count == 2); // Lists contain both NaN and inf // We will not store NaN in an unordered_set because it can't check for NaN existence - results_expected = std::unordered_set{0, 1, 2, m_inf, p_inf}; - results_col = cudf::lists::drop_list_duplicates(cudf::lists_column_view{FLT_LCW{ - m_inf, 0, 1, p_inf, NaN, 2, 0, NaN, 1, 2, p_inf, 0, 1, m_inf, 2, NaN, p_inf, NaN, NaN, m_inf}}); + results_expected = std::unordered_set{-2, -1, 0, 1, 2, m_inf, p_inf}; + results_col = cudf::lists::drop_list_duplicates(cudf::lists_column_view{ + FLT_LCW{m_inf, 0, m_NaN, 1, -1, -2, p_NaN, p_NaN, p_inf, p_NaN, m_NaN, 2, -1, 0, m_NaN, + 1, 2, p_inf, 0, 1, m_inf, 2, m_NaN, p_inf, m_NaN, m_NaN, p_NaN, m_inf}}); results_arr = cudf::test::to_host(cudf::lists_column_view(results_col->view()).child()).first; - EXPECT_EQ(results_arr.size(), results_expected.size() + 1); + EXPECT_EQ(results_arr.size(), results_expected.size() + 2); results.clear(); NaN_count = 0; for (auto const x : results_arr) { @@ -117,7 +119,7 @@ TEST_F(DropListDuplicatesTest, FloatingPointTestsNonNull) results.insert(x); } } - EXPECT_TRUE(results_expected.size() == results.size() && NaN_count == 1); + EXPECT_TRUE(results_expected.size() == results.size() && NaN_count == 2); } TEST_F(DropListDuplicatesTest, IntegerTestsNonNull) From 84b06a3b428f6270f5b9a5816289dc6f9cd9f230 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 19 Mar 2021 15:28:29 -0600 Subject: [PATCH 03/34] Rewrite tests: split tests into smaller tests with some improvements --- .../lists/drop_list_duplicates_tests.cpp | 275 +++++++++--------- 1 file changed, 142 insertions(+), 133 deletions(-) diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index 27d14f3c434..cffb10930ba 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -16,80 +16,63 @@ #include #include +#include #include +#include #include -using float_type = float; using int_type = int32_t; -using INT_LCW = cudf::test::lists_column_wrapper; -using FLT_LCW = cudf::test::lists_column_wrapper; -using STR_LCW = cudf::test::lists_column_wrapper; +using float_type = float; + +using LIST_COL_FLT = cudf::test::lists_column_wrapper; +using LIST_COL_STR = cudf::test::lists_column_wrapper; -template +auto constexpr m_NaN = -std::numeric_limits::quiet_NaN(); +auto constexpr m_inf = -std::numeric_limits::infinity(); +auto constexpr p_inf = std::numeric_limits::infinity(); +auto constexpr p_NaN = std::numeric_limits::quiet_NaN(); + +template void test_once(cudf::column_view const& input, LCW const& expected, cudf::null_equality nulls_equal = cudf::null_equality::EQUAL) { auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{input}, nulls_equal); - if (equal_test) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, true); + if (cudf::is_floating_point(input.type())) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } else { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); } } struct DropListDuplicatesTest : public cudf::test::BaseFixture { }; -TEST_F(DropListDuplicatesTest, InvalidCasesTests) +TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInf) { - // Lists of nested types are not supported - EXPECT_THROW( - cudf::lists::drop_list_duplicates(cudf::lists_column_view{INT_LCW{INT_LCW{{1, 2}, {3}}}}), - cudf::logic_error); - EXPECT_THROW( - cudf::lists::drop_list_duplicates(cudf::lists_column_view{FLT_LCW{FLT_LCW{{1, 2}, {3}}}}), - cudf::logic_error); - EXPECT_THROW( - cudf::lists::drop_list_duplicates(cudf::lists_column_view{STR_LCW{STR_LCW{STR_LCW{"string"}}}}), - cudf::logic_error); + // Lists contain inf + test_once(LIST_COL_FLT{0, 1, 2, 0, 1, 2, 0, 1, 2, p_inf, p_inf, p_inf}, + LIST_COL_FLT{0, 1, 2, p_inf}); + test_once(LIST_COL_FLT{p_inf, 0, m_inf, 0, p_inf, 0, m_inf, 0, p_inf, 0, m_inf}, + LIST_COL_FLT{m_inf, 0, p_inf}); } -TEST_F(DropListDuplicatesTest, FloatingPointTestsNonNull) +TEST_F(DropListDuplicatesTest, FloatingPointTestsWithNaN) { - // Trivial cases - test_once(FLT_LCW{{}}, FLT_LCW{{}}); - test_once(FLT_LCW{{0, 1, 2, 3, 4, 5}, {}}, FLT_LCW{{0, 1, 2, 3, 4, 5}, {}}); - - // Multiple empty lists - test_once(FLT_LCW{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}, - FLT_LCW{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}); - - auto constexpr p_inf = std::numeric_limits::infinity(); - auto constexpr m_inf = -std::numeric_limits::infinity(); - - // Lists contain inf - test_once(FLT_LCW{0, 1, 2, 0, 1, 2, 0, 1, 2, p_inf, p_inf, p_inf}, - FLT_LCW{0, 1, 2, p_inf}); - test_once(FLT_LCW{p_inf, 0, m_inf, 0, p_inf, 0, m_inf, 0, p_inf, 0, m_inf}, - FLT_LCW{m_inf, 0, p_inf}); - - // Lists contain NaN // The position of NaN is undefined after sorting, thus we need to offload the data to CPU to // check for validity - auto constexpr m_NaN = -std::numeric_limits::quiet_NaN(); - auto constexpr p_NaN = std::numeric_limits::quiet_NaN(); // We will not store NaN in an unordered_set because it can't check for NaN existence std::unordered_set results_expected{-2, -1, 0, 1, 2}; - auto results_col = cudf::lists::drop_list_duplicates(cudf::lists_column_view{ - FLT_LCW{0, -1, 1, p_NaN, 2, 0, m_NaN, 1, -2, 2, 0, 1, 2, m_NaN, p_NaN, p_NaN, p_NaN, m_NaN}}); - auto results_arr = + auto const results_col = cudf::lists::drop_list_duplicates(cudf::lists_column_view{LIST_COL_FLT{ + 0, -1, 1, p_NaN, 2, 0, m_NaN, 1, -2, 2, 0, 1, 2, m_NaN, p_NaN, p_NaN, p_NaN, m_NaN}}); + auto const results_arr = cudf::test::to_host(cudf::lists_column_view(results_col->view()).child()).first; EXPECT_EQ(results_arr.size(), results_expected.size() + 2); + int NaN_count{0}; std::unordered_set results; for (auto const x : results_arr) { @@ -100,18 +83,22 @@ TEST_F(DropListDuplicatesTest, FloatingPointTestsNonNull) } } EXPECT_TRUE(results_expected.size() == results.size() && NaN_count == 2); +} +TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInfAndNaN) +{ // Lists contain both NaN and inf // We will not store NaN in an unordered_set because it can't check for NaN existence - results_expected = std::unordered_set{-2, -1, 0, 1, 2, m_inf, p_inf}; - results_col = cudf::lists::drop_list_duplicates(cudf::lists_column_view{ - FLT_LCW{m_inf, 0, m_NaN, 1, -1, -2, p_NaN, p_NaN, p_inf, p_NaN, m_NaN, 2, -1, 0, m_NaN, - 1, 2, p_inf, 0, 1, m_inf, 2, m_NaN, p_inf, m_NaN, m_NaN, p_NaN, m_inf}}); - results_arr = + std::unordered_set results_expected{-2, -1, 0, 1, 2, m_inf, p_inf}; + auto const results_col = cudf::lists::drop_list_duplicates(cudf::lists_column_view{LIST_COL_FLT{ + m_inf, 0, m_NaN, 1, -1, -2, p_NaN, p_NaN, p_inf, p_NaN, m_NaN, 2, -1, 0, + m_NaN, 1, 2, p_inf, 0, 1, m_inf, 2, m_NaN, p_inf, m_NaN, m_NaN, p_NaN, m_inf}}); + auto const results_arr = cudf::test::to_host(cudf::lists_column_view(results_col->view()).child()).first; EXPECT_EQ(results_arr.size(), results_expected.size() + 2); - results.clear(); - NaN_count = 0; + + int NaN_count{0}; + std::unordered_set results; for (auto const x : results_arr) { if (std::isnan(x)) { ++NaN_count; @@ -122,110 +109,132 @@ TEST_F(DropListDuplicatesTest, FloatingPointTestsNonNull) EXPECT_TRUE(results_expected.size() == results.size() && NaN_count == 2); } -TEST_F(DropListDuplicatesTest, IntegerTestsNonNull) +TEST_F(DropListDuplicatesTest, StringTestsNonNull) +{ + // Trivial cases + test_once(LIST_COL_STR{{}}, LIST_COL_STR{{}}); + test_once(LIST_COL_STR{"this", "is", "a", "string"}, LIST_COL_STR{"a", "is", "string", "this"}); + + // One list column + test_once(LIST_COL_STR{"this", "is", "is", "is", "a", "string", "string"}, + LIST_COL_STR{"a", "is", "string", "this"}); + + // Multiple lists column + test_once( + LIST_COL_STR{LIST_COL_STR{"this", "is", "a", "no duplicate", "string"}, + LIST_COL_STR{"this", "is", "is", "a", "one duplicate", "string"}, + LIST_COL_STR{"this", "is", "is", "is", "a", "two duplicates", "string"}, + LIST_COL_STR{"this", "is", "is", "is", "is", "a", "three duplicates", "string"}}, + LIST_COL_STR{LIST_COL_STR{"a", "is", "no duplicate", "string", "this"}, + LIST_COL_STR{"a", "is", "one duplicate", "string", "this"}, + LIST_COL_STR{"a", "is", "string", "this", "two duplicates"}, + LIST_COL_STR{"a", "is", "string", "this", "three duplicates"}}); +} + +TEST_F(DropListDuplicatesTest, StringTestsWithNulls) +{ + auto const null = std::string(""); + + // One list column with null entries + test_once( + LIST_COL_STR{{"this", null, "is", "is", "is", "a", null, "string", null, "string"}, + cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i != 1 && i != 6 && i != 8; })}, + LIST_COL_STR{{"a", "is", "string", "this", null}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; })}); + + // Multiple lists column with null lists and null entries + test_once( + LIST_COL_STR{ + {LIST_COL_STR{ + {"this", null, "is", null, "a", null, "no duplicate", null, "string"}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; })}, + LIST_COL_STR{}, + LIST_COL_STR{"this", "is", "is", "a", "one duplicate", "string"}}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}, + LIST_COL_STR{{LIST_COL_STR{{"a", "is", "no duplicate", "string", "this", null}, + cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i <= 4; })}, + LIST_COL_STR{}, + LIST_COL_STR{"a", "is", "one duplicate", "string", "this"}}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}); +} + +template +struct DropListDuplicatesTypedTest : public cudf::test::BaseFixture { +}; +#define LIST_COL cudf::test::lists_column_wrapper + +using TypesForTest = + cudf::test::Concat; +TYPED_TEST_CASE(DropListDuplicatesTypedTest, TypesForTest); + +TYPED_TEST(DropListDuplicatesTypedTest, InvalidInputTests) { + // Lists of nested types are not supported + EXPECT_THROW( + cudf::lists::drop_list_duplicates(cudf::lists_column_view{LIST_COL{LIST_COL{{1, 2}, {3}}}}), + cudf::logic_error); +} + +TYPED_TEST(DropListDuplicatesTypedTest, TrivialInputTests) +{ + // Empty input + test_once(LIST_COL{{}}, LIST_COL{{}}); + // Trivial cases - test_once(INT_LCW{{}}, INT_LCW{{}}); - test_once(INT_LCW{{0, 1, 2, 3, 4, 5}, {}}, INT_LCW{{0, 1, 2, 3, 4, 5}, {}}); + test_once(LIST_COL{0, 1, 2, 3, 4, 5}, LIST_COL{0, 1, 2, 3, 4, 5}); // Multiple empty lists - test_once(INT_LCW{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}, - INT_LCW{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}); + test_once(LIST_COL{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}, + LIST_COL{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}); +} +TYPED_TEST(DropListDuplicatesTypedTest, NonNullInputTests) +{ // Adjacent lists containing the same entries - test_once( - INT_LCW{{1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 2, 2, 2}, {2, 2, 2, 2, 3, 3, 3, 3}}, - INT_LCW{{1}, {1, 2}, {2, 3}}); + test_once(LIST_COL{{1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 2, 2, 2}, {2, 2, 2, 2, 3, 3, 3, 3}}, + LIST_COL{{1}, {1, 2}, {2, 3}}); // Sliced list column - auto const list0 = INT_LCW{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}}; + auto const list0 = + LIST_COL{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}}; auto const list1 = cudf::slice(list0, {0, 5})[0]; auto const list2 = cudf::slice(list0, {1, 5})[0]; auto const list3 = cudf::slice(list0, {1, 3})[0]; auto const list4 = cudf::slice(list0, {0, 3})[0]; - test_once(list0, INT_LCW{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); - test_once(list1, INT_LCW{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); - test_once(list2, INT_LCW{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); - test_once(list3, INT_LCW{{1, 2, 3, 4}, {5}}); - test_once(list4, INT_LCW{{1, 2, 3}, {1, 2, 3, 4}, {5}}); + test_once(list0, LIST_COL{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); + test_once(list1, LIST_COL{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); + test_once(list2, LIST_COL{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); + test_once(list3, LIST_COL{{1, 2, 3, 4}, {5}}); + test_once(list4, LIST_COL{{1, 2, 3}, {1, 2, 3, 4}, {5}}); } -TEST_F(DropListDuplicatesTest, IntegerTestsWithNulls) +TYPED_TEST(DropListDuplicatesTypedTest, WithNullInputTests) { - auto constexpr null = std::numeric_limits::max(); + auto constexpr null = TypeParam{0}; // null lists - test_once(INT_LCW{{{3, 2, 1, 4, 1}, {5}, {}, {}, {10, 8, 9}, {6, 7}}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i != 2 && i != 3; })}, - INT_LCW{{{1, 2, 3, 4}, {5}, {}, {}, {8, 9, 10}, {6, 7}}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i != 2 && i != 3; })}); + test_once(LIST_COL{{{3, 2, 1, 4, 1}, {5}, {}, {}, {10, 8, 9}, {6, 7}}, + cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i != 2 && i != 3; })}, + LIST_COL{{{1, 2, 3, 4}, {5}, {}, {}, {8, 9, 10}, {6, 7}}, + cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i != 2 && i != 3; })}); // null entries are equal - test_once( - INT_LCW{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, - INT_LCW{{1, 3, 5, 7, 9, null}, - std::initializer_list{true, true, true, true, true, false}}); + test_once( + LIST_COL{std::initializer_list{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, + LIST_COL{std::initializer_list{1, 3, 5, 7, 9, null}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 5; })}); // nulls entries are not equal - test_once( - INT_LCW{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, - INT_LCW{ - {1, 3, 5, 7, 9, null, null, null, null, null}, - std::initializer_list{true, true, true, true, true, false, false, false, false, false}}, + test_once( + LIST_COL{std::initializer_list{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, + LIST_COL{std::initializer_list{1, 3, 5, 7, 9, null, null, null, null, null}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i < 5; })}, cudf::null_equality::UNEQUAL); } - -TEST_F(DropListDuplicatesTest, StringTestsNonNull) -{ - // Trivial cases - test_once(STR_LCW{{}}, STR_LCW{{}}); - test_once(STR_LCW{"this", "is", "a", "string"}, STR_LCW{"a", "is", "string", "this"}); - - // One list column - test_once(STR_LCW{"this", "is", "is", "is", "a", "string", "string"}, - STR_LCW{"a", "is", "string", "this"}); - - // Multiple lists column - test_once( - STR_LCW{STR_LCW{"this", "is", "a", "no duplicate", "string"}, - STR_LCW{"this", "is", "is", "a", "one duplicate", "string"}, - STR_LCW{"this", "is", "is", "is", "a", "two duplicates", "string"}, - STR_LCW{"this", "is", "is", "is", "is", "a", "three duplicates", "string"}}, - STR_LCW{STR_LCW{"a", "is", "no duplicate", "string", "this"}, - STR_LCW{"a", "is", "one duplicate", "string", "this"}, - STR_LCW{"a", "is", "string", "this", "two duplicates"}, - STR_LCW{"a", "is", "string", "this", "three duplicates"}}); -} - -TEST_F(DropListDuplicatesTest, StringTestsWithNulls) -{ - auto const null = std::string(""); - - // One list column with null entries - test_once( - STR_LCW{{"this", null, "is", "is", "is", "a", null, "string", null, "string"}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i != 1 && i != 6 && i != 8; })}, - STR_LCW{{"a", "is", "string", "this", null}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; })}); - - // Multiple lists column with null lists and null entries - test_once( - STR_LCW{{STR_LCW{{"this", null, "is", null, "a", null, "no duplicate", null, "string"}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i % 2 == 0; })}, - STR_LCW{}, - STR_LCW{"this", "is", "is", "a", "one duplicate", "string"}}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}, - STR_LCW{ - {STR_LCW{{"a", "is", "no duplicate", "string", "this", null}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i <= 4; })}, - STR_LCW{}, - STR_LCW{"a", "is", "one duplicate", "string", "this"}}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}); -} From 34305f27eca07096a9a7ccd07c05b74a81d1a50b Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 19 Mar 2021 15:41:10 -0600 Subject: [PATCH 04/34] Some improvement to floating point tests with NaNs --- .../lists/drop_list_duplicates_tests.cpp | 54 +++++++++---------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index cffb10930ba..6094d7a249a 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -21,6 +21,7 @@ #include #include +#include #include using int_type = int32_t; @@ -60,18 +61,22 @@ TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInf) LIST_COL_FLT{m_inf, 0, p_inf}); } -TEST_F(DropListDuplicatesTest, FloatingPointTestsWithNaN) +// The position of NaN is undefined after sorting, thus we need to offload the data to CPU to +// check for validity +// We will not store NaN in the results_expected variable (an unordered_set) because we can't check +// for NaN existence in a set. Instead, we will count the number of NaNs in the input and compare +// with the number of NaNs in the output. +void test_floating_point(std::vector const& h_input, + std::unordered_set const& results_expected) { - // The position of NaN is undefined after sorting, thus we need to offload the data to CPU to - // check for validity + const auto num_NaNs = + std::count_if(h_input.begin(), h_input.end(), [](auto x) { return std::isnan(x); }); - // We will not store NaN in an unordered_set because it can't check for NaN existence - std::unordered_set results_expected{-2, -1, 0, 1, 2}; - auto const results_col = cudf::lists::drop_list_duplicates(cudf::lists_column_view{LIST_COL_FLT{ - 0, -1, 1, p_NaN, 2, 0, m_NaN, 1, -2, 2, 0, 1, 2, m_NaN, p_NaN, p_NaN, p_NaN, m_NaN}}); + auto const results_col = cudf::lists::drop_list_duplicates( + cudf::lists_column_view{LIST_COL_FLT(h_input.begin(), h_input.end())}); auto const results_arr = cudf::test::to_host(cudf::lists_column_view(results_col->view()).child()).first; - EXPECT_EQ(results_arr.size(), results_expected.size() + 2); + EXPECT_EQ(results_arr.size(), results_expected.size() + num_NaNs); int NaN_count{0}; std::unordered_set results; @@ -82,31 +87,24 @@ TEST_F(DropListDuplicatesTest, FloatingPointTestsWithNaN) results.insert(x); } } - EXPECT_TRUE(results_expected.size() == results.size() && NaN_count == 2); + EXPECT_TRUE(results_expected.size() == results.size() && NaN_count == num_NaNs); +} + +TEST_F(DropListDuplicatesTest, FloatingPointTestsWithNaN) +{ + std::vector h_input{ + 0, -1, 1, p_NaN, 2, 0, m_NaN, 1, -2, 2, 0, 1, 2, m_NaN, p_NaN, p_NaN, p_NaN, m_NaN}; + std::unordered_set results_expected{-2, -1, 0, 1, 2}; + test_floating_point(h_input, results_expected); } TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInfAndNaN) { - // Lists contain both NaN and inf - // We will not store NaN in an unordered_set because it can't check for NaN existence + std::vector h_input{m_inf, 0, m_NaN, 1, -1, -2, p_NaN, p_NaN, p_inf, p_NaN, + m_NaN, 2, -1, 0, m_NaN, 1, 2, p_inf, 0, 1, + m_inf, 2, m_NaN, p_inf, m_NaN, m_NaN, p_NaN, m_inf}; std::unordered_set results_expected{-2, -1, 0, 1, 2, m_inf, p_inf}; - auto const results_col = cudf::lists::drop_list_duplicates(cudf::lists_column_view{LIST_COL_FLT{ - m_inf, 0, m_NaN, 1, -1, -2, p_NaN, p_NaN, p_inf, p_NaN, m_NaN, 2, -1, 0, - m_NaN, 1, 2, p_inf, 0, 1, m_inf, 2, m_NaN, p_inf, m_NaN, m_NaN, p_NaN, m_inf}}); - auto const results_arr = - cudf::test::to_host(cudf::lists_column_view(results_col->view()).child()).first; - EXPECT_EQ(results_arr.size(), results_expected.size() + 2); - - int NaN_count{0}; - std::unordered_set results; - for (auto const x : results_arr) { - if (std::isnan(x)) { - ++NaN_count; - } else { - results.insert(x); - } - } - EXPECT_TRUE(results_expected.size() == results.size() && NaN_count == 2); + test_floating_point(h_input, results_expected); } TEST_F(DropListDuplicatesTest, StringTestsNonNull) From fa46446c7e15d9bacdf6be4daef9cbe118378c87 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 19 Mar 2021 18:26:37 -0600 Subject: [PATCH 05/34] Add customized comparators for drop_list_duplicates, still need to update doc --- cpp/src/lists/drop_list_duplicates.cu | 130 +++++++++++++++++++++----- 1 file changed, 109 insertions(+), 21 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 1eb105d296d..e53486d9d71 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -35,6 +35,90 @@ namespace lists { namespace detail { namespace { using offset_type = lists_column_view::offset_type; + +/** + * @brief Performs an equality comparison between two elements in two columns. + * + * @tparam has_nulls Indicates the potential for null values in either column. + */ +template +class customized_element_comparator { + public: + /** + * @brief Construct type-dispatched function object for comparing equality + * between two elements. + * + * @note `lhs` and `rhs` may be the same. + * + * @param lhs The column containing the first element + * @param rhs The column containing the second element (may be the same as lhs) + * @param nulls_are_equal Indicates if two null elements are treated as equivalent + */ + __host__ __device__ customized_element_comparator(column_device_view _d_view, + bool _nulls_are_equal) + : d_view{_d_view}, nulls_are_equal{_nulls_are_equal} + { + } + + /** + * @brief Compares the specified elements for equality. + * + * @param i The index of the first element + * @param j The index of the second element + * + */ + template ()>* = nullptr> + __device__ bool operator()(size_type i, size_type j) const noexcept + { + if (has_nulls) { + bool const lhs_is_null{d_view.nullable() and d_view.is_null(i)}; + bool const rhs_is_null{d_view.nullable() and d_view.is_null(j)}; + if (lhs_is_null and rhs_is_null) { + return nulls_are_equal; + } else if (lhs_is_null != rhs_is_null) { + return false; + } + } + + return d_view.element(i) == d_view.element(j); + } + + template ()>* = nullptr> + __device__ bool operator()(size_type, size_type) + { + release_assert(false && "Attempted to compare elements of uncomparable types."); + return false; + } + + private: + column_device_view d_view; + bool nulls_are_equal; +}; + +template +class customized_row_comparator { + public: + customized_row_comparator(column_device_view _d_view, null_equality _nulls_equal) + : d_view{_d_view}, nulls_equal{_nulls_equal} + { + } + + __device__ bool operator()(size_type i, size_type j) const noexcept + { + return cudf::type_dispatcher( + d_view.type(), + customized_element_comparator{d_view, nulls_equal == null_equality::EQUAL}, + i, + j); + } + + private: + column_device_view d_view; + null_equality nulls_equal; +}; + /** * @brief Copy list entries and entry list offsets ignoring duplicates * @@ -51,7 +135,6 @@ using offset_type = lists_column_view::offset_type; * @return A pair of columns, the first one contains unique list entries and the second one * contains their corresponding list offsets */ -template std::vector> get_unique_entries_and_list_offsets( column_view const& all_lists_entries, column_view const& entries_list_offsets, @@ -59,26 +142,35 @@ std::vector> get_unique_entries_and_list_offsets( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - // Create an intermediate table, since the comparator only work on tables - auto const device_input_table = - cudf::table_device_view::create(table_view{{all_lists_entries}}, stream); - auto const comp = row_equality_comparator( - *device_input_table, *device_input_table, nulls_equal == null_equality::EQUAL); - auto const num_entries = all_lists_entries.size(); // Allocate memory to store the indices of the unique entries auto const unique_indices = cudf::make_numeric_column( entries_list_offsets.type(), num_entries, mask_state::UNALLOCATED, stream); auto const unique_indices_begin = unique_indices->mutable_view().begin(); - auto const copy_end = thrust::unique_copy( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - unique_indices_begin, - [list_offsets = entries_list_offsets.begin(), comp] __device__(auto i, auto j) { - return list_offsets[i] == list_offsets[j] && comp(i, j); - }); + offset_type* copy_end{0}; + auto const d_view = column_device_view::create(all_lists_entries, stream); + if (all_lists_entries.has_nulls()) { + auto const comp = customized_row_comparator(*d_view, nulls_equal); + copy_end = thrust::unique_copy( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + unique_indices_begin, + [list_offsets = entries_list_offsets.begin(), comp] __device__(auto i, auto j) { + return list_offsets[i] == list_offsets[j] && comp(i, j); + }); + } else { + auto const comp = customized_row_comparator(*d_view, nulls_equal); + copy_end = thrust::unique_copy( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + unique_indices_begin, + [list_offsets = entries_list_offsets.begin(), comp] __device__(auto i, auto j) { + return list_offsets[i] == list_offsets[j] && comp(i, j); + }); + } // Collect unique entries and entry list offsets auto const indices = cudf::detail::slice( @@ -255,12 +347,8 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu detail::generate_entry_list_offsets(all_lists_entries.size(), lists_offsets->view(), stream); // Copy non-duplicated entries (along with their list offsets) to new arrays - auto unique_entries_and_list_offsets = - all_lists_entries.has_nulls() - ? detail::get_unique_entries_and_list_offsets( - all_lists_entries, entries_list_offsets->view(), nulls_equal, stream, mr) - : detail::get_unique_entries_and_list_offsets( - all_lists_entries, entries_list_offsets->view(), nulls_equal, stream, mr); + auto unique_entries_and_list_offsets = detail::get_unique_entries_and_list_offsets( + all_lists_entries, entries_list_offsets->view(), nulls_equal, stream, mr); // Generate offsets for the new lists column detail::generate_offsets(unique_entries_and_list_offsets.front()->size(), From 452835bf47176ce4654926b630efc6bcc97ee1a2 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 19 Mar 2021 19:09:08 -0600 Subject: [PATCH 06/34] Some cleanup --- cpp/src/lists/drop_list_duplicates.cu | 41 +++++++++++++-------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index e53486d9d71..0b49874fd67 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -80,7 +79,6 @@ class customized_element_comparator { return false; } } - return d_view.element(i) == d_view.element(j); } @@ -97,7 +95,7 @@ class customized_element_comparator { bool nulls_are_equal; }; -template +template class customized_row_comparator { public: customized_row_comparator(column_device_view _d_view, null_equality _nulls_equal) @@ -149,27 +147,26 @@ std::vector> get_unique_entries_and_list_offsets( auto const unique_indices_begin = unique_indices->mutable_view().begin(); offset_type* copy_end{0}; - auto const d_view = column_device_view::create(all_lists_entries, stream); + auto const d_view = column_device_view::create(all_lists_entries, stream); + auto const list_offsets = entries_list_offsets.begin(); if (all_lists_entries.has_nulls()) { - auto const comp = customized_row_comparator(*d_view, nulls_equal); - copy_end = thrust::unique_copy( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - unique_indices_begin, - [list_offsets = entries_list_offsets.begin(), comp] __device__(auto i, auto j) { - return list_offsets[i] == list_offsets[j] && comp(i, j); - }); + customized_row_comparator const comp{*d_view, nulls_equal}; + copy_end = thrust::unique_copy(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + unique_indices_begin, + [list_offsets, comp] __device__(auto i, auto j) { + return list_offsets[i] == list_offsets[j] && comp(i, j); + }); } else { - auto const comp = customized_row_comparator(*d_view, nulls_equal); - copy_end = thrust::unique_copy( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - unique_indices_begin, - [list_offsets = entries_list_offsets.begin(), comp] __device__(auto i, auto j) { - return list_offsets[i] == list_offsets[j] && comp(i, j); - }); + customized_row_comparator const comp{*d_view, nulls_equal}; + copy_end = thrust::unique_copy(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + unique_indices_begin, + [list_offsets, comp] __device__(auto i, auto j) { + return list_offsets[i] == list_offsets[j] && comp(i, j); + }); } // Collect unique entries and entry list offsets From 42535a01eb55c9e9a4acad5a9a741a49329c4b04 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 19 Mar 2021 21:13:24 -0600 Subject: [PATCH 07/34] Rewrite doc for element_comparator and element_comparator_fn --- cpp/src/lists/drop_list_duplicates.cu | 74 +++++++++++++++------------ 1 file changed, 41 insertions(+), 33 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 0b49874fd67..04e90d0f73c 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -36,55 +36,51 @@ namespace { using offset_type = lists_column_view::offset_type; /** - * @brief Performs an equality comparison between two elements in two columns. + * @brief Performs an equality comparison between two rows in a single column * - * @tparam has_nulls Indicates the potential for null values in either column. + * If the row elements are not of floating point types, this functor will return the + * same comparison result as `cudf::element_equality_comparator`. + * For floating point types, elements that hold NaN value will be considered as unequal. + * + * @tparam has_nulls Indicates the potential for null values in the column */ template -class customized_element_comparator { +class element_comparator_fn { public: /** * @brief Construct type-dispatched function object for comparing equality - * between two elements. - * - * @note `lhs` and `rhs` may be the same. + * between two rows * - * @param lhs The column containing the first element - * @param rhs The column containing the second element (may be the same as lhs) - * @param nulls_are_equal Indicates if two null elements are treated as equivalent + * @param d_view The column containing the rows to compare + * @param nulls_are_equal Indicates if two null rows are treated as having the same value */ - __host__ __device__ customized_element_comparator(column_device_view _d_view, - bool _nulls_are_equal) - : d_view{_d_view}, nulls_are_equal{_nulls_are_equal} + __host__ __device__ element_comparator_fn(column_device_view d_view, bool nulls_are_equal) + : d_view{d_view}, nulls_are_equal{nulls_are_equal} { } - /** - * @brief Compares the specified elements for equality. - * - * @param i The index of the first element - * @param j The index of the second element - * - */ - template ()>* = nullptr> + template ()>* = nullptr> __device__ bool operator()(size_type i, size_type j) const noexcept { if (has_nulls) { - bool const lhs_is_null{d_view.nullable() and d_view.is_null(i)}; - bool const rhs_is_null{d_view.nullable() and d_view.is_null(j)}; + bool const nullable = d_view.nullable(); + bool const lhs_is_null{nullable and d_view.is_null(i)}; + bool const rhs_is_null{nullable and d_view.is_null(j)}; if (lhs_is_null and rhs_is_null) { return nulls_are_equal; } else if (lhs_is_null != rhs_is_null) { return false; } } - return d_view.element(i) == d_view.element(j); + + // For floating point types, if both element(i) and element(j) are NaNs then this comparison + // will return `false`. This is the desired behavior for `drop_list_duplicates`. + return d_view.element(i) == d_view.element(j); } template ()>* = nullptr> - __device__ bool operator()(size_type, size_type) + __device__ bool operator()(size_type, size_type) const { release_assert(false && "Attempted to compare elements of uncomparable types."); return false; @@ -95,10 +91,15 @@ class customized_element_comparator { bool nulls_are_equal; }; +/** + * @brief Perform an equality comparison between two rows in a single column + * + * @tparam has_nulls Indicates the potential for null values in the column + */ template -class customized_row_comparator { +class element_comparator { public: - customized_row_comparator(column_device_view _d_view, null_equality _nulls_equal) + element_comparator(column_device_view _d_view, null_equality _nulls_equal) : d_view{_d_view}, nulls_equal{_nulls_equal} { } @@ -107,7 +108,7 @@ class customized_row_comparator { { return cudf::type_dispatcher( d_view.type(), - customized_element_comparator{d_view, nulls_equal == null_equality::EQUAL}, + element_comparator_fn{d_view, nulls_equal == null_equality::EQUAL}, i, j); } @@ -141,16 +142,17 @@ std::vector> get_unique_entries_and_list_offsets( rmm::mr::device_memory_resource* mr) { auto const num_entries = all_lists_entries.size(); + // Allocate memory to store the indices of the unique entries auto const unique_indices = cudf::make_numeric_column( entries_list_offsets.type(), num_entries, mask_state::UNALLOCATED, stream); auto const unique_indices_begin = unique_indices->mutable_view().begin(); - offset_type* copy_end{0}; - auto const d_view = column_device_view::create(all_lists_entries, stream); - auto const list_offsets = entries_list_offsets.begin(); + offset_type* copy_end{nullptr}; + auto const d_view_entries = column_device_view::create(all_lists_entries, stream); + auto const list_offsets = entries_list_offsets.begin(); if (all_lists_entries.has_nulls()) { - customized_row_comparator const comp{*d_view, nulls_equal}; + element_comparator const comp{*d_view_entries, nulls_equal}; copy_end = thrust::unique_copy(rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_entries), @@ -159,7 +161,7 @@ std::vector> get_unique_entries_and_list_offsets( return list_offsets[i] == list_offsets[j] && comp(i, j); }); } else { - customized_row_comparator const comp{*d_view, nulls_equal}; + element_comparator const comp{*d_view_entries, nulls_equal}; copy_end = thrust::unique_copy(rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_entries), @@ -170,6 +172,8 @@ std::vector> get_unique_entries_and_list_offsets( } // Collect unique entries and entry list offsets + // The new null_count and bitmask of the unique entries will also be generated + // by the gather function auto const indices = cudf::detail::slice( unique_indices->view(), 0, thrust::distance(unique_indices_begin, copy_end)); return cudf::detail::gather(table_view{{all_lists_entries, entries_list_offsets}}, @@ -354,6 +358,10 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu stream); // Construct a new lists column without duplicated entries + // Reuse the null_count and bitmask of the lists_column: those are the null information for + // the list elements (rows) + // For the entries of those lists (rows), their null_count and bitmask were generated separately + // during the step `get_unique_entries_and_list_offsets` above return make_lists_column(lists_column.size(), std::move(lists_offsets), std::move(unique_entries_and_list_offsets.front()), From 2b5b8e4139fba0ce3b4cb0b18a5ad479a33dc0e5 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 22 Mar 2021 10:31:27 -0600 Subject: [PATCH 08/34] Using type_dispatcher only for host code --- cpp/src/lists/drop_list_duplicates.cu | 132 +++++++++++++------------- 1 file changed, 64 insertions(+), 68 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 04e90d0f73c..013b2cefa20 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -36,38 +36,38 @@ namespace { using offset_type = lists_column_view::offset_type; /** - * @brief Performs an equality comparison between two rows in a single column + * @brief Performs an equality comparison between two entries in a lists column * - * If the row elements are not of floating point types, this functor will return the - * same comparison result as `cudf::element_equality_comparator`. - * For floating point types, elements that hold NaN value will be considered as unequal. + * For the two elements that are in the same list in the lists column, they will always be + * considered as different. If they are from the same list, and their type is one of floating + * point types, this functor will return the same comparison result as + * `cudf::element_equality_comparator`. For floating point types, entries holding NaN value will + * be considered as different. * * @tparam has_nulls Indicates the potential for null values in the column + * @tparam Type The data type of entries */ -template -class element_comparator_fn { +template +class list_entry_comparator { public: - /** - * @brief Construct type-dispatched function object for comparing equality - * between two rows - * - * @param d_view The column containing the rows to compare - * @param nulls_are_equal Indicates if two null rows are treated as having the same value - */ - __host__ __device__ element_comparator_fn(column_device_view d_view, bool nulls_are_equal) - : d_view{d_view}, nulls_are_equal{nulls_are_equal} + __host__ __device__ list_entry_comparator(offset_type const* list_offsets, + column_device_view const& d_view, + null_equality nulls_equal) + : list_offsets(list_offsets), d_view{d_view}, nulls_equal{nulls_equal} { } - template ()>* = nullptr> __device__ bool operator()(size_type i, size_type j) const noexcept { + // Two entries are not considered for equality if they belong to different lists + if (list_offsets[i] != list_offsets[j]) { return false; } + if (has_nulls) { bool const nullable = d_view.nullable(); bool const lhs_is_null{nullable and d_view.is_null(i)}; bool const rhs_is_null{nullable and d_view.is_null(j)}; if (lhs_is_null and rhs_is_null) { - return nulls_are_equal; + return nulls_equal == null_equality::EQUAL; } else if (lhs_is_null != rhs_is_null) { return false; } @@ -75,47 +75,55 @@ class element_comparator_fn { // For floating point types, if both element(i) and element(j) are NaNs then this comparison // will return `false`. This is the desired behavior for `drop_list_duplicates`. - return d_view.element(i) == d_view.element(j); - } - - template ()>* = nullptr> - __device__ bool operator()(size_type, size_type) const - { - release_assert(false && "Attempted to compare elements of uncomparable types."); - return false; + return d_view.element(i) == d_view.element(j); } private: - column_device_view d_view; - bool nulls_are_equal; + offset_type const* list_offsets; + column_device_view const& d_view; + null_equality nulls_equal; }; /** - * @brief Perform an equality comparison between two rows in a single column - * - * @tparam has_nulls Indicates the potential for null values in the column + * @brief Construct type-dispatched function object for copying indices of the list entries + * ignoring duplicates */ -template -class element_comparator { +class get_unique_entries_fn { public: - element_comparator(column_device_view _d_view, null_equality _nulls_equal) - : d_view{_d_view}, nulls_equal{_nulls_equal} + template ()>* = nullptr> + offset_type* operator()(offset_type const*, + column_device_view&, + size_type, + offset_type*, + bool, + null_equality, + rmm::cuda_stream_view) const { + CUDF_FAIL("Cannot operate on types that are not equally comparable."); } - __device__ bool operator()(size_type i, size_type j) const noexcept + template ()>* = nullptr> + offset_type* operator()(offset_type const* list_offsets, + column_device_view& d_view, + size_type num_entries, + offset_type* output_begin, + bool has_nulls, + null_equality nulls_equal, + rmm::cuda_stream_view stream) const noexcept { - return cudf::type_dispatcher( - d_view.type(), - element_comparator_fn{d_view, nulls_equal == null_equality::EQUAL}, - i, - j); + return has_nulls ? thrust::unique_copy( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + output_begin, + list_entry_comparator{list_offsets, d_view, nulls_equal}) + : thrust::unique_copy( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + output_begin, + list_entry_comparator{list_offsets, d_view, nulls_equal}); } - - private: - column_device_view d_view; - null_equality nulls_equal; }; /** @@ -141,35 +149,23 @@ std::vector> get_unique_entries_and_list_offsets( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto const num_entries = all_lists_entries.size(); + auto const num_entries = all_lists_entries.size(); + auto const d_view_entries = column_device_view::create(all_lists_entries, stream); // Allocate memory to store the indices of the unique entries auto const unique_indices = cudf::make_numeric_column( entries_list_offsets.type(), num_entries, mask_state::UNALLOCATED, stream); auto const unique_indices_begin = unique_indices->mutable_view().begin(); - offset_type* copy_end{nullptr}; - auto const d_view_entries = column_device_view::create(all_lists_entries, stream); - auto const list_offsets = entries_list_offsets.begin(); - if (all_lists_entries.has_nulls()) { - element_comparator const comp{*d_view_entries, nulls_equal}; - copy_end = thrust::unique_copy(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - unique_indices_begin, - [list_offsets, comp] __device__(auto i, auto j) { - return list_offsets[i] == list_offsets[j] && comp(i, j); - }); - } else { - element_comparator const comp{*d_view_entries, nulls_equal}; - copy_end = thrust::unique_copy(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - unique_indices_begin, - [list_offsets, comp] __device__(auto i, auto j) { - return list_offsets[i] == list_offsets[j] && comp(i, j); - }); - } + auto const copy_end = type_dispatcher(all_lists_entries.type(), + get_unique_entries_fn{}, + entries_list_offsets.begin(), + *d_view_entries, + num_entries, + unique_indices_begin, + all_lists_entries.has_nulls(), + nulls_equal, + stream); // Collect unique entries and entry list offsets // The new null_count and bitmask of the unique entries will also be generated From dfa1c8a05cc8e03eb9b9f80ba119299d4f2ef027 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 22 Mar 2021 11:14:05 -0600 Subject: [PATCH 09/34] Fix memory access violation bug when using reference to column_device_view --- cpp/src/lists/drop_list_duplicates.cu | 33 +++++++++++++++------------ 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 013b2cefa20..69fa6933d42 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -39,7 +39,7 @@ using offset_type = lists_column_view::offset_type; * @brief Performs an equality comparison between two entries in a lists column * * For the two elements that are in the same list in the lists column, they will always be - * considered as different. If they are from the same list, and their type is one of floating + * considered as different. If they are from the same list and their type is one of floating * point types, this functor will return the same comparison result as * `cudf::element_equality_comparator`. For floating point types, entries holding NaN value will * be considered as different. @@ -51,7 +51,7 @@ template class list_entry_comparator { public: __host__ __device__ list_entry_comparator(offset_type const* list_offsets, - column_device_view const& d_view, + column_device_view d_view, null_equality nulls_equal) : list_offsets(list_offsets), d_view{d_view}, nulls_equal{nulls_equal} { @@ -80,7 +80,7 @@ class list_entry_comparator { private: offset_type const* list_offsets; - column_device_view const& d_view; + column_device_view d_view; null_equality nulls_equal; }; @@ -111,18 +111,21 @@ class get_unique_entries_fn { null_equality nulls_equal, rmm::cuda_stream_view stream) const noexcept { - return has_nulls ? thrust::unique_copy( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - output_begin, - list_entry_comparator{list_offsets, d_view, nulls_equal}) - : thrust::unique_copy( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - output_begin, - list_entry_comparator{list_offsets, d_view, nulls_equal}); + if (has_nulls) { + list_entry_comparator const comp{list_offsets, d_view, nulls_equal}; + return thrust::unique_copy(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + output_begin, + comp); + } else { + list_entry_comparator const comp{list_offsets, d_view, nulls_equal}; + return thrust::unique_copy(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + output_begin, + comp); + } } }; From 9d07cc72fec186f98f557b8926ebf1643596ee32 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 22 Mar 2021 15:43:40 -0600 Subject: [PATCH 10/34] Add test case when the list contains both -0.0 and 0.0 --- cpp/tests/lists/drop_list_duplicates_tests.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index 6094d7a249a..5ccb7bbf349 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -52,6 +52,13 @@ void test_once(cudf::column_view const& input, struct DropListDuplicatesTest : public cudf::test::BaseFixture { }; +TEST_F(DropListDuplicatesTest, FloatingPointTestsWithSignedZero) +{ + // -0.0 and 0.0 should be considered equal + test_once(LIST_COL_FLT{0.0, 1, 2, -0.0, 1, 2, 0.0, 1, 2, -0.0, -0.0, 0.0, 0.0}, + LIST_COL_FLT{0, 1, 2}); +} + TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInf) { // Lists contain inf From ef6d7e24cb25786a80d03ad8fa52e2916ddd3747 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 23 Mar 2021 12:59:08 -0600 Subject: [PATCH 11/34] Rename constants --- .../lists/drop_list_duplicates_tests.cpp | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index 5ccb7bbf349..9352fca9fd8 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -30,10 +30,10 @@ using float_type = float; using LIST_COL_FLT = cudf::test::lists_column_wrapper; using LIST_COL_STR = cudf::test::lists_column_wrapper; -auto constexpr m_NaN = -std::numeric_limits::quiet_NaN(); -auto constexpr m_inf = -std::numeric_limits::infinity(); -auto constexpr p_inf = std::numeric_limits::infinity(); -auto constexpr p_NaN = std::numeric_limits::quiet_NaN(); +auto constexpr neg_NaN = -std::numeric_limits::quiet_NaN(); +auto constexpr neg_Inf = -std::numeric_limits::infinity(); +auto constexpr NaN = std::numeric_limits::quiet_NaN(); +auto constexpr Inf = std::numeric_limits::infinity(); template void test_once(cudf::column_view const& input, @@ -62,10 +62,9 @@ TEST_F(DropListDuplicatesTest, FloatingPointTestsWithSignedZero) TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInf) { // Lists contain inf - test_once(LIST_COL_FLT{0, 1, 2, 0, 1, 2, 0, 1, 2, p_inf, p_inf, p_inf}, - LIST_COL_FLT{0, 1, 2, p_inf}); - test_once(LIST_COL_FLT{p_inf, 0, m_inf, 0, p_inf, 0, m_inf, 0, p_inf, 0, m_inf}, - LIST_COL_FLT{m_inf, 0, p_inf}); + test_once(LIST_COL_FLT{0, 1, 2, 0, 1, 2, 0, 1, 2, Inf, Inf, Inf}, LIST_COL_FLT{0, 1, 2, Inf}); + test_once(LIST_COL_FLT{Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf}, + LIST_COL_FLT{neg_Inf, 0, Inf}); } // The position of NaN is undefined after sorting, thus we need to offload the data to CPU to @@ -100,17 +99,17 @@ void test_floating_point(std::vector const& h_input, TEST_F(DropListDuplicatesTest, FloatingPointTestsWithNaN) { std::vector h_input{ - 0, -1, 1, p_NaN, 2, 0, m_NaN, 1, -2, 2, 0, 1, 2, m_NaN, p_NaN, p_NaN, p_NaN, m_NaN}; + 0, -1, 1, NaN, 2, 0, neg_NaN, 1, -2, 2, 0, 1, 2, neg_NaN, NaN, NaN, NaN, neg_NaN}; std::unordered_set results_expected{-2, -1, 0, 1, 2}; test_floating_point(h_input, results_expected); } TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInfAndNaN) { - std::vector h_input{m_inf, 0, m_NaN, 1, -1, -2, p_NaN, p_NaN, p_inf, p_NaN, - m_NaN, 2, -1, 0, m_NaN, 1, 2, p_inf, 0, 1, - m_inf, 2, m_NaN, p_inf, m_NaN, m_NaN, p_NaN, m_inf}; - std::unordered_set results_expected{-2, -1, 0, 1, 2, m_inf, p_inf}; + std::vector h_input{neg_Inf, 0, neg_NaN, 1, -1, -2, NaN, NaN, Inf, NaN, + neg_NaN, 2, -1, 0, neg_NaN, 1, 2, Inf, 0, 1, + neg_Inf, 2, neg_NaN, Inf, neg_NaN, neg_NaN, NaN, neg_Inf}; + std::unordered_set results_expected{-2, -1, 0, 1, 2, neg_Inf, Inf}; test_floating_point(h_input, results_expected); } From 725155b6a1900066707a525f3372c77cffee2319 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 23 Mar 2021 12:59:32 -0600 Subject: [PATCH 12/34] Change has_null from template parameter to runtime parameter --- cpp/src/lists/drop_list_duplicates.cu | 38 +++++++++++---------------- 1 file changed, 15 insertions(+), 23 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 69fa6933d42..69f54ec3e14 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -44,16 +44,16 @@ using offset_type = lists_column_view::offset_type; * `cudf::element_equality_comparator`. For floating point types, entries holding NaN value will * be considered as different. * - * @tparam has_nulls Indicates the potential for null values in the column - * @tparam Type The data type of entries + * @tparam Type The data type of entries */ -template +template class list_entry_comparator { public: __host__ __device__ list_entry_comparator(offset_type const* list_offsets, column_device_view d_view, - null_equality nulls_equal) - : list_offsets(list_offsets), d_view{d_view}, nulls_equal{nulls_equal} + null_equality nulls_equal, + bool has_nulls) + : list_offsets(list_offsets), d_view{d_view}, nulls_equal{nulls_equal}, has_nulls(has_nulls) { } @@ -82,6 +82,7 @@ class list_entry_comparator { offset_type const* list_offsets; column_device_view d_view; null_equality nulls_equal; + bool has_nulls; }; /** @@ -95,8 +96,8 @@ class get_unique_entries_fn { column_device_view&, size_type, offset_type*, - bool, null_equality, + bool, rmm::cuda_stream_view) const { CUDF_FAIL("Cannot operate on types that are not equally comparable."); @@ -107,25 +108,16 @@ class get_unique_entries_fn { column_device_view& d_view, size_type num_entries, offset_type* output_begin, - bool has_nulls, null_equality nulls_equal, + bool has_nulls, rmm::cuda_stream_view stream) const noexcept { - if (has_nulls) { - list_entry_comparator const comp{list_offsets, d_view, nulls_equal}; - return thrust::unique_copy(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - output_begin, - comp); - } else { - list_entry_comparator const comp{list_offsets, d_view, nulls_equal}; - return thrust::unique_copy(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - output_begin, - comp); - } + list_entry_comparator const comp{list_offsets, d_view, nulls_equal, has_nulls}; + return thrust::unique_copy(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + output_begin, + comp); } }; @@ -166,8 +158,8 @@ std::vector> get_unique_entries_and_list_offsets( *d_view_entries, num_entries, unique_indices_begin, - all_lists_entries.has_nulls(), nulls_equal, + all_lists_entries.has_nulls(), stream); // Collect unique entries and entry list offsets From 6f76f8e31c90b45a27ff95205f5c2ce349233bbe Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 23 Mar 2021 17:58:59 -0600 Subject: [PATCH 13/34] Remove redundant qualifiers from class constructor --- cpp/src/lists/drop_list_duplicates.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 28e466929e7..316695c8d23 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -49,10 +49,10 @@ using offset_type = lists_column_view::offset_type; template class list_entry_comparator { public: - __host__ __device__ list_entry_comparator(offset_type const* list_offsets, - column_device_view d_view, - null_equality nulls_equal, - bool has_nulls) + list_entry_comparator(offset_type const* list_offsets, + column_device_view d_view, + null_equality nulls_equal, + bool has_nulls) : list_offsets(list_offsets), d_view{d_view}, nulls_equal{nulls_equal}, has_nulls(has_nulls) { } From c66b88fd9a21c4d90b5f96988cf1268bbef4346d Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 24 Mar 2021 19:47:43 -0600 Subject: [PATCH 14/34] AddAdd `nan_equality` enum to specify whether NaN elements should be considered as equal or not --- cpp/include/cudf/types.hpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index 7a3316a0571..d550f477cfa 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -136,6 +136,15 @@ enum class nan_policy : bool { NAN_IS_VALID ///< treat nans as valid elements (non-null) }; +/** + * @brief Enum to consider different elements (of floating point types) holding NaN value as equal + * or unequal + */ +enum class nan_equality : bool { + EQUAL, ///< NaNs compare equal + UNEQUAL ///< NaNs compare unequal +}; + /** * @brief */ From 806b90052ffbe4d865de214d65f95234a8727e7f Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 24 Mar 2021 19:50:38 -0600 Subject: [PATCH 15/34] Rewrite `drop_list_duplicate`, adding `nans_equal` parameter, allowing to specify whether to handle NaNs as the same value or different values --- .../lists/detail/drop_list_duplicates.hpp | 1 + .../cudf/lists/drop_list_duplicates.hpp | 3 + cpp/src/lists/drop_list_duplicates.cu | 368 +++++++++++++----- 3 files changed, 277 insertions(+), 95 deletions(-) diff --git a/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp b/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp index ba3e1d17d7f..53b31015145 100644 --- a/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp +++ b/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp @@ -31,6 +31,7 @@ namespace detail { std::unique_ptr drop_list_duplicates( lists_column_view const& lists_column, null_equality nulls_equal, + nan_equality nans_equal, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/lists/drop_list_duplicates.hpp b/cpp/include/cudf/lists/drop_list_duplicates.hpp index 0939bd7956a..f1ce3b7f0e3 100644 --- a/cpp/include/cudf/lists/drop_list_duplicates.hpp +++ b/cpp/include/cudf/lists/drop_list_duplicates.hpp @@ -41,6 +41,8 @@ namespace lists { * * @param lists_column The input lists_column_view * @param nulls_equal Flag to specify whether null entries should be considered equal + * @param nans_equal Flag to specify whether NaN entries should be considered as equal value (only + * applicable for floating point data column) * @param mr Device resource used to allocate memory * * @code{.pseudo} @@ -56,6 +58,7 @@ namespace lists { std::unique_ptr drop_list_duplicates( lists_column_view const& lists_column, null_equality nulls_equal = null_equality::EQUAL, + nan_equality nans_equal = nan_equality::UNEQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 316695c8d23..8567dc8ea89 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -22,11 +22,13 @@ #include #include #include +#include #include #include #include +#include #include namespace cudf { @@ -35,18 +37,205 @@ namespace detail { namespace { using offset_type = lists_column_view::offset_type; +template +struct has_negative_nans { + column_device_view const d_entries; + bool const has_nulls; + + __device__ Type operator()(size_type idx) + { + if (has_nulls && d_entries.is_null(idx)) { return false; } + + auto const val = d_entries.element(idx); + return std::isnan(val) && std::signbit(val); // std::signbit(x) == true if x is negative + } +}; + +/** + * @brief A structure to be used along with type_dispatcher to check if a + * `column_view` has any negative NaN entry + */ +struct has_negative_nans_fn { + template >* = nullptr> + bool operator()(column_view const& lists_entries, rmm::cuda_stream_view stream) const noexcept + { + auto const d_entries = column_device_view::create(lists_entries, stream); + return thrust::any_of(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(lists_entries.size()), + detail::has_negative_nans{*d_entries, lists_entries.has_nulls()}); + } + + template >* = nullptr> + bool operator()(column_view const&, rmm::cuda_stream_view) const noexcept + { + return false; + } +}; + +template +struct replace_negative_nans { + __device__ Type operator()(Type val) const noexcept + { + return std::isnan(val) ? std::numeric_limits::quiet_NaN() : val; + } +}; + +/** + * @brief A structure to be used along with type_dispatcher to replace -NaN by NaN for all entries + * of a floating point data column + */ +struct replace_negative_nans_fn { + template >* = nullptr> + void operator()(column_view const&, mutable_column_view const&, rmm::cuda_stream_view) const + { + CUDF_FAIL("Cannot operate on a type that is not floating point."); + } + + template >* = nullptr> + void operator()(column_view const& lists_entries, + mutable_column_view const& new_entries, + rmm::cuda_stream_view stream) const noexcept + { + // Do not care whether an entry is null or not, just consider it as a floating point value + thrust::transform(rmm::exec_policy(stream), + lists_entries.begin(), + lists_entries.end(), + new_entries.begin(), + detail::replace_negative_nans{}); + } +}; + +/** + * @brief Transform a given lists column to a new lists column in which all the list entries holding + * -NaN value are replaced by (positive) NaN + */ +std::unique_ptr replace_negative_nans_entries(column_view const& lists_entries, + lists_column_view const& lists_column, + rmm::cuda_stream_view stream) +{ + auto const mr = rmm::mr::get_current_device_resource(); + + // Copy list offsets from the given lists column to the new lists column + auto new_offsets = std::make_unique( + data_type{type_id::INT32}, + lists_column.size() + 1, + rmm::device_buffer{(lists_column.size() + 1) * sizeof(offset_type), stream, mr}); + thrust::copy_n(rmm::exec_policy(stream), + lists_column.offsets_begin(), + lists_column.size() + 1, + new_offsets->mutable_view().begin()); + + // Copy entries from the given lists column to the new lists column, replacing all -NaNs by NaNs + auto new_entries = std::make_unique( + lists_entries.type(), + lists_entries.size(), + rmm::device_buffer{lists_entries.size() * cudf::size_of(lists_entries.type()), stream, mr}, + cudf::detail::copy_bitmask(lists_entries, stream, mr), + lists_entries.null_count()); + + type_dispatcher(lists_entries.type(), + detail::replace_negative_nans_fn{}, + lists_entries, + new_entries->mutable_view(), + stream); + + return make_lists_column(lists_column.size(), + std::move(new_offsets), + std::move(new_entries), + lists_column.null_count(), + cudf::detail::copy_bitmask(lists_column.parent(), stream, mr)); +} + +/** + * @brief Generate a 0-based offset column for a lists column + * + * Given a lists_column_view, which may have a non-zero offset, generate a new column containing + * 0-based list offsets. This is done by subtracting each of the input list offset by the first + * offset. + * + * @code{.pseudo} + * Given a list column having offsets = { 3, 7, 9, 13 }, + * then output_offsets = { 0, 4, 6, 10 } + * @endcode + * + * @param lists_column The input lists column + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device resource used to allocate memory + * + * @return A column containing 0-based list offsets + */ +std::unique_ptr generate_clean_offsets(lists_column_view const& lists_column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto output_offsets = make_numeric_column(data_type{type_to_id()}, + lists_column.size() + 1, + mask_state::UNALLOCATED, + stream, + mr); + thrust::transform( + rmm::exec_policy(stream), + lists_column.offsets_begin(), + lists_column.offsets_end(), + output_offsets->mutable_view().begin(), + [first = lists_column.offsets_begin()] __device__(auto offset) { return offset - *first; }); + return output_offsets; +} + +/** + * @brief Populate list offsets for all list entries + * + * Given an `offsets` column_view containing offsets of a lists column and a number of all list + * entries in the column, generate an array that maps from each list entry to the offset of the list + * containing that entry. + * + * @code{.pseudo} + * num_entries = 10, offsets = { 0, 4, 6, 10 } + * output = { 1, 1, 1, 1, 2, 2, 3, 3, 3, 3 } + * @endcode + * + * @param num_entries The number of list entries + * @param offsets Column view to the list offsets + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device resource used to allocate memory + * + * @return A column containing entry list offsets + */ +std::unique_ptr generate_entry_list_offsets(size_type num_entries, + column_view const& offsets, + rmm::cuda_stream_view stream) +{ + auto entry_list_offsets = make_numeric_column(offsets.type(), + num_entries, + mask_state::UNALLOCATED, + stream, + rmm::mr::get_current_device_resource()); + thrust::upper_bound(rmm::exec_policy(stream), + offsets.begin(), + offsets.end(), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + entry_list_offsets->mutable_view().begin()); + return entry_list_offsets; +} + /** * @brief Performs an equality comparison between two entries in a lists column * * For the two elements that are in the same list in the lists column, they will always be * considered as different. If they are from the same list and their type is one of floating * point types, this functor will return the same comparison result as - * `cudf::element_equality_comparator`. For floating point types, entries holding NaN value will - * be considered as different. + * `cudf::element_equality_comparator`. + * + * For floating point types, entries holding NaN value can be considered as different values or the + * same value depending on the nans_equal parameter. * - * @tparam Type The data type of entries + * @tparam Type The data type of entries + * @tparam nans_equal Flag to specify whether NaN entries should be considered as equal value (only + * applicable for floating point data column) */ -template +template class list_entry_comparator { public: list_entry_comparator(offset_type const* list_offsets, @@ -57,7 +246,9 @@ class list_entry_comparator { { } - __device__ bool operator()(size_type i, size_type j) const noexcept + template + std::enable_if_t and nans_equal_, bool> __device__ + operator()(size_type i, size_type j) const noexcept { // Two entries are not considered for equality if they belong to different lists if (list_offsets[i] != list_offsets[j]) { return false; } @@ -74,7 +265,33 @@ class list_entry_comparator { } // For floating point types, if both element(i) and element(j) are NaNs then this comparison - // will return `false`. This is the desired behavior for `drop_list_duplicates`. + // will return `true`. This is the desired behavior in Pandas. + auto const lhs = d_view.element(i); + auto const rhs = d_view.element(j); + if (std::isnan(lhs) and std::isnan(rhs)) { return true; } + return lhs == rhs; + } + + template + std::enable_if_t or not nans_equal_, bool> __device__ + operator()(size_type i, size_type j) const noexcept + { + // Two entries are not considered for equality if they belong to different lists + if (list_offsets[i] != list_offsets[j]) { return false; } + + if (has_nulls) { + bool const nullable = d_view.nullable(); + bool const lhs_is_null{nullable and d_view.is_null(i)}; + bool const rhs_is_null{nullable and d_view.is_null(j)}; + if (lhs_is_null and rhs_is_null) { + return nulls_equal == null_equality::EQUAL; + } else if (lhs_is_null != rhs_is_null) { + return false; + } + } + + // For floating point types, if both element(i) and element(j) are NaNs then this comparison + // will return `false`. This is the desired behavior in Apache Spark. return d_view.element(i) == d_view.element(j); } @@ -89,14 +306,14 @@ class list_entry_comparator { * @brief Construct type-dispatched function object for copying indices of the list entries * ignoring duplicates */ -class get_unique_entries_fn { - public: +struct get_unique_entries_fn { template ()>* = nullptr> offset_type* operator()(offset_type const*, column_device_view&, size_type, offset_type*, null_equality, + nan_equality, bool, rmm::cuda_stream_view) const { @@ -109,15 +326,25 @@ class get_unique_entries_fn { size_type num_entries, offset_type* output_begin, null_equality nulls_equal, + nan_equality nans_equal, bool has_nulls, rmm::cuda_stream_view stream) const noexcept { - list_entry_comparator const comp{list_offsets, d_view, nulls_equal, has_nulls}; - return thrust::unique_copy(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - output_begin, - comp); + if (nans_equal == nan_equality::EQUAL) { + list_entry_comparator const comp{list_offsets, d_view, nulls_equal, has_nulls}; + return thrust::unique_copy(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + output_begin, + comp); + } else { + list_entry_comparator const comp{list_offsets, d_view, nulls_equal, has_nulls}; + return thrust::unique_copy(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + output_begin, + comp); + } } }; @@ -131,6 +358,8 @@ class get_unique_entries_fn { * @param all_lists_entries The input array containing all list entries * @param entries_list_offsets A map from list entries to their corresponding list offsets * @param nulls_equal Flag to specify whether null entries should be considered equal + * @param nans_equal Flag to specify whether NaN entries should be considered as equal + * value (only applicable for floating point data column) * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device resource used to allocate memory * @@ -141,6 +370,7 @@ std::vector> get_unique_entries_and_list_offsets( column_view const& all_lists_entries, column_view const& entries_list_offsets, null_equality nulls_equal, + nan_equality nans_equal, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -159,6 +389,7 @@ std::vector> get_unique_entries_and_list_offsets( num_entries, unique_indices_begin, nulls_equal, + nans_equal, all_lists_entries.has_nulls(), stream); @@ -176,79 +407,6 @@ std::vector> get_unique_entries_and_list_offsets( ->release(); } -/** - * @brief Generate a 0-based offset column for a lists column - * - * Given a lists_column_view, which may have a non-zero offset, generate a new column containing - * 0-based list offsets. This is done by subtracting each of the input list offset by the first - * offset. - * - * @code{.pseudo} - * Given a list column having offsets = { 3, 7, 9, 13 }, - * then output_offsets = { 0, 4, 6, 10 } - * @endcode - * - * @param lists_column The input lists column - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device resource used to allocate memory - * - * @return A column containing 0-based list offsets - */ -std::unique_ptr generate_clean_offsets(lists_column_view const& lists_column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - auto output_offsets = make_numeric_column(data_type{type_to_id()}, - lists_column.size() + 1, - mask_state::UNALLOCATED, - stream, - mr); - thrust::transform( - rmm::exec_policy(stream), - lists_column.offsets_begin(), - lists_column.offsets_end(), - output_offsets->mutable_view().begin(), - [first = lists_column.offsets_begin()] __device__(auto offset) { return offset - *first; }); - return output_offsets; -} - -/** - * @brief Populate list offsets for all list entries - * - * Given an `offsets` column_view containing offsets of a lists column and a number of all list - * entries in the column, generate an array that maps from each list entry to the offset of the list - * containing that entry. - * - * @code{.pseudo} - * num_entries = 10, offsets = { 0, 4, 6, 10 } - * output = { 1, 1, 1, 1, 2, 2, 3, 3, 3, 3 } - * @endcode - * - * @param num_entries The number of list entries - * @param offsets Column view to the list offsets - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device resource used to allocate memory - * - * @return A column containing entry list offsets - */ -std::unique_ptr generate_entry_list_offsets(size_type num_entries, - column_view const& offsets, - rmm::cuda_stream_view stream) -{ - auto entry_list_offsets = make_numeric_column(offsets.type(), - num_entries, - mask_state::UNALLOCATED, - stream, - rmm::mr::get_current_device_resource()); - thrust::upper_bound(rmm::exec_policy(stream), - offsets.begin(), - offsets.end(), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - entry_list_offsets->mutable_view().begin()); - return entry_list_offsets; -} - /** * @brief Generate list offsets from entry offsets * @@ -309,6 +467,7 @@ void generate_offsets(size_type num_entries, return offsets[i - prefix_sum_empty_lists[i]]; }); } + } // anonymous namespace /** @@ -318,6 +477,7 @@ void generate_offsets(size_type num_entries, */ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_column, null_equality nulls_equal, + nan_equality nans_equal, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -326,23 +486,39 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu CUDF_FAIL("Nested types are not supported in drop_list_duplicates."); } - // Call segmented sort on the list elements and store them in a temporary column sorted_list - auto const sorted_lists = - detail::sort_lists(lists_column, order::ASCENDING, null_order::AFTER, stream); - // Flatten all entries (depth = 1) of the lists column - auto const all_lists_entries = lists_column_view(sorted_lists->view()).get_sliced_child(stream); + auto const lists_entries = lists_column.get_sliced_child(stream); + + // sorted_lists will store the results of the original lists after calling segmented_sort + std::unique_ptr sorted_lists; + + // If the column contains lists of floating point data type and NaNs are considered as equal, we + // need to replace -NaN by NaN before sorting + auto const has_negative_nan = + type_dispatcher(lists_entries.type(), detail::has_negative_nans_fn{}, lists_entries, stream); + if (has_negative_nan) { + // The column new_lists_column is temporary, thus we will not pass in `mr` + auto const new_lists_column = + detail::replace_negative_nans_entries(lists_entries, lists_column, stream); + sorted_lists = detail::sort_lists( + lists_column_view(new_lists_column->view()), order::ASCENDING, null_order::AFTER, stream); + } else { + sorted_lists = detail::sort_lists(lists_column, order::ASCENDING, null_order::AFTER, stream); + } + + auto const sorted_lists_entries = + lists_column_view(sorted_lists->view()).get_sliced_child(stream); // Generate a 0-based offset column auto lists_offsets = detail::generate_clean_offsets(lists_column, stream, mr); // Generate a mapping from list entries to offsets of the lists containing those entries auto const entries_list_offsets = - detail::generate_entry_list_offsets(all_lists_entries.size(), lists_offsets->view(), stream); + detail::generate_entry_list_offsets(sorted_lists_entries.size(), lists_offsets->view(), stream); // Copy non-duplicated entries (along with their list offsets) to new arrays auto unique_entries_and_list_offsets = detail::get_unique_entries_and_list_offsets( - all_lists_entries, entries_list_offsets->view(), nulls_equal, stream, mr); + sorted_lists_entries, entries_list_offsets->view(), nulls_equal, nans_equal, stream, mr); // Generate offsets for the new lists column detail::generate_offsets(unique_entries_and_list_offsets.front()->size(), @@ -369,10 +545,12 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu */ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_column, null_equality nulls_equal, + nan_equality nans_equal, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::drop_list_duplicates(lists_column, nulls_equal, rmm::cuda_stream_default, mr); + return detail::drop_list_duplicates( + lists_column, nulls_equal, nans_equal, rmm::cuda_stream_default, mr); } } // namespace lists From d96bc79364e2d496932a5324156b374b0ee93dd2 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 24 Mar 2021 19:51:39 -0600 Subject: [PATCH 16/34] Rewrite tests for `drop_list_duplicates` --- .../lists/drop_list_duplicates_tests.cpp | 30 ++++++++++++------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index 9352fca9fd8..07f7d09bb26 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -72,19 +72,27 @@ TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInf) // We will not store NaN in the results_expected variable (an unordered_set) because we can't check // for NaN existence in a set. Instead, we will count the number of NaNs in the input and compare // with the number of NaNs in the output. -void test_floating_point(std::vector const& h_input, - std::unordered_set const& results_expected) +static void test_floating_point(std::vector const& h_input, + std::unordered_set const& results_expected, + cudf::nan_equality nans_equal) { - const auto num_NaNs = - std::count_if(h_input.begin(), h_input.end(), [](auto x) { return std::isnan(x); }); + // If NaNs are considered as equal value, the final result should always contain at max ONE NaN + // entry per list + std::size_t const num_NaNs = + nans_equal == cudf::nan_equality::EQUAL + ? std::size_t{1} + : std::count_if(h_input.begin(), h_input.end(), [](auto x) { return std::isnan(x); }); auto const results_col = cudf::lists::drop_list_duplicates( - cudf::lists_column_view{LIST_COL_FLT(h_input.begin(), h_input.end())}); + cudf::lists_column_view{LIST_COL_FLT(h_input.begin(), h_input.end())}, + cudf::null_equality::EQUAL, + nans_equal); auto const results_arr = cudf::test::to_host(cudf::lists_column_view(results_col->view()).child()).first; + EXPECT_EQ(results_arr.size(), results_expected.size() + num_NaNs); - int NaN_count{0}; + std::size_t NaN_count{0}; std::unordered_set results; for (auto const x : results_arr) { if (std::isnan(x)) { @@ -96,21 +104,23 @@ void test_floating_point(std::vector const& h_input, EXPECT_TRUE(results_expected.size() == results.size() && NaN_count == num_NaNs); } -TEST_F(DropListDuplicatesTest, FloatingPointTestsWithNaN) +TEST_F(DropListDuplicatesTest, FloatingPointTestsWithNaNs) { std::vector h_input{ 0, -1, 1, NaN, 2, 0, neg_NaN, 1, -2, 2, 0, 1, 2, neg_NaN, NaN, NaN, NaN, neg_NaN}; std::unordered_set results_expected{-2, -1, 0, 1, 2}; - test_floating_point(h_input, results_expected); + test_floating_point(h_input, results_expected, cudf::nan_equality::UNEQUAL); + test_floating_point(h_input, results_expected, cudf::nan_equality::EQUAL); } -TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInfAndNaN) +TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInfsAndNaNs) { std::vector h_input{neg_Inf, 0, neg_NaN, 1, -1, -2, NaN, NaN, Inf, NaN, neg_NaN, 2, -1, 0, neg_NaN, 1, 2, Inf, 0, 1, neg_Inf, 2, neg_NaN, Inf, neg_NaN, neg_NaN, NaN, neg_Inf}; std::unordered_set results_expected{-2, -1, 0, 1, 2, neg_Inf, Inf}; - test_floating_point(h_input, results_expected); + test_floating_point(h_input, results_expected, cudf::nan_equality::UNEQUAL); + test_floating_point(h_input, results_expected, cudf::nan_equality::EQUAL); } TEST_F(DropListDuplicatesTest, StringTestsNonNull) From 0246c78b10a04d569b6266b83bb736be74f72409 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 24 Mar 2021 19:53:43 -0600 Subject: [PATCH 17/34] Rewrite `collect_set_aggregation`, adding `nans_equal` parameter --- cpp/include/cudf/aggregation.hpp | 5 ++++- .../cudf/detail/aggregation/aggregation.hpp | 18 +++++++++++++----- cpp/src/aggregation/aggregation.cpp | 5 +++-- cpp/src/groupby/sort/aggregate.cpp | 13 ++++++++----- 4 files changed, 28 insertions(+), 13 deletions(-) diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index 3c454c85720..2ea46a5685c 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -230,10 +230,13 @@ std::unique_ptr make_collect_list_aggregation( * @param null_handling Indicates whether to include/exclude nulls during collection * @param nulls_equal Flag to specify whether null entries within each list should be considered * equal + * @param nans_equal Flag to specify whether NaN values in floating point column should be + * considered equal */ std::unique_ptr make_collect_set_aggregation( null_policy null_handling = null_policy::INCLUDE, - null_equality null_equal = null_equality::EQUAL); + null_equality null_equal = null_equality::EQUAL, + nan_equality nans_equal = nan_equality::UNEQUAL); /// Factory to create a LAG aggregation std::unique_ptr make_lag_aggregation(size_type offset); diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 18bef301e03..0bfe6b84ae2 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -345,24 +345,32 @@ struct collect_list_aggregation final : derived_aggregation */ struct collect_set_aggregation final : derived_aggregation { explicit collect_set_aggregation(null_policy null_handling = null_policy::INCLUDE, - null_equality null_equal = null_equality::EQUAL) - : derived_aggregation{COLLECT_SET}, _null_handling{null_handling}, _null_equal(null_equal) + null_equality nulls_equal = null_equality::EQUAL, + nan_equality nans_equal = nan_equality::UNEQUAL) + : derived_aggregation{COLLECT_SET}, + _null_handling{null_handling}, + _nulls_equal(nulls_equal), + _nans_equal(nans_equal) { } null_policy _null_handling; ///< include or exclude nulls - null_equality _null_equal; ///< whether to consider nulls as equal values + null_equality _nulls_equal; ///< whether to consider nulls as equal values + nan_equality _nans_equal; ///< whether to consider NaNs as equal value (applicable only to + ///< floating point types) protected: friend class derived_aggregation; bool operator==(collect_set_aggregation const& other) const { - return _null_handling == other._null_handling && _null_equal == other._null_equal; + return _null_handling == other._null_handling && _nulls_equal == other._nulls_equal && + _nans_equal == other._nans_equal; } size_t hash_impl() const { - return std::hash{}(static_cast(_null_handling) ^ static_cast(_null_equal)); + return std::hash{}(static_cast(_null_handling) ^ static_cast(_nulls_equal) ^ + static_cast(_nans_equal)); } }; diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp index 33c19617308..3a044a42101 100644 --- a/cpp/src/aggregation/aggregation.cpp +++ b/cpp/src/aggregation/aggregation.cpp @@ -132,9 +132,10 @@ std::unique_ptr make_collect_list_aggregation(null_policy null_hand } /// Factory to create a COLLECT_SET aggregation std::unique_ptr make_collect_set_aggregation(null_policy null_handling, - null_equality null_equal) + null_equality nulls_equal, + nan_equality nans_equal) { - return std::make_unique(null_handling, null_equal); + return std::make_unique(null_handling, nulls_equal, nans_equal); } /// Factory to create a LAG aggregation std::unique_ptr make_lag_aggregation(size_type offset) diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index b171b19413b..02c46a9095e 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -375,11 +375,14 @@ void aggregrate_result_functor::operator()(aggregation auto const collect_result = detail::group_collect( get_grouped_values(), helper.group_offsets(), helper.num_groups(), stream, mr); auto const nulls_equal = - static_cast(agg)._null_equal; - cache.add_result(col_idx, - agg, - lists::detail::drop_list_duplicates( - lists_column_view(collect_result->view()), nulls_equal, stream, mr)); + static_cast(agg)._nulls_equal; + auto const nans_equal = + static_cast(agg)._nans_equal; + cache.add_result( + col_idx, + agg, + lists::detail::drop_list_duplicates( + lists_column_view(collect_result->view()), nulls_equal, nans_equal, stream, mr)); }; } // namespace detail From 2185d8a3d08a2ec8fead6de0507da895eb1e163d Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 24 Mar 2021 19:57:31 -0600 Subject: [PATCH 18/34] Fix typo --- cpp/include/cudf/aggregation.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index 2ea46a5685c..74ce6e42d7e 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -235,7 +235,7 @@ std::unique_ptr make_collect_list_aggregation( */ std::unique_ptr make_collect_set_aggregation( null_policy null_handling = null_policy::INCLUDE, - null_equality null_equal = null_equality::EQUAL, + null_equality nulls_equal = null_equality::EQUAL, nan_equality nans_equal = nan_equality::UNEQUAL); /// Factory to create a LAG aggregation From 919e8592f26ef714b8aab8d7bb7782184cae170a Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 25 Mar 2021 08:26:40 -0600 Subject: [PATCH 19/34] Change `nan_equality` enum names --- cpp/include/cudf/types.hpp | 6 +++--- cpp/src/lists/drop_list_duplicates.cu | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index d550f477cfa..a4016b2db9b 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -140,9 +140,9 @@ enum class nan_policy : bool { * @brief Enum to consider different elements (of floating point types) holding NaN value as equal * or unequal */ -enum class nan_equality : bool { - EQUAL, ///< NaNs compare equal - UNEQUAL ///< NaNs compare unequal +enum class nan_equality /*unspecified*/ { + ALL_EQUAL, ///< All NaNs compare equal, regardless of sign + UNEQUAL ///< All NaNs compare unequal (IEE754 behavior) }; /** diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 8567dc8ea89..588dab6ab9e 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -42,7 +42,7 @@ struct has_negative_nans { column_device_view const d_entries; bool const has_nulls; - __device__ Type operator()(size_type idx) + __device__ Type operator()(size_type idx) const noexcept { if (has_nulls && d_entries.is_null(idx)) { return false; } @@ -330,7 +330,7 @@ struct get_unique_entries_fn { bool has_nulls, rmm::cuda_stream_view stream) const noexcept { - if (nans_equal == nan_equality::EQUAL) { + if (nans_equal == nan_equality::ALL_EQUAL) { list_entry_comparator const comp{list_offsets, d_view, nulls_equal, has_nulls}; return thrust::unique_copy(rmm::exec_policy(stream), thrust::make_counting_iterator(0), From a002e62215ff68f17a6c75941199e66b62665b28 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 25 Mar 2021 09:15:20 -0600 Subject: [PATCH 20/34] Fix enum in unit tests for `drop_list_duplicates` --- cpp/tests/lists/drop_list_duplicates_tests.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index 07f7d09bb26..bc413fd220a 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -79,7 +79,7 @@ static void test_floating_point(std::vector const& h_input, // If NaNs are considered as equal value, the final result should always contain at max ONE NaN // entry per list std::size_t const num_NaNs = - nans_equal == cudf::nan_equality::EQUAL + nans_equal == cudf::nan_equality::ALL_EQUAL ? std::size_t{1} : std::count_if(h_input.begin(), h_input.end(), [](auto x) { return std::isnan(x); }); @@ -110,7 +110,7 @@ TEST_F(DropListDuplicatesTest, FloatingPointTestsWithNaNs) 0, -1, 1, NaN, 2, 0, neg_NaN, 1, -2, 2, 0, 1, 2, neg_NaN, NaN, NaN, NaN, neg_NaN}; std::unordered_set results_expected{-2, -1, 0, 1, 2}; test_floating_point(h_input, results_expected, cudf::nan_equality::UNEQUAL); - test_floating_point(h_input, results_expected, cudf::nan_equality::EQUAL); + test_floating_point(h_input, results_expected, cudf::nan_equality::ALL_EQUAL); } TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInfsAndNaNs) @@ -120,7 +120,7 @@ TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInfsAndNaNs) neg_Inf, 2, neg_NaN, Inf, neg_NaN, neg_NaN, NaN, neg_Inf}; std::unordered_set results_expected{-2, -1, 0, 1, 2, neg_Inf, Inf}; test_floating_point(h_input, results_expected, cudf::nan_equality::UNEQUAL); - test_floating_point(h_input, results_expected, cudf::nan_equality::EQUAL); + test_floating_point(h_input, results_expected, cudf::nan_equality::ALL_EQUAL); } TEST_F(DropListDuplicatesTest, StringTestsNonNull) From 26cae096abb920bd7560c35d6e27e2f95ece9032 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 25 Mar 2021 10:40:13 -0600 Subject: [PATCH 21/34] Add an option to specify NaNs are compared equal only if they have the same sign --- cpp/include/cudf/types.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index a4016b2db9b..1fdc5a131bc 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -141,8 +141,9 @@ enum class nan_policy : bool { * or unequal */ enum class nan_equality /*unspecified*/ { - ALL_EQUAL, ///< All NaNs compare equal, regardless of sign - UNEQUAL ///< All NaNs compare unequal (IEE754 behavior) + ALL_EQUAL, ///< All NaNs compare equal, regardless of sign + UNEQUAL, ///< All NaNs compare unequal (IEE754 behavior) + SAME_SIGN_EQUAL ///< NaNs compare equal only if they have the same sign }; /** From 4356bf67290d31f998f759ae76db0c9dd2f1f6c6 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 25 Mar 2021 10:42:27 -0600 Subject: [PATCH 22/34] Rework `drop_list_duplicates` for the new `nan_equality` option --- cpp/src/lists/drop_list_duplicates.cu | 75 +++++++++++++++++---------- 1 file changed, 48 insertions(+), 27 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 588dab6ab9e..6de7941909d 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -235,7 +235,7 @@ std::unique_ptr generate_entry_list_offsets(size_type num_entries, * @tparam nans_equal Flag to specify whether NaN entries should be considered as equal value (only * applicable for floating point data column) */ -template +template class list_entry_comparator { public: list_entry_comparator(offset_type const* list_offsets, @@ -246,9 +246,10 @@ class list_entry_comparator { { } - template - std::enable_if_t and nans_equal_, bool> __device__ - operator()(size_type i, size_type j) const noexcept + template + std::enable_if_t and nans_equal_ != nan_equality::UNEQUAL, + bool> + __device__ operator()(size_type i, size_type j) const noexcept { // Two entries are not considered for equality if they belong to different lists if (list_offsets[i] != list_offsets[j]) { return false; } @@ -264,17 +265,22 @@ class list_entry_comparator { } } - // For floating point types, if both element(i) and element(j) are NaNs then this comparison - // will return `true`. This is the desired behavior in Pandas. + // For floating point types, if both element(i) and element(j) are NaNs and: + // nans_equal_ == ALL_EQUAL, this will always return `true` + // nans_equal_ == SAME_SIGN_EQUAL: returns `true' only if both elements have the same sign auto const lhs = d_view.element(i); auto const rhs = d_view.element(j); - if (std::isnan(lhs) and std::isnan(rhs)) { return true; } + if (std::isnan(lhs) and std::isnan(rhs)) { + if (nans_equal_ == nan_equality::ALL_EQUAL) { return true; } + return std::signbit(lhs) == std::signbit(rhs); + } return lhs == rhs; } - template - std::enable_if_t or not nans_equal_, bool> __device__ - operator()(size_type i, size_type j) const noexcept + template + std::enable_if_t or nans_equal_ == nan_equality::UNEQUAL, + bool> + __device__ operator()(size_type i, size_type j) const noexcept { // Two entries are not considered for equality if they belong to different lists if (list_offsets[i] != list_offsets[j]) { return false; } @@ -291,7 +297,7 @@ class list_entry_comparator { } // For floating point types, if both element(i) and element(j) are NaNs then this comparison - // will return `false`. This is the desired behavior in Apache Spark. + // will always return `false`. This is the desired behavior in Apache Spark. return d_view.element(i) == d_view.element(j); } @@ -330,20 +336,34 @@ struct get_unique_entries_fn { bool has_nulls, rmm::cuda_stream_view stream) const noexcept { - if (nans_equal == nan_equality::ALL_EQUAL) { - list_entry_comparator const comp{list_offsets, d_view, nulls_equal, has_nulls}; - return thrust::unique_copy(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - output_begin, - comp); - } else { - list_entry_comparator const comp{list_offsets, d_view, nulls_equal, has_nulls}; - return thrust::unique_copy(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - output_begin, - comp); + switch (nans_equal) { + case nan_equality::ALL_EQUAL: { + list_entry_comparator const comp{ + list_offsets, d_view, nulls_equal, has_nulls}; + return thrust::unique_copy(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + output_begin, + comp); + } + case nan_equality::UNEQUAL: { + list_entry_comparator const comp{ + list_offsets, d_view, nulls_equal, has_nulls}; + return thrust::unique_copy(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + output_begin, + comp); + } + default: { // case nan_equality::SAME_SIGN_EQUAL: + list_entry_comparator const comp{ + list_offsets, d_view, nulls_equal, has_nulls}; + return thrust::unique_copy(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + output_begin, + comp); + } } } }; @@ -492,9 +512,10 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu // sorted_lists will store the results of the original lists after calling segmented_sort std::unique_ptr sorted_lists; - // If the column contains lists of floating point data type and NaNs are considered as equal, we - // need to replace -NaN by NaN before sorting + // If the column contains lists of floating point data type and NaNs are considered as all equal + // regardless of sign, we need to replace -NaN by NaN before sorting auto const has_negative_nan = + (nans_equal == nan_equality::ALL_EQUAL) && type_dispatcher(lists_entries.type(), detail::has_negative_nans_fn{}, lists_entries, stream); if (has_negative_nan) { // The column new_lists_column is temporary, thus we will not pass in `mr` From e4cfa11e920fb4a2f041964e3b2479d1135c0d21 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 25 Mar 2021 10:45:16 -0600 Subject: [PATCH 23/34] Rewrite unit tests for `drop_list_duplicates` that can test for all cases of `nan_equality` --- .../lists/drop_list_duplicates_tests.cpp | 34 ++++++++++++++----- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index bc413fd220a..9ca48c4ecfd 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -76,12 +76,28 @@ static void test_floating_point(std::vector const& h_input, std::unordered_set const& results_expected, cudf::nan_equality nans_equal) { - // If NaNs are considered as equal value, the final result should always contain at max ONE NaN - // entry per list - std::size_t const num_NaNs = - nans_equal == cudf::nan_equality::ALL_EQUAL - ? std::size_t{1} - : std::count_if(h_input.begin(), h_input.end(), [](auto x) { return std::isnan(x); }); + std::size_t num_NaNs; + switch (nans_equal) { + case cudf::nan_equality::ALL_EQUAL: { + auto const has_nan = + std::any_of(h_input.begin(), h_input.end(), [](auto x) { return std::isnan(x); }); + num_NaNs = static_cast(has_nan); + break; + } + case cudf::nan_equality::UNEQUAL: { + num_NaNs = + std::count_if(h_input.begin(), h_input.end(), [](auto x) { return std::isnan(x); }); + break; + } + default: { // case cudf::nan_equality::SAME_SIGN_EQUAL: + auto const has_positive_nan = std::any_of(h_input.begin(), h_input.end(), [](auto x) { + return std::isnan(x) and not std::signbit(x); + }); + auto const has_negative_nan = std::any_of( + h_input.begin(), h_input.end(), [](auto x) { return std::isnan(x) and std::signbit(x); }); + num_NaNs = static_cast(has_positive_nan) + static_cast(has_negative_nan); + } + } auto const results_col = cudf::lists::drop_list_duplicates( cudf::lists_column_view{LIST_COL_FLT(h_input.begin(), h_input.end())}, @@ -109,8 +125,9 @@ TEST_F(DropListDuplicatesTest, FloatingPointTestsWithNaNs) std::vector h_input{ 0, -1, 1, NaN, 2, 0, neg_NaN, 1, -2, 2, 0, 1, 2, neg_NaN, NaN, NaN, NaN, neg_NaN}; std::unordered_set results_expected{-2, -1, 0, 1, 2}; - test_floating_point(h_input, results_expected, cudf::nan_equality::UNEQUAL); test_floating_point(h_input, results_expected, cudf::nan_equality::ALL_EQUAL); + test_floating_point(h_input, results_expected, cudf::nan_equality::UNEQUAL); + test_floating_point(h_input, results_expected, cudf::nan_equality::SAME_SIGN_EQUAL); } TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInfsAndNaNs) @@ -119,8 +136,9 @@ TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInfsAndNaNs) neg_NaN, 2, -1, 0, neg_NaN, 1, 2, Inf, 0, 1, neg_Inf, 2, neg_NaN, Inf, neg_NaN, neg_NaN, NaN, neg_Inf}; std::unordered_set results_expected{-2, -1, 0, 1, 2, neg_Inf, Inf}; - test_floating_point(h_input, results_expected, cudf::nan_equality::UNEQUAL); test_floating_point(h_input, results_expected, cudf::nan_equality::ALL_EQUAL); + test_floating_point(h_input, results_expected, cudf::nan_equality::UNEQUAL); + test_floating_point(h_input, results_expected, cudf::nan_equality::SAME_SIGN_EQUAL); } TEST_F(DropListDuplicatesTest, StringTestsNonNull) From 98ec9b12b0df69b26252342a6fd5350aa9dfe3f0 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 25 Mar 2021 12:41:10 -0600 Subject: [PATCH 24/34] Revert "Rewrite unit tests for `drop_list_duplicates` that can test for all cases of `nan_equality`" This reverts commit e4cfa11e920fb4a2f041964e3b2479d1135c0d21. --- .../lists/drop_list_duplicates_tests.cpp | 34 +++++-------------- 1 file changed, 8 insertions(+), 26 deletions(-) diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index 9ca48c4ecfd..bc413fd220a 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -76,28 +76,12 @@ static void test_floating_point(std::vector const& h_input, std::unordered_set const& results_expected, cudf::nan_equality nans_equal) { - std::size_t num_NaNs; - switch (nans_equal) { - case cudf::nan_equality::ALL_EQUAL: { - auto const has_nan = - std::any_of(h_input.begin(), h_input.end(), [](auto x) { return std::isnan(x); }); - num_NaNs = static_cast(has_nan); - break; - } - case cudf::nan_equality::UNEQUAL: { - num_NaNs = - std::count_if(h_input.begin(), h_input.end(), [](auto x) { return std::isnan(x); }); - break; - } - default: { // case cudf::nan_equality::SAME_SIGN_EQUAL: - auto const has_positive_nan = std::any_of(h_input.begin(), h_input.end(), [](auto x) { - return std::isnan(x) and not std::signbit(x); - }); - auto const has_negative_nan = std::any_of( - h_input.begin(), h_input.end(), [](auto x) { return std::isnan(x) and std::signbit(x); }); - num_NaNs = static_cast(has_positive_nan) + static_cast(has_negative_nan); - } - } + // If NaNs are considered as equal value, the final result should always contain at max ONE NaN + // entry per list + std::size_t const num_NaNs = + nans_equal == cudf::nan_equality::ALL_EQUAL + ? std::size_t{1} + : std::count_if(h_input.begin(), h_input.end(), [](auto x) { return std::isnan(x); }); auto const results_col = cudf::lists::drop_list_duplicates( cudf::lists_column_view{LIST_COL_FLT(h_input.begin(), h_input.end())}, @@ -125,9 +109,8 @@ TEST_F(DropListDuplicatesTest, FloatingPointTestsWithNaNs) std::vector h_input{ 0, -1, 1, NaN, 2, 0, neg_NaN, 1, -2, 2, 0, 1, 2, neg_NaN, NaN, NaN, NaN, neg_NaN}; std::unordered_set results_expected{-2, -1, 0, 1, 2}; - test_floating_point(h_input, results_expected, cudf::nan_equality::ALL_EQUAL); test_floating_point(h_input, results_expected, cudf::nan_equality::UNEQUAL); - test_floating_point(h_input, results_expected, cudf::nan_equality::SAME_SIGN_EQUAL); + test_floating_point(h_input, results_expected, cudf::nan_equality::ALL_EQUAL); } TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInfsAndNaNs) @@ -136,9 +119,8 @@ TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInfsAndNaNs) neg_NaN, 2, -1, 0, neg_NaN, 1, 2, Inf, 0, 1, neg_Inf, 2, neg_NaN, Inf, neg_NaN, neg_NaN, NaN, neg_Inf}; std::unordered_set results_expected{-2, -1, 0, 1, 2, neg_Inf, Inf}; - test_floating_point(h_input, results_expected, cudf::nan_equality::ALL_EQUAL); test_floating_point(h_input, results_expected, cudf::nan_equality::UNEQUAL); - test_floating_point(h_input, results_expected, cudf::nan_equality::SAME_SIGN_EQUAL); + test_floating_point(h_input, results_expected, cudf::nan_equality::ALL_EQUAL); } TEST_F(DropListDuplicatesTest, StringTestsNonNull) From 7a9f850afa0c6b23b6582130ea1d9cf8ec2fa596 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 25 Mar 2021 12:41:18 -0600 Subject: [PATCH 25/34] Revert "Rework `drop_list_duplicates` for the new `nan_equality` option" This reverts commit 4356bf67290d31f998f759ae76db0c9dd2f1f6c6. --- cpp/src/lists/drop_list_duplicates.cu | 75 ++++++++++----------------- 1 file changed, 27 insertions(+), 48 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 6de7941909d..588dab6ab9e 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -235,7 +235,7 @@ std::unique_ptr generate_entry_list_offsets(size_type num_entries, * @tparam nans_equal Flag to specify whether NaN entries should be considered as equal value (only * applicable for floating point data column) */ -template +template class list_entry_comparator { public: list_entry_comparator(offset_type const* list_offsets, @@ -246,10 +246,9 @@ class list_entry_comparator { { } - template - std::enable_if_t and nans_equal_ != nan_equality::UNEQUAL, - bool> - __device__ operator()(size_type i, size_type j) const noexcept + template + std::enable_if_t and nans_equal_, bool> __device__ + operator()(size_type i, size_type j) const noexcept { // Two entries are not considered for equality if they belong to different lists if (list_offsets[i] != list_offsets[j]) { return false; } @@ -265,22 +264,17 @@ class list_entry_comparator { } } - // For floating point types, if both element(i) and element(j) are NaNs and: - // nans_equal_ == ALL_EQUAL, this will always return `true` - // nans_equal_ == SAME_SIGN_EQUAL: returns `true' only if both elements have the same sign + // For floating point types, if both element(i) and element(j) are NaNs then this comparison + // will return `true`. This is the desired behavior in Pandas. auto const lhs = d_view.element(i); auto const rhs = d_view.element(j); - if (std::isnan(lhs) and std::isnan(rhs)) { - if (nans_equal_ == nan_equality::ALL_EQUAL) { return true; } - return std::signbit(lhs) == std::signbit(rhs); - } + if (std::isnan(lhs) and std::isnan(rhs)) { return true; } return lhs == rhs; } - template - std::enable_if_t or nans_equal_ == nan_equality::UNEQUAL, - bool> - __device__ operator()(size_type i, size_type j) const noexcept + template + std::enable_if_t or not nans_equal_, bool> __device__ + operator()(size_type i, size_type j) const noexcept { // Two entries are not considered for equality if they belong to different lists if (list_offsets[i] != list_offsets[j]) { return false; } @@ -297,7 +291,7 @@ class list_entry_comparator { } // For floating point types, if both element(i) and element(j) are NaNs then this comparison - // will always return `false`. This is the desired behavior in Apache Spark. + // will return `false`. This is the desired behavior in Apache Spark. return d_view.element(i) == d_view.element(j); } @@ -336,34 +330,20 @@ struct get_unique_entries_fn { bool has_nulls, rmm::cuda_stream_view stream) const noexcept { - switch (nans_equal) { - case nan_equality::ALL_EQUAL: { - list_entry_comparator const comp{ - list_offsets, d_view, nulls_equal, has_nulls}; - return thrust::unique_copy(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - output_begin, - comp); - } - case nan_equality::UNEQUAL: { - list_entry_comparator const comp{ - list_offsets, d_view, nulls_equal, has_nulls}; - return thrust::unique_copy(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - output_begin, - comp); - } - default: { // case nan_equality::SAME_SIGN_EQUAL: - list_entry_comparator const comp{ - list_offsets, d_view, nulls_equal, has_nulls}; - return thrust::unique_copy(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - output_begin, - comp); - } + if (nans_equal == nan_equality::ALL_EQUAL) { + list_entry_comparator const comp{list_offsets, d_view, nulls_equal, has_nulls}; + return thrust::unique_copy(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + output_begin, + comp); + } else { + list_entry_comparator const comp{list_offsets, d_view, nulls_equal, has_nulls}; + return thrust::unique_copy(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + output_begin, + comp); } } }; @@ -512,10 +492,9 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu // sorted_lists will store the results of the original lists after calling segmented_sort std::unique_ptr sorted_lists; - // If the column contains lists of floating point data type and NaNs are considered as all equal - // regardless of sign, we need to replace -NaN by NaN before sorting + // If the column contains lists of floating point data type and NaNs are considered as equal, we + // need to replace -NaN by NaN before sorting auto const has_negative_nan = - (nans_equal == nan_equality::ALL_EQUAL) && type_dispatcher(lists_entries.type(), detail::has_negative_nans_fn{}, lists_entries, stream); if (has_negative_nan) { // The column new_lists_column is temporary, thus we will not pass in `mr` From 04b120d6a002c7a3e1eaa22c264ca605eb653e52 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 25 Mar 2021 12:41:31 -0600 Subject: [PATCH 26/34] Revert "Add an option to specify NaNs are compared equal only if they have the same sign" This reverts commit 26cae096abb920bd7560c35d6e27e2f95ece9032. --- cpp/include/cudf/types.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index 1fdc5a131bc..a4016b2db9b 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -141,9 +141,8 @@ enum class nan_policy : bool { * or unequal */ enum class nan_equality /*unspecified*/ { - ALL_EQUAL, ///< All NaNs compare equal, regardless of sign - UNEQUAL, ///< All NaNs compare unequal (IEE754 behavior) - SAME_SIGN_EQUAL ///< NaNs compare equal only if they have the same sign + ALL_EQUAL, ///< All NaNs compare equal, regardless of sign + UNEQUAL ///< All NaNs compare unequal (IEE754 behavior) }; /** From 0973be9ba695284c369066f0c416672bce7061d5 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 25 Mar 2021 13:03:53 -0600 Subject: [PATCH 27/34] Fix typo --- cpp/include/cudf/types.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index a4016b2db9b..102c6804c5e 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -142,7 +142,7 @@ enum class nan_policy : bool { */ enum class nan_equality /*unspecified*/ { ALL_EQUAL, ///< All NaNs compare equal, regardless of sign - UNEQUAL ///< All NaNs compare unequal (IEE754 behavior) + UNEQUAL ///< All NaNs compare unequal (IEEE754 behavior) }; /** From fa035a6a557d2a8ed69e55f987283233c6c37694 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 29 Mar 2021 18:00:24 -0600 Subject: [PATCH 28/34] Avoid initialize-then-assign --- cpp/src/lists/drop_list_duplicates.cu | 30 +++++++++++++-------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 588dab6ab9e..0978ddcfcf8 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -490,21 +490,21 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu auto const lists_entries = lists_column.get_sliced_child(stream); // sorted_lists will store the results of the original lists after calling segmented_sort - std::unique_ptr sorted_lists; - - // If the column contains lists of floating point data type and NaNs are considered as equal, we - // need to replace -NaN by NaN before sorting - auto const has_negative_nan = - type_dispatcher(lists_entries.type(), detail::has_negative_nans_fn{}, lists_entries, stream); - if (has_negative_nan) { - // The column new_lists_column is temporary, thus we will not pass in `mr` - auto const new_lists_column = - detail::replace_negative_nans_entries(lists_entries, lists_column, stream); - sorted_lists = detail::sort_lists( - lists_column_view(new_lists_column->view()), order::ASCENDING, null_order::AFTER, stream); - } else { - sorted_lists = detail::sort_lists(lists_column, order::ASCENDING, null_order::AFTER, stream); - } + auto const sorted_lists = [&]() { + // If the column contains lists of floating point data type and NaNs are considered as equal, we + // need to replace -NaN by NaN before sorting + auto const has_negative_nan = + type_dispatcher(lists_entries.type(), detail::has_negative_nans_fn{}, lists_entries, stream); + if (has_negative_nan) { + // The column new_lists_column is temporary, thus we will not pass in `mr` + auto const new_lists_column = + detail::replace_negative_nans_entries(lists_entries, lists_column, stream); + return detail::sort_lists( + lists_column_view(new_lists_column->view()), order::ASCENDING, null_order::AFTER, stream); + } else { + return detail::sort_lists(lists_column, order::ASCENDING, null_order::AFTER, stream); + } + }(); auto const sorted_lists_entries = lists_column_view(sorted_lists->view()).get_sliced_child(stream); From 8520f0d0695cca612029e4048d93ae2f41654326 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 29 Mar 2021 18:00:47 -0600 Subject: [PATCH 29/34] Replace `thrust::any_of` by `thrust::count_if` --- cpp/src/lists/drop_list_duplicates.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 0978ddcfcf8..e49dd287f72 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -60,10 +60,10 @@ struct has_negative_nans_fn { bool operator()(column_view const& lists_entries, rmm::cuda_stream_view stream) const noexcept { auto const d_entries = column_device_view::create(lists_entries, stream); - return thrust::any_of(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(lists_entries.size()), - detail::has_negative_nans{*d_entries, lists_entries.has_nulls()}); + return thrust::count_if(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(lists_entries.size()), + detail::has_negative_nans{*d_entries, lists_entries.has_nulls()}); } template >* = nullptr> From 49bfe13a9e9bceea9b144aad950aa0cfaeb8f77e Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 29 Mar 2021 20:54:14 -0600 Subject: [PATCH 30/34] Copy column by constructor --- cpp/src/lists/drop_list_duplicates.cu | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index e49dd287f72..8d3fa686384 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -114,25 +114,9 @@ std::unique_ptr replace_negative_nans_entries(column_view const& lists_e lists_column_view const& lists_column, rmm::cuda_stream_view stream) { - auto const mr = rmm::mr::get_current_device_resource(); - - // Copy list offsets from the given lists column to the new lists column - auto new_offsets = std::make_unique( - data_type{type_id::INT32}, - lists_column.size() + 1, - rmm::device_buffer{(lists_column.size() + 1) * sizeof(offset_type), stream, mr}); - thrust::copy_n(rmm::exec_policy(stream), - lists_column.offsets_begin(), - lists_column.size() + 1, - new_offsets->mutable_view().begin()); - - // Copy entries from the given lists column to the new lists column, replacing all -NaNs by NaNs - auto new_entries = std::make_unique( - lists_entries.type(), - lists_entries.size(), - rmm::device_buffer{lists_entries.size() * cudf::size_of(lists_entries.type()), stream, mr}, - cudf::detail::copy_bitmask(lists_entries, stream, mr), - lists_entries.null_count()); + auto const mr = rmm::mr::get_current_device_resource(); + auto new_offsets = std::make_unique(lists_column.offsets()); + auto new_entries = std::make_unique(lists_entries); type_dispatcher(lists_entries.type(), detail::replace_negative_nans_fn{}, From 27b5bebffe0acc2ae9725c98854d0145b3f3b6c5 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 29 Mar 2021 21:13:39 -0600 Subject: [PATCH 31/34] Minor cleanup --- cpp/src/lists/drop_list_duplicates.cu | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 90a77834db8..76a5820aeb3 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -112,7 +112,6 @@ std::unique_ptr replace_negative_nans_entries(column_view const& lists_e lists_column_view const& lists_column, rmm::cuda_stream_view stream) { - auto const mr = rmm::mr::get_current_device_resource(); auto new_offsets = std::make_unique(lists_column.offsets()); auto new_entries = std::make_unique(lists_entries); @@ -122,11 +121,13 @@ std::unique_ptr replace_negative_nans_entries(column_view const& lists_e new_entries->mutable_view(), stream); - return make_lists_column(lists_column.size(), - std::move(new_offsets), - std::move(new_entries), - lists_column.null_count(), - cudf::detail::copy_bitmask(lists_column.parent(), stream, mr)); + return make_lists_column( + lists_column.size(), + std::move(new_offsets), + std::move(new_entries), + lists_column.null_count(), + cudf::detail::copy_bitmask( + lists_column.parent(), stream, rmm::mr::get_current_device_resource())); } /** From 1406a2567f57ead6a78c36f10d20b9235ac10c01 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 30 Mar 2021 07:33:41 -0600 Subject: [PATCH 32/34] Replace `is_null` by `is_null_nocheck` --- cpp/src/lists/drop_list_duplicates.cu | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 76a5820aeb3..914f46d70cc 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -42,7 +42,7 @@ struct has_negative_nans { __device__ Type operator()(size_type idx) const noexcept { - if (has_nulls && d_entries.is_null(idx)) { return false; } + if (has_nulls && d_entries.is_null_nocheck(idx)) { return false; } auto const val = d_entries.element(idx); return std::isnan(val) && std::signbit(val); // std::signbit(x) == true if x is negative @@ -238,8 +238,8 @@ class list_entry_comparator { if (has_nulls) { bool const nullable = d_view.nullable(); - bool const lhs_is_null{nullable and d_view.is_null(i)}; - bool const rhs_is_null{nullable and d_view.is_null(j)}; + bool const lhs_is_null{nullable and d_view.is_null_nocheck(i)}; + bool const rhs_is_null{nullable and d_view.is_null_nocheck(j)}; if (lhs_is_null and rhs_is_null) { return nulls_equal == null_equality::EQUAL; } else if (lhs_is_null != rhs_is_null) { @@ -264,8 +264,8 @@ class list_entry_comparator { if (has_nulls) { bool const nullable = d_view.nullable(); - bool const lhs_is_null{nullable and d_view.is_null(i)}; - bool const rhs_is_null{nullable and d_view.is_null(j)}; + bool const lhs_is_null{nullable and d_view.is_null_nocheck(i)}; + bool const rhs_is_null{nullable and d_view.is_null_nocheck(j)}; if (lhs_is_null and rhs_is_null) { return nulls_equal == null_equality::EQUAL; } else if (lhs_is_null != rhs_is_null) { From 32b439383082078fc505178cd6e88436ff003178 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 30 Mar 2021 07:53:39 -0600 Subject: [PATCH 33/34] Replace `make_numeric_column` by `device_uvector` --- cpp/src/lists/drop_list_duplicates.cu | 36 ++++++++++++--------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 914f46d70cc..6da0465f472 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -16,7 +16,7 @@ #include #include -#include +#include #include #include #include @@ -25,10 +25,10 @@ #include #include +#include #include #include -#include #include namespace cudf { @@ -361,30 +361,26 @@ std::vector> get_unique_entries_and_list_offsets( auto const d_view_entries = column_device_view::create(all_lists_entries, stream); // Allocate memory to store the indices of the unique entries - auto const unique_indices = cudf::make_numeric_column( - entries_list_offsets.type(), num_entries, mask_state::UNALLOCATED, stream); - auto const unique_indices_begin = unique_indices->mutable_view().begin(); - - auto const copy_end = type_dispatcher(all_lists_entries.type(), - get_unique_entries_fn{}, - entries_list_offsets.begin(), - *d_view_entries, - num_entries, - unique_indices_begin, - nulls_equal, - nans_equal, - all_lists_entries.has_nulls(), - stream); + auto unique_indices = rmm::device_uvector(num_entries, stream); + auto const output_begin = unique_indices.begin(); + auto const output_end = type_dispatcher(all_lists_entries.type(), + get_unique_entries_fn{}, + entries_list_offsets.begin(), + *d_view_entries, + num_entries, + output_begin, + nulls_equal, + nans_equal, + all_lists_entries.has_nulls(), + stream); // Collect unique entries and entry list offsets // The new null_count and bitmask of the unique entries will also be generated // by the gather function - auto const indices = cudf::detail::slice( - unique_indices->view(), 0, thrust::distance(unique_indices_begin, copy_end)); return cudf::detail::gather(table_view{{all_lists_entries, entries_list_offsets}}, - indices, + output_begin, + output_end, cudf::out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, stream, mr) ->release(); From b5af91ecbd49351b6d2b61533f9d8c522790aa5b Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 30 Mar 2021 08:03:21 -0600 Subject: [PATCH 34/34] Rewrite comments, and add a condition check for nans_equal == ALL_EQUAL before calling to `has_negative_nans` --- cpp/src/lists/drop_list_duplicates.cu | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 6da0465f472..564d919b65d 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -67,6 +67,7 @@ struct has_negative_nans_fn { template >* = nullptr> bool operator()(column_view const&, rmm::cuda_stream_view) const noexcept { + // Columns of non floating-point data will never contain NaN return false; } }; @@ -81,13 +82,13 @@ struct replace_negative_nans { /** * @brief A structure to be used along with type_dispatcher to replace -NaN by NaN for all entries - * of a floating point data column + * of a floating-point data column */ struct replace_negative_nans_fn { template >* = nullptr> void operator()(column_view const&, mutable_column_view const&, rmm::cuda_stream_view) const { - CUDF_FAIL("Cannot operate on a type that is not floating point."); + CUDF_FAIL("Cannot operate on a type that is not floating-point."); } template >* = nullptr> @@ -95,7 +96,7 @@ struct replace_negative_nans_fn { mutable_column_view const& new_entries, rmm::cuda_stream_view stream) const noexcept { - // Do not care whether an entry is null or not, just consider it as a floating point value + // Do not care whether an entry is null or not, just consider it as a floating-point value thrust::transform(rmm::exec_policy(stream), lists_entries.begin(), lists_entries.end(), @@ -211,12 +212,12 @@ std::unique_ptr generate_entry_list_offsets(size_type num_entries, * point types, this functor will return the same comparison result as * `cudf::element_equality_comparator`. * - * For floating point types, entries holding NaN value can be considered as different values or the + * For floating-point types, entries holding NaN value can be considered as different values or the * same value depending on the nans_equal parameter. * * @tparam Type The data type of entries * @tparam nans_equal Flag to specify whether NaN entries should be considered as equal value (only - * applicable for floating point data column) + * applicable for floating-point data column) */ template class list_entry_comparator { @@ -247,7 +248,7 @@ class list_entry_comparator { } } - // For floating point types, if both element(i) and element(j) are NaNs then this comparison + // For floating-point types, if both element(i) and element(j) are NaNs then this comparison // will return `true`. This is the desired behavior in Pandas. auto const lhs = d_view.element(i); auto const rhs = d_view.element(j); @@ -273,7 +274,7 @@ class list_entry_comparator { } } - // For floating point types, if both element(i) and element(j) are NaNs then this comparison + // For floating-point types, if both element(i) and element(j) are NaNs then this comparison // will return `false`. This is the desired behavior in Apache Spark. return d_view.element(i) == d_view.element(j); } @@ -342,7 +343,7 @@ struct get_unique_entries_fn { * @param entries_list_offsets A map from list entries to their corresponding list offsets * @param nulls_equal Flag to specify whether null entries should be considered equal * @param nans_equal Flag to specify whether NaN entries should be considered as equal - * value (only applicable for floating point data column) + * value (only applicable for floating-point data column) * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device resource used to allocate memory * @@ -470,11 +471,12 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu // sorted_lists will store the results of the original lists after calling segmented_sort auto const sorted_lists = [&]() { - // If the column contains lists of floating point data type and NaNs are considered as equal, we - // need to replace -NaN by NaN before sorting - auto const has_negative_nan = + // If nans_equal == ALL_EQUAL and the column contains lists of floating-point data type, + // we need to replace -NaN by NaN before sorting + auto const replace_negative_nan = + nans_equal == nan_equality::ALL_EQUAL and type_dispatcher(lists_entries.type(), detail::has_negative_nans_fn{}, lists_entries, stream); - if (has_negative_nan) { + if (replace_negative_nan) { // The column new_lists_column is temporary, thus we will not pass in `mr` auto const new_lists_column = detail::replace_negative_nans_entries(lists_entries, lists_column, stream);