Skip to content

Commit

Permalink
Update distinct/unique_count to experimental::row hasher/comparat…
Browse files Browse the repository at this point in the history
…or (#12776)

This PR is a part of #11844

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #12776
  • Loading branch information
divyegala committed Mar 7, 2023
1 parent 6d1f8e3 commit 97d8d12
Show file tree
Hide file tree
Showing 6 changed files with 157 additions and 48 deletions.
48 changes: 32 additions & 16 deletions cpp/src/stream_compaction/distinct_count.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <cudf/detail/sorting.hpp>
#include <cudf/detail/stream_compaction.hpp>
#include <cudf/stream_compaction.hpp>
#include <cudf/table/experimental/row_operators.cuh>
#include <cudf/table/table_view.hpp>
#include <cudf/utilities/type_dispatcher.hpp>

Expand Down Expand Up @@ -125,33 +126,48 @@ cudf::size_type distinct_count(table_view const& keys,
null_equality nulls_equal,
rmm::cuda_stream_view stream)
{
auto table_ptr = cudf::table_device_view::create(keys, stream);
auto const num_rows = table_ptr->num_rows();
auto const has_null = nullate::DYNAMIC{cudf::has_nulls(keys)};
auto const num_rows = keys.num_rows();
auto const has_nulls = nullate::DYNAMIC{cudf::has_nested_nulls(keys)};

hash_map_type key_map{compute_hash_table_size(num_rows),
cuco::empty_key{COMPACTION_EMPTY_KEY_SENTINEL},
cuco::empty_value{COMPACTION_EMPTY_VALUE_SENTINEL},
detail::hash_table_allocator_type{default_allocator<char>{}, stream},
stream.value()};

compaction_hash hash_key{has_null, *table_ptr};
row_equality_comparator row_equal(has_null, *table_ptr, *table_ptr, nulls_equal);
auto const preprocessed_input =
cudf::experimental::row::hash::preprocessed_table::create(keys, stream);

auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
auto const hash_key = experimental::compaction_hash(row_hasher.device_hasher(has_nulls));

auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input);

auto iter = cudf::detail::make_counting_transform_iterator(
0, [] __device__(size_type i) { return cuco::make_pair(i, i); });

// when nulls are equal, insert non-null rows only to improve efficiency
if (nulls_equal == null_equality::EQUAL and has_null) {
thrust::counting_iterator<size_type> stencil(0);
auto const [row_bitmask, null_count] = cudf::detail::bitmask_or(keys, stream);
row_validity pred{static_cast<bitmask_type const*>(row_bitmask.data())};

key_map.insert_if(iter, iter + num_rows, stencil, pred, hash_key, row_equal, stream.value());
return key_map.get_size() + static_cast<std::size_t>((null_count > 0) ? 1 : 0);
auto const comparator_helper = [&](auto const row_equal) {
// when nulls are equal, insert non-null rows only to improve efficiency
if (nulls_equal == null_equality::EQUAL and has_nulls) {
thrust::counting_iterator<size_type> stencil(0);
auto const [row_bitmask, null_count] = cudf::detail::bitmask_or(keys, stream);
row_validity pred{static_cast<bitmask_type const*>(row_bitmask.data())};

key_map.insert_if(iter, iter + num_rows, stencil, pred, hash_key, row_equal, stream.value());
return key_map.get_size() + static_cast<std::size_t>(null_count > 0);
}
// otherwise, insert all
key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value());
return key_map.get_size();
};

if (cudf::detail::has_nested_columns(keys)) {
auto const row_equal = row_comp.equal_to<true>(has_nulls, nulls_equal);
return comparator_helper(row_equal);
} else {
auto const row_equal = row_comp.equal_to<false>(has_nulls, nulls_equal);
return comparator_helper(row_equal);
}
// otherwise, insert all
key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value());
return key_map.get_size();
}

cudf::size_type distinct_count(column_view const& input,
Expand Down
20 changes: 1 addition & 19 deletions cpp/src/stream_compaction/stream_compaction_common.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -29,24 +29,6 @@
namespace cudf {
namespace detail {

/**
* @brief Device callable to hash a given row.
*/
template <typename Nullate>
class compaction_hash {
public:
compaction_hash(Nullate has_nulls, table_device_view t) : _hash{has_nulls, t} {}

__device__ inline auto operator()(size_type i) const noexcept
{
auto hash = _hash(i);
return (hash == COMPACTION_EMPTY_KEY_SENTINEL) ? (hash - 1) : hash;
}

private:
row_hash _hash;
};

namespace experimental {

/**
Expand Down
2 changes: 0 additions & 2 deletions cpp/src/stream_compaction/stream_compaction_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,5 @@ using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allo
using hash_map_type =
cuco::static_map<size_type, size_type, cuda::thread_scope_device, hash_table_allocator_type>;

using row_hash = cudf::row_hasher<default_hash, cudf::nullate::DYNAMIC>;

} // namespace detail
} // namespace cudf
29 changes: 20 additions & 9 deletions cpp/src/stream_compaction/unique_count.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -26,6 +26,7 @@
#include <cudf/detail/sorting.hpp>
#include <cudf/detail/stream_compaction.hpp>
#include <cudf/stream_compaction.hpp>
#include <cudf/table/experimental/row_operators.cuh>
#include <cudf/table/table_view.hpp>
#include <cudf/utilities/type_dispatcher.hpp>

Expand Down Expand Up @@ -70,14 +71,24 @@ cudf::size_type unique_count(table_view const& keys,
null_equality nulls_equal,
rmm::cuda_stream_view stream)
{
auto table_ptr = cudf::table_device_view::create(keys, stream);
row_equality_comparator comp(
nullate::DYNAMIC{cudf::has_nulls(keys)}, *table_ptr, *table_ptr, nulls_equal);
return thrust::count_if(
rmm::exec_policy(stream),
thrust::counting_iterator<cudf::size_type>(0),
thrust::counting_iterator<cudf::size_type>(keys.num_rows()),
[comp] __device__(cudf::size_type i) { return (i == 0 or not comp(i, i - 1)); });
auto const row_comp = cudf::experimental::row::equality::self_comparator(keys, stream);
if (cudf::detail::has_nested_columns(keys)) {
auto const comp =
row_comp.equal_to<true>(nullate::DYNAMIC{has_nested_nulls(keys)}, nulls_equal);
return thrust::count_if(
rmm::exec_policy(stream),
thrust::counting_iterator<cudf::size_type>(0),
thrust::counting_iterator<cudf::size_type>(keys.num_rows()),
[comp] __device__(cudf::size_type i) { return (i == 0 or not comp(i, i - 1)); });
} else {
auto const comp =
row_comp.equal_to<false>(nullate::DYNAMIC{has_nested_nulls(keys)}, nulls_equal);
return thrust::count_if(
rmm::exec_policy(stream),
thrust::counting_iterator<cudf::size_type>(0),
thrust::counting_iterator<cudf::size_type>(keys.num_rows()),
[comp] __device__(cudf::size_type i) { return (i == 0 or not comp(i, i - 1)); });
}
}

cudf::size_type unique_count(column_view const& input,
Expand Down
53 changes: 52 additions & 1 deletion cpp/tests/stream_compaction/distinct_count_tests.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -24,12 +24,18 @@
#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/iterator_utilities.hpp>
#include <cudf_test/table_utilities.hpp>
#include <cudf_test/type_lists.hpp>

#include <algorithm>
#include <cmath>

using lists_col = cudf::test::lists_column_wrapper<int32_t>;
using structs_col = cudf::test::structs_column_wrapper;

using cudf::test::iterators::nulls_at;

using cudf::nan_policy;
using cudf::null_equality;
using cudf::null_policy;
Expand Down Expand Up @@ -306,3 +312,48 @@ TEST_F(DistinctCount, TableWithStringColumnWithNull)
EXPECT_EQ(9, cudf::distinct_count(input, null_equality::EQUAL));
EXPECT_EQ(10, cudf::distinct_count(input, null_equality::UNEQUAL));
}

TEST_F(DistinctCount, NullableLists)
{
auto const keys = lists_col{
{{}, {1, 1}, {1}, {} /*NULL*/, {1}, {} /*NULL*/, {2}, {2, 1}, {2}, {2, 2}, {}, {2, 2}},
nulls_at({3, 5})};
auto const input = cudf::table_view{{keys}};

EXPECT_EQ(7, cudf::distinct_count(input, null_equality::EQUAL));
EXPECT_EQ(8, cudf::distinct_count(input, null_equality::UNEQUAL));
}

TEST_F(DistinctCount, NullableStructOfStructs)
{
// +-----------------+
// | s1{s2{a,b}, c} |
// +-----------------+
// 0 | { {1, 1}, 5} |
// 1 | { Null, 4} |
// 2 | { {1, 1}, 5} | // Same as 0
// 3 | { {1, 2}, 4} |
// 4 | { Null, 6} |
// 5 | { Null, 4} | // Same as 4
// 6 | Null | // Same as 6
// 7 | { {2, 1}, 5} |
// 8 | Null |

auto const keys = [&] {
auto a = cudf::test::fixed_width_column_wrapper<int32_t>{1, XXX, 1, 1, XXX, XXX, 0, 2, 0};
auto b = cudf::test::fixed_width_column_wrapper<int32_t>{1, XXX, 1, 2, XXX, XXX, 0, 1, 0};
auto s2 = structs_col{{a, b}, nulls_at({1, 4, 5})};

auto c = cudf::test::fixed_width_column_wrapper<int32_t>{5, 4, 5, 4, 6, 4, 0, 5, 0};
std::vector<std::unique_ptr<cudf::column>> s1_children;
s1_children.emplace_back(s2.release());
s1_children.emplace_back(c.release());
auto const null_it = nulls_at({6, 8});
return structs_col(std::move(s1_children), std::vector<bool>{null_it, null_it + 9});
}();

auto const input = cudf::table_view{{keys}};

EXPECT_EQ(6, cudf::distinct_count(input, null_equality::EQUAL));
EXPECT_EQ(8, cudf::distinct_count(input, null_equality::UNEQUAL));
}
53 changes: 52 additions & 1 deletion cpp/tests/stream_compaction/unique_count_tests.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -24,12 +24,18 @@
#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/iterator_utilities.hpp>
#include <cudf_test/table_utilities.hpp>
#include <cudf_test/type_lists.hpp>

#include <algorithm>
#include <cmath>

using lists_col = cudf::test::lists_column_wrapper<int32_t>;
using structs_col = cudf::test::structs_column_wrapper;

using cudf::test::iterators::nulls_at;

using cudf::nan_policy;
using cudf::null_equality;
using cudf::null_policy;
Expand Down Expand Up @@ -237,3 +243,48 @@ TEST_F(UniqueCount, EmptyColumn)
constexpr auto expected = 0;
EXPECT_EQ(expected, cudf::unique_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL));
}

TEST_F(UniqueCount, NullableLists)
{
auto const keys = lists_col{
{{}, {}, {1, 1}, {1}, {1}, {} /*NULL*/, {} /*NULL*/, {2}, {2}, {2, 1}, {2, 2}, {2, 2}},
nulls_at({5, 6})};
auto const input = cudf::table_view{{keys}};

EXPECT_EQ(7, cudf::unique_count(input, null_equality::EQUAL));
EXPECT_EQ(8, cudf::unique_count(input, null_equality::UNEQUAL));
}

TEST_F(UniqueCount, NullableStructOfStructs)
{
// +-----------------+
// | s1{s2{a,b}, c} |
// +-----------------+
// 0 | { {1, 1}, 5} |
// 1 | { {1, 1}, 5} | // Same as 0
// 2 | { {1, 2}, 4} |
// 3 | { Null, 6} |
// 4 | { Null, 4} |
// 5 | { Null, 4} | // Same as 4
// 6 | Null |
// 7 | Null | // Same as 6
// 8 | { {2, 1}, 5} |

auto const keys = [&] {
auto a = cudf::test::fixed_width_column_wrapper<int32_t>{1, 1, 1, XXX, XXX, XXX, 2, 1, 2};
auto b = cudf::test::fixed_width_column_wrapper<int32_t>{1, 1, 2, XXX, XXX, XXX, 2, 1, 1};
auto s2 = structs_col{{a, b}, nulls_at({3, 4, 5})};

auto c = cudf::test::fixed_width_column_wrapper<int32_t>{5, 5, 4, 6, 4, 4, 3, 3, 5};
std::vector<std::unique_ptr<cudf::column>> s1_children;
s1_children.emplace_back(s2.release());
s1_children.emplace_back(c.release());
auto const null_it = nulls_at({6, 7});
return structs_col(std::move(s1_children), std::vector<bool>{null_it, null_it + 9});
}();

auto const input = cudf::table_view{{keys}};

EXPECT_EQ(6, cudf::unique_count(input, null_equality::EQUAL));
EXPECT_EQ(8, cudf::unique_count(input, null_equality::UNEQUAL));
}

0 comments on commit 97d8d12

Please sign in to comment.