[BUG] size_type overflow in `cudf::groupby::detail::hash::extract_populated_keys` #12058

abellina · 2022-11-03T15:43:18Z

We look to be overflowing size_type while in extract_populated_keys. The iterator on rmm::device_uvector is signed so I think we need to go to unsigned in order to be able to group by up to the 2B row limit.

I believe the issue is actually when calling populated_keys.resize where the std::distance may be negative, but the stack trace I got (on a non-debug build) points at copy_if. Either way it definitely looks to be an issue triggered in this function.

ai.rapids.cudf.Rmm.initialize(0, null, 0)

// create a vector with 1.2B rows with distinct keys
val cvs = new scala.collection.mutable.ArrayBuffer[ai.rapids.cudf.ColumnVector]()
cvs += ai.rapids.cudf.ColumnVector.fromInts((0 until 1200000).toArray:_*)
(1 until 1000).foreach(ix => cvs += cvs(ix-1).add(ai.rapids.cudf.Scalar.fromInt(1200000)))
val concat = ai.rapids.cudf.ColumnVector.concatenate(cvs:_*)
cvs.foreach(_.close)
cvs.clear()

// group by count on column 0
val tbl = new ai.rapids.cudf.Table(concat)
val agg =tbl.groupBy(0).aggregate(ai.rapids.cudf.GroupByAggregation.sum().onColumn(0))

C++ repro:

#include <cudf/types.hpp>
#include <cudf/table/table.hpp>
#include <cudf/column/column_factories.hpp>
#include <cudf/aggregation.hpp>
#include <cudf/groupby.hpp>
#include <thrust/iterator/counting_iterator.h>
#include <rmm/exec_policy.hpp>

int main(int argc, char** argv) {
  auto values = thrust::make_counting_iterator(0);
  std::size_t num_rows = 1200000000L;
  auto columns = std::vector<std::unique_ptr<cudf::column>>();
  columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, num_rows));
  thrust::copy(rmm::exec_policy(rmm::cuda_stream_default), values, values + num_rows, columns[0]->mutable_view().begin<uint32_t>());
  auto tbl = cudf::table(std::move(columns));

  cudf::groupby::groupby grouper(tbl, 
    cudf::null_policy::INCLUDE, 
    cudf::sorted::NO, 
    std::vector<cudf::order>{}, 
    std::vector<cudf::null_order>{});

  std::vector<cudf::groupby::aggregation_request> requests;
  std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggs;
  aggs.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
  requests.emplace_back(cudf::groupby::aggregation_request { tbl.get_column(0), std::move(aggs) });

  auto result = grouper.aggregate(requests);
}

Stack trace:

java.lang.OutOfMemoryError: Could not allocate native memory: std::bad_alloc: out_of_memory: std::bad_alloc: out_of_memory: CUDA error at: include/rmm/mr/device/cuda_memory_resource.hpp:70: cudaErrorMemoryAllocation out of memory. Attempted size: 18446744073696393216 attempted at: #0 in rmm::detail::stack_trace::stack_trace() from /tmp/cudf5256769593043486578.so
#1 in /tmp/cudf5256769593043486578.so(+0x1170e6a) [0x7f73c1cbfe6a]
#2 in int* thrust::cuda_cub::copy_if<thrust::detail::execute_with_allocator<rmm::mr::thrust_allocator<char>, thrust::cuda_cub::execute_on_stream_base>, thrust::transform_iterator<__nv_dl_wrapper_t<__nv_dl_tag<rmm::device_uvector<int> (*)(concurrent_unordered_map<int, int, cudf::experimental::row::hash::device_row_hasher<cudf::detail::MurmurHash3_32, cudf::nullate::DYNAMIC>, cudf::experimental::row::equality::device_row_comparator<cudf::nullate::DYNAMIC, cudf::experimental::row::equality::nan_equal_physical_equality_comparator>, default_allocator<thrust::pair<int, int> > > const&, int, rmm::cuda_stream_view), &cudf::groupby::detail::hash::(anonymous namespace)::extract_populated_keys, 1u>>, thrust::pair<int, int>*, thrust::use_default, thrust::use_default>, int*, __nv_dl_wrapper_t<__nv_dl_tag<rmm::device_uvector<int> (*)(concurrent_unordered_map<int, int, cudf::experimental::row::hash::device_row_hasher<cudf::detail::MurmurHash3_32, cudf::nullate::DYNAMIC>, cudf::experimental::row::equality::device_row_comparator<cudf::nullate::DYNAMIC, cudf::experimental::row::equality::nan_equal_physical_equality_comparator>, default_allocator<thrust::pair<int, int> > > const&, int, rmm::cuda_stream_view), &cudf::groupby::detail::hash::(anonymous namespace)::extract_populated_keys, 2u>, int> >(thrust::cuda_cub::execution_policy<thrust::detail::execute_with_allocator<rmm::mr::thrust_allocator<char>, thrust::cuda_cub::execute_on_stream_base> >&, thrust::transform_iterator<__nv_dl_wrapper_t<__nv_dl_tag<rmm::device_uvector<int> (*)(concurrent_unordered_map<int, int, cudf::experimental::row::hash::device_row_hasher<cudf::detail::MurmurHash3_32, cudf::nullate::DYNAMIC>, cudf::experimental::row::equality::device_row_comparator<cudf::nullate::DYNAMIC, cudf::experimental::row::equality::nan_equal_physical_equality_comparator>, default_allocator<thrust::pair<int, int> > > const&, int, rmm::cuda_stream_view), &cudf::groupby::detail::hash::(anonymous namespace)::extract_populated_keys, 1u>>, thrust::pair<int, int>*, thrust::use_default, thrust::use_default>, thrust::transform_iterator<__nv_dl_wrapper_t<__nv_dl_tag<rmm::device_uvector<int> (*)(concurrent_unordered_map<int, int, cudf::experimental::row::hash::device_row_hasher<cudf::detail::MurmurHash3_32, cudf::nullate::DYNAMIC>, cudf::experimental::row::equality::device_row_comparator<cudf::nullate::DYNAMIC, cudf::experimental::row::equality::nan_equal_physical_equality_comparator>, default_allocator<thrust::pair<int, int> > > const&, int, rmm::cuda_stream_view), &cudf::groupby::detail::hash::(anonymous namespace)::extract_populated_keys, 1u>>, thrust::pair<int, int>*, thrust::use_default, thrust::use_default>, int*, __nv_dl_wrapper_t<__nv_dl_tag<rmm::device_uvector<int> (*)(concurrent_unordered_map<int, int, cudf::experimental::row::hash::device_row_hasher<cudf::detail::MurmurHash3_32, cudf::nullate::DYNAMIC>, cudf::experimental::row::equality::device_row_comparator<cudf::nullate::DYNAMIC, cudf::experimental::row::equality::nan_equal_physical_equality_comparator>, default_allocator<thrust::pair<int, int> > > const&, int, rmm::cuda_stream_view), &cudf::groupby::detail::hash::(anonymous namespace)::extract_populated_keys, 2u>, int>) from /tmp/cudf5256769593043486578.so
#3 in cudf::groupby::detail::hash::(anonymous namespace)::extract_populated_keys(concurrent_unordered_map<int, int, cudf::experimental::row::hash::device_row_hasher<cudf::detail::MurmurHash3_32, cudf::nullate::DYNAMIC>, cudf::experimental::row::equality::device_row_comparator<cudf::nullate::DYNAMIC, cudf::experimental::row::equality::nan_equal_physical_equality_comparator>, default_allocator<thrust::pair<int, int> > > const&, int, rmm::cuda_stream_view) from /tmp/cudf5256769593043486578.so
#4 in cudf::groupby::detail::hash::(anonymous namespace)::groupby(cudf::table_view const&, cudf::host_span<cudf::groupby::aggregation_request const, 18446744073709551615ul>, cudf::detail::result_cache*, bool, cudf::null_policy, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) from /tmp/cudf5256769593043486578.so
#5 in cudf::groupby::detail::hash::groupby(cudf::table_view const&, cudf::host_span<cudf::groupby::aggregation_request const, 18446744073709551615ul>, cudf::null_policy, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) from /tmp/cudf5256769593043486578.so
#6 in cudf::groupby::groupby::dispatch_aggregation(cudf::host_span<cudf::groupby::aggregation_request const, 18446744073709551615ul>, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) from /tmp/cudf5256769593043486578.so
#7 in cudf::groupby::groupby::aggregate(cudf::host_span<cudf::groupby::aggregation_request const, 18446744073709551615ul>, rmm::mr::device_memory_resource*) from /tmp/cudf5256769593043486578.so
#8 in Java_ai_rapids_cudf_Table_groupByAggregate from /tmp/cudf5256769593043486578.so
#9 in [0x7f796d017de7]

  at ai.rapids.cudf.Table.groupByAggregate(Native Method)
  at ai.rapids.cudf.Table.access$3000(Table.java:41)
  at ai.rapids.cudf.Table$GroupByOperation.aggregate(Table.java:3657)
  ... 47 elided

The text was updated successfully, but these errors were encountered:

abellina · 2022-11-03T16:13:10Z

@jrhemstad FYI

davidwendt · 2022-11-03T16:18:50Z

Can you provide a C++ example? I'm not following the syntax of the example in the description too well.
There are 1.2B elements with 1000 unique values? Are the elements int32 or int64 ?

abellina · 2022-11-03T16:23:47Z

Elements are int32 and they are monotonously increasing from 0 to 1.2B, such that when we do the groupby there is no reduction. I can try a C++ repro, but let me know if the above info is enough.

abellina · 2022-11-03T16:41:04Z

I am working on a C++ repro with @nvdbaranec's help as well.

abellina · 2022-11-03T18:44:31Z

Added a C++ repro @davidwendt in the description. I also added some logging in rmm cuda_memory_resource to print the requested sizes, and I got the following in a 40GB GPU (note that a smaller GPU will fail earlier with an OOM):

Attempting to allocate  4800000000
Attempting to allocate  80
Attempting to allocate  19200000000
Attempting to allocate  9600000000
Attempting to allocate  80
Attempting to allocate  80
Attempting to allocate  8
Attempting to allocate  4800000000
Attempting to allocate  18446744073696393216
terminate called after throwing an instance of 'rmm::out_of_memory'
  what():  std::bad_alloc: out_of_memory: CUDA error at: /include/rmm/mr/device/cuda_memory_resource.hpp:72: cudaErrorMemoryAllocation out of memory

davidwendt · 2022-11-04T20:59:08Z

It looks like the error is actually in the thrust::copy_if call in extract_populated_keys

cudf/cpp/src/groupby/hash/groupby.cu

Line 519 in a3e9c1c

auto end_it = thrust::copy_if(rmm::exec_policy(stream),

In the example from the description, the map.capacity() value is 2.4B and so the get_key_it iterator will span over an int32 size. The copy_if hardcodes the iterator distance type to be an int
https://github.com/NVIDIA/thrust/blob/dbd144ed543b60c4ff9d456edd19869e82fe8873/thrust/system/cuda/detail/copy_if.h#L699-L708
So this is where the out-of-memory is occurring. The copy_if is trying to allocate temporary memory for itself with an overflowed num_items value.

I can work around this but it looks like this will still run out of memory later in the gather step (on my 48GB GPU).
I would recommend limiting the possible keys to 1B or less.

ttnghia · 2022-11-04T21:09:17Z

I'm not very familiar with the code in that file so I'm unclear why the copy_if is iterating to map.capacity() instead of map.size()?

I assume that map capacity is something that is around 1.5 to 2.0 map size (which is the number of actual elements).

Update: Found that such map type doesn't have a map size. So pretty non-standard.

davidwendt · 2022-11-04T21:35:28Z

Adding up the numbers in #12058 (comment) we are at 38.4GB.
I traced the gather call that would occur if the copy_if did not fail and it requests another allocate of 9.6GB which would push this right past the 48GB. It appears the copy_if call would only need 16MB for itself (again, if it did not fail).

So just pointing out that this would've run out-of-memory on a 48GB GPU anyway even without the overflow error.

davidwendt · 2022-11-04T21:37:07Z

Regardless, I can create a PR to workaround the overflow error in the thrust::copy_if call.

abellina · 2022-11-04T22:12:20Z

Thanks for taking a look @davidwendt, @ttnghia. It doesn't make a lot of sense to run out of memory with 48GB for ~5GB input for a simple sum. Looking with @jlowe and @nvdbaranec the second overflow in gather is likely due to the size_type usage in cudf::distance, would that make sense?

The main thing is not having the overflow, if there is another overflow via the gather we should fix it as well.

davidwendt · 2022-11-04T22:13:53Z

There is no overflow in gather. This is doing a gather on 1.2B elements of INT64 => 9.6GB. We just don't have that much memory left.

ttnghia · 2022-11-04T22:19:20Z

@abellina I found that there is a check for overflow in gather:

CUDF_EXPECTS(gather_map.size() <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
               "invalid gather map size");

davidwendt · 2022-11-04T22:20:07Z

This check passes so there is no overflow in gather.

abellina · 2022-11-04T22:20:13Z

And I can run this with a 80GB GPU once there's a patch somewhere, to see if that succeeds.

Workaround for limitation in `thrust::copy_if` which fails if the input-iterator spans more than int-max. The `thrust::copy_if` hardcodes the iterator distance type to be an int https://github.com/NVIDIA/thrust/blob/dbd144ed543b60c4ff9d456edd19869e82fe8873/thrust/system/cuda/detail/copy_if.h#L699-L708 Found existing thrust issue: https://github.com/NVIDIA/thrust/issues/1302 This calls the `copy_if` in chunks if the iterator can span greater than int-max. Closes #12058 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Alessandro Bellina (https://github.com/abellina) - Robert Maynard (https://github.com/robertmaynard) - Nghia Truong (https://github.com/ttnghia) URL: #12079

abellina added bug Something isn't working Needs Triage Need team to review and classify labels Nov 3, 2022

abellina mentioned this issue Nov 3, 2022

[BUG] query95 @ 30TB negative allocation from BaseHashJoinIterator.countGroups with default 200 partitions NVIDIA/spark-rapids#6983

Closed

davidwendt self-assigned this Nov 4, 2022

abellina added the Spark Functionality that helps Spark RAPIDS label Nov 4, 2022

davidwendt mentioned this issue Nov 4, 2022

Workaround groupby aggregate thrust::copy_if overflow #12079

Merged

3 tasks

abellina mentioned this issue Nov 9, 2022

[BUG] strided loop overflow observed while joining against build table with 1.2B rows #12108

Closed

rapids-bot bot closed this as completed in #12079 Nov 10, 2022

bdice removed the Needs Triage Need team to review and classify label Mar 4, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[BUG] size_type overflow in `cudf::groupby::detail::hash::extract_populated_keys` #12058

[BUG] size_type overflow in `cudf::groupby::detail::hash::extract_populated_keys` #12058

abellina commented Nov 3, 2022 •

edited

Loading

abellina commented Nov 3, 2022

davidwendt commented Nov 3, 2022

abellina commented Nov 3, 2022

abellina commented Nov 3, 2022

abellina commented Nov 3, 2022

davidwendt commented Nov 4, 2022

ttnghia commented Nov 4, 2022 •

edited

Loading

davidwendt commented Nov 4, 2022

davidwendt commented Nov 4, 2022

abellina commented Nov 4, 2022

davidwendt commented Nov 4, 2022 •

edited

Loading

ttnghia commented Nov 4, 2022

davidwendt commented Nov 4, 2022 •

edited

Loading

abellina commented Nov 4, 2022

[BUG] size_type overflow in cudf::groupby::detail::hash::extract_populated_keys #12058

[BUG] size_type overflow in cudf::groupby::detail::hash::extract_populated_keys #12058

Comments

abellina commented Nov 3, 2022 • edited Loading

abellina commented Nov 3, 2022

davidwendt commented Nov 3, 2022

abellina commented Nov 3, 2022

abellina commented Nov 3, 2022

abellina commented Nov 3, 2022

davidwendt commented Nov 4, 2022

ttnghia commented Nov 4, 2022 • edited Loading

davidwendt commented Nov 4, 2022

davidwendt commented Nov 4, 2022

abellina commented Nov 4, 2022

davidwendt commented Nov 4, 2022 • edited Loading

ttnghia commented Nov 4, 2022

davidwendt commented Nov 4, 2022 • edited Loading

abellina commented Nov 4, 2022

[BUG] size_type overflow in `cudf::groupby::detail::hash::extract_populated_keys` #12058

[BUG] size_type overflow in `cudf::groupby::detail::hash::extract_populated_keys` #12058

abellina commented Nov 3, 2022 •

edited

Loading

ttnghia commented Nov 4, 2022 •

edited

Loading

davidwendt commented Nov 4, 2022 •

edited

Loading

davidwendt commented Nov 4, 2022 •

edited

Loading