Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use offsetalator in gather_chars #14700

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions cpp/include/cudf/detail/offsets_iterator.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -63,10 +63,11 @@ struct input_offsetalator : base_normalator<input_offsetalator, int64_t> {
*
* Use the indexalator_factory to create an iterator instance.
*
* @param data Pointer to an integer array in device memory.
* @param dtype Type of data in data
* @param data Pointer to an integer array in device memory
* @param dtype Type of data in data
* @param offset Index value within `offsets` to use as the beginning of the iterator
*/
CUDF_HOST_DEVICE input_offsetalator(void const* data, data_type dtype)
CUDF_HOST_DEVICE input_offsetalator(void const* data, data_type dtype, size_type offset = 0)
: base_normalator<input_offsetalator, int64_t>(
dtype, dtype.id() == type_id::INT32 ? sizeof(int32_t) : sizeof(int64_t)),
p_{static_cast<char const*>(data)}
Expand All @@ -78,6 +79,7 @@ struct input_offsetalator : base_normalator<input_offsetalator, int64_t> {
cudf_assert((dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64) &&
"Unexpected offsets type");
#endif
p_ += (this->width_ * offset);
}

protected:
Expand Down
11 changes: 8 additions & 3 deletions cpp/include/cudf/detail/offsets_iterator_factory.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -28,14 +28,19 @@ namespace detail {
struct offsetalator_factory {
/**
* @brief Create an input offsetalator instance from an offsets column
*
* @param offsets Column to wrap with an offsetalator
* @param offset Index value within `offsets` to use as the beginning of the iterator
*/
static input_offsetalator make_input_iterator(column_view const& offsets)
static input_offsetalator make_input_iterator(column_view const& offsets, size_type offset = 0)
{
return input_offsetalator(offsets.head(), offsets.type());
return input_offsetalator(offsets.head(), offsets.type(), offset);
}

/**
* @brief Create an output offsetalator instance from an offsets column
*
* @param offsets Column to wrap with an offsetalator
*/
static output_offsetalator make_output_iterator(mutable_column_view const& offsets)
{
Expand Down
41 changes: 21 additions & 20 deletions cpp/include/cudf/strings/detail/gather.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -18,6 +18,7 @@
#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/offsets_iterator_factory.cuh>
#include <cudf/detail/sizes_to_offsets_iterator.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/strings/detail/utilities.hpp>
Expand Down Expand Up @@ -79,7 +80,7 @@ __forceinline__ __device__ uint4 load_uint4(char const* ptr)
template <typename StringIterator, typename MapIterator>
__global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
char* out_chars,
cudf::device_span<int32_t const> const out_offsets,
cudf::detail::input_offsetalator const out_offsets,
MapIterator string_indices,
size_type total_out_strings)
{
Expand Down Expand Up @@ -109,28 +110,25 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
// between `[out_start_aligned, out_end_aligned)` will be copied using uint4.
// `out_start + 4` and `out_end - 4` are used instead of `out_start` and `out_end` to avoid
// `load_uint4` reading beyond string boundaries.
int32_t out_start_aligned =
int64_t const out_start_aligned =
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
(out_start + in_datatype_size + alignment_offset + out_datatype_size - 1) /
out_datatype_size * out_datatype_size -
alignment_offset;
int32_t out_end_aligned =
int64_t const out_end_aligned =
(out_end - in_datatype_size + alignment_offset) / out_datatype_size * out_datatype_size -
alignment_offset;

for (size_type ichar = out_start_aligned + warp_lane * out_datatype_size;
ichar < out_end_aligned;
for (int64_t ichar = out_start_aligned + warp_lane * out_datatype_size; ichar < out_end_aligned;
ichar += cudf::detail::warp_size * out_datatype_size) {
*(out_chars_aligned + (ichar + alignment_offset) / out_datatype_size) =
load_uint4(in_start + ichar - out_start);
}

// Tail logic: copy characters of the current string outside `[out_start_aligned,
// out_end_aligned)`.
// Copy characters of the current string outside [out_start_aligned, out_end_aligned)
if (out_end_aligned <= out_start_aligned) {
// In this case, `[out_start_aligned, out_end_aligned)` is an empty set, and we copy the
// entire string.
for (int32_t ichar = out_start + warp_lane; ichar < out_end;
ichar += cudf::detail::warp_size) {
for (auto ichar = out_start + warp_lane; ichar < out_end; ichar += cudf::detail::warp_size) {
out_chars[ichar] = in_start[ichar - out_start];
}
} else {
Expand All @@ -139,7 +137,7 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
out_chars[out_start + warp_lane] = in_start[warp_lane];
}
// Copy characters in range `[out_end_aligned, out_end)`.
int32_t ichar = out_end_aligned + warp_lane;
auto const ichar = out_end_aligned + warp_lane;
if (ichar < out_end) { out_chars[ichar] = in_start[ichar - out_start]; }
}
}
Expand All @@ -164,11 +162,11 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
template <int strings_per_threadblock, typename StringIterator, typename MapIterator>
__global__ void gather_chars_fn_char_parallel(StringIterator strings_begin,
char* out_chars,
cudf::device_span<int32_t const> const out_offsets,
cudf::detail::input_offsetalator const out_offsets,
MapIterator string_indices,
size_type total_out_strings)
{
__shared__ int32_t out_offsets_threadblock[strings_per_threadblock + 1];
__shared__ int64_t out_offsets_threadblock[strings_per_threadblock + 1];

// Current thread block will process output strings starting at `begin_out_string_idx`.
size_type begin_out_string_idx = blockIdx.x * strings_per_threadblock;
Expand All @@ -185,7 +183,7 @@ __global__ void gather_chars_fn_char_parallel(StringIterator strings_begin,
}
__syncthreads();

for (int32_t out_ibyte = threadIdx.x + out_offsets_threadblock[0];
for (int64_t out_ibyte = threadIdx.x + out_offsets_threadblock[0];
out_ibyte < out_offsets_threadblock[strings_current_threadblock];
out_ibyte += blockDim.x) {
// binary search for the string index corresponding to out_ibyte
Expand All @@ -197,7 +195,7 @@ __global__ void gather_chars_fn_char_parallel(StringIterator strings_begin,
size_type string_idx = thrust::distance(out_offsets_threadblock, string_idx_iter);

// calculate which character to load within the string
int32_t icharacter = out_ibyte - out_offsets_threadblock[string_idx];
auto const icharacter = out_ibyte - out_offsets_threadblock[string_idx];

size_type in_string_idx = string_indices[begin_out_string_idx + string_idx];
out_chars[out_ibyte] = strings_begin[in_string_idx].data()[icharacter];
Expand Down Expand Up @@ -227,7 +225,7 @@ template <typename StringIterator, typename MapIterator>
std::unique_ptr<cudf::column> gather_chars(StringIterator strings_begin,
MapIterator map_begin,
MapIterator map_end,
cudf::device_span<int32_t const> const offsets,
cudf::detail::input_offsetalator const offsets,
size_type chars_bytes,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
Expand Down Expand Up @@ -300,22 +298,25 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,

// build offsets column
auto const d_strings = column_device_view::create(strings.parent(), stream);
auto const d_in_offsets = !strings.is_empty() ? strings.offsets_begin() : nullptr;
auto const d_in_offsets = cudf::detail::offsetalator_factory::make_input_iterator(
strings.is_empty() ? make_empty_column(type_id::INT32)->view() : strings.offsets(),
strings.offset());

auto offsets_itr = thrust::make_transform_iterator(
begin,
cuda::proclaim_return_type<size_type>(
[d_strings = *d_strings, d_in_offsets] __device__(size_type idx) {
if (NullifyOutOfBounds && (idx < 0 || idx >= d_strings.size())) { return 0; }
if (not d_strings.is_valid(idx)) { return 0; }
return d_in_offsets[idx + 1] - d_in_offsets[idx];
return static_cast<size_type>(d_in_offsets[idx + 1] - d_in_offsets[idx]);
}));
auto [out_offsets_column, total_bytes] =
cudf::detail::make_offsets_child_column(offsets_itr, offsets_itr + output_count, stream, mr);

// build chars column
auto const offsets_view = out_offsets_column->view();
auto out_chars_column = gather_chars(
auto const offsets_view =
cudf::detail::offsetalator_factory::make_input_iterator(out_offsets_column->view());
auto out_chars_column = gather_chars(
d_strings->begin<string_view>(), begin, end, offsets_view, total_bytes, stream, mr);

return make_strings_column(output_count,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -103,9 +103,8 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1);
// use a character-parallel kernel for long string lengths
if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) {
auto const d_data = offsets_view.template data<size_type>();
auto const d_offsets =
device_span<size_type const>{d_data, static_cast<std::size_t>(offsets_view.size())};
cudf::detail::offsetalator_factory::make_input_iterator(offsets_view);
auto const str_begin = thrust::make_transform_iterator(
begin, cuda::proclaim_return_type<string_view>([] __device__(auto ip) {
return string_view{ip.first, ip.second};
Expand Down
Loading