Skip to content

Commit

Permalink
Performance improvement for nvtext tokenize/token functions (#13480)
Browse files Browse the repository at this point in the history
Improves performance for nvtext tokenize functions by minimizing character counting in the `characters_tokenize` utility functor in `src/text/utilities/tokenize_ops.cuh`.

Functions this change effects are:
- [`nvtext::count_tokens`](https://docs.rapids.ai/api/libcudf/stable/group__nvtext__tokenize.html#ga5323d94dac99bf42f0cbb07c4fcd7242) (single delimiter and whitespace)
- [`nvtext::tokenize`](https://docs.rapids.ai/api/libcudf/stable/group__nvtext__tokenize.html#ga64c2806c398ce476fa5174f3155ea0fb) (single delimiter and whitespace)
- [`nvtext::replace_tokens`](https://docs.rapids.ai/api/libcudf/stable/group__nvtext__replace.html#ga66219b7db6155c4e14bf6f6147e1fc81)
- [`nvtext::normalize_spaces`](https://docs.rapids.ai/api/libcudf/stable/group__nvtext__normalize.html#ga9104dffc71baf77e710bc63e5e2a8837)
- [`nvtext::ngrams_tokenize`](https://docs.rapids.ai/api/libcudf/stable/group__nvtext__ngrams.html#gace17045b4ee5a3b10157ed40f9575298)

This change improved performance by at least 10% for all string lengths for most of these functions.

Reference #13048

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Bradley Dice (https://github.com/bdice)

URL: #13480
  • Loading branch information
davidwendt committed Jun 29, 2023
1 parent 1df1549 commit 1296ebd
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 33 deletions.
20 changes: 11 additions & 9 deletions cpp/src/text/normalize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ namespace {
*/
struct normalize_spaces_fn {
cudf::column_device_view const d_strings; // strings to normalize
int32_t* d_offsets{}; // offsets into d_buffer
cudf::size_type* d_offsets{}; // offsets into d_chars
char* d_chars{}; // output buffer for characters

__device__ void operator()(cudf::size_type idx)
Expand All @@ -70,24 +70,26 @@ struct normalize_spaces_fn {
cudf::string_view const single_space(" ", 1);
auto const d_str = d_strings.element<cudf::string_view>(idx);
char* buffer = d_chars ? d_chars + d_offsets[idx] : nullptr;
char* optr = buffer; // running output pointer
int32_t nbytes = 0; // holds the number of bytes per output string
char* optr = buffer; // running output pointer

cudf::size_type nbytes = 0; // holds the number of bytes per output string

// create a tokenizer for this string with whitespace delimiter (default)
characters_tokenizer tokenizer(d_str);

// this will retrieve tokens automatically skipping runs of whitespace
while (tokenizer.next_token()) {
auto const token_pos = tokenizer.token_byte_positions();
nbytes += token_pos.second - token_pos.first + 1; // token size plus a single space
auto const token =
cudf::string_view(d_str.data() + token_pos.first, token_pos.second - token_pos.first);
if (optr) {
cudf::string_view const token(d_str.data() + token_pos.first,
token_pos.second - token_pos.first);
if (optr != buffer) // prepend space unless we are at the beginning
optr = cudf::strings::detail::copy_string(optr, single_space);
// prepend space unless we are at the beginning
if (optr != buffer) { optr = cudf::strings::detail::copy_string(optr, single_space); }
// write token to output buffer
optr = cudf::strings::detail::copy_string(optr, token);
thrust::copy_n(thrust::seq, token.data(), token.size_bytes(), optr);
optr += token.size_bytes();
}
nbytes += token.size_bytes() + 1; // token size plus a single space
}
// remove trailing space
if (!d_chars) d_offsets[idx] = (nbytes > 0) ? nbytes - 1 : 0;
Expand Down
8 changes: 5 additions & 3 deletions cpp/src/text/tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@
* limitations under the License.
*/

#include <text/utilities/tokenize_ops.cuh>

#include <nvtext/detail/tokenize.hpp>
#include <nvtext/tokenize.hpp>

#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
Expand All @@ -24,9 +29,6 @@
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/error.hpp>
#include <nvtext/detail/tokenize.hpp>
#include <nvtext/tokenize.hpp>
#include <text/utilities/tokenize_ops.cuh>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>
Expand Down
48 changes: 27 additions & 21 deletions cpp/src/text/utilities/tokenize_ops.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/

#include <cudf/column/column_device_view.cuh>
#include <cudf/strings/detail/utf8.hpp>
#include <cudf/strings/string_view.cuh>

#include <thrust/execution_policy.h>
Expand Down Expand Up @@ -50,7 +51,7 @@ struct characters_tokenizer {
: d_str{d_str},
d_delimiter{d_delimiter},
spaces{true},
itr{d_str.begin()},
current_position{0},
start_position(0),
end_position(d_str.size_bytes())
{
Expand All @@ -64,7 +65,7 @@ struct characters_tokenizer {
* @param chr The character to test.
* @return true if the character is a delimiter
*/
__device__ bool is_delimiter(cudf::char_utf8 chr)
__device__ bool is_delimiter(cudf::char_utf8 chr) const
{
return d_delimiter.empty() ? (chr <= ' ') : // whitespace check
thrust::any_of(thrust::seq,
Expand All @@ -78,7 +79,7 @@ struct characters_tokenizer {
* string at the specified iterator position.
*
* For empty delimiter, whitespace code-point is checked.
* Starting at the given iterator (itr) position, a token
* Starting at the current_position, a token
* start position is identified when a delimiter is
* not found. Once found, the end position is identified
* when a delimiter or the end of the string is found.
Expand All @@ -87,27 +88,32 @@ struct characters_tokenizer {
*/
__device__ bool next_token()
{
if (itr != d_str.begin()) { // skip these 2 lines the first time through
++itr;
start_position = itr.byte_offset();
auto const src_ptr = d_str.data();
if (current_position != 0) { // skip these 2 lines the first time through
current_position += cudf::strings::detail::bytes_in_char_utf8(src_ptr[current_position]);
start_position = current_position;
}
if (start_position >= d_str.size_bytes()) return false;
if (start_position >= d_str.size_bytes()) { return false; }
// continue search for the next token
end_position = d_str.size_bytes();
for (; itr != d_str.end(); ++itr) {
cudf::char_utf8 ch = *itr;
while (current_position < d_str.size_bytes()) {
cudf::char_utf8 ch = 0;
auto const chr_width = cudf::strings::detail::to_char_utf8(src_ptr + current_position, ch);
if (spaces == is_delimiter(ch)) {
if (spaces)
start_position = (itr + 1).byte_offset();
else
end_position = (itr + 1).byte_offset();
current_position += chr_width;
if (spaces) {
start_position = current_position;
} else {
end_position = current_position;
}
continue;
}
spaces = !spaces;
if (spaces) {
end_position = itr.byte_offset();
end_position = current_position;
break;
}
current_position += chr_width;
}
return start_position < end_position;
}
Expand All @@ -118,18 +124,18 @@ struct characters_tokenizer {
*
* @return Byte positions of the current token.
*/
__device__ position_pair token_byte_positions()
__device__ position_pair token_byte_positions() const
{
return position_pair{start_position, end_position};
}

private:
cudf::string_view const d_str; ///< string to tokenize
cudf::string_view const d_delimiter; ///< delimiter characters
bool spaces; ///< true if current position is delimiter
cudf::string_view::const_iterator itr; ///< current position in d_str
cudf::size_type start_position; ///< starting character position of token found
cudf::size_type end_position; ///< ending character position (excl) of token found
cudf::string_view const d_str; ///< string to tokenize
cudf::string_view const d_delimiter; ///< delimiter characters
bool spaces; ///< true if current position is delimiter
cudf::size_type current_position; ///< current position in d_str
cudf::size_type start_position; ///< starting byte position of token found
cudf::size_type end_position; ///< ending byte position (exclusive) of token found
};

/**
Expand Down

0 comments on commit 1296ebd

Please sign in to comment.