From 1296ebdc2e25445ae12b4dddd0dd8ddd53daee18 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 29 Jun 2023 19:02:04 -0400 Subject: [PATCH] Performance improvement for nvtext tokenize/token functions (#13480) Improves performance for nvtext tokenize functions by minimizing character counting in the `characters_tokenize` utility functor in `src/text/utilities/tokenize_ops.cuh`. Functions this change effects are: - [`nvtext::count_tokens`](https://docs.rapids.ai/api/libcudf/stable/group__nvtext__tokenize.html#ga5323d94dac99bf42f0cbb07c4fcd7242) (single delimiter and whitespace) - [`nvtext::tokenize`](https://docs.rapids.ai/api/libcudf/stable/group__nvtext__tokenize.html#ga64c2806c398ce476fa5174f3155ea0fb) (single delimiter and whitespace) - [`nvtext::replace_tokens`](https://docs.rapids.ai/api/libcudf/stable/group__nvtext__replace.html#ga66219b7db6155c4e14bf6f6147e1fc81) - [`nvtext::normalize_spaces`](https://docs.rapids.ai/api/libcudf/stable/group__nvtext__normalize.html#ga9104dffc71baf77e710bc63e5e2a8837) - [`nvtext::ngrams_tokenize`](https://docs.rapids.ai/api/libcudf/stable/group__nvtext__ngrams.html#gace17045b4ee5a3b10157ed40f9575298) This change improved performance by at least 10% for all string lengths for most of these functions. Reference #13048 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Mike Wilson (https://github.com/hyperbolic2346) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/13480 --- cpp/src/text/normalize.cu | 20 ++++++----- cpp/src/text/tokenize.cu | 8 +++-- cpp/src/text/utilities/tokenize_ops.cuh | 48 ++++++++++++++----------- 3 files changed, 43 insertions(+), 33 deletions(-) diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index 73d01c9f3ec..bc2b0607193 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -58,7 +58,7 @@ namespace { */ struct normalize_spaces_fn { cudf::column_device_view const d_strings; // strings to normalize - int32_t* d_offsets{}; // offsets into d_buffer + cudf::size_type* d_offsets{}; // offsets into d_chars char* d_chars{}; // output buffer for characters __device__ void operator()(cudf::size_type idx) @@ -70,8 +70,9 @@ struct normalize_spaces_fn { cudf::string_view const single_space(" ", 1); auto const d_str = d_strings.element(idx); char* buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; - char* optr = buffer; // running output pointer - int32_t nbytes = 0; // holds the number of bytes per output string + char* optr = buffer; // running output pointer + + cudf::size_type nbytes = 0; // holds the number of bytes per output string // create a tokenizer for this string with whitespace delimiter (default) characters_tokenizer tokenizer(d_str); @@ -79,15 +80,16 @@ struct normalize_spaces_fn { // this will retrieve tokens automatically skipping runs of whitespace while (tokenizer.next_token()) { auto const token_pos = tokenizer.token_byte_positions(); - nbytes += token_pos.second - token_pos.first + 1; // token size plus a single space + auto const token = + cudf::string_view(d_str.data() + token_pos.first, token_pos.second - token_pos.first); if (optr) { - cudf::string_view const token(d_str.data() + token_pos.first, - token_pos.second - token_pos.first); - if (optr != buffer) // prepend space unless we are at the beginning - optr = cudf::strings::detail::copy_string(optr, single_space); + // prepend space unless we are at the beginning + if (optr != buffer) { optr = cudf::strings::detail::copy_string(optr, single_space); } // write token to output buffer - optr = cudf::strings::detail::copy_string(optr, token); + thrust::copy_n(thrust::seq, token.data(), token.size_bytes(), optr); + optr += token.size_bytes(); } + nbytes += token.size_bytes() + 1; // token size plus a single space } // remove trailing space if (!d_chars) d_offsets[idx] = (nbytes > 0) ? nbytes - 1 : 0; diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu index d0e366cb081..8604152099c 100644 --- a/cpp/src/text/tokenize.cu +++ b/cpp/src/text/tokenize.cu @@ -14,6 +14,11 @@ * limitations under the License. */ +#include + +#include +#include + #include #include #include @@ -24,9 +29,6 @@ #include #include #include -#include -#include -#include #include #include diff --git a/cpp/src/text/utilities/tokenize_ops.cuh b/cpp/src/text/utilities/tokenize_ops.cuh index b00b6e22184..003c041c0bf 100644 --- a/cpp/src/text/utilities/tokenize_ops.cuh +++ b/cpp/src/text/utilities/tokenize_ops.cuh @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -50,7 +51,7 @@ struct characters_tokenizer { : d_str{d_str}, d_delimiter{d_delimiter}, spaces{true}, - itr{d_str.begin()}, + current_position{0}, start_position(0), end_position(d_str.size_bytes()) { @@ -64,7 +65,7 @@ struct characters_tokenizer { * @param chr The character to test. * @return true if the character is a delimiter */ - __device__ bool is_delimiter(cudf::char_utf8 chr) + __device__ bool is_delimiter(cudf::char_utf8 chr) const { return d_delimiter.empty() ? (chr <= ' ') : // whitespace check thrust::any_of(thrust::seq, @@ -78,7 +79,7 @@ struct characters_tokenizer { * string at the specified iterator position. * * For empty delimiter, whitespace code-point is checked. - * Starting at the given iterator (itr) position, a token + * Starting at the current_position, a token * start position is identified when a delimiter is * not found. Once found, the end position is identified * when a delimiter or the end of the string is found. @@ -87,27 +88,32 @@ struct characters_tokenizer { */ __device__ bool next_token() { - if (itr != d_str.begin()) { // skip these 2 lines the first time through - ++itr; - start_position = itr.byte_offset(); + auto const src_ptr = d_str.data(); + if (current_position != 0) { // skip these 2 lines the first time through + current_position += cudf::strings::detail::bytes_in_char_utf8(src_ptr[current_position]); + start_position = current_position; } - if (start_position >= d_str.size_bytes()) return false; + if (start_position >= d_str.size_bytes()) { return false; } // continue search for the next token end_position = d_str.size_bytes(); - for (; itr != d_str.end(); ++itr) { - cudf::char_utf8 ch = *itr; + while (current_position < d_str.size_bytes()) { + cudf::char_utf8 ch = 0; + auto const chr_width = cudf::strings::detail::to_char_utf8(src_ptr + current_position, ch); if (spaces == is_delimiter(ch)) { - if (spaces) - start_position = (itr + 1).byte_offset(); - else - end_position = (itr + 1).byte_offset(); + current_position += chr_width; + if (spaces) { + start_position = current_position; + } else { + end_position = current_position; + } continue; } spaces = !spaces; if (spaces) { - end_position = itr.byte_offset(); + end_position = current_position; break; } + current_position += chr_width; } return start_position < end_position; } @@ -118,18 +124,18 @@ struct characters_tokenizer { * * @return Byte positions of the current token. */ - __device__ position_pair token_byte_positions() + __device__ position_pair token_byte_positions() const { return position_pair{start_position, end_position}; } private: - cudf::string_view const d_str; ///< string to tokenize - cudf::string_view const d_delimiter; ///< delimiter characters - bool spaces; ///< true if current position is delimiter - cudf::string_view::const_iterator itr; ///< current position in d_str - cudf::size_type start_position; ///< starting character position of token found - cudf::size_type end_position; ///< ending character position (excl) of token found + cudf::string_view const d_str; ///< string to tokenize + cudf::string_view const d_delimiter; ///< delimiter characters + bool spaces; ///< true if current position is delimiter + cudf::size_type current_position; ///< current position in d_str + cudf::size_type start_position; ///< starting byte position of token found + cudf::size_type end_position; ///< ending byte position (exclusive) of token found }; /**