From 1296ebdc2e25445ae12b4dddd0dd8ddd53daee18 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 29 Jun 2023 19:02:04 -0400
Subject: [PATCH] Performance improvement for nvtext tokenize/token functions
 (#13480)

Improves performance for nvtext tokenize functions by minimizing character counting in the `characters_tokenize` utility functor in `src/text/utilities/tokenize_ops.cuh`.

Functions this change effects are:
- [`nvtext::count_tokens`](https://docs.rapids.ai/api/libcudf/stable/group__nvtext__tokenize.html#ga5323d94dac99bf42f0cbb07c4fcd7242) (single delimiter and whitespace)
- [`nvtext::tokenize`](https://docs.rapids.ai/api/libcudf/stable/group__nvtext__tokenize.html#ga64c2806c398ce476fa5174f3155ea0fb) (single delimiter and whitespace)
- [`nvtext::replace_tokens`](https://docs.rapids.ai/api/libcudf/stable/group__nvtext__replace.html#ga66219b7db6155c4e14bf6f6147e1fc81)
- [`nvtext::normalize_spaces`](https://docs.rapids.ai/api/libcudf/stable/group__nvtext__normalize.html#ga9104dffc71baf77e710bc63e5e2a8837)
- [`nvtext::ngrams_tokenize`](https://docs.rapids.ai/api/libcudf/stable/group__nvtext__ngrams.html#gace17045b4ee5a3b10157ed40f9575298)

This change improved performance by at least 10% for all string lengths for most of these functions.

Reference #13048

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13480
---
 cpp/src/text/normalize.cu               | 20 ++++++-----
 cpp/src/text/tokenize.cu                |  8 +++--
 cpp/src/text/utilities/tokenize_ops.cuh | 48 ++++++++++++++-----------
 3 files changed, 43 insertions(+), 33 deletions(-)

diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index 73d01c9f3ec..bc2b0607193 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -58,7 +58,7 @@ namespace {
  */
 struct normalize_spaces_fn {
   cudf::column_device_view const d_strings;  // strings to normalize
-  int32_t* d_offsets{};                      // offsets into d_buffer
+  cudf::size_type* d_offsets{};              // offsets into d_chars
   char* d_chars{};                           // output buffer for characters
 
   __device__ void operator()(cudf::size_type idx)
@@ -70,8 +70,9 @@ struct normalize_spaces_fn {
     cudf::string_view const single_space(" ", 1);
     auto const d_str = d_strings.element<cudf::string_view>(idx);
     char* buffer     = d_chars ? d_chars + d_offsets[idx] : nullptr;
-    char* optr       = buffer;  // running output pointer
-    int32_t nbytes   = 0;       // holds the number of bytes per output string
+    char* optr       = buffer;   // running output pointer
+
+    cudf::size_type nbytes = 0;  // holds the number of bytes per output string
 
     // create a tokenizer for this string with whitespace delimiter (default)
     characters_tokenizer tokenizer(d_str);
@@ -79,15 +80,16 @@ struct normalize_spaces_fn {
     // this will retrieve tokens automatically skipping runs of whitespace
     while (tokenizer.next_token()) {
       auto const token_pos = tokenizer.token_byte_positions();
-      nbytes += token_pos.second - token_pos.first + 1;  // token size plus a single space
+      auto const token =
+        cudf::string_view(d_str.data() + token_pos.first, token_pos.second - token_pos.first);
       if (optr) {
-        cudf::string_view const token(d_str.data() + token_pos.first,
-                                      token_pos.second - token_pos.first);
-        if (optr != buffer)  // prepend space unless we are at the beginning
-          optr = cudf::strings::detail::copy_string(optr, single_space);
+        // prepend space unless we are at the beginning
+        if (optr != buffer) { optr = cudf::strings::detail::copy_string(optr, single_space); }
         // write token to output buffer
-        optr = cudf::strings::detail::copy_string(optr, token);
+        thrust::copy_n(thrust::seq, token.data(), token.size_bytes(), optr);
+        optr += token.size_bytes();
       }
+      nbytes += token.size_bytes() + 1;  // token size plus a single space
     }
     // remove trailing space
     if (!d_chars) d_offsets[idx] = (nbytes > 0) ? nbytes - 1 : 0;
diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
index d0e366cb081..8604152099c 100644
--- a/cpp/src/text/tokenize.cu
+++ b/cpp/src/text/tokenize.cu
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+#include <text/utilities/tokenize_ops.cuh>
+
+#include <nvtext/detail/tokenize.hpp>
+#include <nvtext/tokenize.hpp>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -24,9 +29,6 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
-#include <nvtext/detail/tokenize.hpp>
-#include <nvtext/tokenize.hpp>
-#include <text/utilities/tokenize_ops.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/src/text/utilities/tokenize_ops.cuh b/cpp/src/text/utilities/tokenize_ops.cuh
index b00b6e22184..003c041c0bf 100644
--- a/cpp/src/text/utilities/tokenize_ops.cuh
+++ b/cpp/src/text/utilities/tokenize_ops.cuh
@@ -15,6 +15,7 @@
  */
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/strings/detail/utf8.hpp>
 #include <cudf/strings/string_view.cuh>
 
 #include <thrust/execution_policy.h>
@@ -50,7 +51,7 @@ struct characters_tokenizer {
     : d_str{d_str},
       d_delimiter{d_delimiter},
       spaces{true},
-      itr{d_str.begin()},
+      current_position{0},
       start_position(0),
       end_position(d_str.size_bytes())
   {
@@ -64,7 +65,7 @@ struct characters_tokenizer {
    * @param chr The character to test.
    * @return true if the character is a delimiter
    */
-  __device__ bool is_delimiter(cudf::char_utf8 chr)
+  __device__ bool is_delimiter(cudf::char_utf8 chr) const
   {
     return d_delimiter.empty() ? (chr <= ' ') :  // whitespace check
              thrust::any_of(thrust::seq,
@@ -78,7 +79,7 @@ struct characters_tokenizer {
    * string at the specified iterator position.
    *
    * For empty delimiter, whitespace code-point is checked.
-   * Starting at the given iterator (itr) position, a token
+   * Starting at the current_position, a token
    * start position is identified when a delimiter is
    * not found. Once found, the end position is identified
    * when a delimiter or the end of the string is found.
@@ -87,27 +88,32 @@ struct characters_tokenizer {
    */
   __device__ bool next_token()
   {
-    if (itr != d_str.begin()) {  // skip these 2 lines the first time through
-      ++itr;
-      start_position = itr.byte_offset();
+    auto const src_ptr = d_str.data();
+    if (current_position != 0) {  // skip these 2 lines the first time through
+      current_position += cudf::strings::detail::bytes_in_char_utf8(src_ptr[current_position]);
+      start_position = current_position;
     }
-    if (start_position >= d_str.size_bytes()) return false;
+    if (start_position >= d_str.size_bytes()) { return false; }
     // continue search for the next token
     end_position = d_str.size_bytes();
-    for (; itr != d_str.end(); ++itr) {
-      cudf::char_utf8 ch = *itr;
+    while (current_position < d_str.size_bytes()) {
+      cudf::char_utf8 ch   = 0;
+      auto const chr_width = cudf::strings::detail::to_char_utf8(src_ptr + current_position, ch);
       if (spaces == is_delimiter(ch)) {
-        if (spaces)
-          start_position = (itr + 1).byte_offset();
-        else
-          end_position = (itr + 1).byte_offset();
+        current_position += chr_width;
+        if (spaces) {
+          start_position = current_position;
+        } else {
+          end_position = current_position;
+        }
         continue;
       }
       spaces = !spaces;
       if (spaces) {
-        end_position = itr.byte_offset();
+        end_position = current_position;
         break;
       }
+      current_position += chr_width;
     }
     return start_position < end_position;
   }
@@ -118,18 +124,18 @@ struct characters_tokenizer {
    *
    * @return Byte positions of the current token.
    */
-  __device__ position_pair token_byte_positions()
+  __device__ position_pair token_byte_positions() const
   {
     return position_pair{start_position, end_position};
   }
 
  private:
-  cudf::string_view const d_str;          ///< string to tokenize
-  cudf::string_view const d_delimiter;    ///< delimiter characters
-  bool spaces;                            ///< true if current position is delimiter
-  cudf::string_view::const_iterator itr;  ///< current position in d_str
-  cudf::size_type start_position;         ///< starting character position of token found
-  cudf::size_type end_position;           ///< ending character position (excl) of token found
+  cudf::string_view const d_str;        ///< string to tokenize
+  cudf::string_view const d_delimiter;  ///< delimiter characters
+  bool spaces;                          ///< true if current position is delimiter
+  cudf::size_type current_position;     ///< current position in d_str
+  cudf::size_type start_position;       ///< starting byte position of token found
+  cudf::size_type end_position;         ///< ending byte position (exclusive) of token found
 };
 
 /**