From 420502d4828836b0fe539ef2c6b4f9400e6bb854 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 14 Aug 2023 11:45:33 -0400 Subject: [PATCH 1/2] Fix character-ngrams performance regression for longer strings --- cpp/src/text/generate_ngrams.cu | 113 ++++++++++++-------------------- 1 file changed, 41 insertions(+), 72 deletions(-) diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu index e0a5b860005..05eb32ca3dc 100644 --- a/cpp/src/text/generate_ngrams.cu +++ b/cpp/src/text/generate_ngrams.cu @@ -160,82 +160,41 @@ namespace detail { namespace { /** - * @brief Base class for generating character ngrams - * - * The ngrams are produced for each string and the derived class's - * `process_ngram` function is called for each ngram/substring. + * @brief Generate character ngrams for each string * - * @tparam Derived class uses the CRTP pattern to reuse code logic. + * Each string produces many strings depending on the ngram width and the string size. + * This functor can be used with `make_strings_children` to build the offsets and + * the chars child columns. */ -template -struct base_character_ngram_fn { +struct character_ngram_generator_fn { cudf::column_device_view const d_strings; cudf::size_type ngrams; - cudf::size_type const* d_ngram_offsets{}; - - base_character_ngram_fn(cudf::column_device_view const& d_strings, - cudf::size_type ngrams, - cudf::size_type const* d_ngram_offsets) - : d_strings(d_strings), ngrams(ngrams), d_ngram_offsets(d_ngram_offsets) - { - } + int32_t const* d_ngram_offsets{}; + cudf::size_type* d_offsets{}; + char* d_chars{}; - __device__ void operator()(cudf::size_type idx) const + __device__ void operator()(cudf::size_type idx) { if (d_strings.is_null(idx)) return; auto const d_str = d_strings.element(idx); if (d_str.empty()) return; - auto const& derived = static_cast(*this); auto itr = d_str.begin(); auto const ngram_offset = d_ngram_offsets[idx]; auto const ngram_count = d_ngram_offsets[idx + 1] - ngram_offset; + auto d_sizes = d_offsets + ngram_offset; + auto out_ptr = d_chars ? d_chars + *d_sizes : nullptr; for (cudf::size_type n = 0; n < ngram_count; ++n, ++itr) { auto const begin = itr.byte_offset(); auto const end = (itr + ngrams).byte_offset(); - auto const ngram = cudf::string_view(d_str.data() + begin, end - begin); - derived.process_ngram(ngram, n + ngram_offset); + if (d_chars) { + out_ptr = + cudf::strings::detail::copy_and_increment(out_ptr, d_str.data() + begin, (end - begin)); + } else { + *d_sizes++ = end - begin; + } } } }; - -/** - * @brief Generate character ngrams for each string - * - * Each string produces many strings depending on the ngram width and the string size. - * This functor can be used with `make_strings_children` to build the offsets and - * the chars child columns. - */ -struct character_ngram_generator_fn : base_character_ngram_fn { - cudf::size_type* d_offsets{}; - char* d_chars{}; - - character_ngram_generator_fn(cudf::column_device_view const& d_strings, - cudf::size_type ngrams, - cudf::size_type const* d_ngram_offsets) - : base_character_ngram_fn(d_strings, ngrams, d_ngram_offsets) - { - } - - /** - * @brief Called through the base class for each ngram - * - * Either stores the size of each string or copies the string to the output - * - * @param d_str The ngram substring to process - * @param offset The output position relative to d_offsets - */ - __device__ void process_ngram(cudf::string_view d_str, cudf::size_type offset) const - { - auto d_str_offsets = d_offsets + offset; - if (d_chars) { - auto out_ptr = d_chars + *d_str_offsets; - cudf::strings::detail::copy_string(out_ptr, d_str); - } else { - *d_str_offsets = d_str.size_bytes(); - } - } -}; - } // namespace std::unique_ptr generate_character_ngrams(cudf::strings_column_view const& strings, @@ -282,23 +241,33 @@ std::unique_ptr generate_character_ngrams(cudf::strings_column_vie namespace { /** - * @brief Computes the hash of each ngram as produced by the base class + * @brief Computes the hash of each character ngram + * + * Each thread processes a single string. Substrings are resolved for every character + * of the string and hashed. */ -struct character_ngram_hash_fn : base_character_ngram_fn { - cudf::hash_value_type* d_hashes; - - character_ngram_hash_fn(cudf::column_device_view const& d_strings, - cudf::size_type ngrams, - cudf::size_type const* d_ngram_offsets, - cudf::hash_value_type* d_hashes) - : base_character_ngram_fn(d_strings, ngrams, d_ngram_offsets), d_hashes(d_hashes) - { - } +struct character_ngram_hash_fn { + cudf::column_device_view const d_strings; + cudf::size_type ngrams; + cudf::size_type const* d_ngram_offsets; + cudf::hash_value_type* d_results; - __device__ void process_ngram(cudf::string_view d_str, cudf::size_type offset) const + __device__ void operator()(cudf::size_type idx) const { - auto const hasher = cudf::hashing::detail::MurmurHash3_x86_32{0}; - d_hashes[offset] = hasher(d_str); + if (d_strings.is_null(idx)) return; + auto const d_str = d_strings.element(idx); + if (d_str.empty()) return; + auto itr = d_str.begin(); + auto const ngram_offset = d_ngram_offsets[idx]; + auto const ngram_count = d_ngram_offsets[idx + 1] - ngram_offset; + auto const hasher = cudf::hashing::detail::MurmurHash3_x86_32{0}; + auto d_hashes = d_results + ngram_offset; + for (cudf::size_type n = 0; n < ngram_count; ++n, ++itr) { + auto const begin = itr.byte_offset(); + auto const end = (itr + ngrams).byte_offset(); + auto const ngram = cudf::string_view(d_str.data() + begin, end - begin); + *d_hashes++ = hasher(ngram); + } } }; } // namespace From 24c4ac286784d49393bef7501a424584bd88a6b1 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 14 Aug 2023 13:54:10 -0400 Subject: [PATCH 2/2] change int32 to size_type --- cpp/src/text/generate_ngrams.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu index 05eb32ca3dc..938fd45246d 100644 --- a/cpp/src/text/generate_ngrams.cu +++ b/cpp/src/text/generate_ngrams.cu @@ -169,7 +169,7 @@ namespace { struct character_ngram_generator_fn { cudf::column_device_view const d_strings; cudf::size_type ngrams; - int32_t const* d_ngram_offsets{}; + cudf::size_type const* d_ngram_offsets{}; cudf::size_type* d_offsets{}; char* d_chars{}; @@ -212,7 +212,7 @@ std::unique_ptr generate_character_ngrams(cudf::strings_column_vie auto const d_strings = *strings_column; // create a vector of ngram offsets for each string - rmm::device_uvector ngram_offsets(strings_count + 1, stream); + rmm::device_uvector ngram_offsets(strings_count + 1, stream); thrust::transform_exclusive_scan( rmm::exec_policy(stream), thrust::make_counting_iterator(0), @@ -221,7 +221,7 @@ std::unique_ptr generate_character_ngrams(cudf::strings_column_vie [d_strings, strings_count, ngrams] __device__(auto idx) { if (d_strings.is_null(idx) || (idx == strings_count)) return 0; auto const length = d_strings.element(idx).length(); - return std::max(0, static_cast(length + 1 - ngrams)); + return std::max(0, static_cast(length + 1 - ngrams)); }, cudf::size_type{0}, thrust::plus());