rapidsai · rapids-bot · Jul 12, 2023 · Jul 3, 2023 · Jul 3, 2023 · Jul 5, 2023
@@ -275,7 +275,8 @@ ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.c
 ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)
 
 ConfigureNVBench(
-  TEXT_NVBENCH text/minhash.cpp text/normalize.cpp text/replace.cpp text/tokenize.cpp
+  TEXT_NVBENCH text/hash_ngrams.cpp text/minhash.cpp text/normalize.cpp text/replace.cpp
+  text/tokenize.cpp
 )
 
 # ##################################################################################################

@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/rmm_pool_raii.hpp>
+
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <nvtext/generate_ngrams.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <rmm/device_buffer.hpp>
+
+static void bench_hash_ngrams(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const ngrams    = static_cast<cudf::size_type>(state.get_int64("ngrams"));
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  data_profile const strings_profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  auto const strings_table =
+    create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
+  cudf::strings_column_view input(strings_table->view().column(0));
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+
+  auto chars_size = input.chars_size();
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  // output are hashes: approximate total number of hashes
+  state.add_global_memory_writes<nvbench::int32_t>(num_rows * ngrams);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = nvtext::hash_character_ngrams(input, ngrams);
+  });
+}
+
+NVBENCH_BENCH(bench_hash_ngrams)
+  .set_name("hash_ngrams")
+  .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144})
+  .add_int64_axis("row_width", {128, 512, 2048})
+  .add_int64_axis("ngrams", {5, 10});
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <nvtext/generate_ngrams.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace nvtext {
+namespace detail {
+
+/**
+ * @copydoc hash_character_ngrams(cudf::strings_column_view const&,
+ * cudf::size_type, rmm::mr::device_memory_resource*)
+ *
+ * @param stream CUDA stream used for allocating/copying device memory and launching kernels
+ */
+std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
+                                                    cudf::size_type ngrams,
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::mr::device_memory_resource* mr);
+
+}  // namespace detail
+}  // namespace nvtext
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -90,5 +90,38 @@ std::unique_ptr<cudf::column> generate_character_ngrams(
   cudf::size_type ngrams              = 2,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Hashes ngrams of characters within each string
+ *
+ * Each character of a string used to build the ngrams and ngrams are not
+ * produced across adjacent strings rows.
+ *
+ * ```
+ * "abcdefg" would generate ngrams=5 as ["abcde", "bcdef" "cdefg"]
+ * ```
+ *
+ * The ngrams for each string are hashed and returned in a list column where
+ * the offsets specify rows of hash values for each string.
+ *
+ * The size of the child column will be the total number of ngrams generated from
+ * the input strings column.
+ *
+ * All null row entries are ignored and the output contains all valid rows.
+ *
+ * The hash algorithm uses MurmurHash32 on each ngram.
+ *
+ * @throw cudf::logic_error if `ngrams < 5`
+ * @throw cudf::logic_error if there are not enough characters to generate any ngrams
+ *
+ * @param strings Strings column to produce ngrams from.
+ * @param ngrams The ngram number to generate. Default is 5.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return A lists column of hash values
+ */
+std::unique_ptr<cudf::column> hash_character_ngrams(
+  cudf::strings_column_view const& strings,
+  cudf::size_type ngrams              = 5,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 }  // namespace nvtext
@@ -14,13 +14,17 @@
  * limitations under the License.
  */
 
-#include <nvtext/generate_ngrams.hpp>
+#include <nvtext/detail/generate_ngrams.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy_if.cuh>
+#include <cudf/detail/hashing.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/sizes_to_offsets_iterator.cuh>
+#include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -156,31 +160,79 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
 namespace detail {
 namespace {
 
-struct character_ngram_generator_fn {
+/**
+ * @brief Base class for generating character ngrams
+ *
+ * The ngrams are produced for each string and the derived class's
+ * `process_ngram` function is called for each ngram/substring.
+ *
+ * @tparam Derived class uses the CRTP pattern to reuse code logic.
+ */
+template <typename Derived>
+struct base_character_ngram_fn {
   cudf::column_device_view const d_strings;
   cudf::size_type ngrams;
-  int32_t const* d_ngram_offsets{};
-  cudf::size_type* d_offsets{};
-  char* d_chars{};
+  cudf::size_type const* d_ngram_offsets{};
 
-  __device__ void operator()(cudf::size_type idx)
+  base_character_ngram_fn(cudf::column_device_view const& d_strings,
+                          cudf::size_type ngrams,
+                          cudf::size_type const* d_ngram_offsets)
+    : d_strings(d_strings), ngrams(ngrams), d_ngram_offsets(d_ngram_offsets)
+  {
+  }
+
+  __device__ void operator()(cudf::size_type idx) const
   {
     if (d_strings.is_null(idx)) return;
     auto const d_str = d_strings.element<cudf::string_view>(idx);
     if (d_str.empty()) return;
+    auto const& derived     = static_cast<Derived const&>(*this);
     auto itr                = d_str.begin();
     auto const ngram_offset = d_ngram_offsets[idx];
     auto const ngram_count  = d_ngram_offsets[idx + 1] - ngram_offset;
-    auto d_sizes            = d_offsets + ngram_offset;
-    auto out_ptr            = d_chars ? d_chars + *d_sizes : nullptr;
     for (cudf::size_type n = 0; n < ngram_count; ++n, ++itr) {
       auto const begin = itr.byte_offset();
       auto const end   = (itr + ngrams).byte_offset();
-      if (out_ptr)
-        out_ptr =
-          cudf::strings::detail::copy_and_increment(out_ptr, d_str.data() + begin, (end - begin));
-      else
-        *d_sizes++ = end - begin;
+      auto const ngram = cudf::string_view(d_str.data() + begin, end - begin);
+      derived.process_ngram(ngram, n + ngram_offset);
+    }
+  }
+};
+
+/**
+ * @brief Generate character ngrams for each string
+ *
+ * Each string produces many strings depending on the ngram width and the string size.
+ * This functor can be used with `make_strings_children` to build the offsets and
+ * the chars child columns.
+ */
+struct character_ngram_generator_fn : base_character_ngram_fn<character_ngram_generator_fn> {
+  cudf::size_type* d_offsets{};
+  char* d_chars{};
+
+  character_ngram_generator_fn(cudf::column_device_view const& d_strings,
+                               cudf::size_type ngrams,
+                               cudf::size_type const* d_ngram_offsets)
+    : base_character_ngram_fn(d_strings, ngrams, d_ngram_offsets)
+  {
+  }
+
+  /**
+   * @brief Called through the base class for each ngram
+   *
+   * Either stores the size of each string or copies the string to the output
+   *
+   * @param d_str The ngram substring to process
+   * @param offset The output position relative to d_offsets
+   */
+  __device__ void process_ngram(cudf::string_view d_str, cudf::size_type offset) const
+  {
+    auto d_str_offsets = d_offsets + offset;
+    if (d_chars) {
+      auto out_ptr = d_chars + *d_str_offsets;
+      cudf::strings::detail::copy_string(out_ptr, d_str);
+    } else {
+      *d_str_offsets = d_str.size_bytes();
     }
   }
 };
@@ -229,6 +281,70 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
     total_ngrams, std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{});
 }
 
+namespace {
+/**
+ * @brief Computes the hash of each ngram as produced by the base class
+ */
+struct character_ngram_hash_fn : base_character_ngram_fn<character_ngram_hash_fn> {
+  cudf::hash_value_type* d_hashes;
+
+  character_ngram_hash_fn(cudf::column_device_view const& d_strings,
+                          cudf::size_type ngrams,
+                          cudf::size_type const* d_ngram_offsets,
+                          cudf::hash_value_type* d_hashes)
+    : base_character_ngram_fn(d_strings, ngrams, d_ngram_offsets), d_hashes(d_hashes)
+  {
+  }
+
+  __device__ void process_ngram(cudf::string_view d_str, cudf::size_type offset) const
+  {
+    auto const hasher = cudf::detail::MurmurHash3_32<cudf::string_view>{0};
+    d_hashes[offset]  = hasher(d_str);
+  }
+};
+}  // namespace
+
+std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& input,
+                                                    cudf::size_type ngrams,
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(ngrams > 4, "Parameter ngrams should be an integer value of 5 or greater");
+
+  auto output_type = cudf::data_type{cudf::type_to_id<cudf::hash_value_type>()};
+  if (input.is_empty()) { return cudf::make_empty_column(output_type); }
+
+  auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
+
+  // build offsets column by computing the number of ngrams per string
+  auto sizes_itr = cudf::detail::make_counting_transform_iterator(
+    0, [d_strings = *d_strings, ngrams] __device__(auto idx) {
+      if (d_strings.is_null(idx)) { return 0; }
+      auto const length = d_strings.element<cudf::string_view>(idx).length();
+      return std::max(0, static_cast<cudf::size_type>(length + 1 - ngrams));
+    });
+  auto [offsets, total_ngrams] =
+    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
+  auto d_offsets = offsets->view().data<cudf::size_type>();
+
+  CUDF_EXPECTS(total_ngrams > 0,
+               "Insufficient number of characters in each string to generate ngrams");
+
+  // compute ngrams and build hashes
+  auto hashes =
+    cudf::make_numeric_column(output_type, total_ngrams, cudf::mask_state::UNALLOCATED, stream, mr);
+  auto d_hashes = hashes->mutable_view().data<cudf::hash_value_type>();
+
+  character_ngram_hash_fn generator{*d_strings, ngrams, d_offsets, d_hashes};
+  thrust::for_each_n(rmm::exec_policy(stream),
+                     thrust::counting_iterator<cudf::size_type>(0),
+                     input.size(),
+                     generator);
+
+  return make_lists_column(
+    input.size(), std::move(offsets), std::move(hashes), 0, rmm::device_buffer{}, stream, mr);
+}
+
 }  // namespace detail
 
 std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& strings,
@@ -239,4 +355,12 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
   return detail::generate_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr);
 }
 
+std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
+                                                    cudf::size_type ngrams,
+                                                    rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::hash_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr);
+}
+
 }  // namespace nvtext
@@ -132,4 +132,37 @@ TEST_F(TextGenerateNgramsTest, Errors)
                cudf::logic_error);
 }
 
+TEST_F(TextGenerateNgramsTest, NgramsHash)
+{
+  auto input =
+    cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."});
+
+  auto view    = cudf::strings_column_view(input);
+  auto results = nvtext::hash_character_ngrams(view);
+
+  using LCW = cudf::test::lists_column_wrapper<uint32_t>;
+  // clang-format off
+  LCW expected({LCW{2169381797u, 3924065905u, 1634753325u, 3766025829u,  771291085u,
+                    2286480985u, 2815102125u, 2383213292u, 1587939873u, 3417728802u,
+                     741580288u, 1721912990u, 3322339040u, 2530504717u, 1448945146u},
+                LCW{3542029734u, 2351937583u, 2373822151u, 2610417165u, 1303810911u,
+                    2541942822u, 1736466351u, 3466558519u,  408633648u, 1698719372u,
+                     620653030u,   16851044u,  608863326u,  948572753u, 3672211877u,
+                    4097451013u, 1444462157u, 3762829398u,  743082018u, 2953783152u,
+                    2319357747u}});
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
+TEST_F(TextGenerateNgramsTest, NgramsHashErrors)
+{
+  auto input = cudf::test::strings_column_wrapper({"1", "2", "3"});
+  auto view  = cudf::strings_column_view(input);
+
+  // invalid parameter value
+  EXPECT_THROW(nvtext::hash_character_ngrams(view, 1), cudf::logic_error);
+  // strings not long enough to generate ngrams
+  EXPECT_THROW(nvtext::hash_character_ngrams(view), cudf::logic_error);
+}
+
 CUDF_TEST_PROGRAM_MAIN()