Move some nvtext benchmarks to nvbench (#13368)

Moves some of the nvtext benchmarks to nvbench to help provide a more useful baseline when making improvements to performance. These run with varying parameters for column size and string length. The remaining benchmarks are more involved a may be updated in follow-on PRs. Reference: #13048 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Yunsong Wang (https://github.com/PointKernel) - Robert Maynard (https://github.com/robertmaynard) URL: #13368
rapidsai · May 24, 2023 · 19554a1 · 19554a1
1 parent 132540e
commit 19554a1
Show file tree

Hide file tree

Showing 5 changed files with 103 additions and 208 deletions.
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -272,12 +272,11 @@ ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.c
 
 # ##################################################################################################
 # * nvtext benchmark -------------------------------------------------------------------
-ConfigureBench(
-  TEXT_BENCH text/ngrams.cpp text/normalize.cpp text/normalize_spaces.cpp text/replace.cpp
-  text/subword.cpp text/tokenize.cpp
-)
+ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)
 
-ConfigureNVBench(TEXT_NVBENCH text/minhash.cpp)
+ConfigureNVBench(
+  TEXT_NVBENCH text/minhash.cpp text/normalize.cpp text/replace.cpp text/tokenize.cpp
+)
 
 # ##################################################################################################
 # * strings benchmark -------------------------------------------------------------------

diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp
@@ -16,59 +16,50 @@
 
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <nvtext/normalize.hpp>
 
-class TextNormalize : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
-static void BM_normalize(benchmark::State& state, bool to_lower)
+static void bench_normalize(nvbench::state& state)
 {
-  auto const n_rows          = static_cast<cudf::size_type>(state.range(0));
-  auto const max_str_length  = static_cast<cudf::size_type>(state.range(1));
+  auto const num_rows       = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width      = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const normalize_type = state.get_string("type");
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    nvtext::normalize_characters(input, to_lower);
-  }
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size());
-}
+  auto chars_size = input.chars_size();
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);
 
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 1 << 5;
-  int const max_rowlen = 1 << 13;
-  int const len_mult   = 4;
-  for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) {
-    for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) {
-      // avoid generating combinations that exceed the cudf column limit
-      size_t total_chars = static_cast<size_t>(row_count) * rowlen * 4;
-      if (total_chars < static_cast<size_t>(std::numeric_limits<cudf::size_type>::max())) {
-        b->Args({row_count, rowlen});
-      }
-    }
+  if (normalize_type == "spaces") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { auto result = nvtext::normalize_spaces(input); });
+  } else {
+    bool const to_lower = (normalize_type == "to_lower");
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      auto result = nvtext::normalize_characters(input, to_lower);
+    });
   }
 }
 
-#define NVTEXT_BENCHMARK_DEFINE(name, lower)             \
-  BENCHMARK_DEFINE_F(TextNormalize, name)                \
-  (::benchmark::State & st) { BM_normalize(st, lower); } \
-  BENCHMARK_REGISTER_F(TextNormalize, name)              \
-    ->Apply(generate_bench_args)                         \
-    ->UseManualTime()                                    \
-    ->Unit(benchmark::kMillisecond);
-
-NVTEXT_BENCHMARK_DEFINE(characters, false)
-NVTEXT_BENCHMARK_DEFINE(to_lower, true)
+NVBENCH_BENCH(bench_normalize)
+  .set_name("normalize")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_string_axis("type", {"spaces", "characters", "to_lower"});
diff --git a/cpp/benchmarks/text/normalize_spaces.cpp b/cpp/benchmarks/text/normalize_spaces.cpp
diff --git a/cpp/benchmarks/text/replace.cpp b/cpp/benchmarks/text/replace.cpp
@@ -15,23 +15,26 @@
  */
 
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/string/string_bench_args.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <nvtext/replace.hpp>
 
-#include <random>
+#include <nvbench/nvbench.cuh>
 
-class TextReplace : public cudf::benchmark {};
+#include <random>
 
-static void BM_replace(benchmark::State& state)
+static void bench_replace(nvbench::state& state)
 {
-  auto const n_rows   = static_cast<cudf::size_type>(state.range(0));
-  auto const n_length = static_cast<cudf::size_type>(state.range(1));
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
 
   std::vector<std::string> words{" ",        "one  ",    "two ",       "three ",     "four ",
                                  "five ",    "six  ",    "sevén  ",    "eight ",     "nine ",
@@ -41,46 +44,32 @@ static void BM_replace(benchmark::State& state)
   std::default_random_engine generator;
   std::uniform_int_distribution<int> tokens_dist(0, words.size() - 1);
   std::string row;  // build a row of random tokens
-  while (static_cast<int>(row.size()) < n_length)
+  while (static_cast<cudf::size_type>(row.size()) < row_width)
     row += words[tokens_dist(generator)];
 
   std::uniform_int_distribution<int> position_dist(0, 16);
 
   auto elements = cudf::detail::make_counting_transform_iterator(
     0, [&](auto idx) { return row.c_str() + position_dist(generator); });
-  cudf::test::strings_column_wrapper input(elements, elements + n_rows);
+  cudf::test::strings_column_wrapper input(elements, elements + num_rows);
   cudf::strings_column_view view(input);
 
   cudf::test::strings_column_wrapper targets({"one", "two", "sevén", "zero"});
   cudf::test::strings_column_wrapper replacements({"1", "2", "7", "0"});
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);
-    nvtext::replace_tokens(
-      view, cudf::strings_column_view(targets), cudf::strings_column_view(replacements));
-  }
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
 
-  state.SetBytesProcessed(state.iterations() * view.chars_size());
-}
+  auto chars_size = view.chars_size();
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);
 
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows          = 1 << 12;
-  int const max_rows          = 1 << 24;
-  int const row_multiplier    = 8;
-  int const min_row_length    = 1 << 5;
-  int const max_row_length    = 1 << 13;
-  int const length_multiplier = 4;
-  generate_string_bench_args(
-    b, min_rows, max_rows, row_multiplier, min_row_length, max_row_length, length_multiplier);
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = nvtext::replace_tokens(
+      view, cudf::strings_column_view(targets), cudf::strings_column_view(replacements));
+  });
 }
 
-#define NVTEXT_BENCHMARK_DEFINE(name)           \
-  BENCHMARK_DEFINE_F(TextReplace, name)         \
-  (::benchmark::State & st) { BM_replace(st); } \
-  BENCHMARK_REGISTER_F(TextReplace, name)       \
-    ->Apply(generate_bench_args)                \
-    ->UseManualTime()                           \
-    ->Unit(benchmark::kMillisecond);
-
-NVTEXT_BENCHMARK_DEFINE(replace)
+NVBENCH_BENCH(bench_replace)
+  .set_name("replace")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp
@@ -16,8 +16,6 @@
 
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/string/string_bench_args.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
 
@@ -28,73 +26,57 @@
 #include <nvtext/ngrams_tokenize.hpp>
 #include <nvtext/tokenize.hpp>
 
-class TextTokenize : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
-enum class tokenize_type { single, multi, count, count_multi, ngrams, characters };
-
-static void BM_tokenize(benchmark::State& state, tokenize_type tt)
+static void bench_tokenize(nvbench::state& state)
 {
-  auto const n_rows          = static_cast<cudf::size_type>(state.range(0));
-  auto const max_str_length  = static_cast<cudf::size_type>(state.range(1));
+  auto const num_rows      = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width     = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const tokenize_type = state.get_string("type");
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
-  cudf::test::strings_column_wrapper delimiters({" ", "+", "-"});
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    switch (tt) {
-      case tokenize_type::single:
-        // single whitespace delimiter
-        nvtext::tokenize(input);
-        break;
-      case tokenize_type::multi:
-        nvtext::tokenize(input, cudf::strings_column_view(delimiters));
-        break;
-      case tokenize_type::count:
-        // single whitespace delimiter
-        nvtext::count_tokens(input);
-        break;
-      case tokenize_type::count_multi:
-        nvtext::count_tokens(input, cudf::strings_column_view(delimiters));
-        break;
-      case tokenize_type::ngrams:
-        // default is bigrams
-        nvtext::ngrams_tokenize(input);
-        break;
-      case tokenize_type::characters:
-        // every character becomes a string
-        nvtext::character_tokenize(input);
-        break;
-    }
-  }
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size());
-}
+  auto chars_size = input.chars_size();
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);
 
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 1 << 5;
-  int const max_rowlen = 1 << 13;
-  int const len_mult   = 4;
-  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+  if (tokenize_type == "whitespace") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { auto result = nvtext::tokenize(input); });
+  } else if (tokenize_type == "multi") {
+    cudf::test::strings_column_wrapper delimiters({" ", "+", "-"});
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      auto result = nvtext::tokenize(input, cudf::strings_column_view(delimiters));
+    });
+  } else if (tokenize_type == "count") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { auto result = nvtext::count_tokens(input); });
+  } else if (tokenize_type == "count_multi") {
+    cudf::test::strings_column_wrapper delimiters({" ", "+", "-"});
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      auto result = nvtext::count_tokens(input, cudf::strings_column_view(delimiters));
+    });
+  } else if (tokenize_type == "ngrams") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { auto result = nvtext::ngrams_tokenize(input); });
+  } else if (tokenize_type == "characters") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { auto result = nvtext::character_tokenize(input); });
+  }
 }
 
-#define NVTEXT_BENCHMARK_DEFINE(name)                                 \
-  BENCHMARK_DEFINE_F(TextTokenize, name)                              \
-  (::benchmark::State & st) { BM_tokenize(st, tokenize_type::name); } \
-  BENCHMARK_REGISTER_F(TextTokenize, name)                            \
-    ->Apply(generate_bench_args)                                      \
-    ->UseManualTime()                                                 \
-    ->Unit(benchmark::kMillisecond);
-
-NVTEXT_BENCHMARK_DEFINE(single)
-NVTEXT_BENCHMARK_DEFINE(multi)
-NVTEXT_BENCHMARK_DEFINE(count)
-NVTEXT_BENCHMARK_DEFINE(count_multi)
-NVTEXT_BENCHMARK_DEFINE(ngrams)
-NVTEXT_BENCHMARK_DEFINE(characters)
+NVBENCH_BENCH(bench_tokenize)
+  .set_name("tokenize")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_string_axis("type", {"whitespace", "multi", "count", "count_multi", "ngrams", "characters"});