Add num_aggregations axis and remove multistream benchmark

rapidsai · Jul 5, 2024 · 32181eb · 32181eb
1 parent ca049ff
commit 32181eb
Show file tree

Hide file tree

Showing 4 changed files with 24 additions and 109 deletions.
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -226,13 +226,8 @@ ConfigureBench(
 )
 
 ConfigureNVBench(
-  GROUPBY_NVBENCH
-  groupby/group_max.cpp
-  groupby/group_max_multistream.cpp
-  groupby/group_max_multithreaded.cpp
-  groupby/group_nunique.cpp
-  groupby/group_rank.cpp
-  groupby/group_struct_keys.cpp
+  GROUPBY_NVBENCH groupby/group_max.cpp groupby/group_max_multithreaded.cpp
+  groupby/group_nunique.cpp groupby/group_rank.cpp groupby/group_struct_keys.cpp
 )
 
 # ##################################################################################################

diff --git a/cpp/benchmarks/groupby/group_max.cpp b/cpp/benchmarks/groupby/group_max.cpp
@@ -48,20 +48,25 @@ void groupby_max_helper(nvbench::state& state,
       cudf::type_to_id<Type>(), row_count{num_rows}, data_profile{builder});
   }();
 
+  auto const num_aggregations = state.get_int64("num_aggregations");
+
   auto keys_view = keys->view();
   auto gb_obj    = cudf::groupby::groupby(cudf::table_view({keys_view, keys_view, keys_view}));
 
   std::vector<cudf::groupby::aggregation_request> requests;
-  requests.emplace_back(cudf::groupby::aggregation_request());
-  requests[0].values = vals->view();
-  requests[0].aggregations.push_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());
+  for (int64_t i = 0; i < num_aggregations; i++) {
+    requests.emplace_back(cudf::groupby::aggregation_request());
+    requests[i].values = vals->view();
+    requests[i].aggregations.push_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());
+  }
 
   auto const mem_stats_logger = cudf::memory_stats_logger();
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });
   auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
-  state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
+  state.add_element_count(
+    static_cast<double>(num_rows * num_aggregations) / elapsed_time / 1'000'000., "Mrows/s");
   state.add_buffer_size(
     mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
 }
@@ -91,7 +96,8 @@ NVBENCH_BENCH_TYPES(bench_groupby_max,
   .set_name("groupby_max")
   .add_int64_axis("cardinality", {0})
   .add_int64_power_of_two_axis("num_rows", {12, 18, 24})
-  .add_float64_axis("null_probability", {0, 0.1, 0.9});
+  .add_float64_axis("null_probability", {0, 0.1, 0.9})
+  .add_int64_axis("num_aggregations", {1, 2, 4, 8, 16, 32});
 
 NVBENCH_BENCH_TYPES(bench_groupby_max_cardinality, NVBENCH_TYPE_AXES(nvbench::type_list<int32_t>))
   .set_name("groupby_max_cardinality")

diff --git a/cpp/benchmarks/groupby/group_max_multistream.cpp b/cpp/benchmarks/groupby/group_max_multistream.cpp
diff --git a/cpp/benchmarks/groupby/group_max_multithreaded.cpp b/cpp/benchmarks/groupby/group_max_multithreaded.cpp
@@ -31,6 +31,7 @@ void bench_groupby_max_multithreaded(nvbench::state& state, nvbench::type_list<T
   auto const num_rows         = static_cast<cudf::size_type>(state.get_int64("num_rows"));
   auto const null_probability = state.get_float64("null_probability");
   auto const num_threads      = state.get_int64("num_threads");
+  auto const num_aggregations = state.get_int64("num_aggregations");
 
   auto const keys = [&] {
     data_profile const profile =
@@ -61,28 +62,31 @@ void bench_groupby_max_multithreaded(nvbench::state& state, nvbench::type_list<T
 
   std::vector<std::vector<cudf::groupby::aggregation_request>> requests(num_threads);
   for (int64_t i = 0; i < num_threads; i++) {
-    requests[i].emplace_back(cudf::groupby::aggregation_request());
-    requests[i][0].values = vals->view();
-    requests[i][0].aggregations.push_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());
+    for (int64_t j = 0; j < num_aggregations; j++) {
+      requests[i].emplace_back(cudf::groupby::aggregation_request());
+      requests[i][j].values = vals->view();
+      requests[i][j].aggregations.push_back(
+        cudf::make_max_aggregation<cudf::groupby_aggregation>());
+    }
   }
 
   auto const mem_stats_logger = cudf::memory_stats_logger();
   state.exec(
     nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
       auto perform_agg = [&](int64_t index) { gb_obj.aggregate(requests[index], streams[index]); };
-      threads.paused   = true;
       for (int64_t i = 0; i < num_threads; ++i) {
         threads.submit(perform_agg, i);
       }
       timer.start();
-      threads.paused = false;
       threads.wait_for_tasks();
       cudf::detail::join_streams(streams, cudf::get_default_stream());
       timer.stop();
     });
 
   auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
-  state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
+  state.add_element_count(
+    static_cast<double>(num_rows * num_threads * num_aggregations) / elapsed_time / 1'000'000.,
+    "Mrows/s");
   state.add_buffer_size(
     mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
 }
@@ -93,4 +97,5 @@ NVBENCH_BENCH_TYPES(bench_groupby_max_multithreaded,
   .add_int64_axis("cardinality", {0})
   .add_int64_power_of_two_axis("num_rows", {12, 18})
   .add_float64_axis("null_probability", {0, 0.1, 0.9})
+  .add_int64_axis("num_aggregations", {1, 2, 4, 8, 16, 32})
   .add_int64_axis("num_threads", {1, 2, 4, 8});