rapidsai · rapids-bot · May 13, 2021 · May 11, 2021 · May 11, 2021 · May 11, 2021
@@ -17,16 +17,20 @@
 #include <benchmark/benchmark.h>
 
 #include <cudf_test/column_wrapper.hpp>
-#include <random>
 
 #include "../fixture/benchmark_fixture.hpp"
 #include "../synchronization/synchronization.hpp"
 
-#include <cudf/detail/iterator.cuh>  // include iterator header
-// for reduction tests
-#include <thrust/device_vector.h>
-#include <cub/device/device_reduce.cuh>
 #include <cudf/detail/utilities/device_operators.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
+
+#include <cudf/detail/iterator.cuh>
+
+#include <rmm/device_uvector.hpp>
+
+#include <cub/device/device_reduce.cuh>
+
+#include <random>
 
 template <typename T>
 T random_int(T min, T max)
@@ -59,7 +63,7 @@ inline auto reduce_by_cub(OutputIterator result, InputIterator d_in, int num_ite
 
 // -----------------------------------------------------------------------------
 template <typename T>
-void raw_stream_bench_cub(cudf::column_view &col, rmm::device_vector<T> &result)
+void raw_stream_bench_cub(cudf::column_view &col, rmm::device_uvector<T> &result)
 {
   // std::cout << "raw stream cub: " << "\t";
 
@@ -71,7 +75,7 @@ void raw_stream_bench_cub(cudf::column_view &col, rmm::device_vector<T> &result)
 };
 
 template <typename T, bool has_null>
-void iterator_bench_cub(cudf::column_view &col, rmm::device_vector<T> &result)
+void iterator_bench_cub(cudf::column_view &col, rmm::device_uvector<T> &result)
 {
   // std::cout << "iterator cub " << ( (has_null) ? "<true>: " : "<false>: " ) << "\t";
 
@@ -89,7 +93,7 @@ void iterator_bench_cub(cudf::column_view &col, rmm::device_vector<T> &result)
 
 // -----------------------------------------------------------------------------
 template <typename T>
-void raw_stream_bench_thrust(cudf::column_view &col, rmm::device_vector<T> &result)
+void raw_stream_bench_thrust(cudf::column_view &col, rmm::device_uvector<T> &result)
 {
   // std::cout << "raw stream thust: " << "\t\t";
 
@@ -100,7 +104,7 @@ void raw_stream_bench_thrust(cudf::column_view &col, rmm::device_vector<T> &resu
 }
 
 template <typename T, bool has_null>
-void iterator_bench_thrust(cudf::column_view &col, rmm::device_vector<T> &result)
+void iterator_bench_thrust(cudf::column_view &col, rmm::device_uvector<T> &result)
 {
   // std::cout << "iterator thust " << ( (has_null) ? "<true>: " : "<false>: " ) << "\t";
 
@@ -131,7 +135,7 @@ void BM_iterator(benchmark::State &state)
   cudf::test::fixed_width_column_wrapper<T> wrap_hasnull_F(num_gen, num_gen + column_size);
   cudf::column_view hasnull_F = wrap_hasnull_F;
 
-  rmm::device_vector<T> dev_result(1, T{0});
+  auto dev_result = cudf::detail::make_zeroed_device_uvector_sync<TypeParam>(1);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     if (cub_or_thrust) {
@@ -163,7 +167,7 @@ __device__ thrust::pair<T, bool> operator+(thrust::pair<T, bool> lhs, thrust::pa
 // -----------------------------------------------------------------------------
 template <typename T, bool has_null>
 void pair_iterator_bench_cub(cudf::column_view &col,
-                             rmm::device_vector<thrust::pair<T, bool>> &result)
+                             rmm::device_uvector<thrust::pair<T, bool>> &result)
 {
   thrust::pair<T, bool> init{0, false};
   auto d_col    = cudf::column_device_view::create(col);
@@ -174,7 +178,7 @@ void pair_iterator_bench_cub(cudf::column_view &col,
 
 template <typename T, bool has_null>
 void pair_iterator_bench_thrust(cudf::column_view &col,
-                                rmm::device_vector<thrust::pair<T, bool>> &result)
+                                rmm::device_uvector<thrust::pair<T, bool>> &result)
 {
   thrust::pair<T, bool> init{0, false};
   auto d_col = cudf::column_device_view::create(col);
@@ -198,7 +202,7 @@ void BM_pair_iterator(benchmark::State &state)
   cudf::column_view hasnull_F = wrap_hasnull_F;
   cudf::column_view hasnull_T = wrap_hasnull_T;
 
-  rmm::device_vector<thrust::pair<T, bool>> dev_result(1, {T{0}, false});
+  auto dev_result = cudf::detail::make_zeroed_device_uvector_sync<thrust::pair<T, bool>>(1);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     if (cub_or_thrust) {

@@ -26,6 +26,8 @@
 
 #include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/utilities/error.hpp>
+#include "rmm/cuda_stream_view.hpp"
+#include "rmm/exec_policy.hpp"
 
 __global__ static void init_curand(curandState* state, const int nstates)
 {
@@ -188,61 +190,62 @@ void generate_input_tables(key_type* const build_tbl,
 
   const int num_states =
     num_sms * std::max(num_blocks_init_build_tbl, num_blocks_init_probe_tbl) * block_size;
-  rmm::device_vector<curandState> devStates(num_states);
+  rmm::device_uvector<curandState> devStates(num_states, rmm::cuda_stream_default);
 
-  init_curand<<<(num_states - 1) / block_size + 1, block_size>>>(devStates.data().get(),
-                                                                 num_states);
+  init_curand<<<(num_states - 1) / block_size + 1, block_size>>>(devStates.data(), num_states);
 
   CHECK_CUDA(0);
 
-  rmm::device_vector<key_type> build_tbl_sorted(build_tbl_size);
-
   size_type lottery_size =
     rand_max < std::numeric_limits<key_type>::max() - 1 ? rand_max + 1 : rand_max;
-  rmm::device_vector<key_type> lottery(lottery_size);
+  rmm::device_uvector<key_type> lottery(lottery_size, rmm::cuda_stream_default);
 
-  if (uniq_build_tbl_keys) { thrust::sequence(thrust::device, lottery.begin(), lottery.end(), 0); }
+  if (uniq_build_tbl_keys) {
+    thrust::sequence(rmm::exec_policy(), lottery.begin(), lottery.end(), 0);
+  }
 
   init_build_tbl<key_type, size_type>
     <<<num_sms * num_blocks_init_build_tbl, block_size>>>(build_tbl,
                                                           build_tbl_size,
                                                           rand_max,
                                                           uniq_build_tbl_keys,
-                                                          lottery.data().get(),
+                                                          lottery.data(),
                                                           lottery_size,
-                                                          devStates.data().get(),
+                                                          devStates.data(),
                                                           num_states);
 
   CHECK_CUDA(0);
 
-  CUDA_TRY(cudaMemcpy(build_tbl_sorted.data().get(),
+  rmm::device_uvector<key_type> build_tbl_sorted(build_tbl_size, rmm::cuda_stream_default);
+
+  CUDA_TRY(cudaMemcpy(build_tbl_sorted.data(),
                       build_tbl,
                       build_tbl_size * sizeof(key_type),
                       cudaMemcpyDeviceToDevice));
 
-  thrust::sort(thrust::device, build_tbl_sorted.begin(), build_tbl_sorted.end());
+  thrust::sort(rmm::exec_policy(), build_tbl_sorted.begin(), build_tbl_sorted.end());
 
   // Exclude keys used in build table from lottery
   thrust::counting_iterator<key_type> first_lottery_elem(0);
   thrust::counting_iterator<key_type> last_lottery_elem = first_lottery_elem + lottery_size;
-  key_type* lottery_end                                 = thrust::set_difference(thrust::device,
+  key_type* lottery_end                                 = thrust::set_difference(rmm::exec_policy(),
                                                  first_lottery_elem,
                                                  last_lottery_elem,
                                                  build_tbl_sorted.begin(),
                                                  build_tbl_sorted.end(),
-                                                 lottery.data().get());
+                                                 lottery.data());
 
-  lottery_size = thrust::distance(lottery.data().get(), lottery_end);
+  lottery_size = thrust::distance(lottery.data(), lottery_end);
 
   init_probe_tbl<key_type, size_type>
     <<<num_sms * num_blocks_init_build_tbl, block_size>>>(probe_tbl,
                                                           probe_tbl_size,
                                                           build_tbl,
                                                           build_tbl_size,
-                                                          lottery.data().get(),
+                                                          lottery.data(),
                                                           lottery_size,
                                                           selectivity,
-                                                          devStates.data().get(),
+                                                          devStates.data(),
                                                           num_states);
 
   CHECK_CUDA(0);

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,26 +14,21 @@
  * limitations under the License.
  */
 
+#include "../fixture/benchmark_fixture.hpp"
+#include "../synchronization/synchronization.hpp"
+
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/table_utilities.hpp>
-
-#include <cudf/detail/utilities/cuda.cuh>
+#include <rmm/cuda_stream_view.hpp>
 
-#include <cudf/utilities/traits.hpp>
-#include <random>
 #include <type_traits>
-#include "../fixture/benchmark_fixture.hpp"
-#include "../synchronization/synchronization.hpp"
-
-using namespace cudf;
 
 enum DispatchingType { HOST_DISPATCHING, DEVICE_DISPATCHING, NO_DISPATCHING };
 
@@ -75,7 +70,7 @@ __global__ void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_
 
 // This is for HOST_DISPATCHING
 template <FunctorType functor_type, class T>
-__global__ void host_dispatching_kernel(mutable_column_device_view source_column)
+__global__ void host_dispatching_kernel(cudf::mutable_column_device_view source_column)
 {
   using F               = Functor<T, functor_type>;
   T* A                  = source_column.data<T>();
@@ -89,7 +84,7 @@ __global__ void host_dispatching_kernel(mutable_column_device_view source_column
 template <FunctorType functor_type>
 struct ColumnHandle {
   template <typename ColumnType, CUDF_ENABLE_IF(cudf::is_rep_layout_compatible<ColumnType>())>
-  void operator()(mutable_column_device_view source_column, int work_per_thread)
+  void operator()(cudf::mutable_column_device_view source_column, int work_per_thread)
   {
     cudf::detail::grid_1d grid_config{source_column.size(), block_size};
     int grid_size = grid_config.num_blocks;
@@ -98,7 +93,7 @@ struct ColumnHandle {
   }
 
   template <typename ColumnType, CUDF_ENABLE_IF(not cudf::is_rep_layout_compatible<ColumnType>())>
-  void operator()(mutable_column_device_view source_column, int work_per_thread)
+  void operator()(cudf::mutable_column_device_view source_column, int work_per_thread)
   {
     CUDF_FAIL("Invalid type to benchmark.");
   }
@@ -112,22 +107,22 @@ struct ColumnHandle {
 template <FunctorType functor_type>
 struct RowHandle {
   template <typename T, CUDF_ENABLE_IF(cudf::is_rep_layout_compatible<T>())>
-  __device__ void operator()(mutable_column_device_view source, cudf::size_type index)
+  __device__ void operator()(cudf::mutable_column_device_view source, cudf::size_type index)
   {
     using F                 = Functor<T, functor_type>;
     source.data<T>()[index] = F::f(source.data<T>()[index]);
   }
 
   template <typename T, CUDF_ENABLE_IF(not cudf::is_rep_layout_compatible<T>())>
-  __device__ void operator()(mutable_column_device_view source, cudf::size_type index)
+  __device__ void operator()(cudf::mutable_column_device_view source, cudf::size_type index)
   {
     cudf_assert(false && "Unsupported type.");
   }
 };
 
 // This is for DEVICE_DISPATCHING
 template <FunctorType functor_type>
-__global__ void device_dispatching_kernel(mutable_table_device_view source)
+__global__ void device_dispatching_kernel(cudf::mutable_table_device_view source)
 {
   const cudf::size_type n_rows = source.num_rows();
   cudf::size_type index        = threadIdx.x + blockIdx.x * blockDim.x;
@@ -142,7 +137,7 @@ __global__ void device_dispatching_kernel(mutable_table_device_view source)
 }
 
 template <FunctorType functor_type, DispatchingType dispatching_type, class T>
-void launch_kernel(mutable_table_view input, T** d_ptr, int work_per_thread)
+void launch_kernel(cudf::mutable_table_view input, T** d_ptr, int work_per_thread)
 {
   const cudf::size_type n_rows = input.num_rows();
   const cudf::size_type n_cols = input.num_columns();
@@ -153,12 +148,12 @@ void launch_kernel(mutable_table_view input, T** d_ptr, int work_per_thread)
   if (dispatching_type == HOST_DISPATCHING) {
     // std::vector<cudf::util::cuda::scoped_stream> v_stream(n_cols);
     for (int c = 0; c < n_cols; c++) {
-      auto d_column = mutable_column_device_view::create(input.column(c));
+      auto d_column = cudf::mutable_column_device_view::create(input.column(c));
       cudf::type_dispatcher(
         d_column->type(), ColumnHandle<functor_type>{}, *d_column, work_per_thread);
     }
   } else if (dispatching_type == DEVICE_DISPATCHING) {
-    auto d_table_view = mutable_table_device_view::create(input);
+    auto d_table_view = cudf::mutable_table_device_view::create(input);
     auto f            = device_dispatching_kernel<functor_type>;
     // Launch the kernel
     f<<<grid_size, block_size>>>(*d_table_view);
@@ -191,25 +186,24 @@ void type_dispatcher_benchmark(::benchmark::State& state)
   cudf::mutable_table_view source_table{source_columns};
 
   // For no dispatching
-  std::vector<rmm::device_vector<TypeParam>> h_vec(n_cols,
-                                                   rmm::device_vector<TypeParam>(source_size, 0));
+  std::vector<rmm::device_buffer> h_vec(n_cols,
+                                        rmm::device_buffer(source_size * sizeof(TypeParam)));
   std::vector<TypeParam*> h_vec_p(n_cols);
-  for (int c = 0; c < n_cols; c++) { h_vec_p[c] = h_vec[c].data().get(); }
-  rmm::device_vector<TypeParam*> d_vec(n_cols);
+  for (int c = 0; c < n_cols; c++) { h_vec_p[c] = static_cast<TypeParam*>(h_vec[c].data()); }
+  rmm::device_uvector<TypeParam*> d_vec(n_cols, rmm::cuda_stream_default);
 
   if (dispatching_type == NO_DISPATCHING) {
     CUDA_TRY(cudaMemcpy(
-      d_vec.data().get(), h_vec_p.data(), sizeof(TypeParam*) * n_cols, cudaMemcpyHostToDevice));
+      d_vec.data(), h_vec_p.data(), sizeof(TypeParam*) * n_cols, cudaMemcpyHostToDevice));
   }
 
   // Warm up
-  launch_kernel<functor_type, dispatching_type>(source_table, d_vec.data().get(), work_per_thread);
+  launch_kernel<functor_type, dispatching_type>(source_table, d_vec.data(), work_per_thread);
   CUDA_TRY(cudaDeviceSynchronize());
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
-    launch_kernel<functor_type, dispatching_type>(
-      source_table, d_vec.data().get(), work_per_thread);
+    launch_kernel<functor_type, dispatching_type>(source_table, d_vec.data(), work_per_thread);
   }
 
   state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * source_size * n_cols * 2 *