Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Convert benchmarks to use device_uvector #8208

Merged
merged 5 commits into from
May 13, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 17 additions & 13 deletions cpp/benchmarks/iterator/iterator_benchmark.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,20 @@
#include <benchmark/benchmark.h>

#include <cudf_test/column_wrapper.hpp>
#include <random>

#include "../fixture/benchmark_fixture.hpp"
#include "../synchronization/synchronization.hpp"

#include <cudf/detail/iterator.cuh> // include iterator header
// for reduction tests
#include <thrust/device_vector.h>
#include <cub/device/device_reduce.cuh>
#include <cudf/detail/utilities/device_operators.cuh>
#include <cudf/detail/utilities/vector_factories.hpp>

#include <cudf/detail/iterator.cuh>

#include <rmm/device_uvector.hpp>

#include <cub/device/device_reduce.cuh>

#include <random>

template <typename T>
T random_int(T min, T max)
Expand Down Expand Up @@ -59,7 +63,7 @@ inline auto reduce_by_cub(OutputIterator result, InputIterator d_in, int num_ite

// -----------------------------------------------------------------------------
template <typename T>
void raw_stream_bench_cub(cudf::column_view &col, rmm::device_vector<T> &result)
void raw_stream_bench_cub(cudf::column_view &col, rmm::device_uvector<T> &result)
{
// std::cout << "raw stream cub: " << "\t";

Expand All @@ -71,7 +75,7 @@ void raw_stream_bench_cub(cudf::column_view &col, rmm::device_vector<T> &result)
};

template <typename T, bool has_null>
void iterator_bench_cub(cudf::column_view &col, rmm::device_vector<T> &result)
void iterator_bench_cub(cudf::column_view &col, rmm::device_uvector<T> &result)
{
// std::cout << "iterator cub " << ( (has_null) ? "<true>: " : "<false>: " ) << "\t";

Expand All @@ -89,7 +93,7 @@ void iterator_bench_cub(cudf::column_view &col, rmm::device_vector<T> &result)

// -----------------------------------------------------------------------------
template <typename T>
void raw_stream_bench_thrust(cudf::column_view &col, rmm::device_vector<T> &result)
void raw_stream_bench_thrust(cudf::column_view &col, rmm::device_uvector<T> &result)
{
// std::cout << "raw stream thust: " << "\t\t";

Expand All @@ -100,7 +104,7 @@ void raw_stream_bench_thrust(cudf::column_view &col, rmm::device_vector<T> &resu
}

template <typename T, bool has_null>
void iterator_bench_thrust(cudf::column_view &col, rmm::device_vector<T> &result)
void iterator_bench_thrust(cudf::column_view &col, rmm::device_uvector<T> &result)
{
// std::cout << "iterator thust " << ( (has_null) ? "<true>: " : "<false>: " ) << "\t";

Expand Down Expand Up @@ -131,7 +135,7 @@ void BM_iterator(benchmark::State &state)
cudf::test::fixed_width_column_wrapper<T> wrap_hasnull_F(num_gen, num_gen + column_size);
cudf::column_view hasnull_F = wrap_hasnull_F;

rmm::device_vector<T> dev_result(1, T{0});
auto dev_result = cudf::detail::make_zeroed_device_uvector_sync<TypeParam>(1);
for (auto _ : state) {
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
if (cub_or_thrust) {
Expand Down Expand Up @@ -163,7 +167,7 @@ __device__ thrust::pair<T, bool> operator+(thrust::pair<T, bool> lhs, thrust::pa
// -----------------------------------------------------------------------------
template <typename T, bool has_null>
void pair_iterator_bench_cub(cudf::column_view &col,
rmm::device_vector<thrust::pair<T, bool>> &result)
rmm::device_uvector<thrust::pair<T, bool>> &result)
{
thrust::pair<T, bool> init{0, false};
auto d_col = cudf::column_device_view::create(col);
Expand All @@ -174,7 +178,7 @@ void pair_iterator_bench_cub(cudf::column_view &col,

template <typename T, bool has_null>
void pair_iterator_bench_thrust(cudf::column_view &col,
rmm::device_vector<thrust::pair<T, bool>> &result)
rmm::device_uvector<thrust::pair<T, bool>> &result)
{
thrust::pair<T, bool> init{0, false};
auto d_col = cudf::column_device_view::create(col);
Expand All @@ -198,7 +202,7 @@ void BM_pair_iterator(benchmark::State &state)
cudf::column_view hasnull_F = wrap_hasnull_F;
cudf::column_view hasnull_T = wrap_hasnull_T;

rmm::device_vector<thrust::pair<T, bool>> dev_result(1, {T{0}, false});
auto dev_result = cudf::detail::make_zeroed_device_uvector_sync<thrust::pair<T, bool>>(1);
harrism marked this conversation as resolved.
Show resolved Hide resolved
for (auto _ : state) {
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
if (cub_or_thrust) {
Expand Down
35 changes: 19 additions & 16 deletions cpp/benchmarks/join/generate_input_tables.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@

#include <cudf/detail/utilities/device_atomics.cuh>
#include <cudf/utilities/error.hpp>
#include "rmm/cuda_stream_view.hpp"
#include "rmm/exec_policy.hpp"
harrism marked this conversation as resolved.
Show resolved Hide resolved

__global__ static void init_curand(curandState* state, const int nstates)
{
Expand Down Expand Up @@ -188,61 +190,62 @@ void generate_input_tables(key_type* const build_tbl,

const int num_states =
num_sms * std::max(num_blocks_init_build_tbl, num_blocks_init_probe_tbl) * block_size;
rmm::device_vector<curandState> devStates(num_states);
rmm::device_uvector<curandState> devStates(num_states, rmm::cuda_stream_default);

init_curand<<<(num_states - 1) / block_size + 1, block_size>>>(devStates.data().get(),
num_states);
init_curand<<<(num_states - 1) / block_size + 1, block_size>>>(devStates.data(), num_states);

CHECK_CUDA(0);

rmm::device_vector<key_type> build_tbl_sorted(build_tbl_size);

size_type lottery_size =
rand_max < std::numeric_limits<key_type>::max() - 1 ? rand_max + 1 : rand_max;
rmm::device_vector<key_type> lottery(lottery_size);
rmm::device_uvector<key_type> lottery(lottery_size, rmm::cuda_stream_default);

if (uniq_build_tbl_keys) { thrust::sequence(thrust::device, lottery.begin(), lottery.end(), 0); }
if (uniq_build_tbl_keys) {
thrust::sequence(rmm::exec_policy(), lottery.begin(), lottery.end(), 0);
}

init_build_tbl<key_type, size_type>
<<<num_sms * num_blocks_init_build_tbl, block_size>>>(build_tbl,
build_tbl_size,
rand_max,
uniq_build_tbl_keys,
lottery.data().get(),
lottery.data(),
lottery_size,
devStates.data().get(),
devStates.data(),
num_states);

CHECK_CUDA(0);

CUDA_TRY(cudaMemcpy(build_tbl_sorted.data().get(),
rmm::device_uvector<key_type> build_tbl_sorted(build_tbl_size, rmm::cuda_stream_default);

CUDA_TRY(cudaMemcpy(build_tbl_sorted.data(),
build_tbl,
build_tbl_size * sizeof(key_type),
cudaMemcpyDeviceToDevice));

thrust::sort(thrust::device, build_tbl_sorted.begin(), build_tbl_sorted.end());
thrust::sort(rmm::exec_policy(), build_tbl_sorted.begin(), build_tbl_sorted.end());

// Exclude keys used in build table from lottery
thrust::counting_iterator<key_type> first_lottery_elem(0);
thrust::counting_iterator<key_type> last_lottery_elem = first_lottery_elem + lottery_size;
key_type* lottery_end = thrust::set_difference(thrust::device,
key_type* lottery_end = thrust::set_difference(rmm::exec_policy(),
first_lottery_elem,
last_lottery_elem,
build_tbl_sorted.begin(),
build_tbl_sorted.end(),
lottery.data().get());
lottery.data());

lottery_size = thrust::distance(lottery.data().get(), lottery_end);
lottery_size = thrust::distance(lottery.data(), lottery_end);

init_probe_tbl<key_type, size_type>
<<<num_sms * num_blocks_init_build_tbl, block_size>>>(probe_tbl,
probe_tbl_size,
build_tbl,
build_tbl_size,
lottery.data().get(),
lottery.data(),
lottery_size,
selectivity,
devStates.data().get(),
devStates.data(),
num_states);

CHECK_CUDA(0);
Expand Down
56 changes: 25 additions & 31 deletions cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,26 +14,21 @@
* limitations under the License.
*/

#include "../fixture/benchmark_fixture.hpp"
#include "../synchronization/synchronization.hpp"

#include <cudf_test/column_wrapper.hpp>

#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_view.hpp>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/table/table_device_view.cuh>
#include <cudf/table/table_view.hpp>

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/cudf_gtest.hpp>
#include <cudf_test/table_utilities.hpp>

#include <cudf/detail/utilities/cuda.cuh>
#include <rmm/cuda_stream_view.hpp>

#include <cudf/utilities/traits.hpp>
#include <random>
#include <type_traits>
#include "../fixture/benchmark_fixture.hpp"
#include "../synchronization/synchronization.hpp"

using namespace cudf;

enum DispatchingType { HOST_DISPATCHING, DEVICE_DISPATCHING, NO_DISPATCHING };

Expand Down Expand Up @@ -75,7 +70,7 @@ __global__ void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_

// This is for HOST_DISPATCHING
template <FunctorType functor_type, class T>
__global__ void host_dispatching_kernel(mutable_column_device_view source_column)
__global__ void host_dispatching_kernel(cudf::mutable_column_device_view source_column)
{
using F = Functor<T, functor_type>;
T* A = source_column.data<T>();
Expand All @@ -89,7 +84,7 @@ __global__ void host_dispatching_kernel(mutable_column_device_view source_column
template <FunctorType functor_type>
struct ColumnHandle {
template <typename ColumnType, CUDF_ENABLE_IF(cudf::is_rep_layout_compatible<ColumnType>())>
void operator()(mutable_column_device_view source_column, int work_per_thread)
void operator()(cudf::mutable_column_device_view source_column, int work_per_thread)
{
cudf::detail::grid_1d grid_config{source_column.size(), block_size};
int grid_size = grid_config.num_blocks;
Expand All @@ -98,7 +93,7 @@ struct ColumnHandle {
}

template <typename ColumnType, CUDF_ENABLE_IF(not cudf::is_rep_layout_compatible<ColumnType>())>
void operator()(mutable_column_device_view source_column, int work_per_thread)
void operator()(cudf::mutable_column_device_view source_column, int work_per_thread)
{
CUDF_FAIL("Invalid type to benchmark.");
}
Expand All @@ -112,22 +107,22 @@ struct ColumnHandle {
template <FunctorType functor_type>
struct RowHandle {
template <typename T, CUDF_ENABLE_IF(cudf::is_rep_layout_compatible<T>())>
__device__ void operator()(mutable_column_device_view source, cudf::size_type index)
__device__ void operator()(cudf::mutable_column_device_view source, cudf::size_type index)
{
using F = Functor<T, functor_type>;
source.data<T>()[index] = F::f(source.data<T>()[index]);
}

template <typename T, CUDF_ENABLE_IF(not cudf::is_rep_layout_compatible<T>())>
__device__ void operator()(mutable_column_device_view source, cudf::size_type index)
__device__ void operator()(cudf::mutable_column_device_view source, cudf::size_type index)
{
cudf_assert(false && "Unsupported type.");
}
};

// This is for DEVICE_DISPATCHING
template <FunctorType functor_type>
__global__ void device_dispatching_kernel(mutable_table_device_view source)
__global__ void device_dispatching_kernel(cudf::mutable_table_device_view source)
{
const cudf::size_type n_rows = source.num_rows();
cudf::size_type index = threadIdx.x + blockIdx.x * blockDim.x;
Expand All @@ -142,7 +137,7 @@ __global__ void device_dispatching_kernel(mutable_table_device_view source)
}

template <FunctorType functor_type, DispatchingType dispatching_type, class T>
void launch_kernel(mutable_table_view input, T** d_ptr, int work_per_thread)
void launch_kernel(cudf::mutable_table_view input, T** d_ptr, int work_per_thread)
{
const cudf::size_type n_rows = input.num_rows();
const cudf::size_type n_cols = input.num_columns();
Expand All @@ -153,12 +148,12 @@ void launch_kernel(mutable_table_view input, T** d_ptr, int work_per_thread)
if (dispatching_type == HOST_DISPATCHING) {
// std::vector<cudf::util::cuda::scoped_stream> v_stream(n_cols);
for (int c = 0; c < n_cols; c++) {
auto d_column = mutable_column_device_view::create(input.column(c));
auto d_column = cudf::mutable_column_device_view::create(input.column(c));
cudf::type_dispatcher(
d_column->type(), ColumnHandle<functor_type>{}, *d_column, work_per_thread);
}
} else if (dispatching_type == DEVICE_DISPATCHING) {
auto d_table_view = mutable_table_device_view::create(input);
auto d_table_view = cudf::mutable_table_device_view::create(input);
auto f = device_dispatching_kernel<functor_type>;
// Launch the kernel
f<<<grid_size, block_size>>>(*d_table_view);
Expand Down Expand Up @@ -191,25 +186,24 @@ void type_dispatcher_benchmark(::benchmark::State& state)
cudf::mutable_table_view source_table{source_columns};

// For no dispatching
std::vector<rmm::device_vector<TypeParam>> h_vec(n_cols,
rmm::device_vector<TypeParam>(source_size, 0));
std::vector<rmm::device_buffer> h_vec(n_cols,
rmm::device_buffer(source_size * sizeof(TypeParam)));
std::vector<TypeParam*> h_vec_p(n_cols);
for (int c = 0; c < n_cols; c++) { h_vec_p[c] = h_vec[c].data().get(); }
rmm::device_vector<TypeParam*> d_vec(n_cols);
for (int c = 0; c < n_cols; c++) { h_vec_p[c] = static_cast<TypeParam*>(h_vec[c].data()); }
rmm::device_uvector<TypeParam*> d_vec(n_cols, rmm::cuda_stream_default);

if (dispatching_type == NO_DISPATCHING) {
CUDA_TRY(cudaMemcpy(
d_vec.data().get(), h_vec_p.data(), sizeof(TypeParam*) * n_cols, cudaMemcpyHostToDevice));
d_vec.data(), h_vec_p.data(), sizeof(TypeParam*) * n_cols, cudaMemcpyHostToDevice));
}

// Warm up
launch_kernel<functor_type, dispatching_type>(source_table, d_vec.data().get(), work_per_thread);
launch_kernel<functor_type, dispatching_type>(source_table, d_vec.data(), work_per_thread);
CUDA_TRY(cudaDeviceSynchronize());

for (auto _ : state) {
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
launch_kernel<functor_type, dispatching_type>(
source_table, d_vec.data().get(), work_per_thread);
launch_kernel<functor_type, dispatching_type>(source_table, d_vec.data(), work_per_thread);
}

state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * source_size * n_cols * 2 *
Expand Down