diff --git a/cpp/benchmarks/iterator/iterator_benchmark.cu b/cpp/benchmarks/iterator/iterator_benchmark.cu index 6c3255328cb..d83fc8af3a3 100644 --- a/cpp/benchmarks/iterator/iterator_benchmark.cu +++ b/cpp/benchmarks/iterator/iterator_benchmark.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,19 +14,21 @@ * limitations under the License. */ -#include +#include "../fixture/benchmark_fixture.hpp" +#include "../synchronization/synchronization.hpp" +#include +#include +#include #include -#include -#include "../fixture/benchmark_fixture.hpp" -#include "../synchronization/synchronization.hpp" +#include -#include // include iterator header -// for reduction tests -#include #include -#include + +#include + +#include template T random_int(T min, T max) @@ -59,7 +61,7 @@ inline auto reduce_by_cub(OutputIterator result, InputIterator d_in, int num_ite // ----------------------------------------------------------------------------- template -void raw_stream_bench_cub(cudf::column_view &col, rmm::device_vector &result) +void raw_stream_bench_cub(cudf::column_view &col, rmm::device_uvector &result) { // std::cout << "raw stream cub: " << "\t"; @@ -71,7 +73,7 @@ void raw_stream_bench_cub(cudf::column_view &col, rmm::device_vector &result) }; template -void iterator_bench_cub(cudf::column_view &col, rmm::device_vector &result) +void iterator_bench_cub(cudf::column_view &col, rmm::device_uvector &result) { // std::cout << "iterator cub " << ( (has_null) ? ": " : ": " ) << "\t"; @@ -89,7 +91,7 @@ void iterator_bench_cub(cudf::column_view &col, rmm::device_vector &result) // ----------------------------------------------------------------------------- template -void raw_stream_bench_thrust(cudf::column_view &col, rmm::device_vector &result) +void raw_stream_bench_thrust(cudf::column_view &col, rmm::device_uvector &result) { // std::cout << "raw stream thust: " << "\t\t"; @@ -100,7 +102,7 @@ void raw_stream_bench_thrust(cudf::column_view &col, rmm::device_vector &resu } template -void iterator_bench_thrust(cudf::column_view &col, rmm::device_vector &result) +void iterator_bench_thrust(cudf::column_view &col, rmm::device_uvector &result) { // std::cout << "iterator thust " << ( (has_null) ? ": " : ": " ) << "\t"; @@ -131,7 +133,8 @@ void BM_iterator(benchmark::State &state) cudf::test::fixed_width_column_wrapper wrap_hasnull_F(num_gen, num_gen + column_size); cudf::column_view hasnull_F = wrap_hasnull_F; - rmm::device_vector dev_result(1, T{0}); + // Initialize dev_result to false + auto dev_result = cudf::detail::make_zeroed_device_uvector_sync(1); for (auto _ : state) { cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 if (cub_or_thrust) { @@ -163,7 +166,7 @@ __device__ thrust::pair operator+(thrust::pair lhs, thrust::pa // ----------------------------------------------------------------------------- template void pair_iterator_bench_cub(cudf::column_view &col, - rmm::device_vector> &result) + rmm::device_uvector> &result) { thrust::pair init{0, false}; auto d_col = cudf::column_device_view::create(col); @@ -174,7 +177,7 @@ void pair_iterator_bench_cub(cudf::column_view &col, template void pair_iterator_bench_thrust(cudf::column_view &col, - rmm::device_vector> &result) + rmm::device_uvector> &result) { thrust::pair init{0, false}; auto d_col = cudf::column_device_view::create(col); @@ -198,7 +201,8 @@ void BM_pair_iterator(benchmark::State &state) cudf::column_view hasnull_F = wrap_hasnull_F; cudf::column_view hasnull_T = wrap_hasnull_T; - rmm::device_vector> dev_result(1, {T{0}, false}); + // Initialize dev_result to false + auto dev_result = cudf::detail::make_zeroed_device_uvector_sync>(1); for (auto _ : state) { cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 if (cub_or_thrust) { diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh index 79cb2d3e44d..285a9241a26 100644 --- a/cpp/benchmarks/join/generate_input_tables.cuh +++ b/cpp/benchmarks/join/generate_input_tables.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,18 +14,22 @@ * limitations under the License. */ -#ifndef __GENERATE_INPUT_TABLES_CUH -#define __GENERATE_INPUT_TABLES_CUH +#pragma once + +#include +#include + +#include +#include -#include -#include #include #include #include -#include -#include -#include +#include +#include + +#include __global__ static void init_curand(curandState* state, const int nstates) { @@ -188,64 +192,63 @@ void generate_input_tables(key_type* const build_tbl, const int num_states = num_sms * std::max(num_blocks_init_build_tbl, num_blocks_init_probe_tbl) * block_size; - rmm::device_vector devStates(num_states); + rmm::device_uvector devStates(num_states, rmm::cuda_stream_default); - init_curand<<<(num_states - 1) / block_size + 1, block_size>>>(devStates.data().get(), - num_states); + init_curand<<<(num_states - 1) / block_size + 1, block_size>>>(devStates.data(), num_states); CHECK_CUDA(0); - rmm::device_vector build_tbl_sorted(build_tbl_size); - size_type lottery_size = rand_max < std::numeric_limits::max() - 1 ? rand_max + 1 : rand_max; - rmm::device_vector lottery(lottery_size); + rmm::device_uvector lottery(lottery_size, rmm::cuda_stream_default); - if (uniq_build_tbl_keys) { thrust::sequence(thrust::device, lottery.begin(), lottery.end(), 0); } + if (uniq_build_tbl_keys) { + thrust::sequence(rmm::exec_policy(), lottery.begin(), lottery.end(), 0); + } init_build_tbl <<>>(build_tbl, build_tbl_size, rand_max, uniq_build_tbl_keys, - lottery.data().get(), + lottery.data(), lottery_size, - devStates.data().get(), + devStates.data(), num_states); CHECK_CUDA(0); - CUDA_TRY(cudaMemcpy(build_tbl_sorted.data().get(), + rmm::device_uvector build_tbl_sorted(build_tbl_size, rmm::cuda_stream_default); + + CUDA_TRY(cudaMemcpy(build_tbl_sorted.data(), build_tbl, build_tbl_size * sizeof(key_type), cudaMemcpyDeviceToDevice)); - thrust::sort(thrust::device, build_tbl_sorted.begin(), build_tbl_sorted.end()); + thrust::sort(rmm::exec_policy(), build_tbl_sorted.begin(), build_tbl_sorted.end()); // Exclude keys used in build table from lottery thrust::counting_iterator first_lottery_elem(0); thrust::counting_iterator last_lottery_elem = first_lottery_elem + lottery_size; - key_type* lottery_end = thrust::set_difference(thrust::device, + key_type* lottery_end = thrust::set_difference(rmm::exec_policy(), first_lottery_elem, last_lottery_elem, build_tbl_sorted.begin(), build_tbl_sorted.end(), - lottery.data().get()); + lottery.data()); - lottery_size = thrust::distance(lottery.data().get(), lottery_end); + lottery_size = thrust::distance(lottery.data(), lottery_end); init_probe_tbl <<>>(probe_tbl, probe_tbl_size, build_tbl, build_tbl_size, - lottery.data().get(), + lottery.data(), lottery_size, selectivity, - devStates.data().get(), + devStates.data(), num_states); CHECK_CUDA(0); } - -#endif // __GENERATE_INPUT_TABLES_CUH diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu index 18ef5a1168e..14e79629fee 100644 --- a/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu +++ b/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,26 +14,21 @@ * limitations under the License. */ +#include "../fixture/benchmark_fixture.hpp" +#include "../synchronization/synchronization.hpp" + +#include + #include #include +#include +#include #include #include -#include -#include -#include -#include -#include - -#include +#include -#include -#include #include -#include "../fixture/benchmark_fixture.hpp" -#include "../synchronization/synchronization.hpp" - -using namespace cudf; enum DispatchingType { HOST_DISPATCHING, DEVICE_DISPATCHING, NO_DISPATCHING }; @@ -75,7 +70,7 @@ __global__ void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_ // This is for HOST_DISPATCHING template -__global__ void host_dispatching_kernel(mutable_column_device_view source_column) +__global__ void host_dispatching_kernel(cudf::mutable_column_device_view source_column) { using F = Functor; T* A = source_column.data(); @@ -89,7 +84,7 @@ __global__ void host_dispatching_kernel(mutable_column_device_view source_column template struct ColumnHandle { template ())> - void operator()(mutable_column_device_view source_column, int work_per_thread) + void operator()(cudf::mutable_column_device_view source_column, int work_per_thread) { cudf::detail::grid_1d grid_config{source_column.size(), block_size}; int grid_size = grid_config.num_blocks; @@ -98,7 +93,7 @@ struct ColumnHandle { } template ())> - void operator()(mutable_column_device_view source_column, int work_per_thread) + void operator()(cudf::mutable_column_device_view source_column, int work_per_thread) { CUDF_FAIL("Invalid type to benchmark."); } @@ -112,14 +107,14 @@ struct ColumnHandle { template struct RowHandle { template ())> - __device__ void operator()(mutable_column_device_view source, cudf::size_type index) + __device__ void operator()(cudf::mutable_column_device_view source, cudf::size_type index) { using F = Functor; source.data()[index] = F::f(source.data()[index]); } template ())> - __device__ void operator()(mutable_column_device_view source, cudf::size_type index) + __device__ void operator()(cudf::mutable_column_device_view source, cudf::size_type index) { cudf_assert(false && "Unsupported type."); } @@ -127,7 +122,7 @@ struct RowHandle { // This is for DEVICE_DISPATCHING template -__global__ void device_dispatching_kernel(mutable_table_device_view source) +__global__ void device_dispatching_kernel(cudf::mutable_table_device_view source) { const cudf::size_type n_rows = source.num_rows(); cudf::size_type index = threadIdx.x + blockIdx.x * blockDim.x; @@ -142,7 +137,7 @@ __global__ void device_dispatching_kernel(mutable_table_device_view source) } template -void launch_kernel(mutable_table_view input, T** d_ptr, int work_per_thread) +void launch_kernel(cudf::mutable_table_view input, T** d_ptr, int work_per_thread) { const cudf::size_type n_rows = input.num_rows(); const cudf::size_type n_cols = input.num_columns(); @@ -153,12 +148,12 @@ void launch_kernel(mutable_table_view input, T** d_ptr, int work_per_thread) if (dispatching_type == HOST_DISPATCHING) { // std::vector v_stream(n_cols); for (int c = 0; c < n_cols; c++) { - auto d_column = mutable_column_device_view::create(input.column(c)); + auto d_column = cudf::mutable_column_device_view::create(input.column(c)); cudf::type_dispatcher( d_column->type(), ColumnHandle{}, *d_column, work_per_thread); } } else if (dispatching_type == DEVICE_DISPATCHING) { - auto d_table_view = mutable_table_device_view::create(input); + auto d_table_view = cudf::mutable_table_device_view::create(input); auto f = device_dispatching_kernel; // Launch the kernel f<<>>(*d_table_view); @@ -191,25 +186,24 @@ void type_dispatcher_benchmark(::benchmark::State& state) cudf::mutable_table_view source_table{source_columns}; // For no dispatching - std::vector> h_vec(n_cols, - rmm::device_vector(source_size, 0)); + std::vector h_vec(n_cols, + rmm::device_buffer(source_size * sizeof(TypeParam))); std::vector h_vec_p(n_cols); - for (int c = 0; c < n_cols; c++) { h_vec_p[c] = h_vec[c].data().get(); } - rmm::device_vector d_vec(n_cols); + for (int c = 0; c < n_cols; c++) { h_vec_p[c] = static_cast(h_vec[c].data()); } + rmm::device_uvector d_vec(n_cols, rmm::cuda_stream_default); if (dispatching_type == NO_DISPATCHING) { CUDA_TRY(cudaMemcpy( - d_vec.data().get(), h_vec_p.data(), sizeof(TypeParam*) * n_cols, cudaMemcpyHostToDevice)); + d_vec.data(), h_vec_p.data(), sizeof(TypeParam*) * n_cols, cudaMemcpyHostToDevice)); } // Warm up - launch_kernel(source_table, d_vec.data().get(), work_per_thread); + launch_kernel(source_table, d_vec.data(), work_per_thread); CUDA_TRY(cudaDeviceSynchronize()); for (auto _ : state) { cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - launch_kernel( - source_table, d_vec.data().get(), work_per_thread); + launch_kernel(source_table, d_vec.data(), work_per_thread); } state.SetBytesProcessed(static_cast(state.iterations()) * source_size * n_cols * 2 *