Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Store and set the correct CUDA device in device_buffer #1370

Merged
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 26 additions & 3 deletions include/rmm/cuda_device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,28 @@ struct cuda_device_id {
/// @briefreturn{The wrapped integer value}
[[nodiscard]] constexpr value_type value() const noexcept { return id_; }

/**
* @brief Equality comparison operator
*
* @param other The other `cuda_device_id` to compare to
* @return true if the two `cuda_device_id`s wrap the same integer value, false otherwise
*/
[[nodiscard]] constexpr bool operator==(cuda_device_id const& other) const noexcept
{
return value() == other.value();
}
jrhemstad marked this conversation as resolved.
Show resolved Hide resolved

/**
* @brief Inequality comparison operator
*
* @param other The other `cuda_device_id` to compare to
* @return true if the two `cuda_device_id`s wrap different integer values, false otherwise
*/
[[nodiscard]] constexpr bool operator!=(cuda_device_id const& other) const noexcept
{
return value() != other.value();
}

private:
value_type id_;
};
Expand Down Expand Up @@ -79,16 +101,17 @@ struct cuda_set_device_raii {
* @param dev_id The device to set as the current CUDA device
*/
explicit cuda_set_device_raii(cuda_device_id dev_id)
: old_device_{get_current_cuda_device()}, needs_reset_{old_device_.value() != dev_id.value()}
: old_device_{get_current_cuda_device()},
needs_reset_{dev_id.value() >= 0 && old_device_.value() != dev_id.value()}
harrism marked this conversation as resolved.
Show resolved Hide resolved
{
if (needs_reset_) RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(dev_id.value()));
if (needs_reset_) { RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(dev_id.value())); }
}
/**
* @brief Reactivates the previous CUDA device
*/
~cuda_set_device_raii() noexcept
{
if (needs_reset_) RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(old_device_.value()));
if (needs_reset_) { RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(old_device_.value())); }
}

cuda_set_device_raii(cuda_set_device_raii const&) = delete;
Expand Down
15 changes: 14 additions & 1 deletion include/rmm/device_buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/
#pragma once

#include <rmm/cuda_device.hpp>
#include <rmm/cuda_stream_view.hpp>
#include <rmm/detail/error.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>
Expand Down Expand Up @@ -104,6 +105,7 @@ class device_buffer {
mr::device_memory_resource* mr = mr::get_current_device_resource())
: _stream{stream}, _mr{mr}
{
cuda_set_device_raii dev{_device};
allocate_async(size);
Comment on lines +113 to 114
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: Should the setting of the current device live inside allocate/deallocate rather than it being the responsibility of the caller to ensure the device is correct? Or, is this deliberate because we might want more than just the allocate call to occur with the same device active and this approach avoids excessive device switching?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, it's deliberate. I wanted to put it in allocate_async/deallocate_async, but those calls are often made in places where the correct device is also needed for other operations, and we don't want to cuda_set_device_raii multiple times. There are also places such as resize / shrink_to_fit where a new device_buffer is created and we want that to happen with the original device active, but inside it we call allocate_async and that would cause redundant current device checking.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think what I've arrived at is that in order to minimize device switching, we want to do it at the highest level in device_buffer possible, which means the public API functions (when necessary). For the same reason, we assume the user has set the device before constructing the device_buffer, and we just store the ID at that stage.

}

Expand Down Expand Up @@ -132,6 +134,7 @@ class device_buffer {
mr::device_memory_resource* mr = mr::get_current_device_resource())
: _stream{stream}, _mr{mr}
{
cuda_set_device_raii dev{_device};
allocate_async(size);
copy_async(source_data, size);
}
Expand Down Expand Up @@ -188,6 +191,7 @@ class device_buffer {
other._size = 0;
other._capacity = 0;
other.set_stream(cuda_stream_view{});
other._device = cuda_device_id{-1};
harrism marked this conversation as resolved.
Show resolved Hide resolved
}

/**
Expand All @@ -207,18 +211,21 @@ class device_buffer {
device_buffer& operator=(device_buffer&& other) noexcept
{
if (&other != this) {
cuda_set_device_raii dev{_device};
deallocate_async();

_data = other._data;
_size = other._size;
_capacity = other._capacity;
set_stream(other.stream());
_mr = other._mr;
_mr = other._mr;
_device = other._device;

other._data = nullptr;
other._size = 0;
other._capacity = 0;
other.set_stream(cuda_stream_view{});
other._device = cuda_device_id{-1};
}
return *this;
}
Expand All @@ -232,6 +239,7 @@ class device_buffer {
*/
~device_buffer() noexcept
{
cuda_set_device_raii dev{_device};
deallocate_async();
_mr = nullptr;
_stream = cuda_stream_view{};
Expand Down Expand Up @@ -259,6 +267,7 @@ class device_buffer {
{
set_stream(stream);
if (new_capacity > capacity()) {
cuda_set_device_raii dev{_device};
auto tmp = device_buffer{new_capacity, stream, _mr};
Comment on lines +274 to 275
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: Does the appearance of this pattern suggest that the device_buffer constructor should have an (optional) device argument that one can provide, rather than relying on the implicit current cuda device (which is then managed by this raii object here)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we did that, then if we are eliminating the cuda_set_device_raii here, then the constructor would have to first call cudaSetDevice(device), and I assume it would do so using cuda_set_device_raii, which means on exiting the ctor the previous device would be reset (if different).

So then we would need to call cuda_set_device_raii again after calling the constructor with the optional device argument because of the subsequent cudaMemcpyAsync. That could mean two calls to cudaGetDevice and four calls to cudaSetDevice, worst case. The way it is now, there is at most 1 cudaGetDevice and at most 2 cudaSetDevice.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, my understanding from the docs was that runtime calls (excepting [some] of those to do with events, where the call has to happen with the live device matching the event's stream) don't care about the current device and hence allocation/deallocation (which, with a pool mr record events) are the only places we need to handle it.

auto const old_size = size();
RMM_CUDA_TRY(cudaMemcpyAsync(tmp.data(), data(), size(), cudaMemcpyDefault, stream.value()));
Expand Down Expand Up @@ -300,6 +309,7 @@ class device_buffer {
if (new_size <= capacity()) {
_size = new_size;
} else {
cuda_set_device_raii dev{_device};
auto tmp = device_buffer{new_size, stream, _mr};
RMM_CUDA_TRY(cudaMemcpyAsync(tmp.data(), data(), size(), cudaMemcpyDefault, stream.value()));
*this = std::move(tmp);
Expand All @@ -323,6 +333,7 @@ class device_buffer {
{
set_stream(stream);
if (size() != capacity()) {
cuda_set_device_raii dev{_device};
// Invoke copy ctor on self which only copies `[0, size())` and swap it
// with self. The temporary `device_buffer` will hold the old contents
// which will then be destroyed
Expand Down Expand Up @@ -404,6 +415,7 @@ class device_buffer {
mr::device_memory_resource* _mr{
mr::get_current_device_resource()}; ///< The memory resource used to
///< allocate/deallocate device memory
cuda_device_id _device{get_current_cuda_device()};

/**
* @brief Allocates the specified amount of memory and updates the size/capacity accordingly.
Expand Down Expand Up @@ -454,6 +466,7 @@ class device_buffer {
{
if (bytes > 0) {
RMM_EXPECTS(nullptr != source, "Invalid copy from nullptr.");
RMM_EXPECTS(nullptr != _data, "Invalid copy from nullptr.");
harrism marked this conversation as resolved.
Show resolved Hide resolved

RMM_CUDA_TRY(cudaMemcpyAsync(_data, source, bytes, cudaMemcpyDefault, stream().value()));
}
Expand Down
3 changes: 3 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -186,4 +186,7 @@ ConfigureTest(BINNING_MR_TEST mr/device/binning_mr_tests.cpp)
# callback memory resource tests
ConfigureTest(CALLBACK_MR_TEST mr/device/callback_mr_tests.cpp)

# container multidevice tests
ConfigureTest(CONTAINER_MULTIDEVICE_TEST container_multidevice_tests.cu)

rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing DESTINATION bin/gtests/librmm)
149 changes: 149 additions & 0 deletions tests/container_multidevice_tests.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "device_check_resource_adaptor.hpp"
#include "rmm/mr/device/per_device_resource.hpp"

#include <rmm/cuda_stream.hpp>
#include <rmm/device_buffer.hpp>
#include <rmm/device_scalar.hpp>
#include <rmm/device_uvector.hpp>

#include <gtest/gtest.h>

#include <type_traits>

template <typename ContainerType>
struct ContainerMultiDeviceTest : public ::testing::Test {};

using containers =
::testing::Types<rmm::device_buffer, rmm::device_uvector<int>, rmm::device_scalar<int>>;

TYPED_TEST_CASE(ContainerMultiDeviceTest, containers);

TYPED_TEST(ContainerMultiDeviceTest, CreateDestroyDifferentActiveDevice)
{
// Get the number of cuda devices
int num_devices = rmm::get_num_cuda_devices();

// only run on multidevice systems
if (num_devices >= 2) {
rmm::cuda_set_device_raii dev{rmm::cuda_device_id{0}};
auto* orig_mr = rmm::mr::get_current_device_resource();
auto check_mr = device_check_resource_adaptor{orig_mr};
rmm::mr::set_current_device_resource(&check_mr);

{
if constexpr (std::is_same_v<TypeParam, rmm::device_scalar<int>>) {
auto buf = TypeParam(rmm::cuda_stream_view{});
RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(1)); // force dtor with different active device
} else {
auto buf = TypeParam(128, rmm::cuda_stream_view{});
RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(1)); // force dtor with different active device
}
}

RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(0));
rmm::mr::set_current_device_resource(orig_mr);
}
}

TYPED_TEST(ContainerMultiDeviceTest, CreateMoveDestroyDifferentActiveDevice)
{
// Get the number of cuda devices
int num_devices = rmm::get_num_cuda_devices();

// only run on multidevice systems
if (num_devices >= 2) {
rmm::cuda_set_device_raii dev{rmm::cuda_device_id{0}};
auto* orig_mr = rmm::mr::get_current_device_resource();
auto check_mr = device_check_resource_adaptor{orig_mr};
rmm::mr::set_current_device_resource(&check_mr);

{
auto buf_1 = []() {
if constexpr (std::is_same_v<TypeParam, rmm::device_scalar<int>>) {
return TypeParam(rmm::cuda_stream_view{});
} else {
return TypeParam(128, rmm::cuda_stream_view{});
}
}();

{
if constexpr (std::is_same_v<TypeParam, rmm::device_scalar<int>>) {
// device_vector does not have a constructor that takes a stream
auto buf_0 = TypeParam(rmm::cuda_stream_view{});
buf_1 = std::move(buf_0);
} else {
auto buf_0 = TypeParam(128, rmm::cuda_stream_view{});
buf_1 = std::move(buf_0);
}
}

RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(1)); // force dtor with different active device
}

RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(0));
rmm::mr::set_current_device_resource(orig_mr);
}
}

TYPED_TEST(ContainerMultiDeviceTest, ResizeDifferentActiveDevice)
{
// Get the number of cuda devices
int num_devices = rmm::get_num_cuda_devices();

// only run on multidevice systems
if (num_devices >= 2) {
rmm::cuda_set_device_raii dev{rmm::cuda_device_id{0}};
auto* orig_mr = rmm::mr::get_current_device_resource();
auto check_mr = device_check_resource_adaptor{orig_mr};
rmm::mr::set_current_device_resource(&check_mr);

if constexpr (not std::is_same_v<TypeParam, rmm::device_scalar<int>>) {
auto buf = TypeParam(128, rmm::cuda_stream_view{});
RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(1)); // force resize with different active device
buf.resize(1024, rmm::cuda_stream_view{});
}

RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(0));
rmm::mr::set_current_device_resource(orig_mr);
}
}

TYPED_TEST(ContainerMultiDeviceTest, ShrinkDifferentActiveDevice)
{
// Get the number of cuda devices
int num_devices = rmm::get_num_cuda_devices();

// only run on multidevice systems
if (num_devices >= 2) {
rmm::cuda_set_device_raii dev{rmm::cuda_device_id{0}};
auto* orig_mr = rmm::mr::get_current_device_resource();
auto check_mr = device_check_resource_adaptor{orig_mr};
rmm::mr::set_current_device_resource(&check_mr);

if constexpr (not std::is_same_v<TypeParam, rmm::device_scalar<int>>) {
auto buf = TypeParam(128, rmm::cuda_stream_view{});
RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(1)); // force resize with different active device
buf.resize(64, rmm::cuda_stream_view{});
buf.shrink_to_fit(rmm::cuda_stream_view{});
}

RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(0));
rmm::mr::set_current_device_resource(orig_mr);
}
}
7 changes: 4 additions & 3 deletions tests/device_buffer_tests.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
* limitations under the License.
*/

#include <gtest/gtest.h>

#include <rmm/cuda_stream.hpp>
#include <rmm/cuda_stream_view.hpp>
#include <rmm/detail/error.hpp>
Expand All @@ -29,9 +27,12 @@

#include <thrust/equal.h>
#include <thrust/sequence.h>

#include <gtest/gtest.h>

namespace testing {
namespace thrust = THRUST_NS_QUALIFIER;
}
} // namespace testing
using namespace testing;

#include <cuda_runtime_api.h>
Expand Down
Loading
Loading