Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor rmm::device_scalar in terms of rmm::device_uvector #789

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
194 changes: 55 additions & 139 deletions include/rmm/device_scalar.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#pragma once

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_buffer.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>
#include <rmm/mr/device/per_device_resource.hpp>

Expand All @@ -37,6 +37,35 @@ class device_scalar {
public:
static_assert(std::is_trivially_copyable<T>::value, "Scalar type must be trivially copyable");

using value_type = typename device_uvector<T>::value_type;
using reference = typename device_uvector<T>::reference;
using const_reference = typename device_uvector<T>::const_reference;
using pointer = typename device_uvector<T>::pointer;
using const_pointer = typename device_uvector<T>::const_pointer;

RMM_EXEC_CHECK_DISABLE
~device_scalar() = default;

RMM_EXEC_CHECK_DISABLE
device_scalar(device_scalar &&) = default;

device_scalar &operator=(device_scalar &&) = default;

/**
* @brief Copy ctor is deleted as it doesn't allow a stream argument
*/
device_scalar(device_scalar const &) = delete;

/**
* @brief Copy assignment is deleted as it doesn't allow a stream argument
*/
device_scalar &operator=(device_scalar const &) = delete;

/**
* @brief Default constructor is deleted as it doesn't allow a stream argument
*/
device_scalar() = delete;

/**
* @brief Construct a new uninitialized `device_scalar`.
*
Expand All @@ -54,7 +83,7 @@ class device_scalar {
explicit device_scalar(
cuda_stream_view stream,
rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
: buffer{sizeof(T), stream, mr}
: _storage{1, stream, mr}
{
}

Expand All @@ -75,12 +104,12 @@ class device_scalar {
* @param mr Optional, resource with which to allocate.
*/
explicit device_scalar(
T const &initial_value,
cuda_stream_view stream = cuda_stream_view{},
value_type const &initial_value,
cuda_stream_view stream,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why remove the default? It seems that it would be a common use case to use the default stream, especially with PTDS enabled.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because we want to make all stream-ordered APIs explicit. See e.g. #418. Note that device_uvector and now device_buffer have no default stream parameters.

rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
: buffer{sizeof(T), stream, mr}
: _storage{1, stream, mr}
{
set_value(initial_value, stream);
set_value_async(initial_value, stream);
}

/**
Expand All @@ -96,9 +125,9 @@ class device_scalar {
* @param mr The resource to use for allocating the new `device_scalar`
*/
device_scalar(device_scalar const &other,
cuda_stream_view stream = {},
cuda_stream_view stream,
rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
: buffer{other.buffer, stream, mr}
: _storage{other._storage, stream, mr}
{
}

Expand All @@ -118,31 +147,25 @@ class device_scalar {
* @return T The value of the scalar.
* @param stream CUDA stream on which to perform the copy and synchronize.
*/
T value(cuda_stream_view stream = cuda_stream_view{}) const
{
T host_value{};
_memcpy(&host_value, buffer.data(), stream);
stream.synchronize();
return host_value;
}
value_type value(cuda_stream_view stream) const { return _storage.front_element(stream); }

/**
* @brief Sets the value of the `device_scalar` to the given `host_value`.
* @brief Sets the value of the `device_scalar` to the value of `v`.
*
* This specialization for fundamental types is optimized to use `cudaMemsetAsync` when
* `host_value` is zero.
* `v` is zero.
*
* @note If the stream specified to this function is different from the stream specified
* to the constructor, then appropriate dependencies must be inserted between the streams
* (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling
* this function, otherwise there may be a race condition.
*
* This function does not synchronize `stream` before returning. Therefore, the object
* referenced by `host_value` should not be destroyed or modified until `stream` has been
* referenced by `v` should not be destroyed or modified until `stream` has been
* synchronized. Otherwise, behavior is undefined.
*
* @note: This function incurs a host to device memcpy or device memset and should be used
* sparingly.
* carefully.
*
* Example:
* \code{cpp}
Expand All @@ -151,7 +174,7 @@ class device_scalar {
* int v{42};
*
* // Copies 42 to device storage on `stream`. Does _not_ synchronize
* vec.set_value(v, stream);
* vec.set_value_async(v, stream);
* ...
* cudaStreamSynchronize(stream);
* // Synchronization is required before `v` can be modified
Expand All @@ -160,112 +183,20 @@ class device_scalar {
*
* @throws `rmm::cuda_error` if copying `host_value` to device memory fails.
*
* @param host_value The host value which will be copied to device
* @param v The host value which will be copied to device
* @param stream CUDA stream on which to perform the copy
*/
template <typename U = T>
auto set_value(U const &host_value, cuda_stream_view stream = cuda_stream_view{})
-> std::enable_if_t<std::is_fundamental<U>::value && not std::is_same<U, bool>::value, void>
void set_value_async(value_type const &v, cuda_stream_view s)
{
if (host_value == U{0}) {
set_value_zero(stream);
} else {
_memcpy(buffer.data(), &host_value, stream);
}
}

/**
* @brief Sets the value of the `device_scalar` to the given `host_value`.
*
* This specialization for `bool` is optimized to always use `cudaMemsetAsync`.
*
* @note If the stream specified to this function is different from the stream specified
* to the constructor, then appropriate dependencies must be inserted between the streams
* (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling
* this function, otherwise there may be a race condition.
*
* This function does not synchronize `stream` before returning. `host_value` is passed by value
* so a host-side copy may be performed before calling a device memset.
*
* @note: This function incurs a device memset.
*
* Example:
* \code{cpp}
* rmm::device_scalar<bool> s;
*
* bool v{true};
*
* // Copies `true` to device storage on `stream`. Does _not_ synchronize
* vec.set_value(v, stream);
* ...
* cudaStreamSynchronize(stream);
* // Synchronization is required before `v` can be modified
* v = false;
* \endcode
*
* @throws `rmm::cuda_error` if the device memset fails.
*
* @param host_value The host value which the scalar will be set to (true or false)
* @param stream CUDA stream on which to perform the device memset
*/
template <typename U = T>
auto set_value(U const &host_value, cuda_stream_view stream = cuda_stream_view{})
-> std::enable_if_t<std::is_same<U, bool>::value, void>
{
RMM_CUDA_TRY(cudaMemsetAsync(buffer.data(), host_value, sizeof(bool), stream.value()));
}

/**
* @brief Sets the value of the `device_scalar` to the given `host_value`.
*
* Specialization for non-fundamental types.
*
* @note If the stream specified to this function is different from the stream specified
* to the constructor, then appropriate dependencies must be inserted between the streams
* (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling
* this function, otherwise there may be a race condition.
*
* This function does not synchronize `stream` before returning. Therefore, the object
* referenced by `host_value` should not be destroyed or modified until `stream` has been
* synchronized. Otherwise, behavior is undefined.
*
* @note: This function incurs a host to device memcpy and should be used sparingly.

* Example:
* \code{cpp}
* rmm::device_scalar<my_type> s;
*
* my_type v{42, "text"};
*
* // Copies 42 to device storage on `stream`. Does _not_ synchronize
* vec.set_value(v, stream);
* ...
* cudaStreamSynchronize(stream);
* // Synchronization is required before `v` can be modified
* v.value = 21;
* \endcode
*
* @throws `rmm::cuda_error` if copying `host_value` to device memory fails
* @throws `rmm::cuda_error` if synchronizing `stream` fails
*
* @param host_value The host value which will be copied to device
* @param stream CUDA stream on which to perform the copy
*/
template <typename U = T>
auto set_value(T const &host_value, cuda_stream_view stream = cuda_stream_view{})
-> std::enable_if_t<not std::is_fundamental<U>::value, void>
{
_memcpy(buffer.data(), &host_value, stream);
_storage.set_element_async(0, v, s);
}

// Disallow passing literals to set_value to avoid race conditions where the memory holding the
// literal can be freed before the async memcpy / memset executes.
void set_value(T &&host_value, cuda_stream_view stream = cuda_stream_view{}) = delete;
void set_value_async(value_type &&, cuda_stream_view) = delete;

/**
* @brief Sets the value of the `device_scalar` to zero.
*
* Only supported for fundamental types.
* @brief Sets the value of the `device_scalar` to zero on the specified stream.
*
* @note If the stream specified to this function is different from the stream specified
* to the constructor, then appropriate dependencies must be inserted between the streams
Expand All @@ -274,17 +205,13 @@ class device_scalar {
*
* This function does not synchronize `stream` before returning.
*
* @note: This function incurs a device memset and should be used sparingly.
*
* @throws `rmm::cuda_error` if the device memset fails.
* @note: This function incurs a device memset and should be used carefully.
*
* @param stream CUDA stream on which to perform the device memset
* @param stream CUDA stream on which to perform the copy
*/
template <typename U = T>
auto set_value_zero(cuda_stream_view stream = cuda_stream_view{})
-> std::enable_if_t<std::is_fundamental<U>::value, void>
void set_value_to_zero_async(cuda_stream_view s)
{
RMM_CUDA_TRY(cudaMemsetAsync(buffer.data(), 0, sizeof(U), stream.value()));
_storage.set_element_to_zero_async(value_type{0}, s);
}

/**
Expand All @@ -295,7 +222,7 @@ class device_scalar {
* streams (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`), otherwise there may
* be a race condition.
*/
T *data() noexcept { return static_cast<T *>(buffer.data()); }
pointer data() noexcept { return static_cast<pointer>(_storage.data()); }

/**
* @brief Returns const pointer to object in device memory.
Expand All @@ -305,20 +232,9 @@ class device_scalar {
* streams (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`), otherwise there may
* be a race condition.
*/
T const *data() const noexcept { return static_cast<T const *>(buffer.data()); }

device_scalar() = default;
~device_scalar() = default;
device_scalar(device_scalar &&) = default;
device_scalar &operator=(device_scalar const &) = delete;
device_scalar &operator=(device_scalar &&) = delete;
const_pointer data() const noexcept { return static_cast<const_pointer>(_storage.data()); }

private:
rmm::device_buffer buffer{sizeof(T), cuda_stream_default};

inline void _memcpy(void *dst, const void *src, cuda_stream_view stream) const
{
RMM_CUDA_TRY(cudaMemcpyAsync(dst, src, sizeof(T), cudaMemcpyDefault, stream.value()));
}
rmm::device_uvector<T> _storage;
};
} // namespace rmm
Loading