From 5d7728fa2e9837def034174abcf6cccbfdc9fe09 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 19 May 2026 10:15:40 -0700 Subject: [PATCH 1/6] Remove deprecated RMM `Buffer` support --- cpp/CMakeLists.txt | 19 +--- cpp/benchmarks/CMakeLists.txt | 2 +- cpp/examples/CMakeLists.txt | 6 +- cpp/examples/basic.cpp | 42 +++----- cpp/include/ucxx/api.h | 6 +- cpp/include/ucxx/buffer.h | 106 -------------------- cpp/include/ucxx/endpoint.h | 5 +- cpp/include/ucxx/request_tag_multi.h | 9 +- cpp/include/ucxx/worker.h | 12 ++- cpp/python/CMakeLists.txt | 13 +-- cpp/src/buffer.cpp | 49 +-------- cpp/src/buffer_cccl.cpp | 4 +- cpp/src/worker.cpp | 6 +- cpp/tests/CMakeLists.txt | 6 +- cpp/tests/buffer.cpp | 57 ----------- cpp/tests/request.cpp | 142 ++++++++++++++++----------- python/ucxx/ucxx/_lib/libucxx.pyx | 52 ++-------- python/ucxx/ucxx/_lib/ucxx_api.pxd | 13 --- 18 files changed, 139 insertions(+), 410 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5d739509e..45971ca94 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -42,7 +42,7 @@ option(BUILD_TESTS "Configure CMake to build tests" ON) option(BUILD_BENCHMARKS "Configure CMake to build benchmarks" OFF) option(BUILD_EXAMPLES "Configure CMake to build examples" OFF) option(BUILD_SHARED_LIBS "Build UCXX shared libraries" ON) -option(UCXX_ENABLE_RMM "Enable support for CUDA multi-buffer transfer with RMM" OFF) +option(UCXX_ENABLE_RMM "Enable RMM-backed test and benchmark code paths" OFF) # TODO: Flip UCXX_ENABLE_CCCL default to OFF once devcontainer builds pass -DUCXX_ENABLE_CCCL=ON option(UCXX_ENABLE_CCCL "Enable support for CUDA buffer with CCCL" ON) option(DISABLE_DEPRECATION_WARNINGS "Disable warnings generated from deprecated declarations." OFF) @@ -51,10 +51,7 @@ message(VERBOSE "UCXX: Configure CMake to build tests: ${BUILD_TESTS}") message(VERBOSE "UCXX: Configure CMake to build benchmarks: ${BUILD_BENCHMARKS}") message(VERBOSE "UCXX: Configure CMake to build examples: ${BUILD_EXAMPLES}") message(VERBOSE "UCXX: Build UCXX shared libraries: ${BUILD_SHARED_LIBS}") -message( - VERBOSE - "UCXX: Enable support for CUDA multi-buffer transfer with RMM (DEPRECATED): ${UCXX_ENABLE_RMM}" -) +message(VERBOSE "UCXX: Enable RMM-backed test and benchmark code paths: ${UCXX_ENABLE_RMM}") message(VERBOSE "UCXX: Enable support for CUDA buffer with CCCL: ${UCXX_ENABLE_CCCL}") message( VERBOSE @@ -97,11 +94,8 @@ rapids_find_package( # add third party dependencies using CPM rapids_cpm_init() -# find rmm +# find rmm for tests and benchmarks if(UCXX_ENABLE_RMM) - message(DEPRECATION "UCXX_ENABLE_RMM is deprecated and will be removed in a future release. " - "Use UCXX_ENABLE_CCCL instead." - ) include(cmake/thirdparty/get_rmm.cmake) endif() # find cccl @@ -181,13 +175,6 @@ target_include_directories( target_compile_definitions(ucxx PUBLIC "$<$:${UCXX_CXX_DEFINITIONS}>") -# Enable RMM if necessary -if(UCXX_ENABLE_RMM) - target_link_libraries(ucxx PUBLIC rmm::rmm) - - target_compile_definitions(ucxx PUBLIC UCXX_ENABLE_RMM) -endif() - # Enable CCCL if necessary if(UCXX_ENABLE_CCCL) find_package(CUDAToolkit REQUIRED) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index fadc1436f..3dbfc4936 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -45,7 +45,7 @@ function(ConfigureBench CMAKE_BENCH_NAME) target_link_libraries(${CMAKE_BENCH_NAME} PRIVATE CUDA::cudart_static) endif() - # RMM memory resources for CUDA benchmarks (requires UCXX_ENABLE_RMM and get_rmm.cmake) + # RMM memory resources for CUDA benchmarks. if(UCXX_BENCHMARKS_ENABLE_CUDA AND UCXX_ENABLE_RMM) target_compile_definitions(${CMAKE_BENCH_NAME} PRIVATE UCXX_BENCHMARKS_ENABLE_RMM) target_link_libraries(${CMAKE_BENCH_NAME} PRIVATE rmm::rmm) diff --git a/cpp/examples/CMakeLists.txt b/cpp/examples/CMakeLists.txt index 983b787a9..493946ef5 100644 --- a/cpp/examples/CMakeLists.txt +++ b/cpp/examples/CMakeLists.txt @@ -1,6 +1,6 @@ # ================================================================================= # cmake-format: off -# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: BSD-3-Clause # cmake-format: on # ================================================================================= @@ -31,6 +31,10 @@ function(ConfigureBench CMAKE_BENCH_NAME) CXX_STANDARD_REQUIRED ON ) target_link_libraries(${CMAKE_BENCH_NAME} PRIVATE ucxx $) + if(UCXX_ENABLE_CCCL) + find_package(CUDAToolkit REQUIRED) + target_link_libraries(${CMAKE_BENCH_NAME} PRIVATE CUDA::cudart_static) + endif() add_custom_command( OUTPUT UCXX_EXAMPLES COMMAND ${CMAKE_BENCH_NAME} diff --git a/cpp/examples/basic.cpp b/cpp/examples/basic.cpp index 354f6a48d..14a5e8ac9 100644 --- a/cpp/examples/basic.cpp +++ b/cpp/examples/basic.cpp @@ -18,8 +18,8 @@ #include #include -#if UCXX_ENABLE_RMM -#include +#if UCXX_ENABLE_CCCL +#include #endif class ListenerContext { @@ -91,14 +91,12 @@ static void printUsage() std::cerr << " 'thread-polling', 'thread-blocking' and 'wait' (default: 'blocking')" << std::endl; std::cerr << " -p Port number to listen at" << std::endl; - std::cerr - << " -s Send buffer type, valid values are: 'host', 'rmm', 'cccl' " - "(default: 'host')" - << std::endl; - std::cerr - << " -r Recv buffer type, valid values are: 'host', 'rmm', 'cccl' " - "(default: 'host')" - << std::endl; + std::cerr << " -s Send buffer type, valid values are: 'host', 'cccl' " + "(default: 'host')" + << std::endl; + std::cerr << " -r Recv buffer type, valid values are: 'host', 'cccl' " + "(default: 'host')" + << std::endl; std::cerr << " -h Print this help" << std::endl; std::cerr << std::endl; } @@ -124,14 +122,6 @@ struct args { auto parseBufferType = [](const std::string& bufferTypeString) { if (bufferTypeString == "host") { return ucxx::BufferType::Host; - } else if (bufferTypeString == "rmm") { -#if UCXX_ENABLE_RMM - return ucxx::BufferType::RMM; -#else - std::cerr << "RMM support not enabled, please compile with -DUCXX_ENABLE_RMM=1" - << std::endl; - return ucxx::BufferType::Invalid; -#endif } else if (bufferTypeString == "cccl") { #if UCXX_ENABLE_CCCL return ucxx::BufferType::CCCL; @@ -228,15 +218,6 @@ std::shared_ptr makeBuffer(ucxx::BufferType bufferType, T* values, switch (bufferType) { case ucxx::BufferType::Host: return std::make_shared(values, size * sizeof(T)); - case ucxx::BufferType::RMM: -#if UCXX_ENABLE_RMM - { - auto buf = - std::make_unique(values, size * sizeof(T), rmm::cuda_stream_default); - rmm::cuda_stream_default.synchronize(); - return std::make_shared(std::move(buf)); - } -#endif case ucxx::BufferType::CCCL: #if UCXX_ENABLE_CCCL { @@ -255,7 +236,7 @@ auto verify_buffers(ucxx::Buffer* expected, ucxx::Buffer* actual) std::vector host_expected, host_actual; void *host_expected_ptr, *host_actual_ptr; -#if UCXX_ENABLE_CCCL || UCXX_ENABLE_RMM +#if UCXX_ENABLE_CCCL auto copy_to_host = [](auto& buffer, auto& host_buffer) { // copy device buffer to host host_buffer.resize(buffer->getSize()); @@ -272,14 +253,13 @@ auto verify_buffers(ucxx::Buffer* expected, ucxx::Buffer* actual) }; #endif - if (expected->getType() == ucxx::BufferType::RMM || - expected->getType() == ucxx::BufferType::CCCL) { + if (expected->getType() == ucxx::BufferType::CCCL) { host_expected_ptr = copy_to_host(expected, host_expected); } else { host_expected_ptr = expected->data(); } - if (actual->getType() == ucxx::BufferType::RMM || actual->getType() == ucxx::BufferType::CCCL) { + if (actual->getType() == ucxx::BufferType::CCCL) { host_actual_ptr = copy_to_host(actual, host_actual); } else { host_actual_ptr = actual->data(); diff --git a/cpp/include/ucxx/api.h b/cpp/include/ucxx/api.h index ac77ea866..82a253ba9 100644 --- a/cpp/include/ucxx/api.h +++ b/cpp/include/ucxx/api.h @@ -1,13 +1,9 @@ /** - * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. * SPDX-License-Identifier: BSD-3-Clause */ #pragma once -#ifndef UCXX_ENABLE_RMM -#define UCXX_ENABLE_RMM 0 -#endif - #include #include #include diff --git a/cpp/include/ucxx/buffer.h b/cpp/include/ucxx/buffer.h index 3319615c6..b0d13439c 100644 --- a/cpp/include/ucxx/buffer.h +++ b/cpp/include/ucxx/buffer.h @@ -8,11 +8,6 @@ #include -namespace rmm { -// Forward declaration to prevent symbols from being added to symbol table unnecessarily. -class device_buffer; -} // namespace rmm - namespace ucxx { /** * @brief The type of a buffer. @@ -21,7 +16,6 @@ namespace ucxx { */ enum class BufferType { Host = 0, - RMM, CCCL, Invalid, }; @@ -195,106 +189,6 @@ class HostBuffer : public Buffer { [[nodiscard]] void* data() override; }; -#if UCXX_ENABLE_RMM -/** - * @brief A simple object containing a RMM (CUDA) buffer. - * - * A buffer encapsulating an RMM (CUDA) buffer with its properties. - */ -class RMMBuffer : public Buffer { - private: - std::unique_ptr _buffer; ///< RMM-allocated device buffer - - public: - RMMBuffer() = delete; - RMMBuffer(const RMMBuffer&) = delete; - RMMBuffer& operator=(RMMBuffer const&) = delete; - RMMBuffer(RMMBuffer&& o) = delete; - RMMBuffer& operator=(RMMBuffer&& o) = delete; - - ~RMMBuffer() override; - - /** - * @brief Constructor of concrete type `RMMBuffer`. - * - * Constructor to materialize a buffer holding device memory. The internal - * buffer holds a `std::unique_ptr` and is destroyed - * when the object goes out-of-scope or is explicitly deleted. - * - * @param[in] size the size of the device buffer to allocate. - * - * @code{.cpp} - * // Allocate host buffer of 1KiB - * auto buffer = RMMBuffer(1024); - * @endcode - */ - [[deprecated( - "RMMBuffer is deprecated and will be removed in a future release. Use CCCL buffers instead " - "(UCXX_ENABLE_CCCL).")]] - explicit RMMBuffer(const size_t size); - - /** - * @brief Construct from an existing `rmm::device_buffer`. - * - * @param[in] rmm_buffer the `rmm::device_buffer` to hold. - */ - [[deprecated( - "RMMBuffer is deprecated and will be removed in a future release. Use CCCL buffers instead " - "(UCXX_ENABLE_CCCL).")]] - explicit RMMBuffer(std::unique_ptr rmm_buffer); - - /** - * @brief Release the allocated `rmm::device_buffer` to the caller. - * - * Release ownership of the `rmm::device_buffer` to the caller. After this - * method is called, the caller becomes responsible for the destruction of - * the object once it is not needed anymore. The `rmm::device_buffer` is held - * owned by the `unique_ptr` and will be deallocated once it goes out-of-scope - * or gets explicitly deleted. - * - * The original `RMMBuffer` object becomes invalid. - * - * @code{.cpp} - * // Allocate RMM buffer of 1KiB - * auto buffer = RMMBuffer(1024); - * std::unique_ptr rmmBuffer= buffer.release(); - * - * // do work on rmmBuffer - * - * // `rmm::device_buffer` is destroyed and device Memory is freed once - * // `rmmBuffer` goes out-of-scope. - * @endcode - * - * @throws std::runtime_error if object has been released. - * - * @return the void pointer to the buffer. - */ - [[nodiscard]] std::unique_ptr release(); - - /** - * @brief Get a pointer to the allocated raw device buffer. - * - * Get a pointer to the underlying buffer, but does not release ownership. - * - * @code{.cpp} - * // Allocate device buffer of 1KiB - * auto buffer = RMMBuffer(1024); - * void* bufferPtr = buffer.data(); - * - * // do work on bufferPtr - * - * // `rmm::device_buffer` is destroyed and device Memory is freed once - * // `buffer` goes out-of-scope. - * @endcode - * - * @throws std::runtime_error if object has been released. - * - * @return the void pointer to the device buffer. - */ - [[nodiscard]] void* data() override; -}; -#endif - #if UCXX_ENABLE_CCCL /** * @brief Opaque implementation struct for CCCLBuffer (defined in buffer_cccl.cu). diff --git a/cpp/include/ucxx/endpoint.h b/cpp/include/ucxx/endpoint.h index 7a3cd8604..f7b047420 100644 --- a/cpp/include/ucxx/endpoint.h +++ b/cpp/include/ucxx/endpoint.h @@ -789,9 +789,8 @@ class Endpoint : public Component { * `std::shared` that can be later awaited and checked for errors. * This is a non-blocking operation, and because the receiver has no a priori knowledge * of the data being received, memory allocations are automatically handled internally. - * The receiver must have the same capabilities of the sender, so that if the sender is - * compiled with RMM support to allow for CUDA transfers, the receiver must have the - * ability to understand and allocate CUDA memory. + * Receiving CUDA frames requires UCXX to be compiled with CCCL support (`UCXX_ENABLE_CCCL`) + * and the receiving worker to be configured to allocate CCCL buffers for CUDA frames. * * Using a Python future may be requested by specifying `enablePythonFuture`. If a * Python future is requested, the Python application must then await on this future to diff --git a/cpp/include/ucxx/request_tag_multi.h b/cpp/include/ucxx/request_tag_multi.h index 4e9b490aa..11a24d7d4 100644 --- a/cpp/include/ucxx/request_tag_multi.h +++ b/cpp/include/ucxx/request_tag_multi.h @@ -1,5 +1,5 @@ /** - * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. * SPDX-License-Identifier: BSD-3-Clause */ #pragma once @@ -144,10 +144,9 @@ class RequestTagMulti : public Request { * This is a non-blocking operation, and the status of a send transfer must be verified * from the resulting request object before the data can be released. If this is a receive * transfer and because the receiver has no a priori knowledge of the data being received, - * memory allocations are automatically handled internally. The receiver must have the - * same capabilities of the sender, so that if the sender is compiled with RMM support to - * allow for CUDA transfers, the receiver must have the ability to understand and allocate - * CUDA memory. + * memory allocations are automatically handled internally. Receiving CUDA frames requires UCXX + * to be compiled with CCCL support (`UCXX_ENABLE_CCCL`) and the receiving worker to be + * configured to allocate CCCL buffers for CUDA frames. * * The primary use of multi-buffer transfers is in Python where we want to reduce the * amount of futures needed to watch for, thus reducing Python overhead. However, this diff --git a/cpp/include/ucxx/worker.h b/cpp/include/ucxx/worker.h index 20e2705f6..95aaff4e4 100644 --- a/cpp/include/ucxx/worker.h +++ b/cpp/include/ucxx/worker.h @@ -150,10 +150,10 @@ class Worker : public Component { * Configure which buffer type to use when allocating CUDA buffers for incoming * multi-buffer tag receives. * - * @param[in] bufferType the preferred buffer type (must be `BufferType::RMM` or - * `BufferType::CCCL`). + * @param[in] bufferType the preferred buffer type (currently only `BufferType::CCCL` + * is supported). * - * @throws std::invalid_argument if bufferType is not RMM or CCCL. + * @throws std::invalid_argument if bufferType is not CCCL. */ void setCudaBufferType(BufferType bufferType); @@ -527,7 +527,7 @@ class Worker : public Component { * * Returns the buffer type used when allocating CUDA buffers for incoming * multi-buffer tag receives. Defaults to CCCL if compiled with CCCL support, - * otherwise RMM if compiled with RMM support, otherwise Invalid. + * otherwise Invalid. * * @returns The preferred `BufferType` for CUDA allocations. */ @@ -938,7 +938,9 @@ class Worker : public Component { * // context is `std::shared_ptr` * auto worker = context->createWorker(false); * - * worker->registerAmAllocator(`UCS_MEMORY_TYPE_CUDA`, ucxx::RMMBuffer); + * worker->registerAmAllocator(UCS_MEMORY_TYPE_CUDA, [](size_t length) { + * return std::make_shared(length); + * }); * @endcode * * @param[in] memoryType the memory type the allocator will be used for. diff --git a/cpp/python/CMakeLists.txt b/cpp/python/CMakeLists.txt index 35d60a3ff..c659cbb95 100644 --- a/cpp/python/CMakeLists.txt +++ b/cpp/python/CMakeLists.txt @@ -37,9 +37,6 @@ option(FIND_UCXX_CPP "Search for existing UCXX C++ installations before defaulti # add third party dependencies using CPM rapids_cpm_init() -# find rmm -include(../cmake/thirdparty/get_rmm.cmake) - if(FIND_UCXX_CPP) rapids_find_package( ucxx REQUIRED @@ -47,7 +44,6 @@ if(FIND_UCXX_CPP) INSTALL_EXPORT_SET ucxx-python-exports ) else() - set(UCXX_ENABLE_RMM ON) set(UCXX_ENABLE_CCCL ON) add_subdirectory(.. ucxx-cpp) endif() @@ -99,10 +95,7 @@ target_include_directories( target_compile_definitions(ucxx_python PUBLIC "$<$:${UCXX_CXX_DEFINITIONS}>") -target_compile_definitions( - ucxx_python PUBLIC UCXX_ENABLE_PYTHON - "RMM_LOG_ACTIVE_LEVEL=RAPIDS_LOGGER_LEVEL_${RMM_LOGGING_LEVEL}" -) +target_compile_definitions(ucxx_python PUBLIC UCXX_ENABLE_PYTHON) if(UCXX_ENABLE_CCCL) target_compile_definitions(ucxx_python PUBLIC UCXX_ENABLE_CCCL) @@ -112,9 +105,7 @@ endif() # not need to be linked since its symbols will always be available at runtime since we are running # inide the Python interpreter, and setting it up this way ensures that we will work if the # interpreter links to Python statically instead of dynamically. -target_link_libraries( - ucxx_python PUBLIC rmm::rmm ucx::ucp ucxx::ucxx "$" -) +target_link_libraries(ucxx_python PUBLIC ucx::ucp ucxx::ucxx "$") # Add Conda library, and include paths if specified if(TARGET conda_env) diff --git a/cpp/src/buffer.cpp b/cpp/src/buffer.cpp index b88a42ee2..2862f3cb7 100644 --- a/cpp/src/buffer.cpp +++ b/cpp/src/buffer.cpp @@ -5,14 +5,11 @@ #include #include #include +#include #include #include -#if UCXX_ENABLE_RMM -#include -#endif - namespace ucxx { Buffer::Buffer(const BufferType bufferType, const size_t size) @@ -61,51 +58,9 @@ void* HostBuffer::data() return _buffer; } -#if UCXX_ENABLE_RMM -RMMBuffer::RMMBuffer(const size_t size) - : Buffer(BufferType::RMM, size), - _buffer{std::make_unique(size, rmm::cuda_stream_default)} -{ - ucxx_trace_data("ucxx::RMMBuffer created: %p, buffer: %p, size: %lu", this, _buffer.get(), size); -} - -RMMBuffer::RMMBuffer(std::unique_ptr rmm_buffer) - : Buffer(BufferType::RMM, rmm_buffer->size()), _buffer{std::move(rmm_buffer)} -{ - ucxx_trace_data("ucxx::RMMBuffer created: %p, buffer: %p, size: %lu", this, _buffer.get(), _size); -} - -RMMBuffer::~RMMBuffer() = default; - -std::unique_ptr RMMBuffer::release() -{ - ucxx_trace_data("ucxx::RMMBuffer::%s, RMMBuffer: %p, _buffer: %p", __func__, this, _buffer.get()); - if (!_buffer) throw std::runtime_error("Invalid object or already released"); - - _bufferType = ucxx::BufferType::Invalid; - _size = 0; - - return std::move(_buffer); -} - -void* RMMBuffer::data() -{ - ucxx_trace_data("ucxx::RMMBuffer::%s, RMMBuffer: %p, buffer: %p", __func__, this, _buffer.get()); - if (!_buffer) throw std::runtime_error("Invalid object or already released"); - - return _buffer->data(); -} -#endif - std::shared_ptr allocateBuffer(const BufferType bufferType, const size_t size) { - if (bufferType == BufferType::RMM) { -#if UCXX_ENABLE_RMM - return std::make_shared(size); -#else - throw std::runtime_error("RMM support not enabled, please compile with -DUCXX_ENABLE_RMM=1"); -#endif - } else if (bufferType == BufferType::CCCL) { + if (bufferType == BufferType::CCCL) { #if UCXX_ENABLE_CCCL return std::make_shared(size); #else diff --git a/cpp/src/buffer_cccl.cpp b/cpp/src/buffer_cccl.cpp index 53e3a3a09..60741f410 100644 --- a/cpp/src/buffer_cccl.cpp +++ b/cpp/src/buffer_cccl.cpp @@ -24,8 +24,7 @@ struct CCCLBufferImpl { cccl_buffer_type buffer; // CCCL's cuda::device_default_memory_pool() requires an active CUDA primary context. - // Unlike RMM (which initializes context internally via its device resource setup), - // CCCL needs explicit initialization. cudaFree(0) is the standard zero-cost idiom. + // cudaFree(0) is the standard zero-cost idiom to initialize it. static auto get_device_pool() { cudaFree(0); // Ensure CUDA primary context is initialized @@ -52,7 +51,6 @@ void* CCCLBuffer::data() if (!_impl) throw std::runtime_error("Invalid object or already released"); // Explicit cast required: cuda::buffer::data() returns cuda::std::byte*, not void*. - // RMMBuffer::data() needs no cast since rmm::device_buffer::data() returns void* directly. return static_cast(_impl->buffer.data()); } diff --git a/cpp/src/worker.cpp b/cpp/src/worker.cpp index b9b9063d1..bcf85c6b2 100644 --- a/cpp/src/worker.cpp +++ b/cpp/src/worker.cpp @@ -41,8 +41,6 @@ Worker::Worker(std::shared_ptr context, throw std::runtime_error("Context not initialized"); #if UCXX_ENABLE_CCCL _cudaBufferType = BufferType::CCCL; -#elif UCXX_ENABLE_RMM - _cudaBufferType = BufferType::RMM; #endif ucp_worker_params_t params = {.field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE, @@ -230,8 +228,8 @@ BufferType Worker::getCudaBufferType() const { return _cudaBufferType; } void Worker::setCudaBufferType(BufferType bufferType) { - if (bufferType != BufferType::RMM && bufferType != BufferType::CCCL) - throw std::invalid_argument("cudaBufferType must be BufferType::RMM or BufferType::CCCL"); + if (bufferType != BufferType::CCCL) + throw std::invalid_argument("cudaBufferType must be BufferType::CCCL"); _cudaBufferType = bufferType; } diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 8e458adaa..ab8390102 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -1,6 +1,6 @@ # ====================================================================================================== # cmake-format: off -# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # cmake-format: on # ====================================================================================================== @@ -29,6 +29,10 @@ function(ConfigureTest CMAKE_TEST_NAME) ${CMAKE_TEST_NAME} PRIVATE ucxx GTest::gmock_main GTest::gtest_main $ ) + if(UCXX_ENABLE_RMM) + target_compile_definitions(${CMAKE_TEST_NAME} PRIVATE UCXX_ENABLE_RMM) + target_link_libraries(${CMAKE_TEST_NAME} PRIVATE rmm::rmm) + endif() add_test(NAME ${CMAKE_TEST_NAME} COMMAND ${CMAKE_TEST_NAME}) install( TARGETS ${CMAKE_TEST_NAME} diff --git a/cpp/tests/buffer.cpp b/cpp/tests/buffer.cpp index 7591fb511..c5266a091 100644 --- a/cpp/tests/buffer.cpp +++ b/cpp/tests/buffer.cpp @@ -12,10 +12,6 @@ #include -#if UCXX_ENABLE_RMM -#include -#endif - namespace { class BufferAllocator : public ::testing::Test, @@ -48,17 +44,6 @@ TEST_P(BufferAllocator, TestType) ASSERT_EQ(buffer->getType(), ucxx::BufferType::Invalid); free(releasedBuffer); - } else if (_type == ucxx::BufferType::RMM) { -#if UCXX_ENABLE_RMM - auto buffer = std::dynamic_pointer_cast(_buffer); - ASSERT_EQ(buffer->getType(), _type); - - auto releasedBuffer = buffer->release(); - - ASSERT_EQ(buffer->getType(), ucxx::BufferType::Invalid); -#else - GTEST_SKIP() << "UCXX was not built with RMM support"; -#endif } else if (_type == ucxx::BufferType::CCCL) { #if UCXX_ENABLE_CCCL auto buffer = std::dynamic_pointer_cast(_buffer); @@ -85,17 +70,6 @@ TEST_P(BufferAllocator, TestSize) ASSERT_EQ(buffer->getSize(), 0u); free(releasedBuffer); - } else if (_type == ucxx::BufferType::RMM) { -#if UCXX_ENABLE_RMM - auto buffer = std::dynamic_pointer_cast(_buffer); - ASSERT_EQ(buffer->getSize(), _size); - - auto releasedBuffer = buffer->release(); - - ASSERT_EQ(buffer->getSize(), 0u); -#else - GTEST_SKIP() << "UCXX was not built with RMM support"; -#endif } else if (_type == ucxx::BufferType::CCCL) { #if UCXX_ENABLE_CCCL auto buffer = std::dynamic_pointer_cast(_buffer); @@ -122,19 +96,6 @@ TEST_P(BufferAllocator, TestData) ASSERT_NE(releasedBuffer, nullptr); free(releasedBuffer); - } else if (_type == ucxx::BufferType::RMM) { -#if UCXX_ENABLE_RMM - auto buffer = std::dynamic_pointer_cast(_buffer); - ASSERT_EQ(buffer->data(), _buffer->data()); - - auto releasedBuffer = buffer->release(); - - EXPECT_THROW(buffer->data(), std::runtime_error); - - ASSERT_NE(releasedBuffer, nullptr); -#else - GTEST_SKIP() << "UCXX was not built with RMM support"; -#endif } else if (_type == ucxx::BufferType::CCCL) { #if UCXX_ENABLE_CCCL auto buffer = std::dynamic_pointer_cast(_buffer); @@ -158,16 +119,6 @@ TEST_P(BufferAllocator, TestThrowAfterRelease) EXPECT_THROW(std::ignore = buffer->release(), std::runtime_error); free(releasedBuffer); - } else if (_type == ucxx::BufferType::RMM) { -#if UCXX_ENABLE_RMM - auto buffer = std::dynamic_pointer_cast(_buffer); - auto releasedBuffer = buffer->release(); - - EXPECT_THROW(buffer->data(), std::runtime_error); - EXPECT_THROW(std::ignore = buffer->release(), std::runtime_error); -#else - GTEST_SKIP() << "UCXX was not built with RMM support"; -#endif } else if (_type == ucxx::BufferType::CCCL) { #if UCXX_ENABLE_CCCL GTEST_SKIP() << "CCCLBuffer does not expose release()"; @@ -184,14 +135,6 @@ INSTANTIATE_TEST_SUITE_P(Host, std::make_pair(ucxx::BufferType::Host, 1000), std::make_pair(ucxx::BufferType::Host, 1000000))); -#if UCXX_ENABLE_RMM -INSTANTIATE_TEST_SUITE_P(RMM, - BufferAllocator, - testing::Values(std::make_pair(ucxx::BufferType::RMM, 1), - std::make_pair(ucxx::BufferType::RMM, 1000), - std::make_pair(ucxx::BufferType::RMM, 1000000))); -#endif - #if UCXX_ENABLE_CCCL INSTANTIATE_TEST_SUITE_P(CCCL, BufferAllocator, diff --git a/cpp/tests/request.cpp b/cpp/tests/request.cpp index e28584df4..afe047dd4 100644 --- a/cpp/tests/request.cpp +++ b/cpp/tests/request.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -21,6 +22,10 @@ #include "ucxx/constructors.h" #include "ucxx/utils/ucx.h" +#ifndef UCXX_ENABLE_RMM +#define UCXX_ENABLE_RMM 0 +#endif + #if UCXX_ENABLE_RMM #include #endif @@ -37,15 +42,43 @@ using ::testing::Values; typedef std::vector DataContainerType; -class RequestTest : public ::testing::TestWithParam< - std::tuple> { +enum class TestBufferType { + Host, + RMM, + CCCL, +}; + +bool isCudaBufferType(TestBufferType bufferType) { return bufferType != TestBufferType::Host; } + +#if UCXX_ENABLE_RMM +class RMMTestBuffer : public ucxx::Buffer { + private: + std::unique_ptr _buffer; + + public: + explicit RMMTestBuffer(size_t size) + : ucxx::Buffer(ucxx::BufferType::Invalid, size), + _buffer{std::make_unique(size, rmm::cuda_stream_default)} + { + } + + void* data() override + { + if (!_buffer) throw std::runtime_error("Invalid object"); + return _buffer->data(); + } +}; +#endif + +class RequestTest + : public ::testing::TestWithParam> { protected: std::shared_ptr _context{nullptr}; std::shared_ptr _worker{nullptr}; std::shared_ptr _ep{nullptr}; std::function _progressWorker; - ucxx::BufferType _bufferType; + TestBufferType _bufferType; ucs_memory_type_t _memoryType; bool _registerCustomAmAllocator; bool _enableDelayedSubmission; @@ -68,8 +101,9 @@ class RequestTest : public ::testing::TestWithParam< .delayedSubmission(_enableDelayedSubmission) .requestAttributes(enableRequestAttributes); - if (_bufferType == ucxx::BufferType::RMM || _bufferType == ucxx::BufferType::CCCL) - builder.cudaBufferType(_bufferType); +#if UCXX_ENABLE_CCCL + if (isCudaBufferType(_bufferType)) builder.cudaBufferType(ucxx::BufferType::CCCL); +#endif _worker = builder.build(); @@ -104,20 +138,19 @@ class RequestTest : public ::testing::TestWithParam< _progressMode, _messageLength) = GetParam(); - if (_bufferType == ucxx::BufferType::RMM) { + if (_bufferType == TestBufferType::RMM) { #if !UCXX_ENABLE_RMM GTEST_SKIP() << "UCXX was not built with RMM support"; #endif } - if (_bufferType == ucxx::BufferType::CCCL) { + if (_bufferType == TestBufferType::CCCL) { #if !UCXX_ENABLE_CCCL GTEST_SKIP() << "UCXX was not built with CCCL support"; #endif } - _memoryType = - (_bufferType != ucxx::BufferType::Host) ? UCS_MEMORY_TYPE_CUDA : UCS_MEMORY_TYPE_HOST; + _memoryType = isCudaBufferType(_bufferType) ? UCS_MEMORY_TYPE_CUDA : UCS_MEMORY_TYPE_HOST; _messageSize = _messageLength * sizeof(int); _context = ucxx::createContext({{"RNDV_THRESH", std::to_string(_rndvThresh)}}, @@ -151,16 +184,16 @@ class RequestTest : public ::testing::TestWithParam< std::iota(_send[i].begin(), _send[i].end(), i); - if (_bufferType == ucxx::BufferType::Host) { + if (_bufferType == TestBufferType::Host) { _sendBuffer[i] = std::make_unique(_messageSize); if (allocateRecvBuffer) _recvBuffer[i] = std::make_unique(_messageSize); #if UCXX_ENABLE_RMM - } else if (_bufferType == ucxx::BufferType::RMM) { - _sendBuffer[i] = std::make_unique(_messageSize); - if (allocateRecvBuffer) _recvBuffer[i] = std::make_unique(_messageSize); + } else if (_bufferType == TestBufferType::RMM) { + _sendBuffer[i] = std::make_unique(_messageSize); + if (allocateRecvBuffer) _recvBuffer[i] = std::make_unique(_messageSize); #endif #if UCXX_ENABLE_CCCL - } else if (_bufferType == ucxx::BufferType::CCCL) { + } else if (_bufferType == TestBufferType::CCCL) { _sendBuffer[i] = std::make_unique(_messageSize); if (allocateRecvBuffer) _recvBuffer[i] = std::make_unique(_messageSize); #endif @@ -172,9 +205,9 @@ class RequestTest : public ::testing::TestWithParam< if (allocateRecvBuffer) _recvPtr[i] = _recvBuffer[i]->data(); } #if UCXX_ENABLE_RMM - if (_bufferType == ucxx::BufferType::RMM) { rmm::cuda_stream_default.synchronize(); } + if (_bufferType == TestBufferType::RMM) { rmm::cuda_stream_default.synchronize(); } #endif - if (_bufferType == ucxx::BufferType::CCCL) { cudaStreamSynchronize(nullptr); } + if (_bufferType == TestBufferType::CCCL) { cudaStreamSynchronize(nullptr); } } void copyResults() @@ -182,9 +215,9 @@ class RequestTest : public ::testing::TestWithParam< for (size_t i = 0; i < _numBuffers; ++i) copyMemoryTypeAware(_recv[i].data(), _recvPtr[i], _messageSize, false); #if UCXX_ENABLE_RMM - if (_bufferType == ucxx::BufferType::RMM) { rmm::cuda_stream_default.synchronize(); } + if (_bufferType == TestBufferType::RMM) { rmm::cuda_stream_default.synchronize(); } #endif - if (_bufferType == ucxx::BufferType::CCCL) { cudaStreamSynchronize(nullptr); } + if (_bufferType == TestBufferType::CCCL) { cudaStreamSynchronize(nullptr); } } void copyMemoryTypeAware(void* dst, const void* src, size_t size, bool synchronize = true) @@ -192,7 +225,7 @@ class RequestTest : public ::testing::TestWithParam< if (_memoryType == UCS_MEMORY_TYPE_HOST) { memcpy(dst, src, size); #if UCXX_ENABLE_RMM - } else if (_memoryType == UCS_MEMORY_TYPE_CUDA && _bufferType == ucxx::BufferType::RMM) { + } else if (_memoryType == UCS_MEMORY_TYPE_CUDA && _bufferType == TestBufferType::RMM) { RMM_CUDA_TRY( cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, rmm::cuda_stream_default.value())); if (synchronize) rmm::cuda_stream_default.synchronize(); @@ -211,19 +244,12 @@ TEST_P(RequestTest, ProgressAm) } if (_registerCustomAmAllocator && _memoryType == UCS_MEMORY_TYPE_CUDA) { -#if UCXX_ENABLE_RMM - if (_bufferType == ucxx::BufferType::RMM) { - _worker->registerAmAllocator(UCS_MEMORY_TYPE_CUDA, [](size_t length) { - return std::make_shared(length); - }); - } -#endif #if UCXX_ENABLE_CCCL - if (_bufferType == ucxx::BufferType::CCCL) { - _worker->registerAmAllocator(UCS_MEMORY_TYPE_CUDA, [](size_t length) { - return std::make_shared(length); - }); - } + _worker->registerAmAllocator(UCS_MEMORY_TYPE_CUDA, [](size_t length) { + return std::make_shared(length); + }); +#else + GTEST_SKIP() << "CCCL support is required for CUDA receive allocations"; #endif } @@ -240,9 +266,10 @@ TEST_P(RequestTest, ProgressAm) // Messages of size `_rndvThresh` or larger are rendezvous and will use the custom // allocator, smaller messages are eager and will always be host-allocated. - ASSERT_THAT(recvReq->getRecvBuffer()->getType(), - (_registerCustomAmAllocator && _messageSize >= _rndvThresh) ? _bufferType - : ucxx::BufferType::Host); + const auto expectedRecvBufferType = (_registerCustomAmAllocator && _messageSize >= _rndvThresh) + ? ucxx::BufferType::CCCL + : ucxx::BufferType::Host; + ASSERT_THAT(recvReq->getRecvBuffer()->getType(), expectedRecvBufferType); copyResults(); @@ -349,19 +376,12 @@ TEST_P(RequestTest, ProgressAmReceiverCallback) } if (_registerCustomAmAllocator && _memoryType == UCS_MEMORY_TYPE_CUDA) { -#if UCXX_ENABLE_RMM - if (_bufferType == ucxx::BufferType::RMM) { - _worker->registerAmAllocator(UCS_MEMORY_TYPE_CUDA, [](size_t length) { - return std::make_shared(length); - }); - } -#endif #if UCXX_ENABLE_CCCL - if (_bufferType == ucxx::BufferType::CCCL) { - _worker->registerAmAllocator(UCS_MEMORY_TYPE_CUDA, [](size_t length) { - return std::make_shared(length); - }); - } + _worker->registerAmAllocator(UCS_MEMORY_TYPE_CUDA, [](size_t length) { + return std::make_shared(length); + }); +#else + GTEST_SKIP() << "CCCL support is required for CUDA receive allocations"; #endif } @@ -399,10 +419,10 @@ TEST_P(RequestTest, ProgressAmReceiverCallback) // Messages larger than `_rndvThresh` are rendezvous and will use custom allocator, // smaller messages are eager and will always be host-allocated. - ASSERT_THAT(receivedRequests[0]->getRecvBuffer()->getType(), - (_registerCustomAmAllocator && _messageSize >= _rndvThresh) - ? _bufferType - : ucxx::BufferType::Host); + const auto expectedRecvBufferType = (_registerCustomAmAllocator && _messageSize >= _rndvThresh) + ? ucxx::BufferType::CCCL + : ucxx::BufferType::Host; + ASSERT_THAT(receivedRequests[0]->getRecvBuffer()->getType(), expectedRecvBufferType); } copyResults(); @@ -813,11 +833,15 @@ TEST_P(RequestTest, ProgressTagMulti) const size_t numMulti = 8; const bool allocateRecvBuffer = false; + if (isCudaBufferType(_bufferType) && _worker->getCudaBufferType() == ucxx::BufferType::Invalid) { + GTEST_SKIP() << "CUDA buffer allocation support not enabled"; + } + allocate(numMulti, allocateRecvBuffer); // Allocate buffers for request sizes/types std::vector multiSize(numMulti, _messageSize); - std::vector multiIsCUDA(numMulti, _bufferType != ucxx::BufferType::Host); + std::vector multiIsCUDA(numMulti, isCudaBufferType(_bufferType)); // Submit and wait for transfers to complete std::vector> requests; @@ -835,7 +859,9 @@ TEST_P(RequestTest, ProgressTagMulti) std::dynamic_pointer_cast(requests[1])->_bufferRequests) { // br->buffer == nullptr are headers if (br->buffer) { - ASSERT_EQ(br->buffer->getType(), _bufferType); + auto expectedBufferType = + isCudaBufferType(_bufferType) ? _worker->getCudaBufferType() : ucxx::BufferType::Host; + ASSERT_EQ(br->buffer->getType(), expectedBufferType); ASSERT_EQ(br->buffer->getSize(), _messageSize); _recvPtr[transferIdx] = br->buffer->data(); @@ -1102,7 +1128,7 @@ TEST_P(RequestTest, MemoryPutWithOffset) INSTANTIATE_TEST_SUITE_P(ProgressModes, RequestTest, - Combine(Values(ucxx::BufferType::Host), + Combine(Values(TestBufferType::Host), Values(false), Values(false), Values(ProgressMode::Polling, @@ -1114,7 +1140,7 @@ INSTANTIATE_TEST_SUITE_P(ProgressModes, INSTANTIATE_TEST_SUITE_P(DelayedSubmission, RequestTest, - Combine(Values(ucxx::BufferType::Host), + Combine(Values(TestBufferType::Host), Values(false), Values(true), Values(ProgressMode::ThreadPolling, ProgressMode::ThreadBlocking), @@ -1123,7 +1149,7 @@ INSTANTIATE_TEST_SUITE_P(DelayedSubmission, #if UCXX_ENABLE_RMM INSTANTIATE_TEST_SUITE_P(RMMProgressModes, RequestTest, - Combine(Values(ucxx::BufferType::RMM), + Combine(Values(TestBufferType::RMM), Values(false, true), Values(false), Values(ProgressMode::Polling, @@ -1135,7 +1161,7 @@ INSTANTIATE_TEST_SUITE_P(RMMProgressModes, INSTANTIATE_TEST_SUITE_P(RMMDelayedSubmission, RequestTest, - Combine(Values(ucxx::BufferType::RMM), + Combine(Values(TestBufferType::RMM), Values(false, true), Values(true), Values(ProgressMode::ThreadPolling, ProgressMode::ThreadBlocking), @@ -1145,7 +1171,7 @@ INSTANTIATE_TEST_SUITE_P(RMMDelayedSubmission, #if UCXX_ENABLE_CCCL INSTANTIATE_TEST_SUITE_P(CCCLProgressModes, RequestTest, - Combine(Values(ucxx::BufferType::CCCL), + Combine(Values(TestBufferType::CCCL), Values(false, true), Values(false), Values(ProgressMode::Polling, @@ -1157,7 +1183,7 @@ INSTANTIATE_TEST_SUITE_P(CCCLProgressModes, INSTANTIATE_TEST_SUITE_P(CCCLDelayedSubmission, RequestTest, - Combine(Values(ucxx::BufferType::CCCL), + Combine(Values(TestBufferType::CCCL), Values(false, true), Values(true), Values(ProgressMode::ThreadPolling, ProgressMode::ThreadBlocking), diff --git a/python/ucxx/ucxx/_lib/libucxx.pyx b/python/ucxx/ucxx/_lib/libucxx.pyx index 18c0777bf..12f99a872 100644 --- a/python/ucxx/ucxx/_lib/libucxx.pyx +++ b/python/ucxx/ucxx/_lib/libucxx.pyx @@ -39,8 +39,6 @@ cdef extern from "cuda_runtime.h" nogil: import numpy as np -from rmm.pylibrmm.device_buffer cimport DeviceBuffer - from .arr cimport Array from .ucxx_api cimport * @@ -187,21 +185,11 @@ cdef class HostBufferAdapter: free(self._ptr) -def _get_rmm_buffer(uintptr_t recv_buffer_ptr): - cdef RMMBuffer* rmm_buffer = recv_buffer_ptr - return DeviceBuffer.c_from_unique_ptr(move(rmm_buffer.release())) - - def _get_host_buffer(uintptr_t recv_buffer_ptr): cdef HostBuffer* host_buffer = recv_buffer_ptr return np.asarray(HostBufferAdapter._from_host_buffer(host_buffer)) -cdef shared_ptr[Buffer] _rmm_am_allocator(size_t length) noexcept nogil: - cdef shared_ptr[RMMBuffer] rmm_buffer = make_shared[RMMBuffer](length) - return dynamic_pointer_cast[Buffer, RMMBuffer](rmm_buffer) - -# Unlike RMM which has a Python DeviceBuffer class (from the rmm package), # CCCL has no Python buffer equivalent. This wrapper provides __cuda_array_interface__ # for interoperability with CuPy/cuDF without requiring an external CCCL Python package. cdef class CCCLBufferWrapper: @@ -680,7 +668,6 @@ cdef class UCXWorker(): ) -> None: cdef bint ucxx_enable_delayed_submission = enable_delayed_submission cdef bint ucxx_enable_python_future = enable_python_future - cdef AmAllocatorType rmm_am_allocator cdef AmAllocatorType cccl_am_allocator self._context_feature_flags = (context.feature_flags) @@ -697,22 +684,7 @@ cdef class UCXWorker(): self._enable_python_future = self._worker.get().isFutureEnabled() if self._context_feature_flags & UCP_FEATURE_AM: - if self._worker.get().getCudaBufferType() == BufferType.RMM: - with gil: - warnings.warn( - "RMM CUDA buffer support is deprecated and " - "will be removed in a future release. Use " - "CCCL buffers instead (set " - "UCXX_ENABLE_CCCL=ON and " - "UCXX_ENABLE_RMM=OFF).", - FutureWarning, - stacklevel=2, - ) - rmm_am_allocator = (&_rmm_am_allocator) - self._worker.get().registerAmAllocator( - UCS_MEMORY_TYPE_CUDA, rmm_am_allocator - ) - elif self._worker.get().getCudaBufferType() == BufferType.CCCL: + if self._worker.get().getCudaBufferType() == BufferType.CCCL: cccl_am_allocator = (&_cccl_am_allocator) self._worker.get().registerAmAllocator( UCS_MEMORY_TYPE_CUDA, cccl_am_allocator @@ -768,13 +740,11 @@ cdef class UCXWorker(): @property def cuda_buffer_type(self) -> str: - """Return the preferred CUDA buffer type ('rmm', 'cccl', or 'none').""" + """Return the preferred CUDA buffer type ('cccl', or 'none').""" cdef BufferType bt with nogil: bt = self._worker.get().getCudaBufferType() - if bt == BufferType.RMM: - return "rmm" - elif bt == BufferType.CCCL: + if bt == BufferType.CCCL: return "cccl" else: return "none" @@ -1083,7 +1053,7 @@ cdef class UCXRequest(): return future_ptr @property - def recv_buffer(self) -> None|np.ndarray|DeviceBuffer: + def recv_buffer(self) -> object: cdef shared_ptr[Buffer] buf cdef BufferType bufType @@ -1094,8 +1064,6 @@ cdef class UCXRequest(): # If buf == NULL, it's not allocated by the request but rather the user if buf == NULL: return None - elif bufType == BufferType.RMM: - return _get_rmm_buffer(buf.get()) elif bufType == BufferType.CCCL: return _get_cccl_buffer(buf) elif bufType == BufferType.Host: @@ -1144,7 +1112,7 @@ cdef class UCXRequest(): else: await self.wait_yield() - def get_recv_buffer(self) -> None|np.ndarray|DeviceBuffer: + def get_recv_buffer(self) -> object: warnings.warn( "UCXRequest.get_recv_buffer() is deprecated and will soon be removed, " "use the UCXRequest.recv_buffer property instead", @@ -1175,7 +1143,7 @@ cdef class UCXBufferRequest: ) @property - def py_buffer(self) -> None|np.ndarray|DeviceBuffer: + def py_buffer(self) -> object: cdef shared_ptr[Buffer] buf cdef BufferType bufType @@ -1186,8 +1154,6 @@ cdef class UCXBufferRequest: # If buf == NULL, it holds a header if buf == NULL: return None - elif bufType == BufferType.RMM: - return _get_rmm_buffer(buf.get()) elif bufType == BufferType.CCCL: return _get_cccl_buffer(buf) elif bufType == BufferType.Host: @@ -1202,7 +1168,7 @@ cdef class UCXBufferRequest: ) return self.request - def get_py_buffer(self) -> None|np.ndarray|DeviceBuffer: + def get_py_buffer(self) -> object: warnings.warn( "UCXBufferRequest.get_py_buffer() is deprecated and will soon be removed, " "use the UCXBufferRequest.py_buffer property instead", @@ -1289,7 +1255,7 @@ cdef class UCXBufferRequests: return future_ptr @property - def py_buffers(self) -> tuple[None|np.ndarray|DeviceBuffer, ...]: + def py_buffers(self) -> tuple[object, ...]: if not self.completed: raise RuntimeError("Some requests are not completed yet") @@ -1367,7 +1333,7 @@ cdef class UCXBufferRequests: ) return self.requests - def get_py_buffers(self) -> tuple[None|np.ndarray|DeviceBuffer, ...]: + def get_py_buffers(self) -> tuple[object, ...]: warnings.warn( "UCXBufferRequests.get_py_buffers() is deprecated and will soon be " "removed, use the UCXBufferRequests.py_buffers property instead", diff --git a/python/ucxx/ucxx/_lib/ucxx_api.pxd b/python/ucxx/ucxx/_lib/ucxx_api.pxd index fb2d291a1..939d9e7ee 100644 --- a/python/ucxx/ucxx/_lib/ucxx_api.pxd +++ b/python/ucxx/ucxx/_lib/ucxx_api.pxd @@ -92,11 +92,6 @@ cdef extern from "ucp/api/ucp.h" nogil: unsigned *release_number) -cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil: - cdef cppclass device_buffer: - pass - - cdef extern from "" namespace "ucxx::python" nogil: cdef PyObject* UCXXError @@ -149,7 +144,6 @@ cdef extern from "" namespace "ucxx::python" nogil: cdef extern from "" namespace "ucxx" nogil: cdef enum class BufferType: Host - RMM CCCL Invalid @@ -166,13 +160,6 @@ cdef extern from "" namespace "ucxx" nogil: void* release() except +raise_py_error void* data() except +raise_py_error - cdef cppclass RMMBuffer: - RMMBuffer(const size_t size_t) except +raise_py_error - BufferType getType() - size_t getSize() - unique_ptr[device_buffer] release() except +raise_py_error - void* data() except +raise_py_error - cdef extern from "" namespace "ucxx" nogil: cdef cppclass CCCLBuffer: CCCLBuffer(const size_t size) except +raise_py_error From 7ce37135e80d91a3d4b5cb319348320b30018205 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 19 May 2026 11:15:12 -0700 Subject: [PATCH 2/6] Remove unnecessary build dependencies --- build.sh | 1 - ci/build_wheel_libucxx.sh | 4 +--- ci/build_wheel_ucxx.sh | 2 -- conda/recipes/libucxx/recipe.yaml | 4 ---- conda/recipes/ucxx/recipe.yaml | 3 --- 5 files changed, 1 insertion(+), 13 deletions(-) diff --git a/build.sh b/build.sh index 1e03bab96..ead413426 100755 --- a/build.sh +++ b/build.sh @@ -151,7 +151,6 @@ if hasArg ucxx_tests && ! hasArg ucxx; then fi if buildAll || hasArg libucxx_python || hasArg libucxx_tests || hasArg libucxx_examples; then - UCXX_ENABLE_RMM=ON UCXX_ENABLE_CCCL=ON fi diff --git a/ci/build_wheel_libucxx.sh b/ci/build_wheel_libucxx.sh index 41f4db8c5..d0bb2e937 100755 --- a/ci/build_wheel_libucxx.sh +++ b/ci/build_wheel_libucxx.sh @@ -28,14 +28,12 @@ rapids-pip-retry install \ # 0 really means "add --no-build-isolation" (ref: https://github.com/pypa/pip/issues/5735) export PIP_NO_BUILD_ISOLATION=0 -export SKBUILD_CMAKE_ARGS="-DUCXX_ENABLE_RMM=ON -DUCXX_ENABLE_CCCL=ON" +export SKBUILD_CMAKE_ARGS="-DUCXX_ENABLE_CCCL=ON" ./ci/build_wheel.sh "${package_name}" "${package_dir}" python -m auditwheel repair \ --exclude "libucp.so.0" \ - --exclude "librapids_logger.so" \ - --exclude "librmm.so" \ -w "${RAPIDS_WHEEL_BLD_OUTPUT_DIR}" \ ${package_dir}/dist/* diff --git a/ci/build_wheel_ucxx.sh b/ci/build_wheel_ucxx.sh index 9ac98453b..1268be6fb 100755 --- a/ci/build_wheel_ucxx.sh +++ b/ci/build_wheel_ucxx.sh @@ -31,8 +31,6 @@ export RAPIDS_PY_API python -m auditwheel repair \ --exclude "libucp.so.0" \ --exclude "libucxx.so" \ - --exclude "librapids_logger.so" \ - --exclude "librmm.so" \ -w "${RAPIDS_WHEEL_BLD_OUTPUT_DIR}" \ ${package_dir}/dist/* diff --git a/conda/recipes/libucxx/recipe.yaml b/conda/recipes/libucxx/recipe.yaml index f3dd820e9..d34d716db 100644 --- a/conda/recipes/libucxx/recipe.yaml +++ b/conda/recipes/libucxx/recipe.yaml @@ -100,11 +100,9 @@ outputs: - ${{ stdlib("c") }} host: - cuda-version =${{ cuda_version }} - - librmm ${{ rapids_version }} - ucx run: - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }} - - ${{ pin_compatible("librmm", upper_bound="x.x") }} - ucx >=1.18.0,<1.21.0 run_exports: - ${{ pin_subpackage("libucxx", upper_bound="x.x") }} @@ -112,7 +110,6 @@ outputs: by_name: - cuda-cudart - cuda-version - - librmm - ucx tests: - script: @@ -173,7 +170,6 @@ outputs: by_name: - cuda-cudart - cuda-version - - librmm - libucxx - ucx about: diff --git a/conda/recipes/ucxx/recipe.yaml b/conda/recipes/ucxx/recipe.yaml index 02ddcce1b..f1e5b2744 100644 --- a/conda/recipes/ucxx/recipe.yaml +++ b/conda/recipes/ucxx/recipe.yaml @@ -93,7 +93,6 @@ outputs: - python =${{ py_abi_min }} - python-abi3 ${{ py_abi_min }}.* - rapids-build-backend >=0.4.0,<0.5.0 - - rmm ${{ rapids_version }} - scikit-build-core>=0.11.0 - ucx - libucxx =${{ version }} @@ -106,7 +105,6 @@ outputs: - python - ucx >=1.18.0,<1.21.0 - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }} - - ${{ pin_compatible("rmm", upper_bound="x.x") }} - libucxx =${{ version }} run_constraints: - cupy >=13.6.0 @@ -114,7 +112,6 @@ outputs: by_name: - cuda-cudart - cuda-version - - librmm - libucxx - python_abi - ucx From 50820706488a3b067a2c244f28a45fe462822b03 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 19 May 2026 13:04:39 -0700 Subject: [PATCH 3/6] Update README with CCCL info --- README.md | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index b526260c1..d9ecc1a86 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ Additionally, there is a `./build_and_run.sh` script that will call `./build.sh` ### C++ -To build and install C++ library to `${CONDA_PREFIX}`, with both Python and RMM support, as well as building all tests and benchmarks (with CUDA support) run: +To build and install the C++ library to `${CONDA_PREFIX}`, with Python support and CCCL CUDA buffer support, as well as building all tests and benchmarks with CUDA/CCCL support, run: ``` mkdir cpp/build @@ -53,8 +53,9 @@ cmake .. -DCMAKE_INSTALL_PREFIX=${CONDA_PREFIX} \ -DBUILD_BENCHMARKS=ON \ -DCMAKE_BUILD_TYPE=Release \ -DUCXX_ENABLE_PYTHON=ON \ - -DUCXX_ENABLE_RMM=ON \ - -DUCXX_BENCHMARKS_ENABLE_CUDA=ON + -DUCXX_ENABLE_CCCL=ON \ + -DUCXX_BENCHMARKS_ENABLE_CUDA=ON \ + -DUCXX_BENCHMARKS_ENABLE_CCCL=ON make -j install ``` @@ -122,7 +123,8 @@ It is recommended to use `UCX_TCP_CM_REUSEADDR=y` when binding to interfaces wit #### CCCL Memory Support -When built with `UCXX_ENABLE_CCCL=ON`, additional CCCL-based memory types are available: +When built with `UCXX_ENABLE_CCCL=ON`, `UCXX_BENCHMARKS_ENABLE_CUDA=ON`, and +`UCXX_BENCHMARKS_ENABLE_CCCL=ON`, additional CCCL-based memory types are available: ``` # Server with CCCL device memory pool @@ -138,7 +140,7 @@ $ UCX_TCP_CM_REUSEADDR=y ./benchmarks/ucxx_perftest -m cccl-shared -s 1048576 -n $ ./benchmarks/ucxx_perftest -m cccl-shared -s 1048576 -n 10 127.0.0.1 ``` -**Additional CCCL Memory Types (with `-DUCXX_ENABLE_CCCL=ON`):** +**Additional CCCL Memory Types:** - `cccl-device` - CCCL device memory pool - `cccl-shared` - CCCL shared memory resource - `cccl-cuda-async` - CCCL CUDA async memory resource @@ -146,6 +148,8 @@ $ ./benchmarks/ucxx_perftest -m cccl-shared -s 1048576 -n 10 127.0.0.1 **Requirements for CCCL Support:** - UCXX compiled with `UCXX_ENABLE_CCCL=ON` +- Benchmarks compiled with `UCXX_BENCHMARKS_ENABLE_CUDA=ON` +- Benchmarks compiled with `UCXX_BENCHMARKS_ENABLE_CCCL=ON` - CCCL library available (fetched automatically via CMake) ### Python From e8ff9b0e19514d3d88dc581e762c1b3287adb4bc Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 20 May 2026 13:28:37 -0700 Subject: [PATCH 4/6] Add missing cudart_static linking --- cpp/tests/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index ab8390102..2b7ced8de 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -29,6 +29,10 @@ function(ConfigureTest CMAKE_TEST_NAME) ${CMAKE_TEST_NAME} PRIVATE ucxx GTest::gmock_main GTest::gtest_main $ ) + if(UCXX_ENABLE_CCCL) + find_package(CUDAToolkit REQUIRED) + target_link_libraries(${CMAKE_TEST_NAME} PRIVATE CUDA::cudart_static) + endif() if(UCXX_ENABLE_RMM) target_compile_definitions(${CMAKE_TEST_NAME} PRIVATE UCXX_ENABLE_RMM) target_link_libraries(${CMAKE_TEST_NAME} PRIVATE rmm::rmm) From 7e8ca764e01eac9fc07966d152beceba40d976e8 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 20 May 2026 13:40:17 -0700 Subject: [PATCH 5/6] Rename RMM CMake flags --- build.sh | 6 ++++-- cpp/CMakeLists.txt | 15 +++++++++++---- cpp/benchmarks/CMakeLists.txt | 2 +- cpp/tests/CMakeLists.txt | 4 ++-- cpp/tests/request.cpp | 22 +++++++++++----------- 5 files changed, 29 insertions(+), 20 deletions(-) diff --git a/build.sh b/build.sh index ead413426..d44e12006 100755 --- a/build.sh +++ b/build.sh @@ -56,7 +56,8 @@ BUILD_TESTS=OFF BUILD_EXAMPLES=OFF BUILD_DISABLE_DEPRECATION_WARNINGS=ON BUILD_COMPILE_COMMANDS=OFF -UCXX_ENABLE_RMM=OFF +UCXX_TESTS_ENABLE_RMM=OFF +UCXX_BENCHMARKS_ENABLE_RMM=OFF UCXX_ENABLE_CCCL=OFF UCXX_BENCHMARKS_ENABLE_CUDA=OFF @@ -187,7 +188,8 @@ if buildAll || hasArg libucxx; then -DDISABLE_DEPRECATION_WARNINGS=${BUILD_DISABLE_DEPRECATION_WARNINGS} \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ -DCMAKE_EXPORT_COMPILE_COMMANDS=${BUILD_COMPILE_COMMANDS} \ - -DUCXX_ENABLE_RMM=${UCXX_ENABLE_RMM} \ + -DUCXX_TESTS_ENABLE_RMM=${UCXX_TESTS_ENABLE_RMM} \ + -DUCXX_BENCHMARKS_ENABLE_RMM=${UCXX_BENCHMARKS_ENABLE_RMM} \ -DUCXX_ENABLE_CCCL=${UCXX_ENABLE_CCCL} \ -DUCXX_BENCHMARKS_ENABLE_CUDA=${UCXX_BENCHMARKS_ENABLE_CUDA} \ "${EXTRA_CMAKE_ARGS[@]}" diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 45971ca94..bc7fd2487 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -42,7 +42,8 @@ option(BUILD_TESTS "Configure CMake to build tests" ON) option(BUILD_BENCHMARKS "Configure CMake to build benchmarks" OFF) option(BUILD_EXAMPLES "Configure CMake to build examples" OFF) option(BUILD_SHARED_LIBS "Build UCXX shared libraries" ON) -option(UCXX_ENABLE_RMM "Enable RMM-backed test and benchmark code paths" OFF) +option(UCXX_TESTS_ENABLE_RMM "Enable RMM-backed test code paths" OFF) +option(UCXX_BENCHMARKS_ENABLE_RMM "Enable RMM-backed benchmark code paths" OFF) # TODO: Flip UCXX_ENABLE_CCCL default to OFF once devcontainer builds pass -DUCXX_ENABLE_CCCL=ON option(UCXX_ENABLE_CCCL "Enable support for CUDA buffer with CCCL" ON) option(DISABLE_DEPRECATION_WARNINGS "Disable warnings generated from deprecated declarations." OFF) @@ -51,7 +52,8 @@ message(VERBOSE "UCXX: Configure CMake to build tests: ${BUILD_TESTS}") message(VERBOSE "UCXX: Configure CMake to build benchmarks: ${BUILD_BENCHMARKS}") message(VERBOSE "UCXX: Configure CMake to build examples: ${BUILD_EXAMPLES}") message(VERBOSE "UCXX: Build UCXX shared libraries: ${BUILD_SHARED_LIBS}") -message(VERBOSE "UCXX: Enable RMM-backed test and benchmark code paths: ${UCXX_ENABLE_RMM}") +message(VERBOSE "UCXX: Enable RMM-backed test code paths: ${UCXX_TESTS_ENABLE_RMM}") +message(VERBOSE "UCXX: Enable RMM-backed benchmark code paths: ${UCXX_BENCHMARKS_ENABLE_RMM}") message(VERBOSE "UCXX: Enable support for CUDA buffer with CCCL: ${UCXX_ENABLE_CCCL}") message( VERBOSE @@ -94,8 +96,13 @@ rapids_find_package( # add third party dependencies using CPM rapids_cpm_init() -# find rmm for tests and benchmarks -if(UCXX_ENABLE_RMM) +# find rmm for enabled test and benchmark code paths +if((BUILD_TESTS AND UCXX_TESTS_ENABLE_RMM) + OR (BUILD_BENCHMARKS + AND UCXX_BENCHMARKS_ENABLE_CUDA + AND UCXX_BENCHMARKS_ENABLE_RMM + ) +) include(cmake/thirdparty/get_rmm.cmake) endif() # find cccl diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 3dbfc4936..bdea55ded 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -46,7 +46,7 @@ function(ConfigureBench CMAKE_BENCH_NAME) endif() # RMM memory resources for CUDA benchmarks. - if(UCXX_BENCHMARKS_ENABLE_CUDA AND UCXX_ENABLE_RMM) + if(UCXX_BENCHMARKS_ENABLE_CUDA AND UCXX_BENCHMARKS_ENABLE_RMM) target_compile_definitions(${CMAKE_BENCH_NAME} PRIVATE UCXX_BENCHMARKS_ENABLE_RMM) target_link_libraries(${CMAKE_BENCH_NAME} PRIVATE rmm::rmm) endif() diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 2b7ced8de..5b936a875 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -33,8 +33,8 @@ function(ConfigureTest CMAKE_TEST_NAME) find_package(CUDAToolkit REQUIRED) target_link_libraries(${CMAKE_TEST_NAME} PRIVATE CUDA::cudart_static) endif() - if(UCXX_ENABLE_RMM) - target_compile_definitions(${CMAKE_TEST_NAME} PRIVATE UCXX_ENABLE_RMM) + if(UCXX_TESTS_ENABLE_RMM) + target_compile_definitions(${CMAKE_TEST_NAME} PRIVATE UCXX_TESTS_ENABLE_RMM) target_link_libraries(${CMAKE_TEST_NAME} PRIVATE rmm::rmm) endif() add_test(NAME ${CMAKE_TEST_NAME} COMMAND ${CMAKE_TEST_NAME}) diff --git a/cpp/tests/request.cpp b/cpp/tests/request.cpp index afe047dd4..ac38a1852 100644 --- a/cpp/tests/request.cpp +++ b/cpp/tests/request.cpp @@ -22,11 +22,11 @@ #include "ucxx/constructors.h" #include "ucxx/utils/ucx.h" -#ifndef UCXX_ENABLE_RMM -#define UCXX_ENABLE_RMM 0 +#ifndef UCXX_TESTS_ENABLE_RMM +#define UCXX_TESTS_ENABLE_RMM 0 #endif -#if UCXX_ENABLE_RMM +#if UCXX_TESTS_ENABLE_RMM #include #endif @@ -50,7 +50,7 @@ enum class TestBufferType { bool isCudaBufferType(TestBufferType bufferType) { return bufferType != TestBufferType::Host; } -#if UCXX_ENABLE_RMM +#if UCXX_TESTS_ENABLE_RMM class RMMTestBuffer : public ucxx::Buffer { private: std::unique_ptr _buffer; @@ -139,8 +139,8 @@ class RequestTest _messageLength) = GetParam(); if (_bufferType == TestBufferType::RMM) { -#if !UCXX_ENABLE_RMM - GTEST_SKIP() << "UCXX was not built with RMM support"; +#if !UCXX_TESTS_ENABLE_RMM + GTEST_SKIP() << "UCXX tests were not built with RMM support"; #endif } @@ -187,7 +187,7 @@ class RequestTest if (_bufferType == TestBufferType::Host) { _sendBuffer[i] = std::make_unique(_messageSize); if (allocateRecvBuffer) _recvBuffer[i] = std::make_unique(_messageSize); -#if UCXX_ENABLE_RMM +#if UCXX_TESTS_ENABLE_RMM } else if (_bufferType == TestBufferType::RMM) { _sendBuffer[i] = std::make_unique(_messageSize); if (allocateRecvBuffer) _recvBuffer[i] = std::make_unique(_messageSize); @@ -204,7 +204,7 @@ class RequestTest _sendPtr[i] = _sendBuffer[i]->data(); if (allocateRecvBuffer) _recvPtr[i] = _recvBuffer[i]->data(); } -#if UCXX_ENABLE_RMM +#if UCXX_TESTS_ENABLE_RMM if (_bufferType == TestBufferType::RMM) { rmm::cuda_stream_default.synchronize(); } #endif if (_bufferType == TestBufferType::CCCL) { cudaStreamSynchronize(nullptr); } @@ -214,7 +214,7 @@ class RequestTest { for (size_t i = 0; i < _numBuffers; ++i) copyMemoryTypeAware(_recv[i].data(), _recvPtr[i], _messageSize, false); -#if UCXX_ENABLE_RMM +#if UCXX_TESTS_ENABLE_RMM if (_bufferType == TestBufferType::RMM) { rmm::cuda_stream_default.synchronize(); } #endif if (_bufferType == TestBufferType::CCCL) { cudaStreamSynchronize(nullptr); } @@ -224,7 +224,7 @@ class RequestTest { if (_memoryType == UCS_MEMORY_TYPE_HOST) { memcpy(dst, src, size); -#if UCXX_ENABLE_RMM +#if UCXX_TESTS_ENABLE_RMM } else if (_memoryType == UCS_MEMORY_TYPE_CUDA && _bufferType == TestBufferType::RMM) { RMM_CUDA_TRY( cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, rmm::cuda_stream_default.value())); @@ -1146,7 +1146,7 @@ INSTANTIATE_TEST_SUITE_P(DelayedSubmission, Values(ProgressMode::ThreadPolling, ProgressMode::ThreadBlocking), Values(0, 1, 1024, 2048, 1048576))); -#if UCXX_ENABLE_RMM +#if UCXX_TESTS_ENABLE_RMM INSTANTIATE_TEST_SUITE_P(RMMProgressModes, RequestTest, Combine(Values(TestBufferType::RMM), From 7afe52eddf50135d7c192adf4cb7ccc55470851d Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 20 May 2026 23:54:14 -0700 Subject: [PATCH 6/6] More cudart_static linkage --- python/ucxx/ucxx/_lib/CMakeLists.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/ucxx/ucxx/_lib/CMakeLists.txt b/python/ucxx/ucxx/_lib/CMakeLists.txt index 68922afce..21c033b9b 100644 --- a/python/ucxx/ucxx/_lib/CMakeLists.txt +++ b/python/ucxx/ucxx/_lib/CMakeLists.txt @@ -1,12 +1,13 @@ # ================================================================================= # cmake-format: off -# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: BSD-3-Clause # cmake-format: on # ================================================================================= set(cython_sources arr.pyx libucxx.pyx) -set(linked_libraries ucxx::ucxx ucxx::python) +find_package(CUDAToolkit REQUIRED) +set(linked_libraries ucxx::ucxx ucxx::python CUDA::cudart_static) rapids_cython_create_modules( CXX