From 5d7728fa2e9837def034174abcf6cccbfdc9fe09 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 19 May 2026 10:15:40 -0700
Subject: [PATCH 1/6] Remove deprecated RMM `Buffer` support

---
 cpp/CMakeLists.txt                   |  19 +---
 cpp/benchmarks/CMakeLists.txt        |   2 +-
 cpp/examples/CMakeLists.txt          |   6 +-
 cpp/examples/basic.cpp               |  42 +++-----
 cpp/include/ucxx/api.h               |   6 +-
 cpp/include/ucxx/buffer.h            | 106 --------------------
 cpp/include/ucxx/endpoint.h          |   5 +-
 cpp/include/ucxx/request_tag_multi.h |   9 +-
 cpp/include/ucxx/worker.h            |  12 ++-
 cpp/python/CMakeLists.txt            |  13 +--
 cpp/src/buffer.cpp                   |  49 +--------
 cpp/src/buffer_cccl.cpp              |   4 +-
 cpp/src/worker.cpp                   |   6 +-
 cpp/tests/CMakeLists.txt             |   6 +-
 cpp/tests/buffer.cpp                 |  57 -----------
 cpp/tests/request.cpp                | 142 ++++++++++++++++-----------
 python/ucxx/ucxx/_lib/libucxx.pyx    |  52 ++--------
 python/ucxx/ucxx/_lib/ucxx_api.pxd   |  13 ---
 18 files changed, 139 insertions(+), 410 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 5d739509e..45971ca94 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -42,7 +42,7 @@ option(BUILD_TESTS "Configure CMake to build tests" ON)
 option(BUILD_BENCHMARKS "Configure CMake to build benchmarks" OFF)
 option(BUILD_EXAMPLES "Configure CMake to build examples" OFF)
 option(BUILD_SHARED_LIBS "Build UCXX shared libraries" ON)
-option(UCXX_ENABLE_RMM "Enable support for CUDA multi-buffer transfer with RMM" OFF)
+option(UCXX_ENABLE_RMM "Enable RMM-backed test and benchmark code paths" OFF)
 # TODO: Flip UCXX_ENABLE_CCCL default to OFF once devcontainer builds pass -DUCXX_ENABLE_CCCL=ON
 option(UCXX_ENABLE_CCCL "Enable support for CUDA buffer with CCCL" ON)
 option(DISABLE_DEPRECATION_WARNINGS "Disable warnings generated from deprecated declarations." OFF)
@@ -51,10 +51,7 @@ message(VERBOSE "UCXX: Configure CMake to build tests: ${BUILD_TESTS}")
 message(VERBOSE "UCXX: Configure CMake to build benchmarks: ${BUILD_BENCHMARKS}")
 message(VERBOSE "UCXX: Configure CMake to build examples: ${BUILD_EXAMPLES}")
 message(VERBOSE "UCXX: Build UCXX shared libraries: ${BUILD_SHARED_LIBS}")
-message(
-  VERBOSE
-  "UCXX: Enable support for CUDA multi-buffer transfer with RMM (DEPRECATED): ${UCXX_ENABLE_RMM}"
-)
+message(VERBOSE "UCXX: Enable RMM-backed test and benchmark code paths: ${UCXX_ENABLE_RMM}")
 message(VERBOSE "UCXX: Enable support for CUDA buffer with CCCL: ${UCXX_ENABLE_CCCL}")
 message(
   VERBOSE
@@ -97,11 +94,8 @@ rapids_find_package(
 
 # add third party dependencies using CPM
 rapids_cpm_init()
-# find rmm
+# find rmm for tests and benchmarks
 if(UCXX_ENABLE_RMM)
-  message(DEPRECATION "UCXX_ENABLE_RMM is deprecated and will be removed in a future release. "
-                      "Use UCXX_ENABLE_CCCL instead."
-  )
   include(cmake/thirdparty/get_rmm.cmake)
 endif()
 # find cccl
@@ -181,13 +175,6 @@ target_include_directories(
 
 target_compile_definitions(ucxx PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${UCXX_CXX_DEFINITIONS}>")
 
-# Enable RMM if necessary
-if(UCXX_ENABLE_RMM)
-  target_link_libraries(ucxx PUBLIC rmm::rmm)
-
-  target_compile_definitions(ucxx PUBLIC UCXX_ENABLE_RMM)
-endif()
-
 # Enable CCCL if necessary
 if(UCXX_ENABLE_CCCL)
   find_package(CUDAToolkit REQUIRED)
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index fadc1436f..3dbfc4936 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -45,7 +45,7 @@ function(ConfigureBench CMAKE_BENCH_NAME)
     target_link_libraries(${CMAKE_BENCH_NAME} PRIVATE CUDA::cudart_static)
   endif()
 
-  # RMM memory resources for CUDA benchmarks (requires UCXX_ENABLE_RMM and get_rmm.cmake)
+  # RMM memory resources for CUDA benchmarks.
   if(UCXX_BENCHMARKS_ENABLE_CUDA AND UCXX_ENABLE_RMM)
     target_compile_definitions(${CMAKE_BENCH_NAME} PRIVATE UCXX_BENCHMARKS_ENABLE_RMM)
     target_link_libraries(${CMAKE_BENCH_NAME} PRIVATE rmm::rmm)
diff --git a/cpp/examples/CMakeLists.txt b/cpp/examples/CMakeLists.txt
index 983b787a9..493946ef5 100644
--- a/cpp/examples/CMakeLists.txt
+++ b/cpp/examples/CMakeLists.txt
@@ -1,6 +1,6 @@
 # =================================================================================
 # cmake-format: off
-# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: BSD-3-Clause
 # cmake-format: on
 # =================================================================================
@@ -31,6 +31,10 @@ function(ConfigureBench CMAKE_BENCH_NAME)
                CXX_STANDARD_REQUIRED ON
   )
   target_link_libraries(${CMAKE_BENCH_NAME} PRIVATE ucxx $<TARGET_NAME_IF_EXISTS:conda_env>)
+  if(UCXX_ENABLE_CCCL)
+    find_package(CUDAToolkit REQUIRED)
+    target_link_libraries(${CMAKE_BENCH_NAME} PRIVATE CUDA::cudart_static)
+  endif()
   add_custom_command(
     OUTPUT UCXX_EXAMPLES
     COMMAND ${CMAKE_BENCH_NAME}
diff --git a/cpp/examples/basic.cpp b/cpp/examples/basic.cpp
index 354f6a48d..14a5e8ac9 100644
--- a/cpp/examples/basic.cpp
+++ b/cpp/examples/basic.cpp
@@ -18,8 +18,8 @@
 #include <ucxx/utils/sockaddr.h>
 #include <ucxx/utils/ucx.h>
 
-#if UCXX_ENABLE_RMM
-#include <rmm/device_buffer.hpp>
+#if UCXX_ENABLE_CCCL
+#include <cuda_runtime_api.h>
 #endif
 
 class ListenerContext {
@@ -91,14 +91,12 @@ static void printUsage()
   std::cerr << "              'thread-polling', 'thread-blocking' and 'wait' (default: 'blocking')"
             << std::endl;
   std::cerr << "  -p <port>   Port number to listen at" << std::endl;
-  std::cerr
-    << "  -s <send_buffer_type>   Send buffer type, valid values are: 'host', 'rmm', 'cccl' "
-       "(default: 'host')"
-    << std::endl;
-  std::cerr
-    << "  -r <recv_buffer_type>   Recv buffer type, valid values are: 'host', 'rmm', 'cccl' "
-       "(default: 'host')"
-    << std::endl;
+  std::cerr << "  -s <send_buffer_type>   Send buffer type, valid values are: 'host', 'cccl' "
+               "(default: 'host')"
+            << std::endl;
+  std::cerr << "  -r <recv_buffer_type>   Recv buffer type, valid values are: 'host', 'cccl' "
+               "(default: 'host')"
+            << std::endl;
   std::cerr << "  -h          Print this help" << std::endl;
   std::cerr << std::endl;
 }
@@ -124,14 +122,6 @@ struct args {
     auto parseBufferType = [](const std::string& bufferTypeString) {
       if (bufferTypeString == "host") {
         return ucxx::BufferType::Host;
-      } else if (bufferTypeString == "rmm") {
-#if UCXX_ENABLE_RMM
-        return ucxx::BufferType::RMM;
-#else
-        std::cerr << "RMM support not enabled, please compile with -DUCXX_ENABLE_RMM=1"
-                  << std::endl;
-        return ucxx::BufferType::Invalid;
-#endif
       } else if (bufferTypeString == "cccl") {
 #if UCXX_ENABLE_CCCL
         return ucxx::BufferType::CCCL;
@@ -228,15 +218,6 @@ std::shared_ptr<ucxx::Buffer> makeBuffer(ucxx::BufferType bufferType, T* values,
   switch (bufferType) {
     case ucxx::BufferType::Host:
       return std::make_shared<ucxx::HostBuffer>(values, size * sizeof(T));
-    case ucxx::BufferType::RMM:
-#if UCXX_ENABLE_RMM
-    {
-      auto buf =
-        std::make_unique<rmm::device_buffer>(values, size * sizeof(T), rmm::cuda_stream_default);
-      rmm::cuda_stream_default.synchronize();
-      return std::make_shared<ucxx::RMMBuffer>(std::move(buf));
-    }
-#endif
     case ucxx::BufferType::CCCL:
 #if UCXX_ENABLE_CCCL
     {
@@ -255,7 +236,7 @@ auto verify_buffers(ucxx::Buffer* expected, ucxx::Buffer* actual)
   std::vector<uint8_t> host_expected, host_actual;
   void *host_expected_ptr, *host_actual_ptr;
 
-#if UCXX_ENABLE_CCCL || UCXX_ENABLE_RMM
+#if UCXX_ENABLE_CCCL
   auto copy_to_host = [](auto& buffer, auto& host_buffer) {
     // copy device buffer to host
     host_buffer.resize(buffer->getSize());
@@ -272,14 +253,13 @@ auto verify_buffers(ucxx::Buffer* expected, ucxx::Buffer* actual)
   };
 #endif
 
-  if (expected->getType() == ucxx::BufferType::RMM ||
-      expected->getType() == ucxx::BufferType::CCCL) {
+  if (expected->getType() == ucxx::BufferType::CCCL) {
     host_expected_ptr = copy_to_host(expected, host_expected);
   } else {
     host_expected_ptr = expected->data();
   }
 
-  if (actual->getType() == ucxx::BufferType::RMM || actual->getType() == ucxx::BufferType::CCCL) {
+  if (actual->getType() == ucxx::BufferType::CCCL) {
     host_actual_ptr = copy_to_host(actual, host_actual);
   } else {
     host_actual_ptr = actual->data();
diff --git a/cpp/include/ucxx/api.h b/cpp/include/ucxx/api.h
index ac77ea866..82a253ba9 100644
--- a/cpp/include/ucxx/api.h
+++ b/cpp/include/ucxx/api.h
@@ -1,13 +1,9 @@
 /**
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES.
  * SPDX-License-Identifier: BSD-3-Clause
  */
 #pragma once
 
-#ifndef UCXX_ENABLE_RMM
-#define UCXX_ENABLE_RMM 0
-#endif
-
 #include <ucxx/address.h>
 #include <ucxx/buffer.h>
 #include <ucxx/constructors.h>
diff --git a/cpp/include/ucxx/buffer.h b/cpp/include/ucxx/buffer.h
index 3319615c6..b0d13439c 100644
--- a/cpp/include/ucxx/buffer.h
+++ b/cpp/include/ucxx/buffer.h
@@ -8,11 +8,6 @@
 
 #include <ucxx/log.h>
 
-namespace rmm {
-// Forward declaration to prevent symbols from being added to symbol table unnecessarily.
-class device_buffer;
-}  // namespace rmm
-
 namespace ucxx {
 /**
  * @brief The type of a buffer.
@@ -21,7 +16,6 @@ namespace ucxx {
  */
 enum class BufferType {
   Host = 0,
-  RMM,
   CCCL,
   Invalid,
 };
@@ -195,106 +189,6 @@ class HostBuffer : public Buffer {
   [[nodiscard]] void* data() override;
 };
 
-#if UCXX_ENABLE_RMM
-/**
- * @brief A simple object containing a RMM (CUDA) buffer.
- *
- * A buffer encapsulating an RMM (CUDA) buffer with its properties.
- */
-class RMMBuffer : public Buffer {
- private:
-  std::unique_ptr<rmm::device_buffer> _buffer;  ///< RMM-allocated device buffer
-
- public:
-  RMMBuffer()                            = delete;
-  RMMBuffer(const RMMBuffer&)            = delete;
-  RMMBuffer& operator=(RMMBuffer const&) = delete;
-  RMMBuffer(RMMBuffer&& o)               = delete;
-  RMMBuffer& operator=(RMMBuffer&& o)    = delete;
-
-  ~RMMBuffer() override;
-
-  /**
-   * @brief Constructor of concrete type `RMMBuffer`.
-   *
-   * Constructor to materialize a buffer holding device memory. The internal
-   * buffer holds a `std::unique_ptr<rmm::device_buffer>` and is destroyed
-   * when the object goes out-of-scope or is explicitly deleted.
-   *
-   * @param[in] size the size of the device buffer to allocate.
-   *
-   * @code{.cpp}
-   * // Allocate host buffer of 1KiB
-   * auto buffer = RMMBuffer(1024);
-   * @endcode
-   */
-  [[deprecated(
-    "RMMBuffer is deprecated and will be removed in a future release. Use CCCL buffers instead "
-    "(UCXX_ENABLE_CCCL).")]]
-  explicit RMMBuffer(const size_t size);
-
-  /**
-   * @brief Construct from an existing `rmm::device_buffer`.
-   *
-   * @param[in] rmm_buffer the `rmm::device_buffer` to hold.
-   */
-  [[deprecated(
-    "RMMBuffer is deprecated and will be removed in a future release. Use CCCL buffers instead "
-    "(UCXX_ENABLE_CCCL).")]]
-  explicit RMMBuffer(std::unique_ptr<rmm::device_buffer> rmm_buffer);
-
-  /**
-   * @brief Release the allocated `rmm::device_buffer` to the caller.
-   *
-   * Release ownership of the `rmm::device_buffer` to the caller. After this
-   * method is called, the caller becomes responsible for the destruction of
-   * the object once it is not needed anymore. The `rmm::device_buffer` is held
-   * owned by the `unique_ptr` and will be deallocated once it goes out-of-scope
-   * or gets explicitly deleted.
-   *
-   * The original `RMMBuffer` object becomes invalid.
-   *
-   * @code{.cpp}
-   * // Allocate RMM buffer of 1KiB
-   * auto buffer = RMMBuffer(1024);
-   * std::unique_ptr<RMMBuffer> rmmBuffer= buffer.release();
-   *
-   * // do work on rmmBuffer
-   *
-   * // `rmm::device_buffer` is destroyed and device Memory is freed once
-   * // `rmmBuffer` goes out-of-scope.
-   * @endcode
-   *
-   * @throws std::runtime_error if object has been released.
-   *
-   * @return the void pointer to the buffer.
-   */
-  [[nodiscard]] std::unique_ptr<rmm::device_buffer> release();
-
-  /**
-   * @brief Get a pointer to the allocated raw device buffer.
-   *
-   * Get a pointer to the underlying buffer, but does not release ownership.
-   *
-   * @code{.cpp}
-   * // Allocate device buffer of 1KiB
-   * auto buffer = RMMBuffer(1024);
-   * void* bufferPtr = buffer.data();
-   *
-   * // do work on bufferPtr
-   *
-   * // `rmm::device_buffer` is destroyed and device Memory is freed once
-   * // `buffer` goes out-of-scope.
-   * @endcode
-   *
-   * @throws std::runtime_error if object has been released.
-   *
-   * @return the void pointer to the device buffer.
-   */
-  [[nodiscard]] void* data() override;
-};
-#endif
-
 #if UCXX_ENABLE_CCCL
 /**
  * @brief Opaque implementation struct for CCCLBuffer (defined in buffer_cccl.cu).
diff --git a/cpp/include/ucxx/endpoint.h b/cpp/include/ucxx/endpoint.h
index 7a3cd8604..f7b047420 100644
--- a/cpp/include/ucxx/endpoint.h
+++ b/cpp/include/ucxx/endpoint.h
@@ -789,9 +789,8 @@ class Endpoint : public Component {
    * `std::shared<ucxx::RequestTagMulti>` that can be later awaited and checked for errors.
    * This is a non-blocking operation, and because the receiver has no a priori knowledge
    * of the data being received, memory allocations are automatically handled internally.
-   * The receiver must have the same capabilities of the sender, so that if the sender is
-   * compiled with RMM support to allow for CUDA transfers, the receiver must have the
-   * ability to understand and allocate CUDA memory.
+   * Receiving CUDA frames requires UCXX to be compiled with CCCL support (`UCXX_ENABLE_CCCL`)
+   * and the receiving worker to be configured to allocate CCCL buffers for CUDA frames.
    *
    * Using a Python future may be requested by specifying `enablePythonFuture`. If a
    * Python future is requested, the Python application must then await on this future to
diff --git a/cpp/include/ucxx/request_tag_multi.h b/cpp/include/ucxx/request_tag_multi.h
index 4e9b490aa..11a24d7d4 100644
--- a/cpp/include/ucxx/request_tag_multi.h
+++ b/cpp/include/ucxx/request_tag_multi.h
@@ -1,5 +1,5 @@
 /**
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES.
  * SPDX-License-Identifier: BSD-3-Clause
  */
 #pragma once
@@ -144,10 +144,9 @@ class RequestTagMulti : public Request {
    * This is a non-blocking operation, and the status of a send transfer must be verified
    * from the resulting request object before the data can be released. If this is a receive
    * transfer and because the receiver has no a priori knowledge of the data being received,
-   * memory allocations are automatically handled internally.  The receiver must have the
-   * same capabilities of the sender, so that if the sender is compiled with RMM support to
-   * allow for CUDA transfers, the receiver must have the ability to understand and allocate
-   * CUDA memory.
+   * memory allocations are automatically handled internally. Receiving CUDA frames requires UCXX
+   * to be compiled with CCCL support (`UCXX_ENABLE_CCCL`) and the receiving worker to be
+   * configured to allocate CCCL buffers for CUDA frames.
    *
    * The primary use of multi-buffer transfers is in Python where we want to reduce the
    * amount of futures needed to watch for, thus reducing Python overhead. However, this
diff --git a/cpp/include/ucxx/worker.h b/cpp/include/ucxx/worker.h
index 20e2705f6..95aaff4e4 100644
--- a/cpp/include/ucxx/worker.h
+++ b/cpp/include/ucxx/worker.h
@@ -150,10 +150,10 @@ class Worker : public Component {
    * Configure which buffer type to use when allocating CUDA buffers for incoming
    * multi-buffer tag receives.
    *
-   * @param[in] bufferType  the preferred buffer type (must be `BufferType::RMM` or
-   *                        `BufferType::CCCL`).
+   * @param[in] bufferType  the preferred buffer type (currently only `BufferType::CCCL`
+   *                        is supported).
    *
-   * @throws std::invalid_argument if bufferType is not RMM or CCCL.
+   * @throws std::invalid_argument if bufferType is not CCCL.
    */
   void setCudaBufferType(BufferType bufferType);
 
@@ -527,7 +527,7 @@ class Worker : public Component {
    *
    * Returns the buffer type used when allocating CUDA buffers for incoming
    * multi-buffer tag receives. Defaults to CCCL if compiled with CCCL support,
-   * otherwise RMM if compiled with RMM support, otherwise Invalid.
+   * otherwise Invalid.
    *
    * @returns The preferred `BufferType` for CUDA allocations.
    */
@@ -938,7 +938,9 @@ class Worker : public Component {
    * // context is `std::shared_ptr<ucxx::Context>`
    * auto worker = context->createWorker(false);
    *
-   * worker->registerAmAllocator(`UCS_MEMORY_TYPE_CUDA`, ucxx::RMMBuffer);
+   * worker->registerAmAllocator(UCS_MEMORY_TYPE_CUDA, [](size_t length) {
+   *   return std::make_shared<ucxx::CCCLBuffer>(length);
+   * });
    * @endcode
    *
    * @param[in] memoryType  the memory type the allocator will be used for.
diff --git a/cpp/python/CMakeLists.txt b/cpp/python/CMakeLists.txt
index 35d60a3ff..c659cbb95 100644
--- a/cpp/python/CMakeLists.txt
+++ b/cpp/python/CMakeLists.txt
@@ -37,9 +37,6 @@ option(FIND_UCXX_CPP "Search for existing UCXX C++ installations before defaulti
 
 # add third party dependencies using CPM
 rapids_cpm_init()
-# find rmm
-include(../cmake/thirdparty/get_rmm.cmake)
-
 if(FIND_UCXX_CPP)
   rapids_find_package(
     ucxx REQUIRED
@@ -47,7 +44,6 @@ if(FIND_UCXX_CPP)
     INSTALL_EXPORT_SET ucxx-python-exports
   )
 else()
-  set(UCXX_ENABLE_RMM ON)
   set(UCXX_ENABLE_CCCL ON)
   add_subdirectory(.. ucxx-cpp)
 endif()
@@ -99,10 +95,7 @@ target_include_directories(
 
 target_compile_definitions(ucxx_python PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${UCXX_CXX_DEFINITIONS}>")
 
-target_compile_definitions(
-  ucxx_python PUBLIC UCXX_ENABLE_PYTHON
-                     "RMM_LOG_ACTIVE_LEVEL=RAPIDS_LOGGER_LEVEL_${RMM_LOGGING_LEVEL}"
-)
+target_compile_definitions(ucxx_python PUBLIC UCXX_ENABLE_PYTHON)
 
 if(UCXX_ENABLE_CCCL)
   target_compile_definitions(ucxx_python PUBLIC UCXX_ENABLE_CCCL)
@@ -112,9 +105,7 @@ endif()
 # not need to be linked since its symbols will always be available at runtime since we are running
 # inide the Python interpreter, and setting it up this way ensures that we will work if the
 # interpreter links to Python statically instead of dynamically.
-target_link_libraries(
-  ucxx_python PUBLIC rmm::rmm ucx::ucp ucxx::ucxx "$<COMPILE_ONLY:Python3::Python>"
-)
+target_link_libraries(ucxx_python PUBLIC ucx::ucp ucxx::ucxx "$<COMPILE_ONLY:Python3::Python>")
 
 # Add Conda library, and include paths if specified
 if(TARGET conda_env)
diff --git a/cpp/src/buffer.cpp b/cpp/src/buffer.cpp
index b88a42ee2..2862f3cb7 100644
--- a/cpp/src/buffer.cpp
+++ b/cpp/src/buffer.cpp
@@ -5,14 +5,11 @@
 #include <cstring>
 #include <memory>
 #include <new>
+#include <stdexcept>
 #include <utility>
 
 #include <ucxx/buffer.h>
 
-#if UCXX_ENABLE_RMM
-#include <rmm/device_buffer.hpp>
-#endif
-
 namespace ucxx {
 
 Buffer::Buffer(const BufferType bufferType, const size_t size)
@@ -61,51 +58,9 @@ void* HostBuffer::data()
   return _buffer;
 }
 
-#if UCXX_ENABLE_RMM
-RMMBuffer::RMMBuffer(const size_t size)
-  : Buffer(BufferType::RMM, size),
-    _buffer{std::make_unique<rmm::device_buffer>(size, rmm::cuda_stream_default)}
-{
-  ucxx_trace_data("ucxx::RMMBuffer created: %p, buffer: %p, size: %lu", this, _buffer.get(), size);
-}
-
-RMMBuffer::RMMBuffer(std::unique_ptr<rmm::device_buffer> rmm_buffer)
-  : Buffer(BufferType::RMM, rmm_buffer->size()), _buffer{std::move(rmm_buffer)}
-{
-  ucxx_trace_data("ucxx::RMMBuffer created: %p, buffer: %p, size: %lu", this, _buffer.get(), _size);
-}
-
-RMMBuffer::~RMMBuffer() = default;
-
-std::unique_ptr<rmm::device_buffer> RMMBuffer::release()
-{
-  ucxx_trace_data("ucxx::RMMBuffer::%s, RMMBuffer: %p, _buffer: %p", __func__, this, _buffer.get());
-  if (!_buffer) throw std::runtime_error("Invalid object or already released");
-
-  _bufferType = ucxx::BufferType::Invalid;
-  _size       = 0;
-
-  return std::move(_buffer);
-}
-
-void* RMMBuffer::data()
-{
-  ucxx_trace_data("ucxx::RMMBuffer::%s, RMMBuffer: %p, buffer: %p", __func__, this, _buffer.get());
-  if (!_buffer) throw std::runtime_error("Invalid object or already released");
-
-  return _buffer->data();
-}
-#endif
-
 std::shared_ptr<Buffer> allocateBuffer(const BufferType bufferType, const size_t size)
 {
-  if (bufferType == BufferType::RMM) {
-#if UCXX_ENABLE_RMM
-    return std::make_shared<RMMBuffer>(size);
-#else
-    throw std::runtime_error("RMM support not enabled, please compile with -DUCXX_ENABLE_RMM=1");
-#endif
-  } else if (bufferType == BufferType::CCCL) {
+  if (bufferType == BufferType::CCCL) {
 #if UCXX_ENABLE_CCCL
     return std::make_shared<CCCLBuffer>(size);
 #else
diff --git a/cpp/src/buffer_cccl.cpp b/cpp/src/buffer_cccl.cpp
index 53e3a3a09..60741f410 100644
--- a/cpp/src/buffer_cccl.cpp
+++ b/cpp/src/buffer_cccl.cpp
@@ -24,8 +24,7 @@ struct CCCLBufferImpl {
   cccl_buffer_type buffer;
 
   // CCCL's cuda::device_default_memory_pool() requires an active CUDA primary context.
-  // Unlike RMM (which initializes context internally via its device resource setup),
-  // CCCL needs explicit initialization. cudaFree(0) is the standard zero-cost idiom.
+  // cudaFree(0) is the standard zero-cost idiom to initialize it.
   static auto get_device_pool()
   {
     cudaFree(0);  // Ensure CUDA primary context is initialized
@@ -52,7 +51,6 @@ void* CCCLBuffer::data()
   if (!_impl) throw std::runtime_error("Invalid object or already released");
 
   // Explicit cast required: cuda::buffer::data() returns cuda::std::byte*, not void*.
-  // RMMBuffer::data() needs no cast since rmm::device_buffer::data() returns void* directly.
   return static_cast<void*>(_impl->buffer.data());
 }
 
diff --git a/cpp/src/worker.cpp b/cpp/src/worker.cpp
index b9b9063d1..bcf85c6b2 100644
--- a/cpp/src/worker.cpp
+++ b/cpp/src/worker.cpp
@@ -41,8 +41,6 @@ Worker::Worker(std::shared_ptr<Context> context,
     throw std::runtime_error("Context not initialized");
 #if UCXX_ENABLE_CCCL
   _cudaBufferType = BufferType::CCCL;
-#elif UCXX_ENABLE_RMM
-  _cudaBufferType = BufferType::RMM;
 #endif
 
   ucp_worker_params_t params = {.field_mask  = UCP_WORKER_PARAM_FIELD_THREAD_MODE,
@@ -230,8 +228,8 @@ BufferType Worker::getCudaBufferType() const { return _cudaBufferType; }
 
 void Worker::setCudaBufferType(BufferType bufferType)
 {
-  if (bufferType != BufferType::RMM && bufferType != BufferType::CCCL)
-    throw std::invalid_argument("cudaBufferType must be BufferType::RMM or BufferType::CCCL");
+  if (bufferType != BufferType::CCCL)
+    throw std::invalid_argument("cudaBufferType must be BufferType::CCCL");
   _cudaBufferType = bufferType;
 }
 
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 8e458adaa..ab8390102 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -1,6 +1,6 @@
 # ======================================================================================================
 # cmake-format: off
-# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 # cmake-format: on
 # ======================================================================================================
@@ -29,6 +29,10 @@ function(ConfigureTest CMAKE_TEST_NAME)
     ${CMAKE_TEST_NAME} PRIVATE ucxx GTest::gmock_main GTest::gtest_main
                                $<TARGET_NAME_IF_EXISTS:conda_env>
   )
+  if(UCXX_ENABLE_RMM)
+    target_compile_definitions(${CMAKE_TEST_NAME} PRIVATE UCXX_ENABLE_RMM)
+    target_link_libraries(${CMAKE_TEST_NAME} PRIVATE rmm::rmm)
+  endif()
   add_test(NAME ${CMAKE_TEST_NAME} COMMAND ${CMAKE_TEST_NAME})
   install(
     TARGETS ${CMAKE_TEST_NAME}
diff --git a/cpp/tests/buffer.cpp b/cpp/tests/buffer.cpp
index 7591fb511..c5266a091 100644
--- a/cpp/tests/buffer.cpp
+++ b/cpp/tests/buffer.cpp
@@ -12,10 +12,6 @@
 
 #include <ucxx/api.h>
 
-#if UCXX_ENABLE_RMM
-#include <rmm/device_buffer.hpp>
-#endif
-
 namespace {
 
 class BufferAllocator : public ::testing::Test,
@@ -48,17 +44,6 @@ TEST_P(BufferAllocator, TestType)
     ASSERT_EQ(buffer->getType(), ucxx::BufferType::Invalid);
 
     free(releasedBuffer);
-  } else if (_type == ucxx::BufferType::RMM) {
-#if UCXX_ENABLE_RMM
-    auto buffer = std::dynamic_pointer_cast<ucxx::RMMBuffer>(_buffer);
-    ASSERT_EQ(buffer->getType(), _type);
-
-    auto releasedBuffer = buffer->release();
-
-    ASSERT_EQ(buffer->getType(), ucxx::BufferType::Invalid);
-#else
-    GTEST_SKIP() << "UCXX was not built with RMM support";
-#endif
   } else if (_type == ucxx::BufferType::CCCL) {
 #if UCXX_ENABLE_CCCL
     auto buffer = std::dynamic_pointer_cast<ucxx::CCCLBuffer>(_buffer);
@@ -85,17 +70,6 @@ TEST_P(BufferAllocator, TestSize)
     ASSERT_EQ(buffer->getSize(), 0u);
 
     free(releasedBuffer);
-  } else if (_type == ucxx::BufferType::RMM) {
-#if UCXX_ENABLE_RMM
-    auto buffer = std::dynamic_pointer_cast<ucxx::RMMBuffer>(_buffer);
-    ASSERT_EQ(buffer->getSize(), _size);
-
-    auto releasedBuffer = buffer->release();
-
-    ASSERT_EQ(buffer->getSize(), 0u);
-#else
-    GTEST_SKIP() << "UCXX was not built with RMM support";
-#endif
   } else if (_type == ucxx::BufferType::CCCL) {
 #if UCXX_ENABLE_CCCL
     auto buffer = std::dynamic_pointer_cast<ucxx::CCCLBuffer>(_buffer);
@@ -122,19 +96,6 @@ TEST_P(BufferAllocator, TestData)
     ASSERT_NE(releasedBuffer, nullptr);
 
     free(releasedBuffer);
-  } else if (_type == ucxx::BufferType::RMM) {
-#if UCXX_ENABLE_RMM
-    auto buffer = std::dynamic_pointer_cast<ucxx::RMMBuffer>(_buffer);
-    ASSERT_EQ(buffer->data(), _buffer->data());
-
-    auto releasedBuffer = buffer->release();
-
-    EXPECT_THROW(buffer->data(), std::runtime_error);
-
-    ASSERT_NE(releasedBuffer, nullptr);
-#else
-    GTEST_SKIP() << "UCXX was not built with RMM support";
-#endif
   } else if (_type == ucxx::BufferType::CCCL) {
 #if UCXX_ENABLE_CCCL
     auto buffer = std::dynamic_pointer_cast<ucxx::CCCLBuffer>(_buffer);
@@ -158,16 +119,6 @@ TEST_P(BufferAllocator, TestThrowAfterRelease)
     EXPECT_THROW(std::ignore = buffer->release(), std::runtime_error);
 
     free(releasedBuffer);
-  } else if (_type == ucxx::BufferType::RMM) {
-#if UCXX_ENABLE_RMM
-    auto buffer         = std::dynamic_pointer_cast<ucxx::RMMBuffer>(_buffer);
-    auto releasedBuffer = buffer->release();
-
-    EXPECT_THROW(buffer->data(), std::runtime_error);
-    EXPECT_THROW(std::ignore = buffer->release(), std::runtime_error);
-#else
-    GTEST_SKIP() << "UCXX was not built with RMM support";
-#endif
   } else if (_type == ucxx::BufferType::CCCL) {
 #if UCXX_ENABLE_CCCL
     GTEST_SKIP() << "CCCLBuffer does not expose release()";
@@ -184,14 +135,6 @@ INSTANTIATE_TEST_SUITE_P(Host,
                                          std::make_pair(ucxx::BufferType::Host, 1000),
                                          std::make_pair(ucxx::BufferType::Host, 1000000)));
 
-#if UCXX_ENABLE_RMM
-INSTANTIATE_TEST_SUITE_P(RMM,
-                         BufferAllocator,
-                         testing::Values(std::make_pair(ucxx::BufferType::RMM, 1),
-                                         std::make_pair(ucxx::BufferType::RMM, 1000),
-                                         std::make_pair(ucxx::BufferType::RMM, 1000000)));
-#endif
-
 #if UCXX_ENABLE_CCCL
 INSTANTIATE_TEST_SUITE_P(CCCL,
                          BufferAllocator,
diff --git a/cpp/tests/request.cpp b/cpp/tests/request.cpp
index e28584df4..afe047dd4 100644
--- a/cpp/tests/request.cpp
+++ b/cpp/tests/request.cpp
@@ -5,6 +5,7 @@
 #include <algorithm>
 #include <memory>
 #include <numeric>
+#include <stdexcept>
 #include <string>
 #include <tuple>
 #include <ucp/api/ucp.h>
@@ -21,6 +22,10 @@
 #include "ucxx/constructors.h"
 #include "ucxx/utils/ucx.h"
 
+#ifndef UCXX_ENABLE_RMM
+#define UCXX_ENABLE_RMM 0
+#endif
+
 #if UCXX_ENABLE_RMM
 #include <rmm/device_buffer.hpp>
 #endif
@@ -37,15 +42,43 @@ using ::testing::Values;
 
 typedef std::vector<int> DataContainerType;
 
-class RequestTest : public ::testing::TestWithParam<
-                      std::tuple<ucxx::BufferType, bool, bool, ProgressMode, size_t>> {
+enum class TestBufferType {
+  Host,
+  RMM,
+  CCCL,
+};
+
+bool isCudaBufferType(TestBufferType bufferType) { return bufferType != TestBufferType::Host; }
+
+#if UCXX_ENABLE_RMM
+class RMMTestBuffer : public ucxx::Buffer {
+ private:
+  std::unique_ptr<rmm::device_buffer> _buffer;
+
+ public:
+  explicit RMMTestBuffer(size_t size)
+    : ucxx::Buffer(ucxx::BufferType::Invalid, size),
+      _buffer{std::make_unique<rmm::device_buffer>(size, rmm::cuda_stream_default)}
+  {
+  }
+
+  void* data() override
+  {
+    if (!_buffer) throw std::runtime_error("Invalid object");
+    return _buffer->data();
+  }
+};
+#endif
+
+class RequestTest
+  : public ::testing::TestWithParam<std::tuple<TestBufferType, bool, bool, ProgressMode, size_t>> {
  protected:
   std::shared_ptr<ucxx::Context> _context{nullptr};
   std::shared_ptr<ucxx::Worker> _worker{nullptr};
   std::shared_ptr<ucxx::Endpoint> _ep{nullptr};
   std::function<void()> _progressWorker;
 
-  ucxx::BufferType _bufferType;
+  TestBufferType _bufferType;
   ucs_memory_type_t _memoryType;
   bool _registerCustomAmAllocator;
   bool _enableDelayedSubmission;
@@ -68,8 +101,9 @@ class RequestTest : public ::testing::TestWithParam<
                      .delayedSubmission(_enableDelayedSubmission)
                      .requestAttributes(enableRequestAttributes);
 
-    if (_bufferType == ucxx::BufferType::RMM || _bufferType == ucxx::BufferType::CCCL)
-      builder.cudaBufferType(_bufferType);
+#if UCXX_ENABLE_CCCL
+    if (isCudaBufferType(_bufferType)) builder.cudaBufferType(ucxx::BufferType::CCCL);
+#endif
 
     _worker = builder.build();
 
@@ -104,20 +138,19 @@ class RequestTest : public ::testing::TestWithParam<
              _progressMode,
              _messageLength) = GetParam();
 
-    if (_bufferType == ucxx::BufferType::RMM) {
+    if (_bufferType == TestBufferType::RMM) {
 #if !UCXX_ENABLE_RMM
       GTEST_SKIP() << "UCXX was not built with RMM support";
 #endif
     }
 
-    if (_bufferType == ucxx::BufferType::CCCL) {
+    if (_bufferType == TestBufferType::CCCL) {
 #if !UCXX_ENABLE_CCCL
       GTEST_SKIP() << "UCXX was not built with CCCL support";
 #endif
     }
 
-    _memoryType =
-      (_bufferType != ucxx::BufferType::Host) ? UCS_MEMORY_TYPE_CUDA : UCS_MEMORY_TYPE_HOST;
+    _memoryType  = isCudaBufferType(_bufferType) ? UCS_MEMORY_TYPE_CUDA : UCS_MEMORY_TYPE_HOST;
     _messageSize = _messageLength * sizeof(int);
 
     _context = ucxx::createContext({{"RNDV_THRESH", std::to_string(_rndvThresh)}},
@@ -151,16 +184,16 @@ class RequestTest : public ::testing::TestWithParam<
 
       std::iota(_send[i].begin(), _send[i].end(), i);
 
-      if (_bufferType == ucxx::BufferType::Host) {
+      if (_bufferType == TestBufferType::Host) {
         _sendBuffer[i] = std::make_unique<ucxx::HostBuffer>(_messageSize);
         if (allocateRecvBuffer) _recvBuffer[i] = std::make_unique<ucxx::HostBuffer>(_messageSize);
 #if UCXX_ENABLE_RMM
-      } else if (_bufferType == ucxx::BufferType::RMM) {
-        _sendBuffer[i] = std::make_unique<ucxx::RMMBuffer>(_messageSize);
-        if (allocateRecvBuffer) _recvBuffer[i] = std::make_unique<ucxx::RMMBuffer>(_messageSize);
+      } else if (_bufferType == TestBufferType::RMM) {
+        _sendBuffer[i] = std::make_unique<RMMTestBuffer>(_messageSize);
+        if (allocateRecvBuffer) _recvBuffer[i] = std::make_unique<RMMTestBuffer>(_messageSize);
 #endif
 #if UCXX_ENABLE_CCCL
-      } else if (_bufferType == ucxx::BufferType::CCCL) {
+      } else if (_bufferType == TestBufferType::CCCL) {
         _sendBuffer[i] = std::make_unique<ucxx::CCCLBuffer>(_messageSize);
         if (allocateRecvBuffer) _recvBuffer[i] = std::make_unique<ucxx::CCCLBuffer>(_messageSize);
 #endif
@@ -172,9 +205,9 @@ class RequestTest : public ::testing::TestWithParam<
       if (allocateRecvBuffer) _recvPtr[i] = _recvBuffer[i]->data();
     }
 #if UCXX_ENABLE_RMM
-    if (_bufferType == ucxx::BufferType::RMM) { rmm::cuda_stream_default.synchronize(); }
+    if (_bufferType == TestBufferType::RMM) { rmm::cuda_stream_default.synchronize(); }
 #endif
-    if (_bufferType == ucxx::BufferType::CCCL) { cudaStreamSynchronize(nullptr); }
+    if (_bufferType == TestBufferType::CCCL) { cudaStreamSynchronize(nullptr); }
   }
 
   void copyResults()
@@ -182,9 +215,9 @@ class RequestTest : public ::testing::TestWithParam<
     for (size_t i = 0; i < _numBuffers; ++i)
       copyMemoryTypeAware(_recv[i].data(), _recvPtr[i], _messageSize, false);
 #if UCXX_ENABLE_RMM
-    if (_bufferType == ucxx::BufferType::RMM) { rmm::cuda_stream_default.synchronize(); }
+    if (_bufferType == TestBufferType::RMM) { rmm::cuda_stream_default.synchronize(); }
 #endif
-    if (_bufferType == ucxx::BufferType::CCCL) { cudaStreamSynchronize(nullptr); }
+    if (_bufferType == TestBufferType::CCCL) { cudaStreamSynchronize(nullptr); }
   }
 
   void copyMemoryTypeAware(void* dst, const void* src, size_t size, bool synchronize = true)
@@ -192,7 +225,7 @@ class RequestTest : public ::testing::TestWithParam<
     if (_memoryType == UCS_MEMORY_TYPE_HOST) {
       memcpy(dst, src, size);
 #if UCXX_ENABLE_RMM
-    } else if (_memoryType == UCS_MEMORY_TYPE_CUDA && _bufferType == ucxx::BufferType::RMM) {
+    } else if (_memoryType == UCS_MEMORY_TYPE_CUDA && _bufferType == TestBufferType::RMM) {
       RMM_CUDA_TRY(
         cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, rmm::cuda_stream_default.value()));
       if (synchronize) rmm::cuda_stream_default.synchronize();
@@ -211,19 +244,12 @@ TEST_P(RequestTest, ProgressAm)
   }
 
   if (_registerCustomAmAllocator && _memoryType == UCS_MEMORY_TYPE_CUDA) {
-#if UCXX_ENABLE_RMM
-    if (_bufferType == ucxx::BufferType::RMM) {
-      _worker->registerAmAllocator(UCS_MEMORY_TYPE_CUDA, [](size_t length) {
-        return std::make_shared<ucxx::RMMBuffer>(length);
-      });
-    }
-#endif
 #if UCXX_ENABLE_CCCL
-    if (_bufferType == ucxx::BufferType::CCCL) {
-      _worker->registerAmAllocator(UCS_MEMORY_TYPE_CUDA, [](size_t length) {
-        return std::make_shared<ucxx::CCCLBuffer>(length);
-      });
-    }
+    _worker->registerAmAllocator(UCS_MEMORY_TYPE_CUDA, [](size_t length) {
+      return std::make_shared<ucxx::CCCLBuffer>(length);
+    });
+#else
+    GTEST_SKIP() << "CCCL support is required for CUDA receive allocations";
 #endif
   }
 
@@ -240,9 +266,10 @@ TEST_P(RequestTest, ProgressAm)
 
   // Messages of size `_rndvThresh` or larger are rendezvous and will use the custom
   // allocator, smaller messages are eager and will always be host-allocated.
-  ASSERT_THAT(recvReq->getRecvBuffer()->getType(),
-              (_registerCustomAmAllocator && _messageSize >= _rndvThresh) ? _bufferType
-                                                                          : ucxx::BufferType::Host);
+  const auto expectedRecvBufferType = (_registerCustomAmAllocator && _messageSize >= _rndvThresh)
+                                        ? ucxx::BufferType::CCCL
+                                        : ucxx::BufferType::Host;
+  ASSERT_THAT(recvReq->getRecvBuffer()->getType(), expectedRecvBufferType);
 
   copyResults();
 
@@ -349,19 +376,12 @@ TEST_P(RequestTest, ProgressAmReceiverCallback)
   }
 
   if (_registerCustomAmAllocator && _memoryType == UCS_MEMORY_TYPE_CUDA) {
-#if UCXX_ENABLE_RMM
-    if (_bufferType == ucxx::BufferType::RMM) {
-      _worker->registerAmAllocator(UCS_MEMORY_TYPE_CUDA, [](size_t length) {
-        return std::make_shared<ucxx::RMMBuffer>(length);
-      });
-    }
-#endif
 #if UCXX_ENABLE_CCCL
-    if (_bufferType == ucxx::BufferType::CCCL) {
-      _worker->registerAmAllocator(UCS_MEMORY_TYPE_CUDA, [](size_t length) {
-        return std::make_shared<ucxx::CCCLBuffer>(length);
-      });
-    }
+    _worker->registerAmAllocator(UCS_MEMORY_TYPE_CUDA, [](size_t length) {
+      return std::make_shared<ucxx::CCCLBuffer>(length);
+    });
+#else
+    GTEST_SKIP() << "CCCL support is required for CUDA receive allocations";
 #endif
   }
 
@@ -399,10 +419,10 @@ TEST_P(RequestTest, ProgressAmReceiverCallback)
 
     // Messages larger than `_rndvThresh` are rendezvous and will use custom allocator,
     // smaller messages are eager and will always be host-allocated.
-    ASSERT_THAT(receivedRequests[0]->getRecvBuffer()->getType(),
-                (_registerCustomAmAllocator && _messageSize >= _rndvThresh)
-                  ? _bufferType
-                  : ucxx::BufferType::Host);
+    const auto expectedRecvBufferType = (_registerCustomAmAllocator && _messageSize >= _rndvThresh)
+                                          ? ucxx::BufferType::CCCL
+                                          : ucxx::BufferType::Host;
+    ASSERT_THAT(receivedRequests[0]->getRecvBuffer()->getType(), expectedRecvBufferType);
   }
 
   copyResults();
@@ -813,11 +833,15 @@ TEST_P(RequestTest, ProgressTagMulti)
   const size_t numMulti         = 8;
   const bool allocateRecvBuffer = false;
 
+  if (isCudaBufferType(_bufferType) && _worker->getCudaBufferType() == ucxx::BufferType::Invalid) {
+    GTEST_SKIP() << "CUDA buffer allocation support not enabled";
+  }
+
   allocate(numMulti, allocateRecvBuffer);
 
   // Allocate buffers for request sizes/types
   std::vector<size_t> multiSize(numMulti, _messageSize);
-  std::vector<int> multiIsCUDA(numMulti, _bufferType != ucxx::BufferType::Host);
+  std::vector<int> multiIsCUDA(numMulti, isCudaBufferType(_bufferType));
 
   // Submit and wait for transfers to complete
   std::vector<std::shared_ptr<ucxx::Request>> requests;
@@ -835,7 +859,9 @@ TEST_P(RequestTest, ProgressTagMulti)
        std::dynamic_pointer_cast<ucxx::RequestTagMulti>(requests[1])->_bufferRequests) {
     // br->buffer == nullptr are headers
     if (br->buffer) {
-      ASSERT_EQ(br->buffer->getType(), _bufferType);
+      auto expectedBufferType =
+        isCudaBufferType(_bufferType) ? _worker->getCudaBufferType() : ucxx::BufferType::Host;
+      ASSERT_EQ(br->buffer->getType(), expectedBufferType);
       ASSERT_EQ(br->buffer->getSize(), _messageSize);
 
       _recvPtr[transferIdx] = br->buffer->data();
@@ -1102,7 +1128,7 @@ TEST_P(RequestTest, MemoryPutWithOffset)
 
 INSTANTIATE_TEST_SUITE_P(ProgressModes,
                          RequestTest,
-                         Combine(Values(ucxx::BufferType::Host),
+                         Combine(Values(TestBufferType::Host),
                                  Values(false),
                                  Values(false),
                                  Values(ProgressMode::Polling,
@@ -1114,7 +1140,7 @@ INSTANTIATE_TEST_SUITE_P(ProgressModes,
 
 INSTANTIATE_TEST_SUITE_P(DelayedSubmission,
                          RequestTest,
-                         Combine(Values(ucxx::BufferType::Host),
+                         Combine(Values(TestBufferType::Host),
                                  Values(false),
                                  Values(true),
                                  Values(ProgressMode::ThreadPolling, ProgressMode::ThreadBlocking),
@@ -1123,7 +1149,7 @@ INSTANTIATE_TEST_SUITE_P(DelayedSubmission,
 #if UCXX_ENABLE_RMM
 INSTANTIATE_TEST_SUITE_P(RMMProgressModes,
                          RequestTest,
-                         Combine(Values(ucxx::BufferType::RMM),
+                         Combine(Values(TestBufferType::RMM),
                                  Values(false, true),
                                  Values(false),
                                  Values(ProgressMode::Polling,
@@ -1135,7 +1161,7 @@ INSTANTIATE_TEST_SUITE_P(RMMProgressModes,
 
 INSTANTIATE_TEST_SUITE_P(RMMDelayedSubmission,
                          RequestTest,
-                         Combine(Values(ucxx::BufferType::RMM),
+                         Combine(Values(TestBufferType::RMM),
                                  Values(false, true),
                                  Values(true),
                                  Values(ProgressMode::ThreadPolling, ProgressMode::ThreadBlocking),
@@ -1145,7 +1171,7 @@ INSTANTIATE_TEST_SUITE_P(RMMDelayedSubmission,
 #if UCXX_ENABLE_CCCL
 INSTANTIATE_TEST_SUITE_P(CCCLProgressModes,
                          RequestTest,
-                         Combine(Values(ucxx::BufferType::CCCL),
+                         Combine(Values(TestBufferType::CCCL),
                                  Values(false, true),
                                  Values(false),
                                  Values(ProgressMode::Polling,
@@ -1157,7 +1183,7 @@ INSTANTIATE_TEST_SUITE_P(CCCLProgressModes,
 
 INSTANTIATE_TEST_SUITE_P(CCCLDelayedSubmission,
                          RequestTest,
-                         Combine(Values(ucxx::BufferType::CCCL),
+                         Combine(Values(TestBufferType::CCCL),
                                  Values(false, true),
                                  Values(true),
                                  Values(ProgressMode::ThreadPolling, ProgressMode::ThreadBlocking),
diff --git a/python/ucxx/ucxx/_lib/libucxx.pyx b/python/ucxx/ucxx/_lib/libucxx.pyx
index 18c0777bf..12f99a872 100644
--- a/python/ucxx/ucxx/_lib/libucxx.pyx
+++ b/python/ucxx/ucxx/_lib/libucxx.pyx
@@ -39,8 +39,6 @@ cdef extern from "cuda_runtime.h" nogil:
 
 import numpy as np
 
-from rmm.pylibrmm.device_buffer cimport DeviceBuffer
-
 from .arr cimport Array
 from .ucxx_api cimport *
 
@@ -187,21 +185,11 @@ cdef class HostBufferAdapter:
         free(self._ptr)
 
 
-def _get_rmm_buffer(uintptr_t recv_buffer_ptr):
-    cdef RMMBuffer* rmm_buffer = <RMMBuffer*>recv_buffer_ptr
-    return DeviceBuffer.c_from_unique_ptr(move(rmm_buffer.release()))
-
-
 def _get_host_buffer(uintptr_t recv_buffer_ptr):
     cdef HostBuffer* host_buffer = <HostBuffer*>recv_buffer_ptr
     return np.asarray(HostBufferAdapter._from_host_buffer(host_buffer))
 
 
-cdef shared_ptr[Buffer] _rmm_am_allocator(size_t length) noexcept nogil:
-    cdef shared_ptr[RMMBuffer] rmm_buffer = make_shared[RMMBuffer](length)
-    return dynamic_pointer_cast[Buffer, RMMBuffer](rmm_buffer)
-
-# Unlike RMM which has a Python DeviceBuffer class (from the rmm package),
 # CCCL has no Python buffer equivalent. This wrapper provides __cuda_array_interface__
 # for interoperability with CuPy/cuDF without requiring an external CCCL Python package.
 cdef class CCCLBufferWrapper:
@@ -680,7 +668,6 @@ cdef class UCXWorker():
     ) -> None:
         cdef bint ucxx_enable_delayed_submission = enable_delayed_submission
         cdef bint ucxx_enable_python_future = enable_python_future
-        cdef AmAllocatorType rmm_am_allocator
         cdef AmAllocatorType cccl_am_allocator
 
         self._context_feature_flags = <uint64_t>(context.feature_flags)
@@ -697,22 +684,7 @@ cdef class UCXWorker():
             self._enable_python_future = self._worker.get().isFutureEnabled()
 
             if self._context_feature_flags & UCP_FEATURE_AM:
-                if self._worker.get().getCudaBufferType() == BufferType.RMM:
-                    with gil:
-                        warnings.warn(
-                            "RMM CUDA buffer support is deprecated and "
-                            "will be removed in a future release. Use "
-                            "CCCL buffers instead (set "
-                            "UCXX_ENABLE_CCCL=ON and "
-                            "UCXX_ENABLE_RMM=OFF).",
-                            FutureWarning,
-                            stacklevel=2,
-                        )
-                    rmm_am_allocator = <AmAllocatorType>(&_rmm_am_allocator)
-                    self._worker.get().registerAmAllocator(
-                        UCS_MEMORY_TYPE_CUDA, rmm_am_allocator
-                    )
-                elif self._worker.get().getCudaBufferType() == BufferType.CCCL:
+                if self._worker.get().getCudaBufferType() == BufferType.CCCL:
                     cccl_am_allocator = <AmAllocatorType>(&_cccl_am_allocator)
                     self._worker.get().registerAmAllocator(
                         UCS_MEMORY_TYPE_CUDA, cccl_am_allocator
@@ -768,13 +740,11 @@ cdef class UCXWorker():
 
     @property
     def cuda_buffer_type(self) -> str:
-        """Return the preferred CUDA buffer type ('rmm', 'cccl', or 'none')."""
+        """Return the preferred CUDA buffer type ('cccl', or 'none')."""
         cdef BufferType bt
         with nogil:
             bt = self._worker.get().getCudaBufferType()
-        if bt == BufferType.RMM:
-            return "rmm"
-        elif bt == BufferType.CCCL:
+        if bt == BufferType.CCCL:
             return "cccl"
         else:
             return "none"
@@ -1083,7 +1053,7 @@ cdef class UCXRequest():
         return <object>future_ptr
 
     @property
-    def recv_buffer(self) -> None|np.ndarray|DeviceBuffer:
+    def recv_buffer(self) -> object:
         cdef shared_ptr[Buffer] buf
         cdef BufferType bufType
 
@@ -1094,8 +1064,6 @@ cdef class UCXRequest():
         # If buf == NULL, it's not allocated by the request but rather the user
         if buf == NULL:
             return None
-        elif bufType == BufferType.RMM:
-            return _get_rmm_buffer(<uintptr_t><void*>buf.get())
         elif bufType == BufferType.CCCL:
             return _get_cccl_buffer(buf)
         elif bufType == BufferType.Host:
@@ -1144,7 +1112,7 @@ cdef class UCXRequest():
         else:
             await self.wait_yield()
 
-    def get_recv_buffer(self) -> None|np.ndarray|DeviceBuffer:
+    def get_recv_buffer(self) -> object:
         warnings.warn(
             "UCXRequest.get_recv_buffer() is deprecated and will soon be removed, "
             "use the UCXRequest.recv_buffer property instead",
@@ -1175,7 +1143,7 @@ cdef class UCXBufferRequest:
         )
 
     @property
-    def py_buffer(self) -> None|np.ndarray|DeviceBuffer:
+    def py_buffer(self) -> object:
         cdef shared_ptr[Buffer] buf
         cdef BufferType bufType
 
@@ -1186,8 +1154,6 @@ cdef class UCXBufferRequest:
         # If buf == NULL, it holds a header
         if buf == NULL:
             return None
-        elif bufType == BufferType.RMM:
-            return _get_rmm_buffer(<uintptr_t><void*>buf.get())
         elif bufType == BufferType.CCCL:
             return _get_cccl_buffer(buf)
         elif bufType == BufferType.Host:
@@ -1202,7 +1168,7 @@ cdef class UCXBufferRequest:
         )
         return self.request
 
-    def get_py_buffer(self) -> None|np.ndarray|DeviceBuffer:
+    def get_py_buffer(self) -> object:
         warnings.warn(
             "UCXBufferRequest.get_py_buffer() is deprecated and will soon be removed, "
             "use the UCXBufferRequest.py_buffer property instead",
@@ -1289,7 +1255,7 @@ cdef class UCXBufferRequests:
         return <object>future_ptr
 
     @property
-    def py_buffers(self) -> tuple[None|np.ndarray|DeviceBuffer, ...]:
+    def py_buffers(self) -> tuple[object, ...]:
         if not self.completed:
             raise RuntimeError("Some requests are not completed yet")
 
@@ -1367,7 +1333,7 @@ cdef class UCXBufferRequests:
         )
         return self.requests
 
-    def get_py_buffers(self) -> tuple[None|np.ndarray|DeviceBuffer, ...]:
+    def get_py_buffers(self) -> tuple[object, ...]:
         warnings.warn(
             "UCXBufferRequests.get_py_buffers() is deprecated and will soon be "
             "removed, use the UCXBufferRequests.py_buffers property instead",
diff --git a/python/ucxx/ucxx/_lib/ucxx_api.pxd b/python/ucxx/ucxx/_lib/ucxx_api.pxd
index fb2d291a1..939d9e7ee 100644
--- a/python/ucxx/ucxx/_lib/ucxx_api.pxd
+++ b/python/ucxx/ucxx/_lib/ucxx_api.pxd
@@ -92,11 +92,6 @@ cdef extern from "ucp/api/ucp.h" nogil:
                          unsigned *release_number)
 
 
-cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil:
-    cdef cppclass device_buffer:
-        pass
-
-
 cdef extern from "<ucxx/python/exception.h>" namespace "ucxx::python" nogil:
     cdef PyObject* UCXXError
 
@@ -149,7 +144,6 @@ cdef extern from "<ucxx/python/api.h>" namespace "ucxx::python" nogil:
 cdef extern from "<ucxx/buffer.h>" namespace "ucxx" nogil:
     cdef enum class BufferType:
         Host
-        RMM
         CCCL
         Invalid
 
@@ -166,13 +160,6 @@ cdef extern from "<ucxx/buffer.h>" namespace "ucxx" nogil:
         void* release() except +raise_py_error
         void* data() except +raise_py_error
 
-    cdef cppclass RMMBuffer:
-        RMMBuffer(const size_t size_t) except +raise_py_error
-        BufferType getType()
-        size_t getSize()
-        unique_ptr[device_buffer] release() except +raise_py_error
-        void* data() except +raise_py_error
-
 cdef extern from "<ucxx/buffer.h>" namespace "ucxx" nogil:
     cdef cppclass CCCLBuffer:
         CCCLBuffer(const size_t size) except +raise_py_error

From 7ce37135e80d91a3d4b5cb319348320b30018205 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 19 May 2026 11:15:12 -0700
Subject: [PATCH 2/6] Remove unnecessary build dependencies

---
 build.sh                          | 1 -
 ci/build_wheel_libucxx.sh         | 4 +---
 ci/build_wheel_ucxx.sh            | 2 --
 conda/recipes/libucxx/recipe.yaml | 4 ----
 conda/recipes/ucxx/recipe.yaml    | 3 ---
 5 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/build.sh b/build.sh
index 1e03bab96..ead413426 100755
--- a/build.sh
+++ b/build.sh
@@ -151,7 +151,6 @@ if hasArg ucxx_tests && ! hasArg ucxx; then
 fi
 
 if buildAll || hasArg libucxx_python || hasArg libucxx_tests || hasArg libucxx_examples; then
-  UCXX_ENABLE_RMM=ON
   UCXX_ENABLE_CCCL=ON
 fi
 
diff --git a/ci/build_wheel_libucxx.sh b/ci/build_wheel_libucxx.sh
index 41f4db8c5..d0bb2e937 100755
--- a/ci/build_wheel_libucxx.sh
+++ b/ci/build_wheel_libucxx.sh
@@ -28,14 +28,12 @@ rapids-pip-retry install \
 # 0 really means "add --no-build-isolation" (ref: https://github.com/pypa/pip/issues/5735)
 export PIP_NO_BUILD_ISOLATION=0
 
-export SKBUILD_CMAKE_ARGS="-DUCXX_ENABLE_RMM=ON -DUCXX_ENABLE_CCCL=ON"
+export SKBUILD_CMAKE_ARGS="-DUCXX_ENABLE_CCCL=ON"
 
 ./ci/build_wheel.sh "${package_name}" "${package_dir}"
 
 python -m auditwheel repair \
     --exclude "libucp.so.0" \
-    --exclude "librapids_logger.so" \
-    --exclude "librmm.so" \
     -w "${RAPIDS_WHEEL_BLD_OUTPUT_DIR}" \
     ${package_dir}/dist/*
 
diff --git a/ci/build_wheel_ucxx.sh b/ci/build_wheel_ucxx.sh
index 9ac98453b..1268be6fb 100755
--- a/ci/build_wheel_ucxx.sh
+++ b/ci/build_wheel_ucxx.sh
@@ -31,8 +31,6 @@ export RAPIDS_PY_API
 python -m auditwheel repair \
     --exclude "libucp.so.0" \
     --exclude "libucxx.so" \
-    --exclude "librapids_logger.so" \
-    --exclude "librmm.so" \
     -w "${RAPIDS_WHEEL_BLD_OUTPUT_DIR}" \
     ${package_dir}/dist/*
 
diff --git a/conda/recipes/libucxx/recipe.yaml b/conda/recipes/libucxx/recipe.yaml
index f3dd820e9..d34d716db 100644
--- a/conda/recipes/libucxx/recipe.yaml
+++ b/conda/recipes/libucxx/recipe.yaml
@@ -100,11 +100,9 @@ outputs:
         - ${{ stdlib("c") }}
       host:
         - cuda-version =${{ cuda_version }}
-        - librmm ${{ rapids_version }}
         - ucx
       run:
         - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
-        - ${{ pin_compatible("librmm", upper_bound="x.x") }}
         - ucx >=1.18.0,<1.21.0
       run_exports:
         - ${{ pin_subpackage("libucxx", upper_bound="x.x") }}
@@ -112,7 +110,6 @@ outputs:
         by_name:
           - cuda-cudart
           - cuda-version
-          - librmm
           - ucx
     tests:
       - script:
@@ -173,7 +170,6 @@ outputs:
         by_name:
           - cuda-cudart
           - cuda-version
-          - librmm
           - libucxx
           - ucx
     about:
diff --git a/conda/recipes/ucxx/recipe.yaml b/conda/recipes/ucxx/recipe.yaml
index 02ddcce1b..f1e5b2744 100644
--- a/conda/recipes/ucxx/recipe.yaml
+++ b/conda/recipes/ucxx/recipe.yaml
@@ -93,7 +93,6 @@ outputs:
         - python =${{ py_abi_min }}
         - python-abi3 ${{ py_abi_min }}.*
         - rapids-build-backend >=0.4.0,<0.5.0
-        - rmm ${{ rapids_version }}
         - scikit-build-core>=0.11.0
         - ucx
         - libucxx =${{ version }}
@@ -106,7 +105,6 @@ outputs:
         - python
         - ucx >=1.18.0,<1.21.0
         - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
-        - ${{ pin_compatible("rmm", upper_bound="x.x") }}
         - libucxx =${{ version }}
       run_constraints:
         - cupy >=13.6.0
@@ -114,7 +112,6 @@ outputs:
         by_name:
           - cuda-cudart
           - cuda-version
-          - librmm
           - libucxx
           - python_abi
           - ucx

From 50820706488a3b067a2c244f28a45fe462822b03 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 19 May 2026 13:04:39 -0700
Subject: [PATCH 3/6] Update README with CCCL info

---
 README.md | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index b526260c1..d9ecc1a86 100644
--- a/README.md
+++ b/README.md
@@ -43,7 +43,7 @@ Additionally, there is a `./build_and_run.sh` script that will call `./build.sh`
 
 ### C++
 
-To build and install C++ library to `${CONDA_PREFIX}`, with both Python and RMM support, as well as building all tests and benchmarks (with CUDA support) run:
+To build and install the C++ library to `${CONDA_PREFIX}`, with Python support and CCCL CUDA buffer support, as well as building all tests and benchmarks with CUDA/CCCL support, run:
 
 ```
 mkdir cpp/build
@@ -53,8 +53,9 @@ cmake .. -DCMAKE_INSTALL_PREFIX=${CONDA_PREFIX} \
       -DBUILD_BENCHMARKS=ON \
       -DCMAKE_BUILD_TYPE=Release \
       -DUCXX_ENABLE_PYTHON=ON \
-      -DUCXX_ENABLE_RMM=ON \
-      -DUCXX_BENCHMARKS_ENABLE_CUDA=ON
+      -DUCXX_ENABLE_CCCL=ON \
+      -DUCXX_BENCHMARKS_ENABLE_CUDA=ON \
+      -DUCXX_BENCHMARKS_ENABLE_CCCL=ON
 make -j install
 ```
 
@@ -122,7 +123,8 @@ It is recommended to use `UCX_TCP_CM_REUSEADDR=y` when binding to interfaces wit
 
 #### CCCL Memory Support
 
-When built with `UCXX_ENABLE_CCCL=ON`, additional CCCL-based memory types are available:
+When built with `UCXX_ENABLE_CCCL=ON`, `UCXX_BENCHMARKS_ENABLE_CUDA=ON`, and
+`UCXX_BENCHMARKS_ENABLE_CCCL=ON`, additional CCCL-based memory types are available:
 
 ```
 # Server with CCCL device memory pool
@@ -138,7 +140,7 @@ $ UCX_TCP_CM_REUSEADDR=y ./benchmarks/ucxx_perftest -m cccl-shared -s 1048576 -n
 $ ./benchmarks/ucxx_perftest -m cccl-shared -s 1048576 -n 10 127.0.0.1
 ```
 
-**Additional CCCL Memory Types (with `-DUCXX_ENABLE_CCCL=ON`):**
+**Additional CCCL Memory Types:**
 - `cccl-device` - CCCL device memory pool
 - `cccl-shared` - CCCL shared memory resource
 - `cccl-cuda-async` - CCCL CUDA async memory resource
@@ -146,6 +148,8 @@ $ ./benchmarks/ucxx_perftest -m cccl-shared -s 1048576 -n 10 127.0.0.1
 
 **Requirements for CCCL Support:**
 - UCXX compiled with `UCXX_ENABLE_CCCL=ON`
+- Benchmarks compiled with `UCXX_BENCHMARKS_ENABLE_CUDA=ON`
+- Benchmarks compiled with `UCXX_BENCHMARKS_ENABLE_CCCL=ON`
 - CCCL library available (fetched automatically via CMake)
 
 ### Python

From e8ff9b0e19514d3d88dc581e762c1b3287adb4bc Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 20 May 2026 13:28:37 -0700
Subject: [PATCH 4/6] Add missing cudart_static linking

---
 cpp/tests/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index ab8390102..2b7ced8de 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -29,6 +29,10 @@ function(ConfigureTest CMAKE_TEST_NAME)
     ${CMAKE_TEST_NAME} PRIVATE ucxx GTest::gmock_main GTest::gtest_main
                                $<TARGET_NAME_IF_EXISTS:conda_env>
   )
+  if(UCXX_ENABLE_CCCL)
+    find_package(CUDAToolkit REQUIRED)
+    target_link_libraries(${CMAKE_TEST_NAME} PRIVATE CUDA::cudart_static)
+  endif()
   if(UCXX_ENABLE_RMM)
     target_compile_definitions(${CMAKE_TEST_NAME} PRIVATE UCXX_ENABLE_RMM)
     target_link_libraries(${CMAKE_TEST_NAME} PRIVATE rmm::rmm)

From 7e8ca764e01eac9fc07966d152beceba40d976e8 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 20 May 2026 13:40:17 -0700
Subject: [PATCH 5/6] Rename RMM CMake flags

---
 build.sh                      |  6 ++++--
 cpp/CMakeLists.txt            | 15 +++++++++++----
 cpp/benchmarks/CMakeLists.txt |  2 +-
 cpp/tests/CMakeLists.txt      |  4 ++--
 cpp/tests/request.cpp         | 22 +++++++++++-----------
 5 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/build.sh b/build.sh
index ead413426..d44e12006 100755
--- a/build.sh
+++ b/build.sh
@@ -56,7 +56,8 @@ BUILD_TESTS=OFF
 BUILD_EXAMPLES=OFF
 BUILD_DISABLE_DEPRECATION_WARNINGS=ON
 BUILD_COMPILE_COMMANDS=OFF
-UCXX_ENABLE_RMM=OFF
+UCXX_TESTS_ENABLE_RMM=OFF
+UCXX_BENCHMARKS_ENABLE_RMM=OFF
 UCXX_ENABLE_CCCL=OFF
 UCXX_BENCHMARKS_ENABLE_CUDA=OFF
 
@@ -187,7 +188,8 @@ if buildAll || hasArg libucxx; then
           -DDISABLE_DEPRECATION_WARNINGS=${BUILD_DISABLE_DEPRECATION_WARNINGS} \
           -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
           -DCMAKE_EXPORT_COMPILE_COMMANDS=${BUILD_COMPILE_COMMANDS} \
-          -DUCXX_ENABLE_RMM=${UCXX_ENABLE_RMM} \
+          -DUCXX_TESTS_ENABLE_RMM=${UCXX_TESTS_ENABLE_RMM} \
+          -DUCXX_BENCHMARKS_ENABLE_RMM=${UCXX_BENCHMARKS_ENABLE_RMM} \
           -DUCXX_ENABLE_CCCL=${UCXX_ENABLE_CCCL} \
           -DUCXX_BENCHMARKS_ENABLE_CUDA=${UCXX_BENCHMARKS_ENABLE_CUDA} \
           "${EXTRA_CMAKE_ARGS[@]}"
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 45971ca94..bc7fd2487 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -42,7 +42,8 @@ option(BUILD_TESTS "Configure CMake to build tests" ON)
 option(BUILD_BENCHMARKS "Configure CMake to build benchmarks" OFF)
 option(BUILD_EXAMPLES "Configure CMake to build examples" OFF)
 option(BUILD_SHARED_LIBS "Build UCXX shared libraries" ON)
-option(UCXX_ENABLE_RMM "Enable RMM-backed test and benchmark code paths" OFF)
+option(UCXX_TESTS_ENABLE_RMM "Enable RMM-backed test code paths" OFF)
+option(UCXX_BENCHMARKS_ENABLE_RMM "Enable RMM-backed benchmark code paths" OFF)
 # TODO: Flip UCXX_ENABLE_CCCL default to OFF once devcontainer builds pass -DUCXX_ENABLE_CCCL=ON
 option(UCXX_ENABLE_CCCL "Enable support for CUDA buffer with CCCL" ON)
 option(DISABLE_DEPRECATION_WARNINGS "Disable warnings generated from deprecated declarations." OFF)
@@ -51,7 +52,8 @@ message(VERBOSE "UCXX: Configure CMake to build tests: ${BUILD_TESTS}")
 message(VERBOSE "UCXX: Configure CMake to build benchmarks: ${BUILD_BENCHMARKS}")
 message(VERBOSE "UCXX: Configure CMake to build examples: ${BUILD_EXAMPLES}")
 message(VERBOSE "UCXX: Build UCXX shared libraries: ${BUILD_SHARED_LIBS}")
-message(VERBOSE "UCXX: Enable RMM-backed test and benchmark code paths: ${UCXX_ENABLE_RMM}")
+message(VERBOSE "UCXX: Enable RMM-backed test code paths: ${UCXX_TESTS_ENABLE_RMM}")
+message(VERBOSE "UCXX: Enable RMM-backed benchmark code paths: ${UCXX_BENCHMARKS_ENABLE_RMM}")
 message(VERBOSE "UCXX: Enable support for CUDA buffer with CCCL: ${UCXX_ENABLE_CCCL}")
 message(
   VERBOSE
@@ -94,8 +96,13 @@ rapids_find_package(
 
 # add third party dependencies using CPM
 rapids_cpm_init()
-# find rmm for tests and benchmarks
-if(UCXX_ENABLE_RMM)
+# find rmm for enabled test and benchmark code paths
+if((BUILD_TESTS AND UCXX_TESTS_ENABLE_RMM)
+   OR (BUILD_BENCHMARKS
+       AND UCXX_BENCHMARKS_ENABLE_CUDA
+       AND UCXX_BENCHMARKS_ENABLE_RMM
+      )
+)
   include(cmake/thirdparty/get_rmm.cmake)
 endif()
 # find cccl
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 3dbfc4936..bdea55ded 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -46,7 +46,7 @@ function(ConfigureBench CMAKE_BENCH_NAME)
   endif()
 
   # RMM memory resources for CUDA benchmarks.
-  if(UCXX_BENCHMARKS_ENABLE_CUDA AND UCXX_ENABLE_RMM)
+  if(UCXX_BENCHMARKS_ENABLE_CUDA AND UCXX_BENCHMARKS_ENABLE_RMM)
     target_compile_definitions(${CMAKE_BENCH_NAME} PRIVATE UCXX_BENCHMARKS_ENABLE_RMM)
     target_link_libraries(${CMAKE_BENCH_NAME} PRIVATE rmm::rmm)
   endif()
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 2b7ced8de..5b936a875 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -33,8 +33,8 @@ function(ConfigureTest CMAKE_TEST_NAME)
     find_package(CUDAToolkit REQUIRED)
     target_link_libraries(${CMAKE_TEST_NAME} PRIVATE CUDA::cudart_static)
   endif()
-  if(UCXX_ENABLE_RMM)
-    target_compile_definitions(${CMAKE_TEST_NAME} PRIVATE UCXX_ENABLE_RMM)
+  if(UCXX_TESTS_ENABLE_RMM)
+    target_compile_definitions(${CMAKE_TEST_NAME} PRIVATE UCXX_TESTS_ENABLE_RMM)
     target_link_libraries(${CMAKE_TEST_NAME} PRIVATE rmm::rmm)
   endif()
   add_test(NAME ${CMAKE_TEST_NAME} COMMAND ${CMAKE_TEST_NAME})
diff --git a/cpp/tests/request.cpp b/cpp/tests/request.cpp
index afe047dd4..ac38a1852 100644
--- a/cpp/tests/request.cpp
+++ b/cpp/tests/request.cpp
@@ -22,11 +22,11 @@
 #include "ucxx/constructors.h"
 #include "ucxx/utils/ucx.h"
 
-#ifndef UCXX_ENABLE_RMM
-#define UCXX_ENABLE_RMM 0
+#ifndef UCXX_TESTS_ENABLE_RMM
+#define UCXX_TESTS_ENABLE_RMM 0
 #endif
 
-#if UCXX_ENABLE_RMM
+#if UCXX_TESTS_ENABLE_RMM
 #include <rmm/device_buffer.hpp>
 #endif
 
@@ -50,7 +50,7 @@ enum class TestBufferType {
 
 bool isCudaBufferType(TestBufferType bufferType) { return bufferType != TestBufferType::Host; }
 
-#if UCXX_ENABLE_RMM
+#if UCXX_TESTS_ENABLE_RMM
 class RMMTestBuffer : public ucxx::Buffer {
  private:
   std::unique_ptr<rmm::device_buffer> _buffer;
@@ -139,8 +139,8 @@ class RequestTest
              _messageLength) = GetParam();
 
     if (_bufferType == TestBufferType::RMM) {
-#if !UCXX_ENABLE_RMM
-      GTEST_SKIP() << "UCXX was not built with RMM support";
+#if !UCXX_TESTS_ENABLE_RMM
+      GTEST_SKIP() << "UCXX tests were not built with RMM support";
 #endif
     }
 
@@ -187,7 +187,7 @@ class RequestTest
       if (_bufferType == TestBufferType::Host) {
         _sendBuffer[i] = std::make_unique<ucxx::HostBuffer>(_messageSize);
         if (allocateRecvBuffer) _recvBuffer[i] = std::make_unique<ucxx::HostBuffer>(_messageSize);
-#if UCXX_ENABLE_RMM
+#if UCXX_TESTS_ENABLE_RMM
       } else if (_bufferType == TestBufferType::RMM) {
         _sendBuffer[i] = std::make_unique<RMMTestBuffer>(_messageSize);
         if (allocateRecvBuffer) _recvBuffer[i] = std::make_unique<RMMTestBuffer>(_messageSize);
@@ -204,7 +204,7 @@ class RequestTest
       _sendPtr[i] = _sendBuffer[i]->data();
       if (allocateRecvBuffer) _recvPtr[i] = _recvBuffer[i]->data();
     }
-#if UCXX_ENABLE_RMM
+#if UCXX_TESTS_ENABLE_RMM
     if (_bufferType == TestBufferType::RMM) { rmm::cuda_stream_default.synchronize(); }
 #endif
     if (_bufferType == TestBufferType::CCCL) { cudaStreamSynchronize(nullptr); }
@@ -214,7 +214,7 @@ class RequestTest
   {
     for (size_t i = 0; i < _numBuffers; ++i)
       copyMemoryTypeAware(_recv[i].data(), _recvPtr[i], _messageSize, false);
-#if UCXX_ENABLE_RMM
+#if UCXX_TESTS_ENABLE_RMM
     if (_bufferType == TestBufferType::RMM) { rmm::cuda_stream_default.synchronize(); }
 #endif
     if (_bufferType == TestBufferType::CCCL) { cudaStreamSynchronize(nullptr); }
@@ -224,7 +224,7 @@ class RequestTest
   {
     if (_memoryType == UCS_MEMORY_TYPE_HOST) {
       memcpy(dst, src, size);
-#if UCXX_ENABLE_RMM
+#if UCXX_TESTS_ENABLE_RMM
     } else if (_memoryType == UCS_MEMORY_TYPE_CUDA && _bufferType == TestBufferType::RMM) {
       RMM_CUDA_TRY(
         cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, rmm::cuda_stream_default.value()));
@@ -1146,7 +1146,7 @@ INSTANTIATE_TEST_SUITE_P(DelayedSubmission,
                                  Values(ProgressMode::ThreadPolling, ProgressMode::ThreadBlocking),
                                  Values(0, 1, 1024, 2048, 1048576)));
 
-#if UCXX_ENABLE_RMM
+#if UCXX_TESTS_ENABLE_RMM
 INSTANTIATE_TEST_SUITE_P(RMMProgressModes,
                          RequestTest,
                          Combine(Values(TestBufferType::RMM),

From 7afe52eddf50135d7c192adf4cb7ccc55470851d Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 20 May 2026 23:54:14 -0700
Subject: [PATCH 6/6] More cudart_static linkage

---
 python/ucxx/ucxx/_lib/CMakeLists.txt | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/ucxx/ucxx/_lib/CMakeLists.txt b/python/ucxx/ucxx/_lib/CMakeLists.txt
index 68922afce..21c033b9b 100644
--- a/python/ucxx/ucxx/_lib/CMakeLists.txt
+++ b/python/ucxx/ucxx/_lib/CMakeLists.txt
@@ -1,12 +1,13 @@
 # =================================================================================
 # cmake-format: off
-# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: BSD-3-Clause
 # cmake-format: on
 # =================================================================================
 
 set(cython_sources arr.pyx libucxx.pyx)
-set(linked_libraries ucxx::ucxx ucxx::python)
+find_package(CUDAToolkit REQUIRED)
+set(linked_libraries ucxx::ucxx ucxx::python CUDA::cudart_static)
 
 rapids_cython_create_modules(
   CXX