From d4b67948d99becc5aa81f7242193c9069b40bedc Mon Sep 17 00:00:00 2001 From: Kai Germaschewski Date: Tue, 26 May 2026 18:17:15 +0000 Subject: [PATCH 1/6] cuda: compatibility with cuda >= 13.0.0 --- src/libpsc/cuda/cuda_base.cxx | 60 ++++++++++++++++++++++++++++++----- 1 file changed, 52 insertions(+), 8 deletions(-) diff --git a/src/libpsc/cuda/cuda_base.cxx b/src/libpsc/cuda/cuda_base.cxx index 93f28c6b7..50e54e619 100644 --- a/src/libpsc/cuda/cuda_base.cxx +++ b/src/libpsc/cuda/cuda_base.cxx @@ -33,6 +33,49 @@ static track_mr_type* track_mr; static pool_mr_type* pool_mr; #endif +namespace +{ +struct DevicePropsCompat +{ + int clock_rate; + int concurrent_kernels; + int kernel_exec_timeout; + int integrated; + int can_map_host_memory; + int compute_mode; +}; + +int cuda_device_attr(int dev, cudaDeviceAttr attr) +{ + int value = 0; + cudaError_t ierr = cudaDeviceGetAttribute(&value, attr, dev); + if (ierr != cudaSuccess) { + return 0; + } + return value; +} + +DevicePropsCompat get_device_props_compat(const hipDeviceProp_t& deviceProp, + int dev) +{ +#if CUDART_VERSION >= 13000 + return {cuda_device_attr(dev, cudaDevAttrClockRate), + cuda_device_attr(dev, cudaDevAttrConcurrentKernels), + cuda_device_attr(dev, cudaDevAttrKernelExecTimeout), + cuda_device_attr(dev, cudaDevAttrIntegrated), + cuda_device_attr(dev, cudaDevAttrCanMapHostMemory), + cuda_device_attr(dev, cudaDevAttrComputeMode)}; +#else + return {deviceProp.clockRate, + deviceProp.deviceOverlap, + deviceProp.kernelExecTimeoutEnabled, + deviceProp.integrated, + deviceProp.canMapHostMemory, + deviceProp.computeMode}; +#endif +} +} // namespace + void cuda_base_init(void) { static bool first_time = true; @@ -79,6 +122,7 @@ void cuda_base_init(void) for (int dev = 0; dev < deviceCount; ++dev) { hipDeviceProp_t deviceProp; hipGetDeviceProperties(&deviceProp, dev); + auto compat = get_device_props_compat(deviceProp, dev); if (dev == 0) { // This function call returns 9999 for both major & minor fields, if no @@ -124,25 +168,25 @@ void cuda_base_init(void) printf(" Texture alignment: %lu bytes\n", deviceProp.textureAlignment); printf(" Clock rate: %.2f GHz\n", - deviceProp.clockRate * 1e-6f); + compat.clock_rate * 1e-6f); #if CUDART_VERSION >= 2000 printf(" Concurrent copy and execution: %s\n", - deviceProp.deviceOverlap ? "Yes" : "No"); + compat.concurrent_kernels ? "Yes" : "No"); #endif #if CUDART_VERSION >= 2020 printf(" Run time limit on kernels: %s\n", - deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); + compat.kernel_exec_timeout ? "Yes" : "No"); printf(" Integrated: %s\n", - deviceProp.integrated ? "Yes" : "No"); + compat.integrated ? "Yes" : "No"); printf(" Support host page-locked memory mapping: %s\n", - deviceProp.canMapHostMemory ? "Yes" : "No"); + compat.can_map_host_memory ? "Yes" : "No"); printf( " Compute mode: %s\n", - deviceProp.computeMode == hipComputeModeDefault + compat.compute_mode == hipComputeModeDefault ? "Default (multiple host threads can use this device simultaneously)" - : deviceProp.computeMode == hipComputeModeExclusive + : compat.compute_mode == hipComputeModeExclusive ? "Exclusive (only one host thread at a time can use this device)" - : deviceProp.computeMode == hipComputeModeProhibited + : compat.compute_mode == hipComputeModeProhibited ? "Prohibited (no host thread can use this device)" : "Unknown"); #endif From eb54b111e842190734c49677d57712a34aa23c50 Mon Sep 17 00:00:00 2001 From: Kai Germaschewski Date: Tue, 26 May 2026 18:19:52 +0000 Subject: [PATCH 2/6] cuda: get rid of unused cuda_base.cu --- src/libpsc/cuda/cuda_base.cu | 178 ----------------------------------- 1 file changed, 178 deletions(-) delete mode 100644 src/libpsc/cuda/cuda_base.cu diff --git a/src/libpsc/cuda/cuda_base.cu b/src/libpsc/cuda/cuda_base.cu deleted file mode 100644 index 9d2e4a83b..000000000 --- a/src/libpsc/cuda/cuda_base.cu +++ /dev/null @@ -1,178 +0,0 @@ - -#include "PscConfig.h" -#include "cuda_base.cuh" - -#ifdef PSC_HAVE_RMM -#include -#include -#include -#include -#endif - -#include -#include -#include -#include - -std::size_t mem_particles; -std::size_t mem_randomize_sort; -std::size_t mem_sort_by_block; -std::size_t mem_bnd; -std::size_t mem_heating; -std::size_t mem_collisions; -std::size_t mem_bndp; -std::size_t mem_rnd; - -#ifdef PSC_HAVE_RMM -using device_mr_type = rmm::mr::device_memory_resource; -using pool_mr_type = rmm::mr::pool_memory_resource; -using track_mr_type = rmm::mr::tracking_resource_adaptor; -using log_mr_type = rmm::mr::logging_resource_adaptor; - -static track_mr_type* track_mr; -static pool_mr_type* pool_mr; -#endif - -void cuda_base_init(void) -{ - static bool first_time = true; - if (!first_time) - return; - - first_time = false; - -#ifdef PSC_HAVE_RMM - rmm::logger().set_level(spdlog::level::trace); - - device_mr_type* mr = - rmm::mr::get_current_device_resource(); // Points to `cuda_memory_resource` - static log_mr_type _log_mr{mr, std::cout, true}; - static pool_mr_type pool_mr{&_log_mr, 15000000000}; - static track_mr_type track_mr{&pool_mr}; -#if 0 - static log_mr_type log_mr{&track_mr, std::cout, true}; - rmm::mr::set_current_device_resource(&log_mr); -#else - rmm::mr::set_current_device_resource(&track_mr); -#endif - ::pool_mr = &pool_mr; - ::track_mr = &track_mr; -#endif - - int deviceCount; - cudaGetDeviceCount(&deviceCount); - - // This function call returns 0 if there are no CUDA capable devices. - if (deviceCount == 0) { - printf("There is no device supporting CUDA\n"); - return; - } - - get_rng_state().resize(131072); - - int rank; - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - if (rank != 0) { - return; - } - - for (int dev = 0; dev < deviceCount; ++dev) { - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, dev); - - if (dev == 0) { - // This function call returns 9999 for both major & minor fields, if no - // CUDA capable devices are present - if (deviceProp.major == 9999 && deviceProp.minor == 9999) - printf("There is no device supporting CUDA.\n"); - else if (deviceCount == 1) - printf("There is 1 device supporting CUDA\n"); - else - printf("There are %d devices supporting CUDA\n", deviceCount); - } - printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name); - printf(" CUDA Capability Major revision number: %d\n", - deviceProp.major); - printf(" CUDA Capability Minor revision number: %d\n", - deviceProp.minor); - printf(" Total amount of global memory: %lu bytes\n", - deviceProp.totalGlobalMem); -#if CUDART_VERSION >= 2000 - printf(" Number of multiprocessors: %d\n", - deviceProp.multiProcessorCount); - printf(" Number of cores: %d\n", - 8 * deviceProp.multiProcessorCount); -#endif - printf(" Total amount of constant memory: %lu bytes\n", - deviceProp.totalConstMem); - printf(" Total amount of shared memory per block: %lu bytes\n", - deviceProp.sharedMemPerBlock); - printf(" Total number of registers available per block: %d\n", - deviceProp.regsPerBlock); - printf(" Warp size: %d\n", - deviceProp.warpSize); - printf(" Maximum number of threads per block: %d\n", - deviceProp.maxThreadsPerBlock); - printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", - deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], - deviceProp.maxThreadsDim[2]); - printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", - deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], - deviceProp.maxGridSize[2]); - printf(" Maximum memory pitch: %lu bytes\n", - deviceProp.memPitch); - printf(" Texture alignment: %lu bytes\n", - deviceProp.textureAlignment); - printf(" Clock rate: %.2f GHz\n", - deviceProp.clockRate * 1e-6f); -#if CUDART_VERSION >= 2000 - printf(" Concurrent copy and execution: %s\n", - deviceProp.deviceOverlap ? "Yes" : "No"); -#endif -#if CUDART_VERSION >= 2020 - printf(" Run time limit on kernels: %s\n", - deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); - printf(" Integrated: %s\n", - deviceProp.integrated ? "Yes" : "No"); - printf(" Support host page-locked memory mapping: %s\n", - deviceProp.canMapHostMemory ? "Yes" : "No"); - printf( - " Compute mode: %s\n", - deviceProp.computeMode == cudaComputeModeDefault - ? "Default (multiple host threads can use this device simultaneously)" - : deviceProp.computeMode == cudaComputeModeExclusive - ? "Exclusive (only one host thread at a time can use this device)" - : deviceProp.computeMode == cudaComputeModeProhibited - ? "Prohibited (no host thread can use this device)" - : "Unknown"); -#endif - } -} - -std::size_t mem_cuda_allocated() -{ -#ifdef PSC_HAVE_RMM - if (track_mr) { - return track_mr->get_allocated_bytes(); - } else { - return 0; - } -#else - return 0; -#endif -} - -RngStateCuda& get_rng_state() -{ - static RngStateCuda rng_state; - return rng_state; -} - -void mem_pool_print() -{ -#if 0 // needs hacked RMM to make print() accessible - if (pool_mr) { - pool_mr->print(); - } -#endif -} From 5595cc53ac86b136c6f04e9c29c12c51e56aaa2e Mon Sep 17 00:00:00 2001 From: Kai Germaschewski Date: Tue, 26 May 2026 19:03:55 +0000 Subject: [PATCH 3/6] cuda: fix copy&paste error in marder_impl.hxx --- src/libpsc/psc_push_fields/marder_impl.hxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libpsc/psc_push_fields/marder_impl.hxx b/src/libpsc/psc_push_fields/marder_impl.hxx index 2dc4fb50a..a65dc1df9 100644 --- a/src/libpsc/psc_push_fields/marder_impl.hxx +++ b/src/libpsc/psc_push_fields/marder_impl.hxx @@ -64,7 +64,7 @@ inline void correct(const Grid_t& grid, E1& efield, const Int3& efield_ib, template inline void cuda_marder_correct_yz(E1& efield, E2& res, Float3 fac, Int3 l, - Int3 r, Int3 l, Int3 r) + Int3 r) { auto k_efield = efield.to_kernel(); auto k_res = res.to_kernel(); From 577d355059978787678117be8d7ae0c98d9bd50c Mon Sep 17 00:00:00 2001 From: Kai Germaschewski Date: Tue, 26 May 2026 19:06:56 +0000 Subject: [PATCH 4/6] cuda: don't build psc_shock with cuda --- src/CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3c42b99f7..c0af10095 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -10,6 +10,7 @@ endmacro(add_psc_executable) if(NOT USE_CUDA) add_psc_executable(psc_bgk) + add_psc_executable(psc_shock) endif() add_psc_executable(psc_bubble_yz) @@ -19,16 +20,15 @@ add_psc_executable(psc_whistler) add_psc_executable(psc_harris_yz) add_psc_executable(psc_2d_shock) add_psc_executable(psc_radiation) -add_psc_executable(psc_shock) if(NOT USE_CUDA) install( - TARGETS psc_bgk + TARGETS psc_bgk psc_shock RUNTIME DESTINATION bin ) endif() install( - TARGETS psc_bubble_yz psc_flatfoil_yz psc_whistler psc_shock + TARGETS psc_bubble_yz psc_flatfoil_yz psc_whistler RUNTIME DESTINATION bin ) From c240675992f20a6b7f97648ccd34593fdb444baf Mon Sep 17 00:00:00 2001 From: Kai Germaschewski Date: Tue, 26 May 2026 19:26:53 +0000 Subject: [PATCH 5/6] cuda: thrust::zip_iterator fix for cuda13 --- src/libpsc/cuda/cuda_mparticles.hxx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libpsc/cuda/cuda_mparticles.hxx b/src/libpsc/cuda/cuda_mparticles.hxx index 38c4ba128..59be06ae0 100644 --- a/src/libpsc/cuda/cuda_mparticles.hxx +++ b/src/libpsc/cuda/cuda_mparticles.hxx @@ -138,12 +138,12 @@ public: __host__ __device__ iterator begin() { - return iterator({xi4.begin(), pxi4.begin()}); + return iterator(thrust::make_tuple(xi4.begin(), pxi4.begin())); } __host__ __device__ iterator end() { - return iterator({xi4.end(), pxi4.end()}); + return iterator(thrust::make_tuple(xi4.end(), pxi4.end())); } __host__ void resize(size_t n) From 261eec30b3cc8be450096ba7b0cb92838ee0345f Mon Sep 17 00:00:00 2001 From: Kai Germaschewski Date: Tue, 26 May 2026 19:40:24 +0000 Subject: [PATCH 6/6] cuda: include missing header --- src/libpsc/psc_output_fields/fields_item_moments_1st.hxx | 1 + 1 file changed, 1 insertion(+) diff --git a/src/libpsc/psc_output_fields/fields_item_moments_1st.hxx b/src/libpsc/psc_output_fields/fields_item_moments_1st.hxx index bb0492c16..73ab4859f 100644 --- a/src/libpsc/psc_output_fields/fields_item_moments_1st.hxx +++ b/src/libpsc/psc_output_fields/fields_item_moments_1st.hxx @@ -1,6 +1,7 @@ #pragma once +#include #include #include "fields_item.hxx"