From d4b67948d99becc5aa81f7242193c9069b40bedc Mon Sep 17 00:00:00 2001
From: Kai Germaschewski <kai.germaschewski@gmail.com>
Date: Tue, 26 May 2026 18:17:15 +0000
Subject: [PATCH 1/6] cuda: compatibility with cuda >= 13.0.0

---
 src/libpsc/cuda/cuda_base.cxx | 60 ++++++++++++++++++++++++++++++-----
 1 file changed, 52 insertions(+), 8 deletions(-)

diff --git a/src/libpsc/cuda/cuda_base.cxx b/src/libpsc/cuda/cuda_base.cxx
index 93f28c6b7..50e54e619 100644
--- a/src/libpsc/cuda/cuda_base.cxx
+++ b/src/libpsc/cuda/cuda_base.cxx
@@ -33,6 +33,49 @@ static track_mr_type* track_mr;
 static pool_mr_type* pool_mr;
 #endif
 
+namespace
+{
+struct DevicePropsCompat
+{
+  int clock_rate;
+  int concurrent_kernels;
+  int kernel_exec_timeout;
+  int integrated;
+  int can_map_host_memory;
+  int compute_mode;
+};
+
+int cuda_device_attr(int dev, cudaDeviceAttr attr)
+{
+  int value = 0;
+  cudaError_t ierr = cudaDeviceGetAttribute(&value, attr, dev);
+  if (ierr != cudaSuccess) {
+    return 0;
+  }
+  return value;
+}
+
+DevicePropsCompat get_device_props_compat(const hipDeviceProp_t& deviceProp,
+                                          int dev)
+{
+#if CUDART_VERSION >= 13000
+  return {cuda_device_attr(dev, cudaDevAttrClockRate),
+          cuda_device_attr(dev, cudaDevAttrConcurrentKernels),
+          cuda_device_attr(dev, cudaDevAttrKernelExecTimeout),
+          cuda_device_attr(dev, cudaDevAttrIntegrated),
+          cuda_device_attr(dev, cudaDevAttrCanMapHostMemory),
+          cuda_device_attr(dev, cudaDevAttrComputeMode)};
+#else
+  return {deviceProp.clockRate,
+          deviceProp.deviceOverlap,
+          deviceProp.kernelExecTimeoutEnabled,
+          deviceProp.integrated,
+          deviceProp.canMapHostMemory,
+          deviceProp.computeMode};
+#endif
+}
+} // namespace
+
 void cuda_base_init(void)
 {
   static bool first_time = true;
@@ -79,6 +122,7 @@ void cuda_base_init(void)
   for (int dev = 0; dev < deviceCount; ++dev) {
     hipDeviceProp_t deviceProp;
     hipGetDeviceProperties(&deviceProp, dev);
+    auto compat = get_device_props_compat(deviceProp, dev);
 
     if (dev == 0) {
       // This function call returns 9999 for both major & minor fields, if no
@@ -124,25 +168,25 @@ void cuda_base_init(void)
     printf("  Texture alignment:                             %lu bytes\n",
            deviceProp.textureAlignment);
     printf("  Clock rate:                                    %.2f GHz\n",
-           deviceProp.clockRate * 1e-6f);
+           compat.clock_rate * 1e-6f);
 #if CUDART_VERSION >= 2000
     printf("  Concurrent copy and execution:                 %s\n",
-           deviceProp.deviceOverlap ? "Yes" : "No");
+           compat.concurrent_kernels ? "Yes" : "No");
 #endif
 #if CUDART_VERSION >= 2020
     printf("  Run time limit on kernels:                     %s\n",
-           deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
+           compat.kernel_exec_timeout ? "Yes" : "No");
     printf("  Integrated:                                    %s\n",
-           deviceProp.integrated ? "Yes" : "No");
+           compat.integrated ? "Yes" : "No");
     printf("  Support host page-locked memory mapping:       %s\n",
-           deviceProp.canMapHostMemory ? "Yes" : "No");
+           compat.can_map_host_memory ? "Yes" : "No");
     printf(
       "  Compute mode:                                  %s\n",
-      deviceProp.computeMode == hipComputeModeDefault
+      compat.compute_mode == hipComputeModeDefault
         ? "Default (multiple host threads can use this device simultaneously)"
-      : deviceProp.computeMode == hipComputeModeExclusive
+      : compat.compute_mode == hipComputeModeExclusive
         ? "Exclusive (only one host thread at a time can use this device)"
-      : deviceProp.computeMode == hipComputeModeProhibited
+      : compat.compute_mode == hipComputeModeProhibited
         ? "Prohibited (no host thread can use this device)"
         : "Unknown");
 #endif

From eb54b111e842190734c49677d57712a34aa23c50 Mon Sep 17 00:00:00 2001
From: Kai Germaschewski <kai.germaschewski@gmail.com>
Date: Tue, 26 May 2026 18:19:52 +0000
Subject: [PATCH 2/6] cuda: get rid of unused cuda_base.cu

---
 src/libpsc/cuda/cuda_base.cu | 178 -----------------------------------
 1 file changed, 178 deletions(-)
 delete mode 100644 src/libpsc/cuda/cuda_base.cu

diff --git a/src/libpsc/cuda/cuda_base.cu b/src/libpsc/cuda/cuda_base.cu
deleted file mode 100644
index 9d2e4a83b..000000000
--- a/src/libpsc/cuda/cuda_base.cu
+++ /dev/null
@@ -1,178 +0,0 @@
-
-#include "PscConfig.h"
-#include "cuda_base.cuh"
-
-#ifdef PSC_HAVE_RMM
-#include <rmm/mr/device/pool_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/mr/device/logging_resource_adaptor.hpp>
-#include <rmm/mr/device/tracking_resource_adaptor.hpp>
-#endif
-
-#include <cstdio>
-#include <cassert>
-#include <cuda_bits.h>
-#include <mrc_common.h>
-
-std::size_t mem_particles;
-std::size_t mem_randomize_sort;
-std::size_t mem_sort_by_block;
-std::size_t mem_bnd;
-std::size_t mem_heating;
-std::size_t mem_collisions;
-std::size_t mem_bndp;
-std::size_t mem_rnd;
-
-#ifdef PSC_HAVE_RMM
-using device_mr_type = rmm::mr::device_memory_resource;
-using pool_mr_type = rmm::mr::pool_memory_resource<device_mr_type>;
-using track_mr_type = rmm::mr::tracking_resource_adaptor<device_mr_type>;
-using log_mr_type = rmm::mr::logging_resource_adaptor<device_mr_type>;
-
-static track_mr_type* track_mr;
-static pool_mr_type* pool_mr;
-#endif
-
-void cuda_base_init(void)
-{
-  static bool first_time = true;
-  if (!first_time)
-    return;
-
-  first_time = false;
-
-#ifdef PSC_HAVE_RMM
-  rmm::logger().set_level(spdlog::level::trace);
-
-  device_mr_type* mr =
-    rmm::mr::get_current_device_resource(); // Points to `cuda_memory_resource`
-  static log_mr_type _log_mr{mr, std::cout, true};
-  static pool_mr_type pool_mr{&_log_mr, 15000000000};
-  static track_mr_type track_mr{&pool_mr};
-#if 0
-  static log_mr_type log_mr{&track_mr, std::cout, true};
-  rmm::mr::set_current_device_resource(&log_mr);
-#else
-  rmm::mr::set_current_device_resource(&track_mr);
-#endif
-  ::pool_mr = &pool_mr;
-  ::track_mr = &track_mr;
-#endif
-
-  int deviceCount;
-  cudaGetDeviceCount(&deviceCount);
-
-  // This function call returns 0 if there are no CUDA capable devices.
-  if (deviceCount == 0) {
-    printf("There is no device supporting CUDA\n");
-    return;
-  }
-
-  get_rng_state().resize(131072);
-
-  int rank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  if (rank != 0) {
-    return;
-  }
-
-  for (int dev = 0; dev < deviceCount; ++dev) {
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, dev);
-
-    if (dev == 0) {
-      // This function call returns 9999 for both major & minor fields, if no
-      // CUDA capable devices are present
-      if (deviceProp.major == 9999 && deviceProp.minor == 9999)
-        printf("There is no device supporting CUDA.\n");
-      else if (deviceCount == 1)
-        printf("There is 1 device supporting CUDA\n");
-      else
-        printf("There are %d devices supporting CUDA\n", deviceCount);
-    }
-    printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
-    printf("  CUDA Capability Major revision number:         %d\n",
-           deviceProp.major);
-    printf("  CUDA Capability Minor revision number:         %d\n",
-           deviceProp.minor);
-    printf("  Total amount of global memory:                 %lu bytes\n",
-           deviceProp.totalGlobalMem);
-#if CUDART_VERSION >= 2000
-    printf("  Number of multiprocessors:                     %d\n",
-           deviceProp.multiProcessorCount);
-    printf("  Number of cores:                               %d\n",
-           8 * deviceProp.multiProcessorCount);
-#endif
-    printf("  Total amount of constant memory:               %lu bytes\n",
-           deviceProp.totalConstMem);
-    printf("  Total amount of shared memory per block:       %lu bytes\n",
-           deviceProp.sharedMemPerBlock);
-    printf("  Total number of registers available per block: %d\n",
-           deviceProp.regsPerBlock);
-    printf("  Warp size:                                     %d\n",
-           deviceProp.warpSize);
-    printf("  Maximum number of threads per block:           %d\n",
-           deviceProp.maxThreadsPerBlock);
-    printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n",
-           deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1],
-           deviceProp.maxThreadsDim[2]);
-    printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n",
-           deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],
-           deviceProp.maxGridSize[2]);
-    printf("  Maximum memory pitch:                          %lu bytes\n",
-           deviceProp.memPitch);
-    printf("  Texture alignment:                             %lu bytes\n",
-           deviceProp.textureAlignment);
-    printf("  Clock rate:                                    %.2f GHz\n",
-           deviceProp.clockRate * 1e-6f);
-#if CUDART_VERSION >= 2000
-    printf("  Concurrent copy and execution:                 %s\n",
-           deviceProp.deviceOverlap ? "Yes" : "No");
-#endif
-#if CUDART_VERSION >= 2020
-    printf("  Run time limit on kernels:                     %s\n",
-           deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
-    printf("  Integrated:                                    %s\n",
-           deviceProp.integrated ? "Yes" : "No");
-    printf("  Support host page-locked memory mapping:       %s\n",
-           deviceProp.canMapHostMemory ? "Yes" : "No");
-    printf(
-      "  Compute mode:                                  %s\n",
-      deviceProp.computeMode == cudaComputeModeDefault
-        ? "Default (multiple host threads can use this device simultaneously)"
-      : deviceProp.computeMode == cudaComputeModeExclusive
-        ? "Exclusive (only one host thread at a time can use this device)"
-      : deviceProp.computeMode == cudaComputeModeProhibited
-        ? "Prohibited (no host thread can use this device)"
-        : "Unknown");
-#endif
-  }
-}
-
-std::size_t mem_cuda_allocated()
-{
-#ifdef PSC_HAVE_RMM
-  if (track_mr) {
-    return track_mr->get_allocated_bytes();
-  } else {
-    return 0;
-  }
-#else
-  return 0;
-#endif
-}
-
-RngStateCuda& get_rng_state()
-{
-  static RngStateCuda rng_state;
-  return rng_state;
-}
-
-void mem_pool_print()
-{
-#if 0 // needs hacked RMM to make print() accessible
-  if (pool_mr) {
-    pool_mr->print();
-  }
-#endif
-}

From 5595cc53ac86b136c6f04e9c29c12c51e56aaa2e Mon Sep 17 00:00:00 2001
From: Kai Germaschewski <kai.germaschewski@gmail.com>
Date: Tue, 26 May 2026 19:03:55 +0000
Subject: [PATCH 3/6] cuda: fix copy&paste error in marder_impl.hxx

---
 src/libpsc/psc_push_fields/marder_impl.hxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/libpsc/psc_push_fields/marder_impl.hxx b/src/libpsc/psc_push_fields/marder_impl.hxx
index 2dc4fb50a..a65dc1df9 100644
--- a/src/libpsc/psc_push_fields/marder_impl.hxx
+++ b/src/libpsc/psc_push_fields/marder_impl.hxx
@@ -64,7 +64,7 @@ inline void correct(const Grid_t& grid, E1& efield, const Int3& efield_ib,
 
 template <typename E1, typename E2>
 inline void cuda_marder_correct_yz(E1& efield, E2& res, Float3 fac, Int3 l,
-                                   Int3 r, Int3 l, Int3 r)
+                                   Int3 r)
 {
   auto k_efield = efield.to_kernel();
   auto k_res = res.to_kernel();

From 577d355059978787678117be8d7ae0c98d9bd50c Mon Sep 17 00:00:00 2001
From: Kai Germaschewski <kai.germaschewski@gmail.com>
Date: Tue, 26 May 2026 19:06:56 +0000
Subject: [PATCH 4/6] cuda: don't build psc_shock with cuda

---
 src/CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 3c42b99f7..c0af10095 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,6 +10,7 @@ endmacro(add_psc_executable)
 
 if(NOT USE_CUDA)
   add_psc_executable(psc_bgk)
+  add_psc_executable(psc_shock)
 endif()
 
 add_psc_executable(psc_bubble_yz)
@@ -19,16 +20,15 @@ add_psc_executable(psc_whistler)
 add_psc_executable(psc_harris_yz)
 add_psc_executable(psc_2d_shock)
 add_psc_executable(psc_radiation)
-add_psc_executable(psc_shock)
 
 if(NOT USE_CUDA)
   install(
-    TARGETS psc_bgk
+    TARGETS psc_bgk psc_shock
     RUNTIME DESTINATION bin
   )
 endif()
 
 install(
-  TARGETS psc_bubble_yz psc_flatfoil_yz psc_whistler psc_shock
+  TARGETS psc_bubble_yz psc_flatfoil_yz psc_whistler
   RUNTIME DESTINATION bin
 )

From c240675992f20a6b7f97648ccd34593fdb444baf Mon Sep 17 00:00:00 2001
From: Kai Germaschewski <kai.germaschewski@gmail.com>
Date: Tue, 26 May 2026 19:26:53 +0000
Subject: [PATCH 5/6] cuda: thrust::zip_iterator fix for cuda13

---
 src/libpsc/cuda/cuda_mparticles.hxx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/libpsc/cuda/cuda_mparticles.hxx b/src/libpsc/cuda/cuda_mparticles.hxx
index 38c4ba128..59be06ae0 100644
--- a/src/libpsc/cuda/cuda_mparticles.hxx
+++ b/src/libpsc/cuda/cuda_mparticles.hxx
@@ -138,12 +138,12 @@ public:
 
   __host__ __device__ iterator begin()
   {
-    return iterator({xi4.begin(), pxi4.begin()});
+    return iterator(thrust::make_tuple(xi4.begin(), pxi4.begin()));
   }
 
   __host__ __device__ iterator end()
   {
-    return iterator({xi4.end(), pxi4.end()});
+    return iterator(thrust::make_tuple(xi4.end(), pxi4.end()));
   }
 
   __host__ void resize(size_t n)

From 261eec30b3cc8be450096ba7b0cb92838ee0345f Mon Sep 17 00:00:00 2001
From: Kai Germaschewski <kai.germaschewski@gmail.com>
Date: Tue, 26 May 2026 19:40:24 +0000
Subject: [PATCH 6/6] cuda: include missing header

---
 src/libpsc/psc_output_fields/fields_item_moments_1st.hxx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/libpsc/psc_output_fields/fields_item_moments_1st.hxx b/src/libpsc/psc_output_fields/fields_item_moments_1st.hxx
index bb0492c16..73ab4859f 100644
--- a/src/libpsc/psc_output_fields/fields_item_moments_1st.hxx
+++ b/src/libpsc/psc_output_fields/fields_item_moments_1st.hxx
@@ -1,6 +1,7 @@
 
 #pragma once
 
+#include <psc_fields_single.h>
 #include <psc/moment.hxx>
 #include "fields_item.hxx"