From 97c5055345af1e18cc37b396af3704034535b55a Mon Sep 17 00:00:00 2001
From: Ruochun <ruochunz@gmail.com>
Date: Wed, 14 Jan 2026 22:46:45 +0800
Subject: [PATCH 01/17] Support CUDA13

---
 cmake/CudaSupportedArchitectures.cmake |  7 ++++++-
 src/algorithms/DEMCubWrappers.cu       | 28 +++++++++++++-------------
 2 files changed, 20 insertions(+), 15 deletions(-)
diff --git a/cmake/CudaSupportedArchitectures.cmake b/cmake/CudaSupportedArchitectures.cmake
index b3853ed0..4d163870 100644
--- a/cmake/CudaSupportedArchitectures.cmake
+++ b/cmake/CudaSupportedArchitectures.cmake
@@ -31,7 +31,8 @@ function(cuda_supported_architectures)
 	set(cu10 30 35 50 52 60 61 70 72 75)
 	set(cu11 35 50 52 60 61 70 72 75 80)
 	set(cu11_x 35 50 52 60 61 70 72 75 80 86)
-	set(cu12_x 50 52 60 61 70 72 75 80 86)
+	set(cu12_x 50 52 60 61 70 72 75 80 86 89 120)
+	set(cu13_x 75 80 86 89 90 100 120 121)
 
 	if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 7)
 		set(CUDASUP_ARCHITECTURES ${cu7} CACHE INTERNAL "")
@@ -60,6 +61,10 @@ function(cuda_supported_architectures)
 	if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12)
 		set(CUDASUP_ARCHITECTURES ${cu12_x} CACHE INTERNAL "")
 	endif()
+	
+    if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13)
+        set(CUDASUP_ARCHITECTURES ${cu13_x} CACHE INTERNAL "")
+    endif()
 
 	if (NOT DEFINED CUDASUP_ARCHITECTURES)
 		message(SEND_ERROR "[CUDASUP] Could not determine device architectures supported by the CUDA toolkit!")
diff --git a/src/algorithms/DEMCubWrappers.cu b/src/algorithms/DEMCubWrappers.cu
index 4efade16..4ecfcbe5 100644
--- a/src/algorithms/DEMCubWrappers.cu
+++ b/src/algorithms/DEMCubWrappers.cu
@@ -8,6 +8,14 @@
 #include <DEM/Structs.h>
 #include <core/utils/GpuError.h>
 
+#if CUDART_VERSION >= 13000
+    #define DEME_CUB_SUM_OP(T) \
+        cuda::std::plus<T> {}
+#else
+    #define DEME_CUB_SUM_OP(T) \
+        cub::Sum {}
+#endif
+
 namespace deme {
 
 // Functor type for selecting values less than some criteria
@@ -75,10 +83,11 @@ inline void cubDEMPrefixScan(T1* d_in,
     // let you know when it happens. I made a trick: use ExclusiveScan and (T2)0 as the initial value, and this forces
     // cub to store results as T2 type.
     size_t cub_scratch_bytes = 0;
-    cub::DeviceScan::ExclusiveScan(NULL, cub_scratch_bytes, d_in, d_out, cub::Sum(), (T2)0, n, this_stream);
+    cub::DeviceScan::ExclusiveScan(NULL, cub_scratch_bytes, d_in, d_out, DEME_CUB_SUM_OP(T2), (T2)0, n, this_stream);
     DEME_GPU_CALL(cudaStreamSynchronize(this_stream));
     void* d_scratch_space = (void*)scratchPad.allocateScratchSpace(cub_scratch_bytes);
-    cub::DeviceScan::ExclusiveScan(d_scratch_space, cub_scratch_bytes, d_in, d_out, cub::Sum(), (T2)0, n, this_stream);
+    cub::DeviceScan::ExclusiveScan(d_scratch_space, cub_scratch_bytes, d_in, d_out, DEME_CUB_SUM_OP(T2), (T2)0, n,
+                                   this_stream);
     DEME_GPU_CALL(cudaStreamSynchronize(this_stream));
 }
 
@@ -156,23 +165,14 @@ inline void cubDEMReduceByKeys(T1* d_keys_in,
 template <typename T1, typename T2>
 void cubDEMSum(T1* d_in, T2* d_out, size_t n, cudaStream_t& this_stream, DEMSolverScratchData& scratchPad) {
     size_t cub_scratch_bytes = 0;
-    cub::DeviceReduce::Reduce(NULL, cub_scratch_bytes, d_in, d_out, n, cub::Sum(), (T2)0, this_stream);
+    cub::DeviceReduce::Reduce(NULL, cub_scratch_bytes, d_in, d_out, n, DEME_CUB_SUM_OP(T2), (T2)0, this_stream);
     DEME_GPU_CALL(cudaStreamSynchronize(this_stream));
     void* d_scratch_space = (void*)scratchPad.allocateScratchSpace(cub_scratch_bytes);
-    cub::DeviceReduce::Reduce(d_scratch_space, cub_scratch_bytes, d_in, d_out, n, cub::Sum(), (T2)0, this_stream);
+    cub::DeviceReduce::Reduce(d_scratch_space, cub_scratch_bytes, d_in, d_out, n, DEME_CUB_SUM_OP(T2), (T2)0,
+                              this_stream);
     DEME_GPU_CALL(cudaStreamSynchronize(this_stream));
 }
 
-// template <typename T1>
-// void cubDEMSum(T1* d_in, T1* d_out, size_t n, cudaStream_t& this_stream, DEMSolverScratchData& scratchPad) {
-//     size_t cub_scratch_bytes = 0;
-//     cub::DeviceReduce::Sum(NULL, cub_scratch_bytes, d_in, d_out, n, this_stream);
-//     DEME_GPU_CALL(cudaStreamSynchronize(this_stream));
-//     void* d_scratch_space = (void*)scratchPad.allocateScratchSpace(cub_scratch_bytes);
-//     cub::DeviceReduce::Sum(d_scratch_space, cub_scratch_bytes, d_in, d_out, n, this_stream);
-//     DEME_GPU_CALL(cudaStreamSynchronize(this_stream));
-// }
-
 template <typename T1>
 void cubDEMMax(T1* d_in, T1* d_out, size_t n, cudaStream_t& this_stream, DEMSolverScratchData& scratchPad) {
     size_t cub_scratch_bytes = 0;

From 16a31288c35e74cbd0fda1246d461216a7fc6e99 Mon Sep 17 00:00:00 2001
From: Ruochun Zhang <ruochunz@gmail.com>
Date: Fri, 16 Jan 2026 16:23:29 +0800
Subject: [PATCH 02/17] Fix CUDA 12.8-related issue

---
 cmake/CudaSupportedArchitectures.cmake | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/cmake/CudaSupportedArchitectures.cmake b/cmake/CudaSupportedArchitectures.cmake
index 4d163870..54cf7f0c 100644
--- a/cmake/CudaSupportedArchitectures.cmake
+++ b/cmake/CudaSupportedArchitectures.cmake
@@ -20,7 +20,6 @@
 # version of the CUDA Toolkit
 #
 # Minimum CUDA version: 7.0
-# Maximum CUDA version: 11.6
 
 function(cuda_supported_architectures)
 
@@ -31,7 +30,8 @@ function(cuda_supported_architectures)
 	set(cu10 30 35 50 52 60 61 70 72 75)
 	set(cu11 35 50 52 60 61 70 72 75 80)
 	set(cu11_x 35 50 52 60 61 70 72 75 80 86)
-	set(cu12_x 50 52 60 61 70 72 75 80 86 89 120)
+	set(cu12_x 50 52 60 61 70 72 75 80 86 89)
+	set(cu12_8 50 52 60 61 70 72 75 80 86 89 120)
 	set(cu13_x 75 80 86 89 90 100 120 121)
 
 	if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 7)
@@ -61,6 +61,10 @@ function(cuda_supported_architectures)
 	if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12)
 		set(CUDASUP_ARCHITECTURES ${cu12_x} CACHE INTERNAL "")
 	endif()
+
+	if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
+		set(CUDASUP_ARCHITECTURES ${cu12_8} CACHE INTERNAL "")
+	endif()
 	
     if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13)
         set(CUDASUP_ARCHITECTURES ${cu13_x} CACHE INTERNAL "")

From 958bfa9f451da0509fb338cb778c4964236dcce9 Mon Sep 17 00:00:00 2001
From: Ruochun Zhang <ruochunz@gmail.com>
Date: Fri, 16 Jan 2026 19:17:32 +0800
Subject: [PATCH 03/17] Add a demo and slightly improve CD

---
 src/DEM/dT.cpp                              |  37 ++---
 src/algorithms/DEMDynamicMisc.cu            |  31 ++--
 src/algorithms/DEMStaticDeviceSubroutines.h |   1 -
 src/demo/CMakeLists.txt                     |   1 +
 src/demo/DEMdemo_DrumCubes.cpp              | 158 ++++++++++++++++++++
 src/kernel/DEMKinematicMisc.cu              |  12 +-
 6 files changed, 199 insertions(+), 41 deletions(-)
 create mode 100644 src/demo/DEMdemo_DrumCubes.cpp

diff --git a/src/DEM/dT.cpp b/src/DEM/dT.cpp
index 89f71619..f8c16464 100644
--- a/src/DEM/dT.cpp
+++ b/src/DEM/dT.cpp
@@ -2337,8 +2337,6 @@ inline void DEMDynamicThread::dispatchPatchBasedForceCorrections(
                     "uniqueKeys", countPrimitive * sizeof(contactPairs_t));
                 float3* votedWeightedNormals = (float3*)solverScratchSpace.allocateTempVector(
                     "votedWeightedNormals", countPrimitive * sizeof(float3));
-                double* totalAreas =
-                    (double*)solverScratchSpace.allocateTempVector("totalAreas", countPrimitive * sizeof(double));
                 solverScratchSpace.allocateDualStruct("numUniqueKeys");
                 size_t* numUniqueKeys = solverScratchSpace.getDualStructDevice("numUniqueKeys");
 
@@ -2367,22 +2365,14 @@ inline void DEMDynamicThread::dispatchPatchBasedForceCorrections(
                         numUniqueKeysHost, countPatch, contact_type);
                 }
 
-                // Step 3: Reduce-by-key for areas (sum)
-                // Note: CUB's ReduceByKey requires an output array for unique keys, and the keys
-                // are the same as in Step 2.
-                cubSumReduceByKey<contactPairs_t, double>(keys, uniqueKeys, areas, totalAreas, numUniqueKeys,
-                                                          countPrimitive, streamInfo.stream, solverScratchSpace);
-
-                // Step 4: Normalize the voted normals by total area and scatter back to a temp array.
+                // Step 3: Normalize the voted normals by total area and scatter back to a temp array.
                 float3* votedNormals =
                     (float3*)solverScratchSpace.allocateTempVector("votedNormals", countPatch * sizeof(float3));
-                normalizeAndScatterVotedNormals(votedWeightedNormals, totalAreas, votedNormals, countPatch,
-                                                streamInfo.stream);
+                normalizeAndScatterVotedNormals(votedWeightedNormals, votedNormals, countPatch, streamInfo.stream);
                 solverScratchSpace.finishUsingTempVector("votedWeightedNormals");
-                solverScratchSpace.finishUsingTempVector("totalAreas");
                 // displayDeviceFloat3(votedNormals, countPatch);
 
-                // Step 5: Compute projected penetration and area for each primitive contact
+                // Step 4: Compute projected penetration and area for each primitive contact
                 // Both the penetration and area are projected onto the voted normal
                 // If the projected penetration becomes negative, both are set to 0
                 // Reuse keys array for the reduce-by-key operation
@@ -2395,14 +2385,14 @@ inline void DEMDynamicThread::dispatchPatchBasedForceCorrections(
                                                  streamInfo.stream);
                 solverScratchSpace.finishUsingTempVector("areas");
 
-                // Step 6: Reduce-by-key to get total projected area per patch pair (sum)
+                // Step 5: Reduce-by-key to get total projected area per patch pair (sum)
                 double* totalProjectedAreas =
                     (double*)solverScratchSpace.allocateTempVector("totalProjectedAreas", countPatch * sizeof(double));
                 cubSumReduceByKey<contactPairs_t, double>(keys, uniqueKeys, projectedAreas, totalProjectedAreas,
                                                           numUniqueKeys, countPrimitive, streamInfo.stream,
                                                           solverScratchSpace);
 
-                // Step 7: Reduce-by-key to get max projected penetration per patch pair (max).
+                // Step 6: Reduce-by-key to get max projected penetration per patch pair (max).
                 // This result, maxProjectedPenetrations, is the max of projected penetration, aka the max pen in the
                 // physical overlap case, and it's not the same as maxPenetrations in step 9 which is a fallback
                 // primitive-derived penetration.
@@ -2412,7 +2402,7 @@ inline void DEMDynamicThread::dispatchPatchBasedForceCorrections(
                                                           maxProjectedPenetrations, numUniqueKeys, countPrimitive,
                                                           streamInfo.stream, solverScratchSpace);
 
-                // Step 8: Compute weighted contact points for each primitive (normal case)
+                // Step 7: Compute weighted contact points for each primitive (normal case)
                 // The weight is: projected_penetration * projected_area
                 // Reuse keys, uniqueKeys, and numUniqueKeys that are still allocated
                 double3* weightedContactPoints = (double3*)solverScratchSpace.allocateTempVector(
@@ -2444,16 +2434,16 @@ inline void DEMDynamicThread::dispatchPatchBasedForceCorrections(
                 solverScratchSpace.finishUsingTempVector("totalWeightedContactPoints");
                 solverScratchSpace.finishUsingTempVector("totalContactWeights");
 
-                // Step 9: Handle zero-area patches (all primitive areas are 0)
+                // Step 8: Handle zero-area patches (all primitive areas are 0)
                 // For these patches, we need to find the max penetration primitive and use its normal/penetration
 
-                // 9a: Extract primitive penetrations for max-reduce
+                // 8a: Extract primitive penetrations for max-reduce
                 double* primitivePenetrations = (double*)solverScratchSpace.allocateTempVector(
                     "primitivePenetrations", countPrimitive * sizeof(double));
                 extractPrimitivePenetrations(&granData, primitivePenetrations, startOffsetPrimitive, countPrimitive,
                                              streamInfo.stream);
 
-                // 9b: Max-negative-reduce-by-key to get max negative penetration per patch
+                // 8b: Max-negative-reduce-by-key to get max negative penetration per patch
                 // This finds the largest negative value (smallest absolute value among negatives)
                 // Positive values are treated as very negative to indicate invalid/non-physical state
                 double* maxPenetrations =
@@ -2463,7 +2453,7 @@ inline void DEMDynamicThread::dispatchPatchBasedForceCorrections(
                                                                   streamInfo.stream, solverScratchSpace);
                 solverScratchSpace.finishUsingTempVector("primitivePenetrations");
 
-                // 9c: Find max-penetration primitives for zero-area patches and extract their normals, penetrations,
+                // 8c: Find max-penetration primitives for zero-area patches and extract their normals, penetrations,
                 // and contact points
                 float3* zeroAreaNormals =
                     (float3*)solverScratchSpace.allocateTempVector("zeroAreaNormals", countPatch * sizeof(float3));
@@ -2476,7 +2466,7 @@ inline void DEMDynamicThread::dispatchPatchBasedForceCorrections(
                     startOffsetPrimitive, startOffsetPatch, countPrimitive, streamInfo.stream);
                 solverScratchSpace.finishUsingTempVector("maxPenetrations");
 
-                // Step 9d: Check if each patch has any SAT-satisfying primitive (for tri-tri contacts)
+                // Step 8d: Check if each patch has any SAT-satisfying primitive (for tri-tri contacts)
                 // If no primitive satisfies SAT, the patch contact is non-physical and should use Step 9 fallback
                 notStupidBool_t* patchHasSAT = nullptr;
                 if (contact_type == TRIANGLE_TRIANGLE_CONTACT) {
@@ -2492,7 +2482,7 @@ inline void DEMDynamicThread::dispatchPatchBasedForceCorrections(
                 solverScratchSpace.finishUsingTempVector("uniqueKeys");
                 solverScratchSpace.finishUsingDualStruct("numUniqueKeys");
 
-                // Step 10: Finalize patch results by combining voting with zero-area handling.
+                // Step 9: Finalize patch results by combining voting with zero-area handling.
                 // If patch-based projected area is 0 (or this patch pair consists of no SAT pair), meaning no physical
                 // contact, we use the fallback estimations (zeroArea*) of CP, penetration and areas.
                 double* finalAreas =
@@ -2557,6 +2547,9 @@ inline void DEMDynamicThread::dispatchPatchBasedForceCorrections(
                     cubMaxReduce<double>(finalPenetrations.data(), &maxTriTriPenetration, countPatch, streamInfo.stream,
                                          solverScratchSpace);
                     // No toHost() here - keep on device since host never needs it
+                    // maxTriTriPenetration.toHost();
+                    // std::cout << "Max tri-tri penetration after patch-based correction: " << *maxTriTriPenetration
+                    //           << std::endl;
                 }
 
                 // Final clean up
diff --git a/src/algorithms/DEMDynamicMisc.cu b/src/algorithms/DEMDynamicMisc.cu
index f584b1b2..8179fe47 100644
--- a/src/algorithms/DEMDynamicMisc.cu
+++ b/src/algorithms/DEMDynamicMisc.cu
@@ -138,9 +138,15 @@ __global__ void prepareWeightedNormalsForVoting_impl(DEMDataDT* granData,
         // Extract the area (double) from contactPointGeometryB (stored as float3)
         float3 areaStorage = granData->contactPointGeometryB[myContactID];
         double area = float3StorageToDouble(areaStorage);
+        float3 penStorage = granData->contactPointGeometryA[myContactID];
+        double penetration = float3StorageToDouble(penStorage);
+        penetration = (penetration > DEME_TINY_FLOAT) ? penetration : DEME_TINY_FLOAT;
+        double recipPen = 1.0 / penetration;
 
         // Compute weighted normal (normal * area)
-        weightedNormals[idx] = make_float3(normal.x * area, normal.y * area, normal.z * area);
+        // Note that fake contacts do not affect as their area is 0
+        weightedNormals[idx] = make_float3((double)normal.x * area * recipPen, (double)normal.y * area * recipPen,
+                                           (double)normal.z * area * recipPen);
 
         // Store area for reduction
         areas[idx] = area;
@@ -170,23 +176,19 @@ void prepareWeightedNormalsForVoting(DEMDataDT* granData,
 // Assumes uniqueKeys are sorted (CUB's ReduceByKey maintains sort order)
 // Uses contactPairs_t keys (geomToPatchMap values)
 __global__ void normalizeAndScatterVotedNormals_impl(float3* votedWeightedNormals,
-                                                     double* totalAreas,
                                                      float3* output,
                                                      contactPairs_t count) {
     contactPairs_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < count) {
-        float3 votedNormal = make_float3(0, 0, 0);
-        double totalArea = totalAreas[idx];
-        if (totalArea > 0.0) {
-            // Normalize by dividing by total area (use reciprocal multiplication for efficiency)
-            double invTotalArea = 1.0 / totalArea;
-            votedNormal.x = votedWeightedNormals[idx].x * invTotalArea;
-            votedNormal.y = votedWeightedNormals[idx].y * invTotalArea;
-            votedNormal.z = votedWeightedNormals[idx].z * invTotalArea;
-            // Normalization is needed, as voting by area can destroy unit length
-            votedNormal = normalize(votedNormal);
+        float3 votedNormal = votedWeightedNormals[idx];
+        float len2 = length2(votedNormal);
+        if (len2 > 0.f) {
+            // Normalize votedNormal
+            votedNormal *= rsqrtf(len2);
+        } else {
+            // If total area is 0, set to (0,0,0) to mark no real contact
+            votedNormal = make_float3(0.0f, 0.0f, 0.0f);
         }
-        // else: votedNormal remains (0,0,0)
 
         // Write to output at the correct position
         output[idx] = votedNormal;
@@ -194,14 +196,13 @@ __global__ void normalizeAndScatterVotedNormals_impl(float3* votedWeightedNormal
 }
 
 void normalizeAndScatterVotedNormals(float3* votedWeightedNormals,
-                                     double* totalAreas,
                                      float3* output,
                                      contactPairs_t count,
                                      cudaStream_t& this_stream) {
     size_t blocks_needed = (count + DEME_MAX_THREADS_PER_BLOCK - 1) / DEME_MAX_THREADS_PER_BLOCK;
     if (blocks_needed > 0) {
         normalizeAndScatterVotedNormals_impl<<<blocks_needed, DEME_MAX_THREADS_PER_BLOCK, 0, this_stream>>>(
-            votedWeightedNormals, totalAreas, output, count);
+            votedWeightedNormals, output, count);
         DEME_GPU_CALL(cudaStreamSynchronize(this_stream));
     }
 }
diff --git a/src/algorithms/DEMStaticDeviceSubroutines.h b/src/algorithms/DEMStaticDeviceSubroutines.h
index 081a2b02..747e74c9 100644
--- a/src/algorithms/DEMStaticDeviceSubroutines.h
+++ b/src/algorithms/DEMStaticDeviceSubroutines.h
@@ -184,7 +184,6 @@ void prepareWeightedNormalsForVoting(DEMDataDT* granData,
 // Normalizes voted normals by total area and scatters to output
 // If total area is 0, output is (0,0,0) indicating no contact
 void normalizeAndScatterVotedNormals(float3* votedWeightedNormals,
-                                     double* totalAreas,
                                      float3* output,
                                      contactPairs_t count,
                                      cudaStream_t& this_stream);
diff --git a/src/demo/CMakeLists.txt b/src/demo/CMakeLists.txt
index fd6576b2..f99189c9 100644
--- a/src/demo/CMakeLists.txt
+++ b/src/demo/CMakeLists.txt
@@ -23,6 +23,7 @@ SET(DEMOS
 		DEMdemo_MeshFalling
 		DEMdemo_TestPack
 		DEMdemo_RotatingDrum
+		DEMdemo_DrumCubes
 		DEMdemo_Centrifuge
 		DEMdemo_GameOfLife
 		DEMdemo_BallDrop
diff --git a/src/demo/DEMdemo_DrumCubes.cpp b/src/demo/DEMdemo_DrumCubes.cpp
new file mode 100644
index 00000000..836c4679
--- /dev/null
+++ b/src/demo/DEMdemo_DrumCubes.cpp
@@ -0,0 +1,158 @@
+//  Copyright (c) 2021, SBEL GPU Development Team
+//  Copyright (c) 2021, University of Wisconsin - Madison
+//
+//	SPDX-License-Identifier: BSD-3-Clause
+
+// =============================================================================
+// Rotating drum centrifuge demo with only cube mesh particles.
+// Matches the output style of DEMdemo_Centrifuge but uses 10 mm cubes
+// (12-triangle mesh) inside an analytically defined cylinder and lids.
+// =============================================================================
+
+#include <core/ApiVersion.h>
+#include <core/utils/ThreadManager.h>
+#include <DEM/API.h>
+#include <DEM/utils/Samplers.hpp>
+
+#include <chrono>
+#include <cmath>
+#include <cstdio>
+#include <filesystem>
+#include <random>
+
+using namespace deme;
+using namespace std::filesystem;
+
+int main() {
+    DEMSolver DEMSim;
+    DEMSim.SetOutputFormat(OUTPUT_FORMAT::CSV);
+    DEMSim.SetOutputContent(OUTPUT_CONTENT::FAMILY);
+    DEMSim.SetNoForceRecord();
+    DEMSim.SetMeshUniversalContact(true);
+
+    auto mat_type_cube = DEMSim.LoadMaterial({{"E", 1e6}, {"nu", 0.3}, {"CoR", 0.6}, {"mu", 0.5}, {"Crr", 0.01}});
+    auto mat_type_drum = DEMSim.LoadMaterial({{"E", 2e6}, {"nu", 0.3}, {"CoR", 0.6}, {"mu", 0.5}, {"Crr", 0.01}});
+    DEMSim.SetMaterialPropertyPair("mu", mat_type_cube, mat_type_drum, 0.5);
+
+    const float cube_size = 0.01f;
+    const float cube_density = 2600.0f;
+    const float cube_mass = cube_density * cube_size * cube_size * cube_size;
+    const float cube_moi = cube_mass * cube_size * cube_size / 6.0f;
+    const float half_diag = 0.5f * cube_size * std::sqrt(3.0f);
+
+    // Load cube mesh template (12 triangles) and scale to 10 mm
+    auto cube_template = DEMSim.LoadMeshType((GET_DATA_PATH() / "mesh/cube.obj").string(), mat_type_cube, true, false);
+    cube_template->Scale(cube_size);
+
+    // Drum definition
+    float3 CylCenter = make_float3(0, 0, 0);
+    float3 CylAxis = make_float3(0, 0, 1);
+    float CylRad = 0.08f;
+    float CylHeight = 0.2f;
+    float CylMass = 1.0f;
+    float safe_delta = 0.003f;
+    float IZZ = CylMass * CylRad * CylRad / 2;
+    float IYY = (CylMass / 12) * (3 * CylRad * CylRad + CylHeight * CylHeight);
+    auto Drum = DEMSim.AddExternalObject();
+    // Drum->AddCylinder(CylCenter, CylAxis, CylRad, mat_type_drum, 0);
+    Drum->AddPlane(make_float3(CylRad, 0, 0), make_float3(-1, 0, 0), mat_type_drum);
+    Drum->AddPlane(make_float3(-CylRad, 0, 0), make_float3(1, 0, 0), mat_type_drum);
+    Drum->AddPlane(make_float3(0, CylRad, 0), make_float3(0, -1, 0), mat_type_drum);
+    Drum->AddPlane(make_float3(0, -CylRad, 0), make_float3(0, 1, 0), mat_type_drum);
+    Drum->SetMass(CylMass);
+    Drum->SetMOI(make_float3(IYY, IYY, IZZ));
+    auto Drum_tracker = DEMSim.Track(Drum);
+    unsigned int drum_family = 100;
+    Drum->SetFamily(drum_family);
+    const float rpm = 200.0f;
+    const float drum_ang_vel = rpm * 2.0f * PI / 60.0f;
+    DEMSim.SetFamilyPrescribedAngVel(drum_family, "0", "0", to_string_with_precision(drum_ang_vel));
+    auto top_bot_planes = DEMSim.AddExternalObject();
+    top_bot_planes->AddPlane(make_float3(0, 0, CylHeight / 2. - safe_delta), make_float3(0, 0, -1), mat_type_drum);
+    top_bot_planes->AddPlane(make_float3(0, 0, -CylHeight / 2. + safe_delta), make_float3(0, 0, 1), mat_type_drum);
+    top_bot_planes->SetFamily(drum_family);
+    auto planes_tracker = DEMSim.Track(top_bot_planes);
+
+    // Place 1000 cubes on a grid inside the drum
+    const unsigned int target_cubes = 1000;
+    float sample_radius = CylRad - half_diag - safe_delta;
+    float sample_halfheight = CylHeight / 2.0f - half_diag - safe_delta;
+    float fill_spacing = cube_size * 1.25f;  // leave gap so meshes don't start in contact
+    std::mt19937 rng(42);
+    unsigned int created = 0;
+    for (float z = -sample_halfheight; z <= sample_halfheight && created < target_cubes; z += fill_spacing) {
+        for (float y = -sample_radius; y <= sample_radius && created < target_cubes; y += fill_spacing) {
+            for (float x = -sample_radius; x <= sample_radius && created < target_cubes; x += fill_spacing) {
+                if (x * x + y * y > sample_radius * sample_radius) {
+                    continue;
+                }
+                auto cube = DEMSim.AddMeshFromTemplate(cube_template, make_float3(x, y, z));
+                cube->SetFamily(1);
+                cube->SetMass(cube_mass);
+                cube->SetMOI(make_float3(cube_moi, cube_moi, cube_moi));
+                cube->SetInitQuat(make_float4(0.f, 0.f, 0.f, 1.0f));
+                created++;
+            }
+        }
+    }
+    std::cout << "Placed " << created << " cubes inside the drum." << std::endl;
+
+    auto max_v_finder = DEMSim.CreateInspector("max_absv");
+    float max_v;
+
+    DEMSim.InstructBoxDomainDimension(0.4, 0.4, 0.4);
+    float step_size = 1e-4f;
+    DEMSim.SetInitTimeStep(step_size);
+    DEMSim.SetGravitationalAcceleration(make_float3(0, 0, -9.81));
+    DEMSim.SetExpandSafetyType("auto");
+    DEMSim.SetExpandSafetyAdder(drum_ang_vel * CylRad);
+    DEMSim.Initialize();
+
+    path out_dir = current_path();
+    out_dir /= "DemoOutput_DrumCubes";
+    create_directory(out_dir);
+
+    float time_end = 3.0f;
+    unsigned int fps = 20;
+    float frame_time = 1.0f / fps;
+
+    std::cout << "Output at " << fps << " FPS" << std::endl;
+    unsigned int currframe = 0;
+    unsigned int curr_step = 0;
+    std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
+    for (double t = 0; t < (double)time_end; t += frame_time, curr_step++) {
+        std::cout << "Frame: " << currframe << std::endl;
+        DEMSim.ShowThreadCollaborationStats();
+        char filename[100];
+        sprintf(filename, "DEMdemo_output_%04d.vtk", currframe);
+        DEMSim.WriteMeshFile(out_dir / filename);
+        currframe++;
+        max_v = max_v_finder->GetValue();
+        std::cout << "Max velocity of any point in simulation is " << max_v << std::endl;
+
+        float3 drum_moi = Drum_tracker->MOI();
+        float3 drum_acc = Drum_tracker->ContactAngAccLocal();
+        float3 drum_torque = drum_acc * drum_moi;
+        std::cout << "Contact torque on the side walls is " << drum_torque.x << ", " << drum_torque.y << ", "
+                  << drum_torque.z << std::endl;
+
+        float3 force_on_BC = planes_tracker->ContactAcc() * planes_tracker->Mass();
+        std::cout << "Contact force on bottom plane is " << std::abs(force_on_BC.z) << std::endl;
+
+        DEMSim.DoDynamics(frame_time);
+    }
+    std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> time_sec = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+    std::cout << (time_sec.count()) / time_end * 10.0 << " seconds (wall time) to finish 10 seconds' simulation"
+              << std::endl;
+    DEMSim.ShowThreadCollaborationStats();
+    DEMSim.ClearThreadCollaborationStats();
+
+    DEMSim.ShowTimingStats();
+    std::cout << "----------------------------------------" << std::endl;
+    DEMSim.ShowMemStats();
+    std::cout << "----------------------------------------" << std::endl;
+
+    std::cout << "DEMdemo_DrumCubes exiting..." << std::endl;
+    return 0;
+}
\ No newline at end of file
diff --git a/src/kernel/DEMKinematicMisc.cu b/src/kernel/DEMKinematicMisc.cu
index b73833f0..82c62723 100644
--- a/src/kernel/DEMKinematicMisc.cu
+++ b/src/kernel/DEMKinematicMisc.cu
@@ -84,10 +84,16 @@ __global__ void computeMarginFromAbsv_implTri(deme::DEMSimParams* simParams,
         if (penetrationMargin > simParams->capTriTriPenetration) {
             penetrationMargin = simParams->capTriTriPenetration;
         }
-
-        granData->marginSizeTriangle[triID] =
+        // We hope that penetrationMargin is small, so it's absorbed into the velocity-induce margin.
+        // But if not, it should prevail to avoid losing contacts involving triangles inside another mesh.
+        double finalMargin =
             (double)(vel * simParams->expSafetyMulti + simParams->expSafetyAdder) * (*ts) * (*maxDrift) +
-            penetrationMargin + granData->familyExtraMarginSize[my_family];
+            granData->familyExtraMarginSize[my_family];
+        // if (finalMargin < penetrationMargin) {
+        //     finalMargin = penetrationMargin;
+        // }
+
+        granData->marginSizeTriangle[triID] = finalMargin;
     }
 }
 

From 05ce0a2a60b01be17b47c949ca8c1cf0da3a558e Mon Sep 17 00:00:00 2001
From: Florian Reinle <f.reinle@otec.de>
Date: Sat, 17 Jan 2026 02:30:28 +0100
Subject: [PATCH 04/17] Fix of 2 major dT bugs - revert a critical change form
 "Mesh_Particles_Json" merge 6a0357c - revert a dT Force patch calc
 optimization to be improved later on

---
 src/DEM/dT.cpp                                | 164 +++++++++----
 src/algorithms/DEMCubInstantiations.cu        |  18 +-
 src/algorithms/DEMDynamicMisc.cu              | 232 ++++++++----------
 src/algorithms/DEMStaticDeviceSubroutines.h   | 140 ++++-------
 src/kernel/DEMCalcForceKernels_Primitive.cu   |   5 -
 src/kernel/DEMContactKernels_SphTri_TriTri.cu | 156 ++----------
 6 files changed, 295 insertions(+), 420 deletions(-)

diff --git a/src/DEM/dT.cpp b/src/DEM/dT.cpp
index ed2c3294..2eb50295 100644
--- a/src/DEM/dT.cpp
+++ b/src/DEM/dT.cpp
@@ -2645,35 +2645,45 @@ inline void DEMDynamicThread::dispatchPatchBasedForceCorrections(
             contactPairs_t startOffsetPatch = start_count_patch.first;
             contactPairs_t countPatch = start_count_patch.second;
 
+            // Vote for the contact direction; voting power depends on the contact area
+            // This reduce-by-key operation reduces primitive-recorded force pairs into patch/convex part-based
+            // force pairs. All elements that share the same geomToPatchMap value vote together.
             if (countPrimitive > 0) {
-                contactPairs_t* keys = granData->geomToPatchMap + startOffsetPrimitive;
+                // Allocate temporary arrays for the voting process
+                float3* weightedNormals =
+                    (float3*)solverScratchSpace.allocateTempVector("weightedNormals", countPrimitive * sizeof(float3));
+                double* areas =
+                    (double*)solverScratchSpace.allocateTempVector("areas", countPrimitive * sizeof(double));
+                // Keys extracted from geomToPatchMap - these map primitives to patch pairs
+                contactPairs_t* keys = (contactPairs_t*)solverScratchSpace.allocateTempVector(
+                    "votingKeys", countPrimitive * sizeof(contactPairs_t));
+
                 // Allocate arrays for reduce-by-key results (uniqueKeys uses contactPairs_t, not patchIDPair_t)
                 contactPairs_t* uniqueKeys = (contactPairs_t*)solverScratchSpace.allocateTempVector(
                     "uniqueKeys", countPrimitive * sizeof(contactPairs_t));
+                float3* votedWeightedNormals = (float3*)solverScratchSpace.allocateTempVector(
+                    "votedWeightedNormals", countPrimitive * sizeof(float3));
                 solverScratchSpace.allocateDualStruct("numUniqueKeys");
                 size_t* numUniqueKeys = solverScratchSpace.getDualStructDevice("numUniqueKeys");
 
-                // Step 1: Area-weighted normals for voting
-                float3* weightedNormals =
-                    (float3*)solverScratchSpace.allocateTempVector("weightedNormals", countPrimitive * sizeof(float3));
-                prepareWeightedNormalsForVoting(&granData, weightedNormals, startOffsetPrimitive, countPrimitive,
-                                                streamInfo.stream);
+                // Step 1: Prepare weighted normals, areas, and keys
+                // The kernel extracts keys from geomToPatchMap, computes weighted normals, and stores areas
+                prepareWeightedNormalsForVoting(&granData, weightedNormals, areas, keys, startOffsetPrimitive,
+                                                countPrimitive, streamInfo.stream);
 
-                float3* votedWeightedNormals = (float3*)solverScratchSpace.allocateTempVector(
-                    "votedWeightedNormals", countPrimitive * sizeof(float3));
+                // Step 2: Reduce-by-key for weighted normals (sum)
+                // The keys are geomToPatchMap values (contactPairs_t), which group primitives by patch pair
                 cubSumReduceByKey<contactPairs_t, float3>(keys, uniqueKeys, weightedNormals, votedWeightedNormals,
                                                           numUniqueKeys, countPrimitive, streamInfo.stream,
                                                           solverScratchSpace);
                 solverScratchSpace.finishUsingTempVector("weightedNormals");
-
-                // Normalize the voted normals using unique keys and scatter to patch-local storage.
-                float3* votedNormals =
-                    (float3*)solverScratchSpace.allocateTempVector("votedNormals", countPatch * sizeof(float3));
+                // For extra safety
                 solverScratchSpace.syncDualStructDeviceToHost("numUniqueKeys");
                 size_t numUniqueKeysHost = *(solverScratchSpace.getDualStructHost("numUniqueKeys"));
-                normalizeAndScatterVotedNormalsFromUniqueKeys(votedWeightedNormals, uniqueKeys, votedNormals,
-                                                              startOffsetPatch, numUniqueKeysHost, streamInfo.stream);
-                solverScratchSpace.finishUsingTempVector("votedWeightedNormals");
+                // std::cout << "Keys:" << std::endl;
+                // displayDeviceArray<contactPairs_t>(keys, countPrimitive);
+                // std::cout << "Unique Keys:" << std::endl;
+                // displayDeviceArray<contactPairs_t>(uniqueKeys, numUniqueKeysHost);
                 if (numUniqueKeysHost != countPatch) {
                     DEME_ERROR(
                         "Patch-based contact voting produced %zu unique patch pairs, but expected %zu pairs for "
@@ -2681,40 +2691,108 @@ inline void DEMDynamicThread::dispatchPatchBasedForceCorrections(
                         numUniqueKeysHost, countPatch, contact_type);
                 }
 
-                // Step 2: Fused accumulation (sum + max) in a single reduce-by-key.
-                FusedPatchAccum* primitiveAccums = (FusedPatchAccum*)solverScratchSpace.allocateTempVector(
-                    "fusedPrimitiveAccums", countPrimitive * sizeof(FusedPatchAccum));
-                computeFusedPatchContactAccumulators(&granData, votedNormals, keys, primitiveAccums,
-                                                     startOffsetPrimitive, startOffsetPatch, countPrimitive,
-                                                     streamInfo.stream);
-
-                FusedPatchAccum* patchAccums = (FusedPatchAccum*)solverScratchSpace.allocateTempVector(
-                    "fusedPatchAccums", numUniqueKeysHost * sizeof(FusedPatchAccum));
-                cubSumReduceByKey<contactPairs_t, FusedPatchAccum>(keys, uniqueKeys, primitiveAccums, patchAccums,
-                                                                   numUniqueKeys, countPrimitive, streamInfo.stream,
-                                                                   solverScratchSpace);
-                solverScratchSpace.finishUsingTempVector("fusedPrimitiveAccums");
-
+                // Step 3: Normalize the voted normals by total area and scatter back to a temp array.
+                float3* votedNormals =
+                    (float3*)solverScratchSpace.allocateTempVector("votedNormals", countPatch * sizeof(float3));
+                normalizeAndScatterVotedNormals(votedWeightedNormals, votedNormals, countPatch, streamInfo.stream);
+                solverScratchSpace.finishUsingTempVector("votedWeightedNormals");
+                // displayDeviceFloat3(votedNormals, countPatch);
+
+                // Step 4: Compute projected penetration and area for each primitive contact
+                // Both the penetration and area are projected onto the voted normal
+                // If the projected penetration becomes negative, both are set to 0
+                // Reuse keys array for the reduce-by-key operation
+                double* projectedPenetrations = (double*)solverScratchSpace.allocateTempVector(
+                    "projectedPenetrations", countPrimitive * sizeof(double));
+                double* projectedAreas =
+                    (double*)solverScratchSpace.allocateTempVector("projectedAreas", countPrimitive * sizeof(double));
+                computeWeightedUsefulPenetration(&granData, votedNormals, keys, areas, projectedPenetrations,
+                                                 projectedAreas, startOffsetPrimitive, startOffsetPatch, countPrimitive,
+                                                 streamInfo.stream);
+                solverScratchSpace.finishUsingTempVector("areas");
+
+                // Step 5: Reduce-by-key to get total projected area per patch pair (sum)
                 double* totalProjectedAreas =
                     (double*)solverScratchSpace.allocateTempVector("totalProjectedAreas", countPatch * sizeof(double));
+                cubSumReduceByKey<contactPairs_t, double>(keys, uniqueKeys, projectedAreas, totalProjectedAreas,
+                                                          numUniqueKeys, countPrimitive, streamInfo.stream,
+                                                          solverScratchSpace);
+
+                // Step 6: Reduce-by-key to get max projected penetration per patch pair (max).
+                // This result, maxProjectedPenetrations, is the max of projected penetration, aka the max pen in the
+                // physical overlap case, and it's not the same as maxPenetrations in step 9 which is a fallback
+                // primitive-derived penetration.
                 double* maxProjectedPenetrations = (double*)solverScratchSpace.allocateTempVector(
                     "maxProjectedPenetrations", countPatch * sizeof(double));
+                cubMaxReduceByKey<contactPairs_t, double>(keys, uniqueKeys, projectedPenetrations,
+                                                          maxProjectedPenetrations, numUniqueKeys, countPrimitive,
+                                                          streamInfo.stream, solverScratchSpace);
+
+                // Step 7: Compute weighted contact points for each primitive (normal case)
+                // The weight is: projected_penetration * projected_area
+                // Reuse keys, uniqueKeys, and numUniqueKeys that are still allocated
+                double3* weightedContactPoints = (double3*)solverScratchSpace.allocateTempVector(
+                    "weightedContactPoints", countPrimitive * sizeof(double3));
+                double* contactWeights =
+                    (double*)solverScratchSpace.allocateTempVector("contactWeights", countPrimitive * sizeof(double));
+                computeWeightedContactPoints(&granData, weightedContactPoints, contactWeights, projectedPenetrations,
+                                             projectedAreas, startOffsetPrimitive, countPrimitive, streamInfo.stream);
+                solverScratchSpace.finishUsingTempVector("projectedPenetrations");
+                solverScratchSpace.finishUsingTempVector("projectedAreas");
+                // Reduce-by-key to get total weighted contact points per patch pair
+                double3* totalWeightedContactPoints = (double3*)solverScratchSpace.allocateTempVector(
+                    "totalWeightedContactPoints", countPatch * sizeof(double3));
+                double* totalContactWeights =
+                    (double*)solverScratchSpace.allocateTempVector("totalContactWeights", countPatch * sizeof(double));
+                cubSumReduceByKey<contactPairs_t, double3>(keys, uniqueKeys, weightedContactPoints,
+                                                           totalWeightedContactPoints, numUniqueKeys, countPrimitive,
+                                                           streamInfo.stream, solverScratchSpace);
+                cubSumReduceByKey<contactPairs_t, double>(keys, uniqueKeys, contactWeights, totalContactWeights,
+                                                          numUniqueKeys, countPrimitive, streamInfo.stream,
+                                                          solverScratchSpace);
+                solverScratchSpace.finishUsingTempVector("weightedContactPoints");
+                solverScratchSpace.finishUsingTempVector("contactWeights");
+                // Compute voted contact points per patch pair by dividing by total weight
                 double3* votedContactPoints =
                     (double3*)solverScratchSpace.allocateTempVector("votedContactPoints", countPatch * sizeof(double3));
+                computeFinalContactPointsPerPatch(totalWeightedContactPoints, totalContactWeights, votedContactPoints,
+                                                  countPatch, streamInfo.stream);
+                solverScratchSpace.finishUsingTempVector("totalWeightedContactPoints");
+                solverScratchSpace.finishUsingTempVector("totalContactWeights");
+
+                // Step 8: Handle zero-area patches (all primitive areas are 0)
+                // For these patches, we need to find the max penetration primitive and use its normal/penetration
+
+                // 8a: Extract primitive penetrations for max-reduce
+                double* primitivePenetrations = (double*)solverScratchSpace.allocateTempVector(
+                    "primitivePenetrations", countPrimitive * sizeof(double));
+                extractPrimitivePenetrations(&granData, primitivePenetrations, startOffsetPrimitive, countPrimitive,
+                                             streamInfo.stream);
+
+                // 8b: Max-negative-reduce-by-key to get max negative penetration per patch
+                // This finds the largest negative value (smallest absolute value among negatives)
+                // Positive values are treated as very negative to indicate invalid/non-physical state
+                double* maxPenetrations =
+                    (double*)solverScratchSpace.allocateTempVector("maxPenetrations", countPatch * sizeof(double));
+                cubMaxNegativeReduceByKey<contactPairs_t, double>(keys, uniqueKeys, primitivePenetrations,
+                                                                  maxPenetrations, numUniqueKeys, countPrimitive,
+                                                                  streamInfo.stream, solverScratchSpace);
+                solverScratchSpace.finishUsingTempVector("primitivePenetrations");
+
+                // 8c: Find max-penetration primitives for zero-area patches and extract their normals, penetrations,
+                // and contact points
                 float3* zeroAreaNormals =
                     (float3*)solverScratchSpace.allocateTempVector("zeroAreaNormals", countPatch * sizeof(float3));
                 double* zeroAreaPenetrations =
                     (double*)solverScratchSpace.allocateTempVector("zeroAreaPenetrations", countPatch * sizeof(double));
                 double3* zeroAreaContactPoints = (double3*)solverScratchSpace.allocateTempVector(
                     "zeroAreaContactPoints", countPatch * sizeof(double3));
+                findMaxPenetrationPrimitiveForZeroAreaPatches(
+                    &granData, maxPenetrations, zeroAreaNormals, zeroAreaPenetrations, zeroAreaContactPoints, keys,
+                    startOffsetPrimitive, startOffsetPatch, countPrimitive, streamInfo.stream);
+                solverScratchSpace.finishUsingTempVector("maxPenetrations");
 
-                scatterFusedPatchAccumulators(patchAccums, uniqueKeys, totalProjectedAreas, maxProjectedPenetrations,
-                                              votedContactPoints, votedNormals, zeroAreaNormals, zeroAreaPenetrations,
-                                              zeroAreaContactPoints, startOffsetPatch, numUniqueKeysHost,
-                                              streamInfo.stream);
-                solverScratchSpace.finishUsingTempVector("fusedPatchAccums");
-
-                // Step 9d: Check if each patch has any SAT-satisfying primitive (for tri-tri contacts)
+                // Step 8d: Check if each patch has any SAT-satisfying primitive (for tri-tri contacts)
                 // If no primitive satisfies SAT, the patch contact is non-physical and should use Step 9 fallback
                 notStupidBool_t* patchHasSAT = nullptr;
                 if (contact_type == TRIANGLE_TRIANGLE_CONTACT) {
@@ -2725,11 +2803,12 @@ inline void DEMDynamicThread::dispatchPatchBasedForceCorrections(
                                                         streamInfo.stream);
                 }
 
-                // Clean up key bookkeeping now that we're done with reductions
+                // Clean up keys arrays now that we're done with reductions
+                solverScratchSpace.finishUsingTempVector("votingKeys");
                 solverScratchSpace.finishUsingTempVector("uniqueKeys");
                 solverScratchSpace.finishUsingDualStruct("numUniqueKeys");
 
-                // Step 10: Finalize patch results by combining voting with zero-area handling.
+                // Step 9: Finalize patch results by combining voting with zero-area handling.
                 // If patch-based projected area is 0 (or this patch pair consists of no SAT pair), meaning no physical
                 // contact, we use the fallback estimations (zeroArea*) of CP, penetration and areas.
                 double* finalAreas =
@@ -2754,9 +2833,7 @@ inline void DEMDynamicThread::dispatchPatchBasedForceCorrections(
                 solverScratchSpace.finishUsingTempVector("zeroAreaPenetrations");
                 solverScratchSpace.finishUsingTempVector("votedContactPoints");
                 solverScratchSpace.finishUsingTempVector("zeroAreaContactPoints");
-                if (patchHasSAT != nullptr) {
-                    solverScratchSpace.finishUsingTempVector("patchHasSAT");
-                }
+                solverScratchSpace.finishUsingTempVector("patchHasSAT");
 
                 // Now we have:
                 // - finalAreas: final contact area per patch pair (countPatch elements)
@@ -2786,6 +2863,8 @@ inline void DEMDynamicThread::dispatchPatchBasedForceCorrections(
                         }
                     }
                 }
+                DEME_GPU_CALL(cudaStreamSynchronize(streamInfo.stream));
+
                 // If this is a tri-tri contact, compute max penetration for kT
                 // The max value stays on device until sendToTheirBuffer transfers it
                 if (contact_type == TRIANGLE_TRIANGLE_CONTACT && countPatch > 0) {
@@ -2794,6 +2873,9 @@ inline void DEMDynamicThread::dispatchPatchBasedForceCorrections(
                     cubMaxReduce<double>(finalPenetrations.data(), &maxTriTriPenetration, countPatch, streamInfo.stream,
                                          solverScratchSpace);
                     // No toHost() here - keep on device since host never needs it
+                    // maxTriTriPenetration.toHost();
+                    // std::cout << "Max tri-tri penetration after patch-based correction: " << *maxTriTriPenetration
+                    //           << std::endl;
                 }
 
                 // Final clean up
diff --git a/src/algorithms/DEMCubInstantiations.cu b/src/algorithms/DEMCubInstantiations.cu
index 86380234..3c93680d 100644
--- a/src/algorithms/DEMCubInstantiations.cu
+++ b/src/algorithms/DEMCubInstantiations.cu
@@ -97,23 +97,7 @@ template void cubSumReduceByKey<contactPairs_t, double>(contactPairs_t* d_keys_i
                                                         size_t n,
                                                         cudaStream_t& this_stream,
                                                         DEMSolverScratchData& scratchPad);
-// Patch contact accumulators (sum + max) with contactPairs_t keys
-template void cubSumReduceByKey<contactPairs_t, PatchContactAccum>(contactPairs_t* d_keys_in,
-                                                                   contactPairs_t* d_unique_out,
-                                                                   PatchContactAccum* d_vals_in,
-                                                                   PatchContactAccum* d_aggregates_out,
-                                                                   size_t* d_num_out,
-                                                                   size_t n,
-                                                                   cudaStream_t& this_stream,
-                                                                   DEMSolverScratchData& scratchPad);
-template void cubSumReduceByKey<contactPairs_t, FusedPatchAccum>(contactPairs_t* d_keys_in,
-                                                                 contactPairs_t* d_unique_out,
-                                                                 FusedPatchAccum* d_vals_in,
-                                                                 FusedPatchAccum* d_aggregates_out,
-                                                                 size_t* d_num_out,
-                                                                 size_t n,
-                                                                 cudaStream_t& this_stream,
-                                                                 DEMSolverScratchData& scratchPad);
+
 
 ////////////////////////////////////////////////////////////////////////////////
 // Reduce::Max
diff --git a/src/algorithms/DEMDynamicMisc.cu b/src/algorithms/DEMDynamicMisc.cu
index 007cb3e8..43143cbe 100644
--- a/src/algorithms/DEMDynamicMisc.cu
+++ b/src/algorithms/DEMDynamicMisc.cu
@@ -119,10 +119,12 @@ void getContactForcesConcerningOwners(float3* d_points,
 // Patch-based voting kernels for mesh contact correction
 ////////////////////////////////////////////////////////////////////////////////
 
-// Kernel to compute weighted normals (normal * area) for voting.
-// Keys are read directly from geomToPatchMap on the fly, so only weightedNormals need to be written here.
+// Kernel to compute weighted normals (normal * area) for voting
+// Also prepares the area values for reduction and extracts the keys (geomToPatchMap values)
 __global__ void prepareWeightedNormalsForVoting_impl(DEMDataDT* granData,
                                                      float3* weightedNormals,
+                                                     double* areas,
+                                                     contactPairs_t* keys,
                                                      contactPairs_t startOffset,
                                                      contactPairs_t count) {
     contactPairs_t idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -135,54 +137,70 @@ __global__ void prepareWeightedNormalsForVoting_impl(DEMDataDT* granData,
         // Extract the area (double) from contactPointGeometryB (stored as float3)
         float3 areaStorage = granData->contactPointGeometryB[myContactID];
         double area = float3StorageToDouble(areaStorage);
+        float3 penStorage = granData->contactPointGeometryA[myContactID];
+        double penetration = float3StorageToDouble(penStorage);
+        penetration = (penetration > DEME_TINY_FLOAT) ? penetration : DEME_TINY_FLOAT;
+        double recipPen = 1.0 / penetration;
 
         // Compute weighted normal (normal * area)
-        weightedNormals[idx] = make_float3(normal.x * area, normal.y * area, normal.z * area);
+        // Note that fake contacts do not affect as their area is 0
+        weightedNormals[idx] = make_float3((double)normal.x * area * recipPen, (double)normal.y * area * recipPen,
+                                           (double)normal.z * area * recipPen);
+
+        // Store area for reduction
+        areas[idx] = area;
+
+        // Extract key from geomToPatchMap
+        keys[idx] = granData->geomToPatchMap[myContactID];
     }
 }
 
 void prepareWeightedNormalsForVoting(DEMDataDT* granData,
                                      float3* weightedNormals,
+                                     double* areas,
+                                     contactPairs_t* keys,
                                      contactPairs_t startOffset,
                                      contactPairs_t count,
                                      cudaStream_t& this_stream) {
     size_t blocks_needed = (count + DEME_MAX_THREADS_PER_BLOCK - 1) / DEME_MAX_THREADS_PER_BLOCK;
     if (blocks_needed > 0) {
         prepareWeightedNormalsForVoting_impl<<<blocks_needed, DEME_MAX_THREADS_PER_BLOCK, 0, this_stream>>>(
-            granData, weightedNormals, startOffset, count);
+            granData, weightedNormals, areas, keys, startOffset, count);
     }
 }
 
-// Kernel to normalize voted normals and scatter them based on unique keys.
-// Uses uniqueKeys (geomToPatchMap) to locate the patch slot, removing the need for total area arrays.
-__global__ void normalizeAndScatterVotedNormalsFromUniqueKeys_impl(float3* votedWeightedNormals,
-                                                                   contactPairs_t* uniqueKeys,
-                                                                   float3* output,
-                                                                   contactPairs_t startOffsetPatch,
-                                                                   contactPairs_t count) {
+// Kernel to normalize the voted normals by dividing by total area and scatter to output
+// If total area is 0, set result to (0,0,0)
+// Assumes uniqueKeys are sorted (CUB's ReduceByKey maintains sort order)
+// Uses contactPairs_t keys (geomToPatchMap values)
+__global__ void normalizeAndScatterVotedNormals_impl(float3* votedWeightedNormals,
+                                                     float3* output,
+                                                     contactPairs_t count) {
     contactPairs_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < count) {
-        contactPairs_t patchIdx = uniqueKeys[idx];
-        contactPairs_t localIdx = patchIdx - startOffsetPatch;
-
         float3 votedNormal = votedWeightedNormals[idx];
-        float len2 = votedNormal.x * votedNormal.x + votedNormal.y * votedNormal.y + votedNormal.z * votedNormal.z;
-        // normalize when length is non-zero; otherwise leave zero vector
-        output[localIdx] = (len2 > 0.f) ? normalize(votedNormal) : make_float3(0, 0, 0);
+        float len2 = length2(votedNormal);
+        if (len2 > 0.f) {
+            // Normalize votedNormal
+            votedNormal *= rsqrtf(len2);
+        } else {
+            // If total area is 0, set to (0,0,0) to mark no real contact
+            votedNormal = make_float3(0.0f, 0.0f, 0.0f);
+        }
+
+        // Write to output at the correct position
+        output[idx] = votedNormal;
     }
 }
 
-void normalizeAndScatterVotedNormalsFromUniqueKeys(float3* votedWeightedNormals,
-                                                   contactPairs_t* uniqueKeys,
-                                                   float3* output,
-                                                   contactPairs_t startOffsetPatch,
-                                                   contactPairs_t count,
-                                                   cudaStream_t& this_stream) {
+void normalizeAndScatterVotedNormals(float3* votedWeightedNormals,
+                                     float3* output,
+                                     contactPairs_t count,
+                                     cudaStream_t& this_stream) {
     size_t blocks_needed = (count + DEME_MAX_THREADS_PER_BLOCK - 1) / DEME_MAX_THREADS_PER_BLOCK;
     if (blocks_needed > 0) {
-        normalizeAndScatterVotedNormalsFromUniqueKeys_impl<<<blocks_needed, DEME_MAX_THREADS_PER_BLOCK, 0,
-                                                            this_stream>>>(votedWeightedNormals, uniqueKeys, output,
-                                                                           startOffsetPatch, count);
+        normalizeAndScatterVotedNormals_impl<<<blocks_needed, DEME_MAX_THREADS_PER_BLOCK, 0, this_stream>>>(
+            votedWeightedNormals, output, count);
     }
 }
 
@@ -190,125 +208,91 @@ void normalizeAndScatterVotedNormalsFromUniqueKeys(float3* votedWeightedNormals,
 // Penetration depth computation kernels for mesh contact correction
 ////////////////////////////////////////////////////////////////////////////////
 
-// Kernel to compute per-primitive patch accumulators (projected area, max projected penetration, weighted CP sum).
-__global__ void computeFusedPatchContactAccumulators_impl(DEMDataDT* granData,
-                                                          float3* votedNormals,
-                                                          const contactPairs_t* keys,
-                                                          FusedPatchAccum* accumulators,
-                                                          contactPairs_t startOffsetPrimitive,
-                                                          contactPairs_t startOffsetPatch,
-                                                          contactPairs_t count) {
+// Kernel to compute weighted useful penetration for each primitive contact
+// The "useful" penetration is the original penetration projected onto the voted normal.
+// If the projection makes penetration negative (tangential contact), it's clamped to 0.
+// Each primitive's useful penetration is then weighted by its contact area.
+__global__ void computeWeightedUsefulPenetration_impl(DEMDataDT* granData,
+                                                      float3* votedNormals,
+                                                      contactPairs_t* keys,
+                                                      double* areas,
+                                                      double* projectedPenetrations,
+                                                      double* projectedAreas,
+                                                      contactPairs_t startOffsetPrimitive,
+                                                      contactPairs_t startOffsetPatch,
+                                                      contactPairs_t count) {
     contactPairs_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < count) {
         contactPairs_t myContactID = startOffsetPrimitive + idx;
 
+        // Get the patch pair index for this primitive (absolute index)
         contactPairs_t patchIdx = keys[idx];
+
+        // Get the voted normalized normal for this patch pair
+        // Subtract startOffsetPatch to get the local index into votedNormals
         contactPairs_t localPatchIdx = patchIdx - startOffsetPatch;
         float3 votedNormal = votedNormals[localPatchIdx];
+        // If voted normal is (0,0,0), meaning all primitive contacts agree on no contact, then the end result must be
+        // 0, no special handling needed
 
+        // Get the original contact normal (stored in contactForces during primitive force calc)
         float3 originalNormal = granData->contactForces[myContactID];
+
+        // Get the original penetration depth from contactPointGeometryA (stored as double in float3)
         float3 penetrationStorage = granData->contactPointGeometryA[myContactID];
-        double rawPenetration = float3StorageToDouble(penetrationStorage);
-        double clampedPenetration = (rawPenetration > 0.0) ? rawPenetration : 0.0;
+        double originalPenetration = float3StorageToDouble(penetrationStorage);
+        // Negative penetration does not participate
+        if (originalPenetration <= 0.0) {
+            originalPenetration = 0.0;
+        }
 
-        double area = float3StorageToDouble(granData->contactPointGeometryB[myContactID]);
+        // Get the contact area from storage that is not yet freed. Note the index is idx not myContactID, as areas is a
+        // type-specific vector.
+        double area = areas[idx];
 
+        // Compute the projected penetration and area by projecting onto the voted normal
+        // Projected penetration: originalPenetration * dot(originalNormal, votedNormal)
+        // Projected area: area * dot(originalNormal, votedNormal)
+        // If dot product is negative (opposite directions), set both to 0
         float dotProduct = dot(originalNormal, votedNormal);
-        double cospos = (dotProduct > 0.f) ? (double)dotProduct : 0.0;
+        double projectedPenetration = originalPenetration * (double)dotProduct;
+        double projectedArea = area * (double)dotProduct;
 
-        double projectedPenetration = clampedPenetration * cospos;
-        double projectedArea = area * cospos;
-        double weight = projectedPenetration * projectedArea;
-
-        double3 contactPoint = to_double3(granData->contactTorque_convToForce[myContactID]);
-        double3 weightedCP = make_double3(contactPoint.x * weight, contactPoint.y * weight, contactPoint.z * weight);
-
-        FusedPatchAccum acc;
-        acc.sumProjArea = projectedArea;
-        acc.maxProjPen = projectedPenetration;
-        acc.sumWeight = weight;
-        acc.sumWeightedCP = weightedCP;
-        acc.sumWeightedNormal =
-            make_float3(originalNormal.x * area, originalNormal.y * area, originalNormal.z * area);
-        acc.maxPenRaw = rawPenetration;
-        acc.maxPenNormal = originalNormal;
-        acc.maxPenCP = contactPoint;
-        accumulators[idx] = acc;
-    }
-}
-
-void computeFusedPatchContactAccumulators(DEMDataDT* granData,
-                                          float3* votedNormals,
-                                          const contactPairs_t* keys,
-                                          FusedPatchAccum* accumulators,
-                                          contactPairs_t startOffsetPrimitive,
-                                          contactPairs_t startOffsetPatch,
-                                          contactPairs_t count,
-                                          cudaStream_t& this_stream) {
-    size_t blocks_needed = (count + DEME_MAX_THREADS_PER_BLOCK - 1) / DEME_MAX_THREADS_PER_BLOCK;
-    if (blocks_needed > 0) {
-        computeFusedPatchContactAccumulators_impl<<<blocks_needed, DEME_MAX_THREADS_PER_BLOCK, 0, this_stream>>>(
-            granData, votedNormals, keys, accumulators, startOffsetPrimitive, startOffsetPatch, count);
-    }
-}
-
-// Kernel to scatter reduced patch accumulators to the final arrays expected by patch-based correction.
-__global__ void scatterFusedPatchAccumulators_impl(const FusedPatchAccum* accumulators,
-                                                   const contactPairs_t* uniqueKeys,
-                                                   double* totalProjectedAreas,
-                                                   double* maxProjectedPenetrations,
-                                                   double3* votedContactPoints,
-                                                   float3* votedNormals,
-                                                   float3* zeroAreaNormals,
-                                                   double* zeroAreaPenetrations,
-                                                   double3* zeroAreaContactPoints,
-                                                   contactPairs_t startOffsetPatch,
-                                                   contactPairs_t count) {
-    contactPairs_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < count) {
-        contactPairs_t patchIdx = uniqueKeys[idx];
-        contactPairs_t localIdx = patchIdx - startOffsetPatch;
-
-        FusedPatchAccum acc = accumulators[idx];
-        totalProjectedAreas[localIdx] = acc.sumProjArea;
-        maxProjectedPenetrations[localIdx] = acc.maxProjPen;
-
-        float3 summedNormal = acc.sumWeightedNormal;
-        float len2 = summedNormal.x * summedNormal.x + summedNormal.y * summedNormal.y + summedNormal.z * summedNormal.z;
-        votedNormals[localIdx] = (len2 > 0.f) ? normalize(summedNormal) : make_float3(0, 0, 0);
-
-        if (acc.sumWeight > 0.0) {
-            double invWeight = 1.0 / acc.sumWeight;
-            votedContactPoints[localIdx] =
-                make_double3(acc.sumWeightedCP.x * invWeight, acc.sumWeightedCP.y * invWeight,
-                             acc.sumWeightedCP.z * invWeight);
-        } else {
-            votedContactPoints[localIdx] = make_double3(0.0, 0.0, 0.0);
+        // If projected values becomes negative, set both area and penetration to 0
+        if (projectedPenetration <= 0.0) {
+            projectedPenetration = 0.0;
         }
+        if (projectedArea <= 0.0) {
+            projectedArea = 0.0;
+        }
+
+        projectedPenetrations[idx] = projectedPenetration;
+        projectedAreas[idx] = projectedArea;
 
-        zeroAreaNormals[localIdx] = acc.maxPenNormal;
-        zeroAreaPenetrations[localIdx] = (acc.maxPenRaw < 0.0) ? acc.maxPenRaw : -DEME_HUGE_FLOAT;
-        zeroAreaContactPoints[localIdx] = acc.maxPenCP;
+        // printf(
+        //     "voted normal: (%f, %f, %f), original normal: (%f, %f, %f), original pen: %f, dot: %f, projected pen: %f,
+        //     " "area: %f, projected area: %f\n", votedNormal.x, votedNormal.y, votedNormal.z, originalNormal.x,
+        //     originalNormal.y, originalNormal.z, originalPenetration, dotProduct, projectedPenetration, area,
+        //     projectedArea);
     }
 }
 
-void scatterFusedPatchAccumulators(const FusedPatchAccum* accumulators,
-                                   const contactPairs_t* uniqueKeys,
-                                   double* totalProjectedAreas,
-                                   double* maxProjectedPenetrations,
-                                   double3* votedContactPoints,
-                                   float3* votedNormals,
-                                   float3* zeroAreaNormals,
-                                   double* zeroAreaPenetrations,
-                                   double3* zeroAreaContactPoints,
-                                   contactPairs_t startOffsetPatch,
-                                   contactPairs_t count,
-                                   cudaStream_t& this_stream) {
+void computeWeightedUsefulPenetration(DEMDataDT* granData,
+                                      float3* votedNormals,
+                                      contactPairs_t* keys,
+                                      double* areas,
+                                      double* projectedPenetrations,
+                                      double* projectedAreas,
+                                      contactPairs_t startOffsetPrimitive,
+                                      contactPairs_t startOffsetPatch,
+                                      contactPairs_t count,
+                                      cudaStream_t& this_stream) {
     size_t blocks_needed = (count + DEME_MAX_THREADS_PER_BLOCK - 1) / DEME_MAX_THREADS_PER_BLOCK;
     if (blocks_needed > 0) {
-        scatterFusedPatchAccumulators_impl<<<blocks_needed, DEME_MAX_THREADS_PER_BLOCK, 0, this_stream>>>(
-            accumulators, uniqueKeys, totalProjectedAreas, maxProjectedPenetrations, votedContactPoints, votedNormals,
-            zeroAreaNormals, zeroAreaPenetrations, zeroAreaContactPoints, startOffsetPatch, count);
+        computeWeightedUsefulPenetration_impl<<<blocks_needed, DEME_MAX_THREADS_PER_BLOCK, 0, this_stream>>>(
+            granData, votedNormals, keys, areas, projectedPenetrations, projectedAreas, startOffsetPrimitive,
+            startOffsetPatch, count);
+
     }
 }
 
diff --git a/src/algorithms/DEMStaticDeviceSubroutines.h b/src/algorithms/DEMStaticDeviceSubroutines.h
index 9748f4e8..87bac353 100644
--- a/src/algorithms/DEMStaticDeviceSubroutines.h
+++ b/src/algorithms/DEMStaticDeviceSubroutines.h
@@ -172,114 +172,56 @@ void getContactForcesConcerningOwners(float3* d_points,
 // Patch-based voting wrappers for mesh contact correction
 ////////////////////////////////////////////////////////////////////////////////
 
-// Prepares weighted normals (normal * area) for voting
+// Prepares weighted normals (normal * area), areas, and keys from geomToPatchMap for voting
 void prepareWeightedNormalsForVoting(DEMDataDT* granData,
                                      float3* weightedNormals,
+                                     double* areas,
+                                     contactPairs_t* keys,
                                      contactPairs_t startOffset,
                                      contactPairs_t count,
                                      cudaStream_t& this_stream);
 
-// Normalize voted normals using unique patch keys and scatter to the local patch array
-void normalizeAndScatterVotedNormalsFromUniqueKeys(float3* votedWeightedNormals,
-                                                   contactPairs_t* uniqueKeys,
-                                                   float3* output,
+// Normalizes voted normals by total area and scatters to output
+// If total area is 0, output is (0,0,0) indicating no contact
+void normalizeAndScatterVotedNormals(float3* votedWeightedNormals,
+                                     float3* output,
+                                     contactPairs_t count,
+                                     cudaStream_t& this_stream);
+
+// Computes projected penetration and area for each primitive contact
+// Both the penetration and area are projected onto the voted normal
+// If the projected penetration becomes negative, both are set to 0
+void computeWeightedUsefulPenetration(DEMDataDT* granData,
+                                      float3* votedNormals,
+                                      contactPairs_t* keys,
+                                      double* areas,
+                                      double* projectedPenetrations,
+                                      double* projectedAreas,
+                                      contactPairs_t startOffsetPrimitive,
+                                      contactPairs_t startOffsetPatch,
+                                      contactPairs_t count,
+                                      cudaStream_t& this_stream);
+
+// Extracts primitive penetrations from contactPointGeometryA for max-reduce operation
+void extractPrimitivePenetrations(DEMDataDT* granData,
+                                  double* penetrations,
+                                  contactPairs_t startOffset,
+                                  contactPairs_t count,
+                                  cudaStream_t& this_stream);
+
+// Finds the primitive with max penetration for zero-area patches and extracts its normal, penetration, and contact
+// point
+void findMaxPenetrationPrimitiveForZeroAreaPatches(DEMDataDT* granData,
+                                                   double* maxPenetrations,
+                                                   float3* zeroAreaNormals,
+                                                   double* zeroAreaPenetrations,
+                                                   double3* zeroAreaContactPoints,
+                                                   contactPairs_t* keys,
+                                                   contactPairs_t startOffsetPrimitive,
                                                    contactPairs_t startOffsetPatch,
-                                                   contactPairs_t count,
+                                                   contactPairs_t countPrimitive,
                                                    cudaStream_t& this_stream);
 
-// Fused accumulator carrying area-weighted normals, projected metrics, and max-penetration data.
-struct FusedPatchAccum {
-    double sumProjArea;      // sum of projected areas (>=0)
-    double maxProjPen;       // max projected penetration (>=0)
-    double sumWeight;        // sum of projectedPenetration*projectedArea (>=0)
-    double3 sumWeightedCP;   // weighted contact point accumulator
-    float3 sumWeightedNormal;  // area-weighted normal (normal*area), used for voted normal
-    double maxPenRaw;        // raw penetration (can be negative)
-    float3 maxPenNormal;     // normal associated with maxPenRaw
-    double3 maxPenCP;        // contact point associated with maxPenRaw
-
-    __host__ __device__ __forceinline__ FusedPatchAccum operator+(const FusedPatchAccum& other) const {
-        FusedPatchAccum out;
-        out.sumProjArea = sumProjArea + other.sumProjArea;
-        out.maxProjPen = (maxProjPen > other.maxProjPen) ? maxProjPen : other.maxProjPen;
-        out.sumWeight = sumWeight + other.sumWeight;
-        out.sumWeightedCP =
-            make_double3(sumWeightedCP.x + other.sumWeightedCP.x, sumWeightedCP.y + other.sumWeightedCP.y,
-                         sumWeightedCP.z + other.sumWeightedCP.z);
-        out.sumWeightedNormal =
-            make_float3(sumWeightedNormal.x + other.sumWeightedNormal.x, sumWeightedNormal.y + other.sumWeightedNormal.y,
-                        sumWeightedNormal.z + other.sumWeightedNormal.z);
-
-        // Max-negative preference (equivalent to CubOpMaxNegative): prefer negatives closest to zero; otherwise most
-        // negative positive (smallest positive)
-        double a = maxPenRaw;
-        double b = other.maxPenRaw;
-        bool pick_other = false;
-        if (a < 0 && b < 0) {
-            pick_other = (b > a);  // closer to zero negative
-        } else if (a < 0) {
-            pick_other = false;
-        } else if (b < 0) {
-            pick_other = true;
-        } else {
-            pick_other = (b < a);  // both non-negative: pick smaller one
-        }
-        if (pick_other) {
-            out.maxPenRaw = b;
-            out.maxPenNormal = other.maxPenNormal;
-            out.maxPenCP = other.maxPenCP;
-        } else {
-            out.maxPenRaw = a;
-            out.maxPenNormal = maxPenNormal;
-            out.maxPenCP = maxPenCP;
-        }
-        return out;
-    }
-};
-
-// Compute fused per-primitive accumulators (projected metrics + max-penetration + area-weighted normal)
-void computeFusedPatchContactAccumulators(DEMDataDT* granData,
-                                          float3* votedNormals,
-                                          const contactPairs_t* keys,
-                                          FusedPatchAccum* accumulators,
-                                          contactPairs_t startOffsetPrimitive,
-                                          contactPairs_t startOffsetPatch,
-                                          contactPairs_t count,
-                                          cudaStream_t& this_stream);
-
-// Scatter fused accumulators to patch-local arrays expected by finalizePatchResults.
-void scatterFusedPatchAccumulators(const FusedPatchAccum* accumulators,
-                                   const contactPairs_t* uniqueKeys,
-                                   double* totalProjectedAreas,
-                                   double* maxProjectedPenetrations,
-                                   double3* votedContactPoints,
-                                   float3* votedNormals,
-                                   float3* zeroAreaNormals,
-                                   double* zeroAreaPenetrations,
-                                   double3* zeroAreaContactPoints,
-                                   contactPairs_t startOffsetPatch,
-                                   contactPairs_t count,
-                                   cudaStream_t& this_stream);
-
-struct PatchContactAccum {
-    double sumProjArea;
-    double maxProjPen;
-    double sumWeight;
-    double3 sumWeightedCP;
-
-    __host__ __device__ __forceinline__ PatchContactAccum operator+(const PatchContactAccum& other) const {
-        PatchContactAccum out;
-        out.sumProjArea = sumProjArea + other.sumProjArea;
-        out.maxProjPen = (maxProjPen > other.maxProjPen) ? maxProjPen : other.maxProjPen;
-        out.sumWeight = sumWeight + other.sumWeight;
-        out.sumWeightedCP =
-            make_double3(sumWeightedCP.x + other.sumWeightedCP.x, sumWeightedCP.y + other.sumWeightedCP.y,
-                         sumWeightedCP.z + other.sumWeightedCP.z);
-        return out;
-    }
-};
-
-// Computes projected penetration/area and weighted contact point accumulators per primitive
 // Checks if any primitive in each patch satisfies SAT (for tri-tri contacts)
 // Outputs a flag per patch: 1 if at least one SAT-satisfying primitive exists, 0 otherwise
 void checkPatchHasSATSatisfyingPrimitive(DEMDataDT* granData,
diff --git a/src/kernel/DEMCalcForceKernels_Primitive.cu b/src/kernel/DEMCalcForceKernels_Primitive.cu
index ce0e8851..a70286c6 100644
--- a/src/kernel/DEMCalcForceKernels_Primitive.cu
+++ b/src/kernel/DEMCalcForceKernels_Primitive.cu
@@ -37,9 +37,6 @@ __device__ __forceinline__ void calculatePrimitiveContactForces_impl(deme::DEMSi
     float AOwnerMass, ARadius, BOwnerMass, BRadius;
     float4 AOriQ, BOriQ;
     deme::materialsOffset_t bodyAMatType, bodyBMatType;
-    // Cache analytic entity info when B is analytical (used for on-the-fly area calc)
-    deme::objType_t analyticalType = deme::ANAL_OBJ_TYPE_PLANE;
-    float analyticalSize1 = 0.f;
     // The user-specified extra margin size (how much we should be lenient in determining `in-contact')
     float extraMarginSize = 0.;
     // Triangle A's three points are defined outside, as may be reused in B's acquisition and penetration calc.
@@ -278,8 +275,6 @@ __device__ __forceinline__ void calculatePrimitiveContactForces_impl(deme::DEMSi
         // For analytical entity, its patch ID is just its own component ID (but myPatchID is hardly used in this
         // analytical case)
         deme::bodyID_t myPatchID = analyticalID;
-        analyticalType = objType[analyticalID];
-        analyticalSize1 = objSize1[analyticalID];
         // If B is analytical entity, its owner, relative location, material info is jitified.
         bodyBMatType = objMaterial[analyticalID];
         BOwnerMass = objMass[analyticalID];
diff --git a/src/kernel/DEMContactKernels_SphTri_TriTri.cu b/src/kernel/DEMContactKernels_SphTri_TriTri.cu
index aeca48ad..104cc6f8 100644
--- a/src/kernel/DEMContactKernels_SphTri_TriTri.cu
+++ b/src/kernel/DEMContactKernels_SphTri_TriTri.cu
@@ -114,97 +114,6 @@ inline __device__ void fillSharedMemSpheres(deme::DEMSimParams* simParams,
     radii[myThreadID] = myRadius;
 }
 
-// Combined AABB overlap check and canonical bin assignment for tri-tri contacts.
-// This function does TWO things in one pass to avoid redundant AABB computation:
-// 1. Checks if the two prisms' AABBs overlap (early rejection if not)
-// 2. Determines if this bin is the canonical bin for this triangle pair
-//
-// Returns: true if AABBs overlap AND this is the canonical bin to process this pair
-//          false otherwise (either no overlap or should be processed in another bin)
-//
-// CANONICAL BIN ASSIGNMENT:
-// PROBLEM: Two triangles can be in many bins simultaneously. If we count this pair in
-// every bin where both are present, we get massive duplication.
-//
-// SOLUTION: For each unique triangle pair, assign it to EXACTLY ONE bin using a deterministic
-// rule that can be computed locally (without knowing all bins both triangles touch).
-//
-// APPROACH: Compute the AABB intersection of both prisms. The MINIMUM bin ID that touches
-// this intersection is the canonical bin. Since both triangles must touch this intersection
-// region, this bin is guaranteed to contain both triangles.
-inline __device__ bool shouldProcessTriTriInThisBin(deme::DEMSimParams* simParams,
-                                                    deme::binID_t currentBinID,
-                                                    deme::bodyID_t triID_A,
-                                                    deme::bodyID_t triID_B,
-                                                    const float3& triANode1,
-                                                    const float3& triANode2,
-                                                    const float3& triANode3,
-                                                    const float3& triBNode1,
-                                                    const float3& triBNode2,
-                                                    const float3& triBNode3,
-                                                    const float3& triANode1_other,
-                                                    const float3& triANode2_other,
-                                                    const float3& triANode3_other,
-                                                    const float3& triBNode1_other,
-                                                    const float3& triBNode2_other,
-                                                    const float3& triBNode3_other) {
-    (void)triID_A;
-    (void)triID_B;
-    
-    // Compute AABB of first prism (6 vertices)
-    float minX1 = fminf(fminf(fminf(triANode1.x, triANode2.x), fminf(triANode3.x, triBNode1.x)), fminf(triBNode2.x, triBNode3.x));
-    float maxX1 = fmaxf(fmaxf(fmaxf(triANode1.x, triANode2.x), fmaxf(triANode3.x, triBNode1.x)), fmaxf(triBNode2.x, triBNode3.x));
-    float minY1 = fminf(fminf(fminf(triANode1.y, triANode2.y), fminf(triANode3.y, triBNode1.y)), fminf(triBNode2.y, triBNode3.y));
-    float maxY1 = fmaxf(fmaxf(fmaxf(triANode1.y, triANode2.y), fmaxf(triANode3.y, triBNode1.y)), fmaxf(triBNode2.y, triBNode3.y));
-    float minZ1 = fminf(fminf(fminf(triANode1.z, triANode2.z), fminf(triANode3.z, triBNode1.z)), fminf(triBNode2.z, triBNode3.z));
-    float maxZ1 = fmaxf(fmaxf(fmaxf(triANode1.z, triANode2.z), fmaxf(triANode3.z, triBNode1.z)), fmaxf(triBNode2.z, triBNode3.z));
-    
-    // Compute AABB of second prism (6 vertices)
-    float minX2 = fminf(fminf(fminf(triANode1_other.x, triANode2_other.x), fminf(triANode3_other.x, triBNode1_other.x)), fminf(triBNode2_other.x, triBNode3_other.x));
-    float maxX2 = fmaxf(fmaxf(fmaxf(triANode1_other.x, triANode2_other.x), fmaxf(triANode3_other.x, triBNode1_other.x)), fmaxf(triBNode2_other.x, triBNode3_other.x));
-    float minY2 = fminf(fminf(fminf(triANode1_other.y, triANode2_other.y), fminf(triANode3_other.y, triBNode1_other.y)), fminf(triBNode2_other.y, triBNode3_other.y));
-    float maxY2 = fmaxf(fmaxf(fmaxf(triANode1_other.y, triANode2_other.y), fmaxf(triANode3_other.y, triBNode1_other.y)), fmaxf(triBNode2_other.y, triBNode3_other.y));
-    float minZ2 = fminf(fminf(fminf(triANode1_other.z, triANode2_other.z), fminf(triANode3_other.z, triBNode1_other.z)), fminf(triBNode2_other.z, triBNode3_other.z));
-    float maxZ2 = fmaxf(fmaxf(fmaxf(triANode1_other.z, triANode2_other.z), fmaxf(triANode3_other.z, triBNode1_other.z)), fmaxf(triBNode2_other.z, triBNode3_other.z));
-    
-    // EARLY REJECTION: Check AABB overlap first (avoids expensive SAT if no overlap)
-    const float margin = 1e-6f;
-    if (minX1 > maxX2 + margin || maxX1 < minX2 - margin ||
-        minY1 > maxY2 + margin || maxY1 < minY2 - margin ||
-        minZ1 > maxZ2 + margin || maxZ1 < minZ2 - margin) {
-        return false;  // AABBs don't overlap, no contact possible
-    }
-    
-    // AABBs overlap - now check if this is the canonical bin for this pair
-    const float inv_binSize = (float)simParams->dyn.inv_binSize;
-    
-    // Compute AABB intersection minimum corner
-    float intMinX = fmaxf(minX1, minX2);
-    float intMinY = fmaxf(minY1, minY2);
-    float intMinZ = fmaxf(minZ1, minZ2);
-    
-    // Find the minimum bin ID that touches this intersection
-    // This is the bin containing the minimum corner of the intersection
-    int binIdxX = (int)floorf(intMinX * inv_binSize);
-    int binIdxY = (int)floorf(intMinY * inv_binSize);
-    int binIdxZ = (int)floorf(intMinZ * inv_binSize);
-    
-    // Clamp to valid range
-    binIdxX = (binIdxX >= 0) ? ((binIdxX < (int)simParams->nbX) ? binIdxX : (int)simParams->nbX - 1) : 0;
-    binIdxY = (binIdxY >= 0) ? ((binIdxY < (int)simParams->nbY) ? binIdxY : (int)simParams->nbY - 1) : 0;
-    binIdxZ = (binIdxZ >= 0) ? ((binIdxZ < (int)simParams->nbZ) ? binIdxZ : (int)simParams->nbZ - 1) : 0;
-    
-    deme::binID_t canonicalBin = binIDFrom3Indices<deme::binID_t>(
-        (deme::binID_t)binIdxX, (deme::binID_t)binIdxY, (deme::binID_t)binIdxZ,
-        simParams->nbX, simParams->nbY, simParams->nbZ);
-    
-    // Process only if current bin is the canonical bin for this pair
-    return (currentBinID == canonicalBin);
-}
-
-// Full prism-prism contact check using SAT (Separating Axis Theorem).
-// NOTE: AABB overlap check is already done in shouldProcessTriTriInThisBin(),
-// so we skip it here and go directly to the full SAT test.
 inline __device__ bool checkPrismPrismContact(deme::DEMSimParams* simParams,
                                               const float3& triANode1,
                                               const float3& triANode2,
@@ -218,10 +127,7 @@ inline __device__ bool checkPrismPrismContact(deme::DEMSimParams* simParams,
                                               const float3& triBNode1_other,
                                               const float3& triBNode2_other,
                                               const float3& triBNode3_other) {
-    (void)simParams;  // simParams not needed since AABB check moved to shouldProcessTriTriInThisBin
-    
-    // Calculate the contact point between 2 prisms using full SAT check
-    // AABB pre-check already done in shouldProcessTriTriInThisBin
+    // Calculate the contact point between 2 prisms, and return whether they are in contact
     bool in_contact =
         calc_prism_contact(triANode1, triANode2, triANode3, triBNode1, triBNode2, triBNode3, triANode1_other,
                            triANode2_other, triANode3_other, triBNode1_other, triBNode2_other, triBNode3_other);
@@ -272,7 +178,6 @@ __global__ void getNumberOfTriangleContactsEachBin(deme::DEMSimParams* simParams
             "run despite this, set allowance higher via SetMaxTriangleInBin before simulation starts.",
             blockIdx.x, nTriInBin, simParams->errOutBinTriNum);
     }
-    
     const deme::spheresBinTouches_t myThreadID = threadIdx.x;
     // But what is the index of the same binID in array activeBinIDs? Well, mapTriActBinToSphActBin comes to rescure.
     const deme::binID_t indForAcqSphInfo = mapTriActBinToSphActBin[blockIdx.x];
@@ -418,20 +323,25 @@ __global__ void getNumberOfTriangleContactsEachBin(deme::DEMSimParams* simParams
                     continue;
                 }
 
-                // Use canonical bin assignment to avoid duplicate tri-tri contacts across bins.
-                // Pass full prism (both triA and triB faces) for correct AABB computation.
-                if (!shouldProcessTriTriInThisBin(simParams, binID, triIDs[bodyA], triIDs[bodyB],
-                                                   triANode1[bodyA], triANode2[bodyA], triANode3[bodyA],
-                                                   triBNode1[bodyA], triBNode2[bodyA], triBNode3[bodyA],
-                                                   triANode1[bodyB], triANode2[bodyB], triANode3[bodyB],
-                                                   triBNode1[bodyB], triBNode2[bodyB], triBNode3[bodyB]))
-                    continue;
-
+                // Tri--tri contact does not take into account bins, as duplicates will be removed in the end
                 bool in_contact = checkPrismPrismContact(
                     simParams, triANode1[bodyA], triANode2[bodyA], triANode3[bodyA], triBNode1[bodyA], triBNode2[bodyA],
                     triBNode3[bodyA], triANode1[bodyB], triANode2[bodyB], triANode3[bodyB], triBNode1[bodyB],
                     triBNode2[bodyB], triBNode3[bodyB]);
 
+                /*
+                if (in_contact && (contactPntBin != binID)) {
+                    unsigned int ZZ = binID/(simParams->nbX*simParams->nbY);
+                    unsigned int YY = binID%(simParams->nbX*simParams->nbY)/simParams->nbX;
+                    unsigned int XX = binID%(simParams->nbX*simParams->nbY)%simParams->nbX;
+                    double binLocX = (XX + 0.5) * simParams->binSize;
+                    double binLocY = (YY + 0.5) * simParams->binSize;
+                    double binLocZ = (ZZ + 0.5) * simParams->binSize;
+                    printf("binLoc: %f, %f, %f\n", binLocX, binLocY, binLocZ);
+                    printf("triANode1A: %f, %f, %f\n", triANode1[bodyA].x, triANode1[bodyA].y, triANode1[bodyA].z);
+                }
+                */
+
                 if (in_contact) {
                     atomicAdd(&blockTriTriPairCnt, 1);
                 }
@@ -468,15 +378,7 @@ __global__ void getNumberOfTriangleContactsEachBin(deme::DEMSimParams* simParams
                         continue;
                     }
 
-                    // Use canonical bin assignment to avoid duplicate tri-tri contacts across bins.
-                    // Pass full prism (both triA and triB faces) for correct AABB computation.
-                    if (!shouldProcessTriTriInThisBin(simParams, binID, triIDs[myThreadID], cur_bodyID,
-                                                       triANode1[myThreadID], triANode2[myThreadID], triANode3[myThreadID],
-                                                       triBNode1[myThreadID], triBNode2[myThreadID], triBNode3[myThreadID],
-                                                       cur_triANode1, cur_triANode2, cur_triANode3,
-                                                       cur_triBNode1, cur_triBNode2, cur_triBNode3))
-                        continue;
-
+                    // Tri--tri contact does not take into account bins, as duplicates will be removed in the end
                     bool in_contact = checkPrismPrismContact(
                         simParams, triANode1[myThreadID], triANode2[myThreadID], triANode3[myThreadID],
                         triBNode1[myThreadID], triBNode2[myThreadID], triBNode3[myThreadID], cur_triANode1,
@@ -696,15 +598,7 @@ __global__ void populateTriangleContactsEachBin(deme::DEMSimParams* simParams,
                     continue;
                 }
 
-                // Use canonical bin assignment to avoid duplicate tri-tri contacts across bins.
-                // Pass full prism (both triA and triB faces) for correct AABB computation.
-                if (!shouldProcessTriTriInThisBin(simParams, binID, triIDs[bodyA], triIDs[bodyB],
-                                                   triANode1[bodyA], triANode2[bodyA], triANode3[bodyA],
-                                                   triBNode1[bodyA], triBNode2[bodyA], triBNode3[bodyA],
-                                                   triANode1[bodyB], triANode2[bodyB], triANode3[bodyB],
-                                                   triBNode1[bodyB], triBNode2[bodyB], triBNode3[bodyB]))
-                    continue;
-
+                // Tri--tri contact does not take into account bins, as duplicates will be removed in the end
                 bool in_contact = checkPrismPrismContact(
                     simParams, triANode1[bodyA], triANode2[bodyA], triANode3[bodyA], triBNode1[bodyA], triBNode2[bodyA],
                     triBNode3[bodyA], triANode1[bodyB], triANode2[bodyB], triANode3[bodyB], triBNode1[bodyB],
@@ -712,7 +606,8 @@ __global__ void populateTriangleContactsEachBin(deme::DEMSimParams* simParams,
 
                 if (in_contact) {
                     deme::contactPairs_t inBlockOffset = mmReportOffset + atomicAdd(&blockTriTriPairCnt, 1);
-                    // Respect the budget-limited offset range from the scaled counts
+                    // The chance of offset going out-of-bound is very low, lower than sph--bin CD step, but I put it
+                    // here anyway
                     if (inBlockOffset < mmReportOffset_end) {
                         // ----------------------------------------------------------------------------
                         // IMPORTANT NOTE: Here, we don't need to adjust A and B ids to ensure A < B, and it's
@@ -773,15 +668,7 @@ __global__ void populateTriangleContactsEachBin(deme::DEMSimParams* simParams,
                         continue;
                     }
 
-                    // Use canonical bin assignment to avoid duplicate tri-tri contacts across bins.
-                    // Pass full prism (both triA and triB faces) for correct AABB computation.
-                    if (!shouldProcessTriTriInThisBin(simParams, binID, triIDs[myThreadID], cur_bodyID,
-                                                       triANode1[myThreadID], triANode2[myThreadID], triANode3[myThreadID],
-                                                       triBNode1[myThreadID], triBNode2[myThreadID], triBNode3[myThreadID],
-                                                       cur_triANode1, cur_triANode2, cur_triANode3,
-                                                       cur_triBNode1, cur_triBNode2, cur_triBNode3))
-                        continue;
-
+                    // Tri--tri contact does not take into account bins, as duplicates will be removed in the end
                     bool in_contact = checkPrismPrismContact(
                         simParams, triANode1[myThreadID], triANode2[myThreadID], triANode3[myThreadID],
                         triBNode1[myThreadID], triBNode2[myThreadID], triBNode3[myThreadID], cur_triANode1,
@@ -789,7 +676,8 @@ __global__ void populateTriangleContactsEachBin(deme::DEMSimParams* simParams,
 
                     if (in_contact) {
                         deme::contactPairs_t inBlockOffset = mmReportOffset + atomicAdd(&blockTriTriPairCnt, 1);
-                        // Respect the budget-limited offset range from the scaled counts
+                        // The chance of offset going out-of-bound is very low, lower than sph--bin CD step, but I put
+                        // it here anyway
                         if (inBlockOffset < mmReportOffset_end) {
                             deme::bodyID_t triA_ID, triB_ID;
                             if (triIDs[myThreadID] <= cur_bodyID) {

From da99f8eff1606fff5817fafbcd22f72a45c0acf0 Mon Sep 17 00:00:00 2001
From: Florian Reinle <f.reinle@otec.de>
Date: Mon, 19 Jan 2026 10:30:28 +0100
Subject: [PATCH 05/17] regain  some kT and dT performance - kT improved with
 addional precalc kernel to reduce calculating some data twice - dT improved
 by simplyfing force calculation and added some earlier exits

---
 src/DEM/dT.cpp                                | 175 ++--
 src/DEM/kT.cpp                                |   1 +
 src/algorithms/DEMContactDetection.cu         |  58 +-
 src/algorithms/DEMCubInstantiations.cu        |  10 +-
 src/algorithms/DEMDynamicMisc.cu              | 185 ++++-
 src/algorithms/DEMStaticDeviceSubroutines.h   |  69 +-
 src/kernel/DEMBinTriangleKernels.cu           | 757 ++++++++++--------
 src/kernel/DEMCalcForceKernels_Primitive.cu   |  21 +-
 .../DEMCollisionKernels_SphTri_TriTri.cuh     |  34 +-
 src/kernel/DEMKinematicMisc.cu                |  15 +-
 10 files changed, 814 insertions(+), 511 deletions(-)

diff --git a/src/DEM/dT.cpp b/src/DEM/dT.cpp
index 2eb50295..6f2a1622 100644
--- a/src/DEM/dT.cpp
+++ b/src/DEM/dT.cpp
@@ -2649,127 +2649,77 @@ inline void DEMDynamicThread::dispatchPatchBasedForceCorrections(
             // This reduce-by-key operation reduces primitive-recorded force pairs into patch/convex part-based
             // force pairs. All elements that share the same geomToPatchMap value vote together.
             if (countPrimitive > 0) {
-                // Allocate temporary arrays for the voting process
-                float3* weightedNormals =
-                    (float3*)solverScratchSpace.allocateTempVector("weightedNormals", countPrimitive * sizeof(float3));
-                double* areas =
-                    (double*)solverScratchSpace.allocateTempVector("areas", countPrimitive * sizeof(double));
-                // Keys extracted from geomToPatchMap - these map primitives to patch pairs
-                contactPairs_t* keys = (contactPairs_t*)solverScratchSpace.allocateTempVector(
-                    "votingKeys", countPrimitive * sizeof(contactPairs_t));
+                // Keys are already available on device: geomToPatchMap maps each primitive contact to its patch pair.
+                // This avoids materializing an extra temporary key buffer.
+                contactPairs_t* keys = granData->geomToPatchMap + startOffsetPrimitive;
 
                 // Allocate arrays for reduce-by-key results (uniqueKeys uses contactPairs_t, not patchIDPair_t)
                 contactPairs_t* uniqueKeys = (contactPairs_t*)solverScratchSpace.allocateTempVector(
                     "uniqueKeys", countPrimitive * sizeof(contactPairs_t));
-                float3* votedWeightedNormals = (float3*)solverScratchSpace.allocateTempVector(
-                    "votedWeightedNormals", countPrimitive * sizeof(float3));
                 solverScratchSpace.allocateDualStruct("numUniqueKeys");
                 size_t* numUniqueKeys = solverScratchSpace.getDualStructDevice("numUniqueKeys");
 
-                // Step 1: Prepare weighted normals, areas, and keys
-                // The kernel extracts keys from geomToPatchMap, computes weighted normals, and stores areas
-                prepareWeightedNormalsForVoting(&granData, weightedNormals, areas, keys, startOffsetPrimitive,
-                                                countPrimitive, streamInfo.stream);
+                // Step 1: Prepare weighted normals for voting.
+                // Note: the validated legacy semantics uses area/penetration weighting.
+                float3* weightedNormals =
+                    (float3*)solverScratchSpace.allocateTempVector("weightedNormals", countPrimitive * sizeof(float3));
+                prepareWeightedNormalsForVoting(&granData, weightedNormals, startOffsetPrimitive, countPrimitive,
+                                                streamInfo.stream);
 
                 // Step 2: Reduce-by-key for weighted normals (sum)
-                // The keys are geomToPatchMap values (contactPairs_t), which group primitives by patch pair
+                // The number of patch pairs (unique keys) is expected to be countPatch.
+                // Using countPatch here saves scratch memory without changing semantics.
+                float3* votedWeightedNormals = (float3*)solverScratchSpace.allocateTempVector(
+                    "votedWeightedNormals", countPatch * sizeof(float3));
                 cubSumReduceByKey<contactPairs_t, float3>(keys, uniqueKeys, weightedNormals, votedWeightedNormals,
                                                           numUniqueKeys, countPrimitive, streamInfo.stream,
                                                           solverScratchSpace);
                 solverScratchSpace.finishUsingTempVector("weightedNormals");
-                // For extra safety
-                solverScratchSpace.syncDualStructDeviceToHost("numUniqueKeys");
-                size_t numUniqueKeysHost = *(solverScratchSpace.getDualStructHost("numUniqueKeys"));
-                // std::cout << "Keys:" << std::endl;
-                // displayDeviceArray<contactPairs_t>(keys, countPrimitive);
-                // std::cout << "Unique Keys:" << std::endl;
-                // displayDeviceArray<contactPairs_t>(uniqueKeys, numUniqueKeysHost);
-                if (numUniqueKeysHost != countPatch) {
-                    DEME_ERROR(
-                        "Patch-based contact voting produced %zu unique patch pairs, but expected %zu pairs for "
-                        "contact type %d!",
-                        numUniqueKeysHost, countPatch, contact_type);
-                }
 
-                // Step 3: Normalize the voted normals by total area and scatter back to a temp array.
+                // Optional debug-only safety check (removed from release path for full GPU orientation).
+                DEME_DEBUG_EXEC({
+                    solverScratchSpace.syncDualStructDeviceToHost("numUniqueKeys");
+                    size_t numUniqueKeysHost = *(solverScratchSpace.getDualStructHost("numUniqueKeys"));
+                    if (numUniqueKeysHost != countPatch) {
+                        DEME_ERROR(
+                            "Patch-based contact voting produced %zu unique patch pairs, but expected %zu pairs for "
+                            "contact type %d!",
+                            numUniqueKeysHost, countPatch, contact_type);
+                    }
+                });
+
+                // Step 3: Normalize voted normals.
                 float3* votedNormals =
                     (float3*)solverScratchSpace.allocateTempVector("votedNormals", countPatch * sizeof(float3));
                 normalizeAndScatterVotedNormals(votedWeightedNormals, votedNormals, countPatch, streamInfo.stream);
                 solverScratchSpace.finishUsingTempVector("votedWeightedNormals");
-                // displayDeviceFloat3(votedNormals, countPatch);
-
-                // Step 4: Compute projected penetration and area for each primitive contact
-                // Both the penetration and area are projected onto the voted normal
-                // If the projected penetration becomes negative, both are set to 0
-                // Reuse keys array for the reduce-by-key operation
-                double* projectedPenetrations = (double*)solverScratchSpace.allocateTempVector(
-                    "projectedPenetrations", countPrimitive * sizeof(double));
-                double* projectedAreas =
-                    (double*)solverScratchSpace.allocateTempVector("projectedAreas", countPrimitive * sizeof(double));
-                computeWeightedUsefulPenetration(&granData, votedNormals, keys, areas, projectedPenetrations,
-                                                 projectedAreas, startOffsetPrimitive, startOffsetPatch, countPrimitive,
-                                                 streamInfo.stream);
-                solverScratchSpace.finishUsingTempVector("areas");
-
-                // Step 5: Reduce-by-key to get total projected area per patch pair (sum)
-                double* totalProjectedAreas =
-                    (double*)solverScratchSpace.allocateTempVector("totalProjectedAreas", countPatch * sizeof(double));
-                cubSumReduceByKey<contactPairs_t, double>(keys, uniqueKeys, projectedAreas, totalProjectedAreas,
-                                                          numUniqueKeys, countPrimitive, streamInfo.stream,
-                                                          solverScratchSpace);
 
-                // Step 6: Reduce-by-key to get max projected penetration per patch pair (max).
-                // This result, maxProjectedPenetrations, is the max of projected penetration, aka the max pen in the
-                // physical overlap case, and it's not the same as maxPenetrations in step 9 which is a fallback
-                // primitive-derived penetration.
-                double* maxProjectedPenetrations = (double*)solverScratchSpace.allocateTempVector(
-                    "maxProjectedPenetrations", countPatch * sizeof(double));
-                cubMaxReduceByKey<contactPairs_t, double>(keys, uniqueKeys, projectedPenetrations,
-                                                          maxProjectedPenetrations, numUniqueKeys, countPrimitive,
-                                                          streamInfo.stream, solverScratchSpace);
-
-                // Step 7: Compute weighted contact points for each primitive (normal case)
-                // The weight is: projected_penetration * projected_area
-                // Reuse keys, uniqueKeys, and numUniqueKeys that are still allocated
-                double3* weightedContactPoints = (double3*)solverScratchSpace.allocateTempVector(
-                    "weightedContactPoints", countPrimitive * sizeof(double3));
-                double* contactWeights =
-                    (double*)solverScratchSpace.allocateTempVector("contactWeights", countPrimitive * sizeof(double));
-                computeWeightedContactPoints(&granData, weightedContactPoints, contactWeights, projectedPenetrations,
-                                             projectedAreas, startOffsetPrimitive, countPrimitive, streamInfo.stream);
-                solverScratchSpace.finishUsingTempVector("projectedPenetrations");
-                solverScratchSpace.finishUsingTempVector("projectedAreas");
-                // Reduce-by-key to get total weighted contact points per patch pair
-                double3* totalWeightedContactPoints = (double3*)solverScratchSpace.allocateTempVector(
-                    "totalWeightedContactPoints", countPatch * sizeof(double3));
-                double* totalContactWeights =
-                    (double*)solverScratchSpace.allocateTempVector("totalContactWeights", countPatch * sizeof(double));
-                cubSumReduceByKey<contactPairs_t, double3>(keys, uniqueKeys, weightedContactPoints,
-                                                           totalWeightedContactPoints, numUniqueKeys, countPrimitive,
-                                                           streamInfo.stream, solverScratchSpace);
-                cubSumReduceByKey<contactPairs_t, double>(keys, uniqueKeys, contactWeights, totalContactWeights,
-                                                          numUniqueKeys, countPrimitive, streamInfo.stream,
-                                                          solverScratchSpace);
-                solverScratchSpace.finishUsingTempVector("weightedContactPoints");
-                solverScratchSpace.finishUsingTempVector("contactWeights");
-                // Compute voted contact points per patch pair by dividing by total weight
-                double3* votedContactPoints =
-                    (double3*)solverScratchSpace.allocateTempVector("votedContactPoints", countPatch * sizeof(double3));
-                computeFinalContactPointsPerPatch(totalWeightedContactPoints, totalContactWeights, votedContactPoints,
-                                                  countPatch, streamInfo.stream);
-                solverScratchSpace.finishUsingTempVector("totalWeightedContactPoints");
-                solverScratchSpace.finishUsingTempVector("totalContactWeights");
-
-                // Step 8: Handle zero-area patches (all primitive areas are 0)
+                // Step 4: Compute per-primitive patch accumulators (projected area, max projected penetration,
+                // and weighted contact-point sums) in one pass.
+                PatchContactAccum* primitivePatchAccumulators = (PatchContactAccum*)solverScratchSpace.allocateTempVector(
+                    "primitivePatchAccumulators", countPrimitive * sizeof(PatchContactAccum));
+                computePatchContactAccumulators(&granData, votedNormals, keys, primitivePatchAccumulators,
+                                                startOffsetPrimitive, startOffsetPatch, countPrimitive,
+                                                streamInfo.stream);
+
+                // Step 5: Reduce-by-key accumulators to patch level (sum + max).
+                PatchContactAccum* patchContactAccumulators = (PatchContactAccum*)solverScratchSpace.allocateTempVector(
+                    "patchContactAccumulators", countPatch * sizeof(PatchContactAccum));
+                cubSumReduceByKey<contactPairs_t, PatchContactAccum>(
+                    keys, uniqueKeys, primitivePatchAccumulators, patchContactAccumulators, numUniqueKeys, countPrimitive,
+                    streamInfo.stream, solverScratchSpace);
+                solverScratchSpace.finishUsingTempVector("primitivePatchAccumulators");
+
+                // Step 6: Handle zero-area patches (all primitive areas are 0)
                 // For these patches, we need to find the max penetration primitive and use its normal/penetration
 
-                // 8a: Extract primitive penetrations for max-reduce
+                // 6a: Extract primitive penetrations for max-reduce
                 double* primitivePenetrations = (double*)solverScratchSpace.allocateTempVector(
                     "primitivePenetrations", countPrimitive * sizeof(double));
                 extractPrimitivePenetrations(&granData, primitivePenetrations, startOffsetPrimitive, countPrimitive,
                                              streamInfo.stream);
 
-                // 8b: Max-negative-reduce-by-key to get max negative penetration per patch
+                // 6b: Max-negative-reduce-by-key to get max negative penetration per patch
                 // This finds the largest negative value (smallest absolute value among negatives)
                 // Positive values are treated as very negative to indicate invalid/non-physical state
                 double* maxPenetrations =
@@ -2779,7 +2729,7 @@ inline void DEMDynamicThread::dispatchPatchBasedForceCorrections(
                                                                   streamInfo.stream, solverScratchSpace);
                 solverScratchSpace.finishUsingTempVector("primitivePenetrations");
 
-                // 8c: Find max-penetration primitives for zero-area patches and extract their normals, penetrations,
+                // 6c: Find max-penetration primitives for zero-area patches and extract their normals, penetrations,
                 // and contact points
                 float3* zeroAreaNormals =
                     (float3*)solverScratchSpace.allocateTempVector("zeroAreaNormals", countPatch * sizeof(float3));
@@ -2792,8 +2742,8 @@ inline void DEMDynamicThread::dispatchPatchBasedForceCorrections(
                     startOffsetPrimitive, startOffsetPatch, countPrimitive, streamInfo.stream);
                 solverScratchSpace.finishUsingTempVector("maxPenetrations");
 
-                // Step 8d: Check if each patch has any SAT-satisfying primitive (for tri-tri contacts)
-                // If no primitive satisfies SAT, the patch contact is non-physical and should use Step 9 fallback
+                // Step 6d: Check if each patch has any SAT-satisfying primitive (for tri-tri contacts)
+                // If no primitive satisfies SAT, the patch contact is non-physical and should use Step 7 fallback
                 notStupidBool_t* patchHasSAT = nullptr;
                 if (contact_type == TRIANGLE_TRIANGLE_CONTACT) {
                     patchHasSAT = (notStupidBool_t*)solverScratchSpace.allocateTempVector(
@@ -2803,12 +2753,7 @@ inline void DEMDynamicThread::dispatchPatchBasedForceCorrections(
                                                         streamInfo.stream);
                 }
 
-                // Clean up keys arrays now that we're done with reductions
-                solverScratchSpace.finishUsingTempVector("votingKeys");
-                solverScratchSpace.finishUsingTempVector("uniqueKeys");
-                solverScratchSpace.finishUsingDualStruct("numUniqueKeys");
-
-                // Step 9: Finalize patch results by combining voting with zero-area handling.
+                // Step 7: Finalize patch results by combining voting with zero-area handling.
                 // If patch-based projected area is 0 (or this patch pair consists of no SAT pair), meaning no physical
                 // contact, we use the fallback estimations (zeroArea*) of CP, penetration and areas.
                 double* finalAreas =
@@ -2822,18 +2767,24 @@ inline void DEMDynamicThread::dispatchPatchBasedForceCorrections(
 
                 double3* finalContactPoints =
                     (double3*)solverScratchSpace.allocateTempVector("finalContactPoints", countPatch * sizeof(double3));
-                finalizePatchResults(totalProjectedAreas, votedNormals, maxProjectedPenetrations, votedContactPoints,
-                                     zeroAreaNormals, zeroAreaPenetrations, zeroAreaContactPoints, patchHasSAT,
-                                     finalAreas, finalNormals, finalPenetrations.data(), finalContactPoints, countPatch,
-                                     streamInfo.stream);
-                solverScratchSpace.finishUsingTempVector("totalProjectedAreas");
+                finalizePatchResultsFromAccumulators(patchContactAccumulators, votedNormals, zeroAreaNormals,
+                                                     zeroAreaPenetrations, zeroAreaContactPoints, patchHasSAT,
+                                                     finalAreas, finalNormals, finalPenetrations.data(),
+                                                     finalContactPoints, countPatch, streamInfo.stream);
+
+                // Clean up temporaries no longer needed past this point.
+                solverScratchSpace.finishUsingTempVector("patchContactAccumulators");
                 solverScratchSpace.finishUsingTempVector("votedNormals");
-                solverScratchSpace.finishUsingTempVector("maxProjectedPenetrations");
                 solverScratchSpace.finishUsingTempVector("zeroAreaNormals");
                 solverScratchSpace.finishUsingTempVector("zeroAreaPenetrations");
-                solverScratchSpace.finishUsingTempVector("votedContactPoints");
                 solverScratchSpace.finishUsingTempVector("zeroAreaContactPoints");
-                solverScratchSpace.finishUsingTempVector("patchHasSAT");
+                if (patchHasSAT != nullptr) {
+                    solverScratchSpace.finishUsingTempVector("patchHasSAT");
+                }
+
+                // Clean up CUB bookkeeping buffers.
+                solverScratchSpace.finishUsingTempVector("uniqueKeys");
+                solverScratchSpace.finishUsingDualStruct("numUniqueKeys");
 
                 // Now we have:
                 // - finalAreas: final contact area per patch pair (countPatch elements)
diff --git a/src/DEM/kT.cpp b/src/DEM/kT.cpp
index 3c0b7aa1..ab8c824b 100644
--- a/src/DEM/kT.cpp
+++ b/src/DEM/kT.cpp
@@ -1271,6 +1271,7 @@ void DEMKinematicThread::prewarmKernels() {
         sphere_contact_kernels->kernel("getNumberOfSphereContactsEachBin").instantiate();
     }
     if (bin_triangle_kernels) {
+        bin_triangle_kernels->kernel("precomputeTriangleSandwichData").instantiate();
         bin_triangle_kernels->kernel("getNumberOfBinsEachTriangleTouches").instantiate();
         bin_triangle_kernels->kernel("populateBinTriangleTouchingPairs").instantiate();
     }
diff --git a/src/algorithms/DEMContactDetection.cu b/src/algorithms/DEMContactDetection.cu
index 0666c35b..5565e0ee 100644
--- a/src/algorithms/DEMContactDetection.cu
+++ b/src/algorithms/DEMContactDetection.cu
@@ -574,11 +574,42 @@ void contactDetection(std::shared_ptr<JitHelper::CachedProgram>& bin_sphere_kern
                 numAnalGeoTriTouches =
                     (objID_t*)scratchPad.allocateTempVector("numAnalGeoTriTouches", CD_temp_arr_bytes);
             }
+            
+            // Triangle prepass: compute world vertices/bounds/shift once, reuse in both sweeps.
+            CD_temp_arr_bytes = simParams->nTriGM * sizeof(float3);
+            float3* tri_vA1 = (float3*)scratchPad.allocateTempVector("tri_vA1", CD_temp_arr_bytes);
+            float3* tri_vB1 = (float3*)scratchPad.allocateTempVector("tri_vB1", CD_temp_arr_bytes);
+            float3* tri_vC1 = (float3*)scratchPad.allocateTempVector("tri_vC1", CD_temp_arr_bytes);
+            float3* tri_vA2 = (float3*)scratchPad.allocateTempVector("tri_vA2", CD_temp_arr_bytes);
+            float3* tri_vB2 = (float3*)scratchPad.allocateTempVector("tri_vB2", CD_temp_arr_bytes);
+            float3* tri_vC2 = (float3*)scratchPad.allocateTempVector("tri_vC2", CD_temp_arr_bytes);
+            float3* tri_shift = (float3*)scratchPad.allocateTempVector("tri_shift", CD_temp_arr_bytes);
+
+            CD_temp_arr_bytes = simParams->nTriGM * sizeof(int3);
+            int3* tri_L1 = (int3*)scratchPad.allocateTempVector("tri_L1", CD_temp_arr_bytes);
+            int3* tri_U1 = (int3*)scratchPad.allocateTempVector("tri_U1", CD_temp_arr_bytes);
+            int3* tri_L2 = (int3*)scratchPad.allocateTempVector("tri_L2", CD_temp_arr_bytes);
+            int3* tri_U2 = (int3*)scratchPad.allocateTempVector("tri_U2", CD_temp_arr_bytes);
+
+            CD_temp_arr_bytes = simParams->nTriGM * sizeof(uint8_t);
+            uint8_t* tri_ok1 = (uint8_t*)scratchPad.allocateTempVector("tri_ok1", CD_temp_arr_bytes);
+            uint8_t* tri_ok2 = (uint8_t*)scratchPad.allocateTempVector("tri_ok2", CD_temp_arr_bytes);
+
+            bin_triangle_kernels->kernel("precomputeTriangleSandwichData")
+                .instantiate()
+                .configure(dim3(blocks_needed_for_tri), dim3(DEME_NUM_TRIANGLE_PER_BLOCK), 0, this_stream)
+                .launch(&simParams, &granData,
+                        tri_vA1, tri_vB1, tri_vC1, tri_vA2, tri_vB2, tri_vC2,
+                        tri_shift, tri_L1, tri_U1, tri_L2, tri_U2, tri_ok1, tri_ok2,
+                        sandwichANode1, sandwichANode2, sandwichANode3,
+                        sandwichBNode1, sandwichBNode2, sandwichBNode3);
+
             bin_triangle_kernels->kernel("getNumberOfBinsEachTriangleTouches")
                 .instantiate()
                 .configure(dim3(blocks_needed_for_tri), dim3(DEME_NUM_TRIANGLE_PER_BLOCK), 0, this_stream)
-                .launch(&simParams, &granData, numBinsTriTouches, numAnalGeoTriTouches, sandwichANode1, sandwichANode2,
-                        sandwichANode3, sandwichBNode1, sandwichBNode2, sandwichBNode3,
+                .launch(&simParams, &granData, numBinsTriTouches, numAnalGeoTriTouches,
+                        tri_vA1, tri_vB1, tri_vC1, tri_vA2, tri_vB2, tri_vC2,
+                        tri_shift, tri_L1, tri_U1, tri_L2, tri_U2, tri_ok1, tri_ok2,
                         solverFlags.meshUniversalContact);
             // std::cout << "numBinsTriTouches: " << std::endl;
             // displayDeviceArray<binsTriangleTouches_t>(numBinsTriTouches, simParams->nTriGM);
@@ -651,9 +682,26 @@ void contactDetection(std::shared_ptr<JitHelper::CachedProgram>& bin_sphere_kern
             bin_triangle_kernels->kernel("populateBinTriangleTouchingPairs")
                 .instantiate()
                 .configure(dim3(blocks_needed_for_tri), dim3(DEME_NUM_TRIANGLE_PER_BLOCK), 0, this_stream)
-                .launch(&simParams, &granData, numBinsTriTouchesScan, numAnalGeoTriTouchesScan, binIDsEachTriTouches,
-                        triIDsEachBinTouches, sandwichANode1, sandwichANode2, sandwichANode3, sandwichBNode1,
-                        sandwichBNode2, sandwichBNode3, idTriA, idGeoB, dType, solverFlags.meshUniversalContact);
+                .launch(&simParams, &granData, numBinsTriTouchesScan, numAnalGeoTriTouchesScan,
+                        binIDsEachTriTouches, triIDsEachBinTouches,
+                        tri_vA1, tri_vB1, tri_vC1, tri_vA2, tri_vB2, tri_vC2,
+                        tri_shift, tri_L1, tri_U1, tri_L2, tri_U2, tri_ok1, tri_ok2,
+                        idTriA, idGeoB, dType, solverFlags.meshUniversalContact);
+
+            scratchPad.finishUsingTempVector("tri_vA1");
+            scratchPad.finishUsingTempVector("tri_vB1");
+            scratchPad.finishUsingTempVector("tri_vC1");
+            scratchPad.finishUsingTempVector("tri_vA2");
+            scratchPad.finishUsingTempVector("tri_vB2");
+            scratchPad.finishUsingTempVector("tri_vC2");
+            scratchPad.finishUsingTempVector("tri_shift");
+            scratchPad.finishUsingTempVector("tri_L1");
+            scratchPad.finishUsingTempVector("tri_U1");
+            scratchPad.finishUsingTempVector("tri_L2");
+            scratchPad.finishUsingTempVector("tri_U2");
+            scratchPad.finishUsingTempVector("tri_ok1");
+            scratchPad.finishUsingTempVector("tri_ok2");
+
             // std::cout << "binIDsEachTriTouches: " << std::endl;
             // displayDeviceArray<binsTriangleTouches_t>(binIDsEachTriTouches, *pNumBinTriTouchPairs);
             // std::cout << "dType: " << std::endl;
diff --git a/src/algorithms/DEMCubInstantiations.cu b/src/algorithms/DEMCubInstantiations.cu
index 3c93680d..7e8d5c7f 100644
--- a/src/algorithms/DEMCubInstantiations.cu
+++ b/src/algorithms/DEMCubInstantiations.cu
@@ -97,7 +97,15 @@ template void cubSumReduceByKey<contactPairs_t, double>(contactPairs_t* d_keys_i
                                                         size_t n,
                                                         cudaStream_t& this_stream,
                                                         DEMSolverScratchData& scratchPad);
-
+// Patch contact accumulators (sum + max) with contactPairs_t keys
+template void cubSumReduceByKey<contactPairs_t, PatchContactAccum>(contactPairs_t* d_keys_in,
+                                                                   contactPairs_t* d_unique_out,
+                                                                   PatchContactAccum* d_vals_in,
+                                                                   PatchContactAccum* d_aggregates_out,
+                                                                   size_t* d_num_out,
+                                                                   size_t n,
+                                                                   cudaStream_t& this_stream,
+                                                                   DEMSolverScratchData& scratchPad);
 
 ////////////////////////////////////////////////////////////////////////////////
 // Reduce::Max
diff --git a/src/algorithms/DEMDynamicMisc.cu b/src/algorithms/DEMDynamicMisc.cu
index 43143cbe..1e55ca7d 100644
--- a/src/algorithms/DEMDynamicMisc.cu
+++ b/src/algorithms/DEMDynamicMisc.cu
@@ -119,53 +119,43 @@ void getContactForcesConcerningOwners(float3* d_points,
 // Patch-based voting kernels for mesh contact correction
 ////////////////////////////////////////////////////////////////////////////////
 
-// Kernel to compute weighted normals (normal * area) for voting
+// Kernel to compute weighted normals (normal * area / penetration) for voting
 // Also prepares the area values for reduction and extracts the keys (geomToPatchMap values)
+
+// Optimized overload: prepare weighted normals only (no temporary areas/keys arrays).
 __global__ void prepareWeightedNormalsForVoting_impl(DEMDataDT* granData,
-                                                     float3* weightedNormals,
-                                                     double* areas,
-                                                     contactPairs_t* keys,
-                                                     contactPairs_t startOffset,
-                                                     contactPairs_t count) {
+                                                          float3* weightedNormals,
+                                                          contactPairs_t startOffset,
+                                                          contactPairs_t count) {
     contactPairs_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < count) {
         contactPairs_t myContactID = startOffset + idx;
 
-        // Get the contact normal from contactForces
-        float3 normal = granData->contactForces[myContactID];
+        // Normal and geometric quantities were produced by the primitive contact kernels.
+        const float3 normal = granData->contactForces[myContactID];
+        const float3 areaStorage = granData->contactPointGeometryB[myContactID];
+        const float area = float3StorageToDouble(areaStorage);
 
-        // Extract the area (double) from contactPointGeometryB (stored as float3)
-        float3 areaStorage = granData->contactPointGeometryB[myContactID];
-        double area = float3StorageToDouble(areaStorage);
-        float3 penStorage = granData->contactPointGeometryA[myContactID];
-        double penetration = float3StorageToDouble(penStorage);
+        // Penetration is used to weight the vote (validated legacy semantics).
+        const float3 penStorage = granData->contactPointGeometryA[myContactID];
+        float penetration = float3StorageToDouble(penStorage);
         penetration = (penetration > DEME_TINY_FLOAT) ? penetration : DEME_TINY_FLOAT;
-        double recipPen = 1.0 / penetration;
-
-        // Compute weighted normal (normal * area)
-        // Note that fake contacts do not affect as their area is 0
-        weightedNormals[idx] = make_float3((double)normal.x * area * recipPen, (double)normal.y * area * recipPen,
-                                           (double)normal.z * area * recipPen);
-
-        // Store area for reduction
-        areas[idx] = area;
+        const float weight = area / penetration;
 
-        // Extract key from geomToPatchMap
-        keys[idx] = granData->geomToPatchMap[myContactID];
+        weightedNormals[idx] = make_float3((double)normal.x * weight, (double)normal.y * weight,
+                                           (double)normal.z * weight);
     }
 }
 
 void prepareWeightedNormalsForVoting(DEMDataDT* granData,
                                      float3* weightedNormals,
-                                     double* areas,
-                                     contactPairs_t* keys,
                                      contactPairs_t startOffset,
                                      contactPairs_t count,
                                      cudaStream_t& this_stream) {
     size_t blocks_needed = (count + DEME_MAX_THREADS_PER_BLOCK - 1) / DEME_MAX_THREADS_PER_BLOCK;
     if (blocks_needed > 0) {
         prepareWeightedNormalsForVoting_impl<<<blocks_needed, DEME_MAX_THREADS_PER_BLOCK, 0, this_stream>>>(
-            granData, weightedNormals, areas, keys, startOffset, count);
+            granData, weightedNormals, startOffset, count);
     }
 }
 
@@ -204,6 +194,147 @@ void normalizeAndScatterVotedNormals(float3* votedWeightedNormals,
     }
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// Fused patch aggregation kernels (projected area, penetration, contact point)
+////////////////////////////////////////////////////////////////////////////////
+
+// Per-primitive accumulator generation.
+//
+// This replaces the former pipeline:
+//   computeWeightedUsefulPenetration -> ReduceByKey(sum projArea)
+//   ReduceByKey(max projPen)
+//   computeWeightedContactPoints -> ReduceByKey(sum weightedCP) -> ReduceByKey(sum weight)
+//
+// It produces the same patch-level quantities, but materializes only one array
+// (PatchContactAccum) and performs a single ReduceByKey.
+__global__ void computePatchContactAccumulators_impl(DEMDataDT* granData,
+                                                     const float3* votedNormals,
+                                                     const contactPairs_t* keys,
+                                                     PatchContactAccum* accumulators,
+                                                     contactPairs_t startOffsetPrimitive,
+                                                     contactPairs_t startOffsetPatch,
+                                                     contactPairs_t count) {
+    contactPairs_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) {
+        const contactPairs_t myContactID = startOffsetPrimitive + idx;
+
+        // Map this primitive to its patch-pair index, then to local [0, countPatch) index.
+        const contactPairs_t patchIdx = keys[idx];
+        const contactPairs_t localPatchIdx = patchIdx - startOffsetPatch;
+
+        const float3 votedNormal = votedNormals[localPatchIdx];
+        const float3 originalNormal = granData->contactForces[myContactID];
+
+        // Penetration depth (positive means overlap/contact); negative is non-contact and does not contribute.
+        const float3 penStorage = granData->contactPointGeometryA[myContactID];
+        double originalPenetration = float3StorageToDouble(penStorage);
+        originalPenetration = (originalPenetration > 0.0) ? originalPenetration : 0.0;
+
+        // Contact area (non-negative; fake contacts have 0 area and thus contribute 0).
+        const float3 areaStorage = granData->contactPointGeometryB[myContactID];
+        const double area = float3StorageToDouble(areaStorage);
+
+        // Projection factor: clamp negative dot products to 0 (tangential/opposing contributions do not participate).
+        const float dotProduct = dot(originalNormal, votedNormal);
+        const double cospos = (dotProduct > 0.f) ? (double)dotProduct : 0.0;
+
+        const double projectedPenetration = originalPenetration * cospos;
+        const double projectedArea = area * cospos;
+
+        const double weight = projectedPenetration * projectedArea;
+
+        const double3 contactPoint = to_double3(granData->contactTorque_convToForce[myContactID]);
+        const double3 weightedCP = make_double3(contactPoint.x * weight, contactPoint.y * weight, contactPoint.z * weight);
+
+        PatchContactAccum acc;
+        acc.sumProjArea = projectedArea;
+        acc.maxProjPen = projectedPenetration;
+        acc.sumWeight = weight;
+        acc.sumWeightedCP = weightedCP;
+        accumulators[idx] = acc;
+    }
+}
+
+void computePatchContactAccumulators(DEMDataDT* granData,
+                                     const float3* votedNormals,
+                                     const contactPairs_t* keys,
+                                     PatchContactAccum* accumulators,
+                                     contactPairs_t startOffsetPrimitive,
+                                     contactPairs_t startOffsetPatch,
+                                     contactPairs_t count,
+                                     cudaStream_t& this_stream) {
+    size_t blocks_needed = (count + DEME_MAX_THREADS_PER_BLOCK - 1) / DEME_MAX_THREADS_PER_BLOCK;
+    if (blocks_needed > 0) {
+        computePatchContactAccumulators_impl<<<blocks_needed, DEME_MAX_THREADS_PER_BLOCK, 0, this_stream>>>(
+            granData, votedNormals, keys, accumulators, startOffsetPrimitive, startOffsetPatch, count);
+    }
+}
+
+// Finalization from patch accumulators (no intermediate per-patch arrays).
+__global__ void finalizePatchResultsFromAccumulators_impl(const PatchContactAccum* patchAccumulators,
+                                                          const float3* votedNormals,
+                                                          const float3* zeroAreaNormals,
+                                                          const double* zeroAreaPenetrations,
+                                                          const double3* zeroAreaContactPoints,
+                                                          const notStupidBool_t* patchHasSAT,
+                                                          double* finalAreas,
+                                                          float3* finalNormals,
+                                                          double* finalPenetrations,
+                                                          double3* finalContactPoints,
+                                                          contactPairs_t count) {
+    contactPairs_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) {
+        const PatchContactAccum acc = patchAccumulators[idx];
+        const double projectedArea = acc.sumProjArea;
+
+        // Default to 1 (SAT satisfied) for non-triangle-triangle contacts where patchHasSAT is null
+        const notStupidBool_t hasSAT = (patchHasSAT != nullptr) ? patchHasSAT[idx] : 1;
+
+        // Use voted results only if projectedArea > 0 AND at least one primitive satisfies SAT
+        if (projectedArea > 0.0 && hasSAT) {
+            finalAreas[idx] = projectedArea;
+            finalNormals[idx] = votedNormals[idx];
+            finalPenetrations[idx] = acc.maxProjPen;
+
+            if (acc.sumWeight > 0.0) {
+                const double invW = 1.0 / acc.sumWeight;
+                finalContactPoints[idx] = make_double3(acc.sumWeightedCP.x * invW,
+                                                      acc.sumWeightedCP.y * invW,
+                                                      acc.sumWeightedCP.z * invW);
+            } else {
+                // If total weight is 0, contact point is set to (0,0,0)
+                finalContactPoints[idx] = make_double3(0.0, 0.0, 0.0);
+            }
+        } else {
+            // Zero-area case OR no SAT-satisfying primitives: fallback to max-penetration primitive's results
+            finalAreas[idx] = 0.0;
+            finalNormals[idx] = zeroAreaNormals[idx];
+            finalPenetrations[idx] = zeroAreaPenetrations[idx];
+            finalContactPoints[idx] = zeroAreaContactPoints[idx];
+        }
+    }
+}
+
+void finalizePatchResultsFromAccumulators(const PatchContactAccum* patchAccumulators,
+                                          const float3* votedNormals,
+                                          const float3* zeroAreaNormals,
+                                          const double* zeroAreaPenetrations,
+                                          const double3* zeroAreaContactPoints,
+                                          const notStupidBool_t* patchHasSAT,
+                                          double* finalAreas,
+                                          float3* finalNormals,
+                                          double* finalPenetrations,
+                                          double3* finalContactPoints,
+                                          contactPairs_t count,
+                                          cudaStream_t& this_stream) {
+    size_t blocks_needed = (count + DEME_MAX_THREADS_PER_BLOCK - 1) / DEME_MAX_THREADS_PER_BLOCK;
+    if (blocks_needed > 0) {
+        finalizePatchResultsFromAccumulators_impl<<<blocks_needed, DEME_MAX_THREADS_PER_BLOCK, 0, this_stream>>>(
+            patchAccumulators, votedNormals, zeroAreaNormals, zeroAreaPenetrations, zeroAreaContactPoints, patchHasSAT,
+            finalAreas, finalNormals, finalPenetrations, finalContactPoints, count);
+    }
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // Penetration depth computation kernels for mesh contact correction
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/src/algorithms/DEMStaticDeviceSubroutines.h b/src/algorithms/DEMStaticDeviceSubroutines.h
index 87bac353..d81697b1 100644
--- a/src/algorithms/DEMStaticDeviceSubroutines.h
+++ b/src/algorithms/DEMStaticDeviceSubroutines.h
@@ -172,7 +172,11 @@ void getContactForcesConcerningOwners(float3* d_points,
 // Patch-based voting wrappers for mesh contact correction
 ////////////////////////////////////////////////////////////////////////////////
 
-// Prepares weighted normals (normal * area), areas, and keys from geomToPatchMap for voting
+// Prepares weighted normals (normal * area / penetration) for voting.
+//
+// The weighted normal magnitude represents the voting power. The subsequent normalization step only
+// needs the *direction*, therefore any positive scalar multiple of the weight yields the same
+// voted direction.  The current implementation follows the existing, validated semantics.
 void prepareWeightedNormalsForVoting(DEMDataDT* granData,
                                      float3* weightedNormals,
                                      double* areas,
@@ -181,6 +185,16 @@ void prepareWeightedNormalsForVoting(DEMDataDT* granData,
                                      contactPairs_t count,
                                      cudaStream_t& this_stream);
 
+// Optimized overload: prepares weighted normals only.
+//
+// This avoids materializing temporary areas/keys buffers. Keys can be sourced directly from
+// granData->geomToPatchMap + startOffsetPrimitive in the caller.
+void prepareWeightedNormalsForVoting(DEMDataDT* granData,
+                                     float3* weightedNormals,
+                                     contactPairs_t startOffset,
+                                     contactPairs_t count,
+                                     cudaStream_t& this_stream);
+
 // Normalizes voted normals by total area and scatters to output
 // If total area is 0, output is (0,0,0) indicating no contact
 void normalizeAndScatterVotedNormals(float3* votedWeightedNormals,
@@ -188,6 +202,59 @@ void normalizeAndScatterVotedNormals(float3* votedWeightedNormals,
                                      contactPairs_t count,
                                      cudaStream_t& this_stream);
 
+// Patch-level accumulator used to fuse multiple ReduceByKey passes.
+//
+// The reduction operator is component-wise associative (sum + max), therefore it can safely be used
+// with CUB ReduceByKey.
+struct PatchContactAccum {
+    double sumProjArea;    ///< Sum of projected contact areas (per patch)
+    double maxProjPen;     ///< Max projected penetration (per patch)
+    double sumWeight;      ///< Sum of weights w = projectedPenetration * projectedArea (per patch)
+    double3 sumWeightedCP; ///< Sum of (contactPoint * w) (per patch)
+
+    __host__ __device__ __forceinline__ PatchContactAccum operator+(const PatchContactAccum& other) const {
+        PatchContactAccum out;
+        out.sumProjArea = sumProjArea + other.sumProjArea;
+        out.maxProjPen = (maxProjPen > other.maxProjPen) ? maxProjPen : other.maxProjPen;
+        out.sumWeight = sumWeight + other.sumWeight;
+        out.sumWeightedCP = make_double3(sumWeightedCP.x + other.sumWeightedCP.x,
+                                         sumWeightedCP.y + other.sumWeightedCP.y,
+                                         sumWeightedCP.z + other.sumWeightedCP.z);
+        return out;
+    }
+};
+
+// Computes per-primitive patch accumulators:
+//   - sumProjArea: projected area contribution
+//   - maxProjPen:  projected penetration contribution (to be reduced by max)
+//   - sumWeight:   weight contribution (for contact point averaging)
+//   - sumWeightedCP: weighted contact point contribution
+void computePatchContactAccumulators(DEMDataDT* granData,
+                                     const float3* votedNormals,
+                                     const contactPairs_t* keys,
+                                     PatchContactAccum* accumulators,
+                                     contactPairs_t startOffsetPrimitive,
+                                     contactPairs_t startOffsetPatch,
+                                     contactPairs_t count,
+                                     cudaStream_t& this_stream);
+
+// Finalizes patch results by combining patch-accumulator voting with zero-area / SAT-fail fallback.
+//
+// Semantics match finalizePatchResults(), but avoids materializing intermediate arrays
+// (totalProjectedAreas, votedPenetrations, votedContactPoints).
+void finalizePatchResultsFromAccumulators(const PatchContactAccum* patchAccumulators,
+                                          const float3* votedNormals,
+                                          const float3* zeroAreaNormals,
+                                          const double* zeroAreaPenetrations,
+                                          const double3* zeroAreaContactPoints,
+                                          const notStupidBool_t* patchHasSAT,
+                                          double* finalAreas,
+                                          float3* finalNormals,
+                                          double* finalPenetrations,
+                                          double3* finalContactPoints,
+                                          contactPairs_t count,
+                                          cudaStream_t& this_stream);
+
 // Computes projected penetration and area for each primitive contact
 // Both the penetration and area are projected onto the voted normal
 // If the projected penetration becomes negative, both are set to 0
diff --git a/src/kernel/DEMBinTriangleKernels.cu b/src/kernel/DEMBinTriangleKernels.cu
index faf24ed6..beba2d36 100644
--- a/src/kernel/DEMBinTriangleKernels.cu
+++ b/src/kernel/DEMBinTriangleKernels.cu
@@ -145,382 +145,447 @@ inline __device__ bool figureOutNodeAndBoundingBox(deme::DEMSimParams* simParams
     return boundingBoxIntersectBinAxisBounds(L, U, vA, vB, vC, simParams);
 }
 
+
+__global__ void precomputeTriangleSandwichData(deme::DEMSimParams* simParams,
+                                              deme::DEMDataKT* granData,
+                                              // World-space vertices for both sandwich triangles
+                                              float3* vA1_all,
+                                              float3* vB1_all,
+                                              float3* vC1_all,
+                                              float3* vA2_all,
+                                              float3* vB2_all,
+                                              float3* vC2_all,
+                                              // Per-triangle translation B = A + shift_world
+                                              float3* shift_world_all,
+                                              // Per-triangle bounds for A and B (only valid if ok flag true)
+                                              int3* LA_all,
+                                              int3* UA_all,
+                                              int3* LB_all,
+                                              int3* UB_all,
+                                              // ok flags
+                                              unsigned char* ok1_all,
+                                              unsigned char* ok2_all,
+                                              // sandwich nodes (local, as produced by makeTriangleSandwich)
+                                              float3* nodeA1,
+                                              float3* nodeB1,
+                                              float3* nodeC1,
+                                              float3* nodeA2,
+                                              float3* nodeB2,
+                                              float3* nodeC2) {
+    deme::bodyID_t triID = blockIdx.x * blockDim.x + threadIdx.x;
+    if (triID >= simParams->nTriGM) {
+        return;
+    }
+
+    float3 vA1, vB1, vC1, vA2, vB2, vC2;
+    deme::binID_t L1[3], L2[3], U1[3], U2[3];
+
+    const bool ok1 = figureOutNodeAndBoundingBox(simParams, granData, triID, vA1, vB1, vC1, L1, U1,
+                                                 nodeA1[triID], nodeB1[triID], nodeC1[triID]);
+    const bool ok2 = figureOutNodeAndBoundingBox(simParams, granData, triID, vA2, vB2, vC2, L2, U2,
+                                                 nodeA2[triID], nodeB2[triID], nodeC2[triID]);
+
+    vA1_all[triID] = vA1;
+    vB1_all[triID] = vB1;
+    vC1_all[triID] = vC1;
+    vA2_all[triID] = vA2;
+    vB2_all[triID] = vB2;
+    vC2_all[triID] = vC2;
+
+    ok1_all[triID] = (unsigned char)(ok1 ? 1 : 0);
+    ok2_all[triID] = (unsigned char)(ok2 ? 1 : 0);
+
+    if (ok1) {
+        LA_all[triID] = make_int3(L1[0], L1[1], L1[2]);
+        UA_all[triID] = make_int3(U1[0], U1[1], U1[2]);
+    }
+    if (ok2) {
+        LB_all[triID] = make_int3(L2[0], L2[1], L2[2]);
+        UB_all[triID] = make_int3(U2[0], U2[1], U2[2]);
+    }
+
+    // Precompute sandwich translation (B = A + shift_world) in numerically safe way.
+    float3 shift_world = make_float3(0.f, 0.f, 0.f);
+    if (ok2) {
+        const deme::bodyID_t owner = granData->ownerTriMesh[triID];
+        const float qw = granData->oriQw[owner];
+        const float qx = granData->oriQx[owner];
+        const float qy = granData->oriQy[owner];
+        const float qz = granData->oriQz[owner];
+        float3 shift_local = make_float3(nodeA2[triID].x - nodeA1[triID].x,
+                                         nodeA2[triID].y - nodeA1[triID].y,
+                                         nodeA2[triID].z - nodeA1[triID].z);
+        applyOriQToVector3<float, deme::oriQ_t>(shift_local.x, shift_local.y, shift_local.z, qw, qx, qy, qz);
+        shift_world = shift_local;
+    }
+    shift_world_all[triID] = shift_world;
+}
+
+// Prepass versions of the existing kernels (signature includes precomputed arrays).
 __global__ void getNumberOfBinsEachTriangleTouches(deme::DEMSimParams* simParams,
-                                                   deme::DEMDataKT* granData,
-                                                   deme::binsTriangleTouches_t* numBinsTriTouches,
-                                                   deme::objID_t* numAnalGeoTriTouches,
-                                                   float3* nodeA1,
-                                                   float3* nodeB1,
-                                                   float3* nodeC1,
-                                                   float3* nodeA2,
-                                                   float3* nodeB2,
-                                                   float3* nodeC2,
-                                                   bool meshUniversalContact) {
+                                                           deme::DEMDataKT* granData,
+                                                           deme::binsTriangleTouches_t* numBinsTriTouches,
+                                                           deme::objID_t* numAnalGeoTriTouches,
+                                                           // precomputed
+                                                           const float3* vA1_all,
+                                                           const float3* vB1_all,
+                                                           const float3* vC1_all,
+                                                           const float3* vA2_all,
+                                                           const float3* vB2_all,
+                                                           const float3* vC2_all,
+                                                           const float3* shift_world_all,
+                                                           const int3* LA_all,
+                                                           const int3* UA_all,
+                                                           const int3* LB_all,
+                                                           const int3* UB_all,
+                                                           const unsigned char* ok1_all,
+                                                           const unsigned char* ok2_all,
+                                                           bool meshUniversalContact) {
     deme::bodyID_t triID = blockIdx.x * blockDim.x + threadIdx.x;
+    if (triID >= simParams->nTriGM) {
+        return;
+    }
 
-    if (triID < simParams->nTriGM) {
-        // 3 vertices of the triangle, in true space location but without adding LBF point (since purely voxel- and
-        // bin-based locations don't need that)
-        float3 vA1, vB1, vC1, vA2, vB2, vC2;
-        deme::binID_t L1[3], L2[3], U1[3], U2[3];
-        const bool ok1 = figureOutNodeAndBoundingBox(simParams, granData, triID, vA1, vB1, vC1, L1, U1, nodeA1[triID],
-                                                     nodeB1[triID], nodeC1[triID]);
-        const bool ok2 = figureOutNodeAndBoundingBox(simParams, granData, triID, vA2, vB2, vC2, L2, U2, nodeA2[triID],
-                                                     nodeB2[triID], nodeC2[triID]);
-
-        // Precompute triangle edges/normal once per triangle (translation-invariant).
-        // We translate vertices per-bin (v - boxCenter) before calling triBoxOverlapBinFastLocalEdgesUnionShiftFP32().
-
-        // If neither triangle sandwich intersects the bin grid, it cannot touch any bin.
-        if (!ok1 && !ok2) {
-            numBinsTriTouches[triID] = 0;
-            if (meshUniversalContact) {
-                numAnalGeoTriTouches[triID] = 0;
-            }
-            return;
-        }
+    const bool ok1 = (ok1_all[triID] != 0);
+    const bool ok2 = (ok2_all[triID] != 0);
 
-        // Preserve per-triangle bounds for cheap gating inside the union sweep (Option C).
-        // (We will overwrite L1/U1 when we merge into the union bounds.)
-        deme::binID_t LA[3], UA[3];
-        if (ok1) {
-            LA[0] = L1[0];
-            LA[1] = L1[1];
-            LA[2] = L1[2];
-            UA[0] = U1[0];
-            UA[1] = U1[1];
-            UA[2] = U1[2];
+    if (!ok1 && !ok2) {
+        numBinsTriTouches[triID] = 0;
+        if (meshUniversalContact) {
+            numAnalGeoTriTouches[triID] = 0;
         }
+        return;
+    }
 
-        // Precompute the sandwich translation (B = A + shift_world) once per triangle, in a numerically safe way:
-        // compute in local coords (small numbers), then rotate into world.
-        float3 shift_world = make_float3(0.f, 0.f, 0.f);
-        if (ok2) {
-            const deme::bodyID_t myOwnerID_shift = granData->ownerTriMesh[triID];
-            const float myOriQw_shift = granData->oriQw[myOwnerID_shift];
-            const float myOriQx_shift = granData->oriQx[myOwnerID_shift];
-            const float myOriQy_shift = granData->oriQy[myOwnerID_shift];
-            const float myOriQz_shift = granData->oriQz[myOwnerID_shift];
-
-            // Use any vertex pair; for a sandwich it is a constant translation for all vertices.
-            float3 shift_local = make_float3(nodeA2[triID].x - nodeA1[triID].x, nodeA2[triID].y - nodeA1[triID].y,
-                                             nodeA2[triID].z - nodeA1[triID].z);
-            applyOriQToVector3<float, deme::oriQ_t>(shift_local.x, shift_local.y, shift_local.z, myOriQw_shift,
-                                                    myOriQx_shift, myOriQy_shift, myOriQz_shift);
-            shift_world = shift_local;
-        }
+    const float3 vA1 = vA1_all[triID];
+    const float3 vB1 = vB1_all[triID];
+    const float3 vC1 = vC1_all[triID];
+    const float3 vA2 = vA2_all[triID];
+    const float3 vB2 = vB2_all[triID];
+    const float3 vC2 = vC2_all[triID];
+    const float3 shift_world = shift_world_all[triID];
+
+    int3 LA = make_int3(0, 0, 0), UA = make_int3(-1, -1, -1);
+    int3 LB = make_int3(0, 0, 0), UB = make_int3(-1, -1, -1);
+    if (ok1) {
+        LA = LA_all[triID];
+        UA = UA_all[triID];
+    }
+    if (ok2) {
+        LB = LB_all[triID];
+        UB = UB_all[triID];
+    }
 
-        // Merge bounds (or take the valid one, if only one is valid).
-        if (ok1 && ok2) {
-            L1[0] = DEME_MIN(L1[0], L2[0]);
-            L1[1] = DEME_MIN(L1[1], L2[1]);
-            L1[2] = DEME_MIN(L1[2], L2[2]);
-            U1[0] = DEME_MAX(U1[0], U2[0]);
-            U1[1] = DEME_MAX(U1[1], U2[1]);
-            U1[2] = DEME_MAX(U1[2], U2[2]);
-        } else if (!ok1) {
-            L1[0] = L2[0];
-            L1[1] = L2[1];
-            L1[2] = L2[2];
-            U1[0] = U2[0];
-            U1[1] = U2[1];
-            U1[2] = U2[2];
-        }
+    // Union bounds
+    deme::binID_t Lx, Ly, Lz, Ux, Uy, Uz;
+    if (ok1 && ok2) {
+        Lx = (deme::binID_t)DEME_MIN(LA.x, LB.x);
+        Ly = (deme::binID_t)DEME_MIN(LA.y, LB.y);
+        Lz = (deme::binID_t)DEME_MIN(LA.z, LB.z);
+        Ux = (deme::binID_t)DEME_MAX(UA.x, UB.x);
+        Uy = (deme::binID_t)DEME_MAX(UA.y, UB.y);
+        Uz = (deme::binID_t)DEME_MAX(UA.z, UB.z);
+    } else if (ok1) {
+        Lx = (deme::binID_t)LA.x;
+        Ly = (deme::binID_t)LA.y;
+        Lz = (deme::binID_t)LA.z;
+        Ux = (deme::binID_t)UA.x;
+        Uy = (deme::binID_t)UA.y;
+        Uz = (deme::binID_t)UA.z;
+    } else {
+        Lx = (deme::binID_t)LB.x;
+        Ly = (deme::binID_t)LB.y;
+        Lz = (deme::binID_t)LB.z;
+        Ux = (deme::binID_t)UB.x;
+        Uy = (deme::binID_t)UB.y;
+        Uz = (deme::binID_t)UB.z;
+    }
 
-        unsigned int numSDsTouched = 0;
-        // Triangle may span a collection of bins...
-        // BTW, I don't know why Chrono::GPU had to check the so-called 3 cases, and create thread divergence like that.
-        // Just sweep through all potential bins and you are fine.
-        float BinCenter[3];
-        const float binSizeF = (float)simParams->dyn.binSize;
-        const float binHalfSpan = binSizeF * (0.5f + (float)DEME_BIN_ENLARGE_RATIO_FOR_FACETS);
-        float BinHalfSizes[3] = {binHalfSpan, binHalfSpan, binHalfSpan};
-        const float startX = binSizeF * (float)L1[0] + 0.5f * binSizeF;
-        const float startY = binSizeF * (float)L1[1] + 0.5f * binSizeF;
-        const float startZ = binSizeF * (float)L1[2] + 0.5f * binSizeF;
-        for (deme::binID_t i = L1[0], ix = 0; i <= U1[0]; i++, ix++) {
-            float cy0 = startY;
-            BinCenter[0] = startX + ix * binSizeF;
-            for (deme::binID_t j = L1[1]; j <= U1[1]; j++) {
-                float cz = startZ;
-                BinCenter[1] = cy0;
-                for (deme::binID_t k = L1[2]; k <= U1[2]; k++) {
-                    BinCenter[2] = cz;
-                    const float3 c = make_float3(BinCenter[0], BinCenter[1], BinCenter[2]);
-                    // Bounds-gating, only test the triangle(s) that can possibly touch this bin.
-                    const bool inA =
-                        ok1 && (i >= LA[0] && i <= UA[0] && j >= LA[1] && j <= UA[1] && k >= LA[2] && k <= UA[2]);
-                    const bool inB =
-                        ok2 && (i >= L2[0] && i <= U2[0] && j >= L2[1] && j <= U2[1] && k >= L2[2] && k <= U2[2]);
-                    if (!inA && !inB) {
-                        continue;
-                    }
-                    const float3 a0 = make_float3(vA1.x - c.x, vA1.y - c.y, vA1.z - c.z);
-                    const float3 a1 = make_float3(vB1.x - c.x, vB1.y - c.y, vB1.z - c.z);
-                    const float3 a2 = make_float3(vC1.x - c.x, vC1.y - c.y, vC1.z - c.z);
-                    const bool hitFast =
-                        triBoxOverlapBinLocalEdgesUnionShiftFP32(a0, a1, a2, shift_world, binHalfSpan, inA, inB);
-                    if (hitFast) {
-                        numSDsTouched++;
-                    }
+    unsigned int numSDsTouched = 0;
+    const float binSizeF = (float)simParams->dyn.binSize;
+    const float binHalfSpan = binSizeF * (0.5f + (float)DEME_BIN_ENLARGE_RATIO_FOR_FACETS);
+    const float startX = binSizeF * (float)Lx + 0.5f * binSizeF;
+    const float startY = binSizeF * (float)Ly + 0.5f * binSizeF;
+    const float startZ = binSizeF * (float)Lz + 0.5f * binSizeF;
+
+    float BinCenter[3];
+    for (deme::binID_t i = Lx, ix = 0; i <= Ux; i++, ix++) {
+        float cy0 = startY;
+        BinCenter[0] = startX + ix * binSizeF;
+        for (deme::binID_t j = Ly; j <= Uy; j++) {
+            float cz = startZ;
+            BinCenter[1] = cy0;
+            for (deme::binID_t k = Lz; k <= Uz; k++) {
+                BinCenter[2] = cz;
+                const float3 c = make_float3(BinCenter[0], BinCenter[1], BinCenter[2]);
+
+                const bool inA = ok1 && (i >= (deme::binID_t)LA.x && i <= (deme::binID_t)UA.x &&
+                                        j >= (deme::binID_t)LA.y && j <= (deme::binID_t)UA.y &&
+                                        k >= (deme::binID_t)LA.z && k <= (deme::binID_t)UA.z);
+                const bool inB = ok2 && (i >= (deme::binID_t)LB.x && i <= (deme::binID_t)UB.x &&
+                                        j >= (deme::binID_t)LB.y && j <= (deme::binID_t)UB.y &&
+                                        k >= (deme::binID_t)LB.z && k <= (deme::binID_t)UB.z);
+                if (!inA && !inB) {
                     cz += binSizeF;
+                    continue;
                 }
-                cy0 += binSizeF;
+
+                const float3 a0 = make_float3(vA1.x - c.x, vA1.y - c.y, vA1.z - c.z);
+                const float3 a1 = make_float3(vB1.x - c.x, vB1.y - c.y, vB1.z - c.z);
+                const float3 a2 = make_float3(vC1.x - c.x, vC1.y - c.y, vC1.z - c.z);
+                const bool hit = triBoxOverlapBinLocalEdgesUnionShiftFP32(a0, a1, a2, shift_world, binHalfSpan, inA, inB);
+                if (hit) {
+                    numSDsTouched++;
+                }
+                cz += binSizeF;
             }
+            cy0 += binSizeF;
         }
-        numBinsTriTouches[triID] = numSDsTouched;
+    }
 
-        // No need to do the following if meshUniversalContact is false
-        if (meshUniversalContact) {
-            // Register sphere--analytical geometry contacts
-            deme::objID_t contact_count = 0;
-            // Each triangle should also check if it overlaps with an analytical boundary-type geometry
-            for (deme::objID_t objB = 0; objB < simParams->nAnalGM; objB++) {
-                deme::bodyID_t objBOwner = objOwner[objB];
-                // Grab family number from memory (not jitified: b/c family number can change frequently in a sim)
-                unsigned int objFamilyNum = granData->familyID[objBOwner];
-                deme::bodyID_t triOwnerID = granData->ownerTriMesh[triID];
-                unsigned int triFamilyNum = granData->familyID[triOwnerID];
-                unsigned int maskMatID = locateMaskPair<unsigned int>(triFamilyNum, objFamilyNum);
-                // If marked no contact, skip ths iteration
-                if (granData->familyMasks[maskMatID] != deme::DONT_PREVENT_CONTACT) {
-                    continue;
-                }
-                float3 ownerXYZ;
-                voxelIDToPosition<float, deme::voxelID_t, deme::subVoxelPos_t>(
-                    ownerXYZ.x, ownerXYZ.y, ownerXYZ.z, granData->voxelID[objBOwner], granData->locX[objBOwner],
-                    granData->locY[objBOwner], granData->locZ[objBOwner], _nvXp2_, _nvYp2_, _voxelSize_, _l_);
-                const float ownerOriQw = granData->oriQw[objBOwner];
-                const float ownerOriQx = granData->oriQx[objBOwner];
-                const float ownerOriQy = granData->oriQy[objBOwner];
-                const float ownerOriQz = granData->oriQz[objBOwner];
-                float objBRelPosX = objRelPosX[objB];
-                float objBRelPosY = objRelPosY[objB];
-                float objBRelPosZ = objRelPosZ[objB];
-                float objBRotX = objRotX[objB];
-                float objBRotY = objRotY[objB];
-                float objBRotZ = objRotZ[objB];
-                applyOriQToVector3<float, deme::oriQ_t>(objBRelPosX, objBRelPosY, objBRelPosZ, ownerOriQw, ownerOriQx,
-                                                        ownerOriQy, ownerOriQz);
-                applyOriQToVector3<float, deme::oriQ_t>(objBRotX, objBRotY, objBRotZ, ownerOriQw, ownerOriQx,
-                                                        ownerOriQy, ownerOriQz);
-                float3 objBPosXYZ = ownerXYZ + make_float3(objBRelPosX, objBRelPosY, objBRelPosZ);
-
-                deme::contact_t contact_type = checkTriEntityOverlapFP32(
-                    vA1, vB1, vC1, objType[objB], objBPosXYZ, make_float3(objBRotX, objBRotY, objBRotZ), objSize1[objB],
-                    objSize2[objB], objSize3[objB], objNormal[objB], granData->marginSizeAnalytical[objB]);
-                if (contact_type == deme::NOT_A_CONTACT) {
-                    contact_type = checkTriEntityOverlapFP32(vA2, vB2, vC2, objType[objB], objBPosXYZ,
-                                                             make_float3(objBRotX, objBRotY, objBRotZ), objSize1[objB],
-                                                             objSize2[objB], objSize3[objB], objNormal[objB],
-                                                             granData->marginSizeAnalytical[objB]);
-                }
-                // Unlike the sphere-X contact case, we do not test against family extra margin here. This may result in
-                // more fake contact pairs, but the efficiency in the mesh-based particle case is not our top priority
-                // yet.
-                if (contact_type == deme::TRIANGLE_ANALYTICAL_CONTACT) {
-                    contact_count++;
-                }
+    numBinsTriTouches[triID] = numSDsTouched;
+
+    if (meshUniversalContact) {
+        deme::objID_t contact_count = 0;
+        for (deme::objID_t objB = 0; objB < simParams->nAnalGM; objB++) {
+            deme::bodyID_t objBOwner = objOwner[objB];
+            unsigned int objFamilyNum = granData->familyID[objBOwner];
+            deme::bodyID_t triOwnerID = granData->ownerTriMesh[triID];
+            unsigned int triFamilyNum = granData->familyID[triOwnerID];
+            unsigned int maskMatID = locateMaskPair<unsigned int>(triFamilyNum, objFamilyNum);
+            if (granData->familyMasks[maskMatID] != deme::DONT_PREVENT_CONTACT) {
+                continue;
+            }
+
+            float3 ownerXYZ;
+            voxelIDToPosition<float, deme::voxelID_t, deme::subVoxelPos_t>(
+                ownerXYZ.x, ownerXYZ.y, ownerXYZ.z, granData->voxelID[objBOwner], granData->locX[objBOwner],
+                granData->locY[objBOwner], granData->locZ[objBOwner], _nvXp2_, _nvYp2_, _voxelSize_, _l_);
+
+            const float ownerOriQw = granData->oriQw[objBOwner];
+            const float ownerOriQx = granData->oriQx[objBOwner];
+            const float ownerOriQy = granData->oriQy[objBOwner];
+            const float ownerOriQz = granData->oriQz[objBOwner];
+
+            float objBRelPosX = objRelPosX[objB];
+            float objBRelPosY = objRelPosY[objB];
+            float objBRelPosZ = objRelPosZ[objB];
+            float objBRotX = objRotX[objB];
+            float objBRotY = objRotY[objB];
+            float objBRotZ = objRotZ[objB];
+
+            applyOriQToVector3<float, deme::oriQ_t>(objBRelPosX, objBRelPosY, objBRelPosZ,
+                                                   ownerOriQw, ownerOriQx, ownerOriQy, ownerOriQz);
+            applyOriQToVector3<float, deme::oriQ_t>(objBRotX, objBRotY, objBRotZ,
+                                                   ownerOriQw, ownerOriQx, ownerOriQy, ownerOriQz);
+
+            float3 objBPosXYZ = ownerXYZ + make_float3(objBRelPosX, objBRelPosY, objBRelPosZ);
+
+            deme::contact_t contact_type = checkTriEntityOverlapFP32(
+                vA1, vB1, vC1, objType[objB], objBPosXYZ, make_float3(objBRotX, objBRotY, objBRotZ), objSize1[objB],
+                objSize2[objB], objSize3[objB], objNormal[objB], granData->marginSizeAnalytical[objB]);
+
+            if (contact_type == deme::NOT_A_CONTACT) {
+                contact_type = checkTriEntityOverlapFP32(
+                    vA2, vB2, vC2, objType[objB], objBPosXYZ, make_float3(objBRotX, objBRotY, objBRotZ),
+                    objSize1[objB], objSize2[objB], objSize3[objB], objNormal[objB], granData->marginSizeAnalytical[objB]);
+            }
+
+            if (contact_type == deme::TRIANGLE_ANALYTICAL_CONTACT) {
+                contact_count++;
             }
-            numAnalGeoTriTouches[triID] = contact_count;
         }
+        numAnalGeoTriTouches[triID] = contact_count;
     }
 }
 
 __global__ void populateBinTriangleTouchingPairs(deme::DEMSimParams* simParams,
-                                                 deme::DEMDataKT* granData,
-                                                 deme::binsTriangleTouchPairs_t* numBinsTriTouchesScan,
-                                                 deme::binsTriangleTouchPairs_t* numAnalGeoTriTouchesScan,
-                                                 deme::binID_t* binIDsEachTriTouches,
-                                                 deme::bodyID_t* triIDsEachBinTouches,
-                                                 float3* nodeA1,
-                                                 float3* nodeB1,
-                                                 float3* nodeC1,
-                                                 float3* nodeA2,
-                                                 float3* nodeB2,
-                                                 float3* nodeC2,
-                                                 deme::bodyID_t* idGeoA,
-                                                 deme::bodyID_t* idGeoB,
-                                                 deme::contact_t* contactTypePrimitive,
-                                                 bool meshUniversalContact) {
+                                                         deme::DEMDataKT* granData,
+                                                         deme::binsTriangleTouchPairs_t* numBinsTriTouchesScan,
+                                                         deme::binsTriangleTouchPairs_t* numAnalGeoTriTouchesScan,
+                                                         deme::binID_t* binIDsEachTriTouches,
+                                                         deme::bodyID_t* triIDsEachBinTouches,
+                                                         // precomputed
+                                                         const float3* vA1_all,
+                                                         const float3* vB1_all,
+                                                         const float3* vC1_all,
+                                                         const float3* vA2_all,
+                                                         const float3* vB2_all,
+                                                         const float3* vC2_all,
+                                                         const float3* shift_world_all,
+                                                         const int3* LA_all,
+                                                         const int3* UA_all,
+                                                         const int3* LB_all,
+                                                         const int3* UB_all,
+                                                         const unsigned char* ok1_all,
+                                                         const unsigned char* ok2_all,
+                                                         // tri-anal output
+                                                         deme::bodyID_t* idGeoA,
+                                                         deme::bodyID_t* idGeoB,
+                                                         deme::contact_t* contactTypePrimitive,
+                                                         bool meshUniversalContact) {
     deme::bodyID_t triID = blockIdx.x * blockDim.x + threadIdx.x;
-    if (triID < simParams->nTriGM) {
-        // 3 vertices of the triangle
-        float3 vA1, vB1, vC1, vA2, vB2, vC2;
-        deme::binID_t L1[3], L2[3], U1[3], U2[3];
-        const bool ok1 = figureOutNodeAndBoundingBox(simParams, granData, triID, vA1, vB1, vC1, L1, U1, nodeA1[triID],
-                                                     nodeB1[triID], nodeC1[triID]);
-        const bool ok2 = figureOutNodeAndBoundingBox(simParams, granData, triID, vA2, vB2, vC2, L2, U2, nodeA2[triID],
-                                                     nodeB2[triID], nodeC2[triID]);
-
-        // Precompute triangle edges/normal once per triangle (translation-invariant).
-        // We translate vertices per-bin (v - boxCenter) before calling triBoxOverlapBinFastLocalEdgesUnionShiftFP32().
-
-        // If neither triangle sandwich intersects the bin grid, it cannot touch any bin.
-        if (!ok1 && !ok2) {
-            return;
-        }
+    if (triID >= simParams->nTriGM) {
+        return;
+    }
 
-        // Preserve per-triangle bounds for cheap gating inside the union sweep (Option C).
-        // (We will overwrite L1/U1 when we merge into the union bounds.)
-        deme::binID_t LA[3], UA[3];
-        if (ok1) {
-            LA[0] = L1[0];
-            LA[1] = L1[1];
-            LA[2] = L1[2];
-            UA[0] = U1[0];
-            UA[1] = U1[1];
-            UA[2] = U1[2];
-        }
+    const bool ok1 = (ok1_all[triID] != 0);
+    const bool ok2 = (ok2_all[triID] != 0);
 
-        // Precompute the sandwich translation (B = A + shift_world) once per triangle, in a numerically safe way:
-        // compute in local coords (small numbers), then rotate into world.
-        float3 shift_world = make_float3(0.f, 0.f, 0.f);
-        if (ok2) {
-            const deme::bodyID_t myOwnerID_shift = granData->ownerTriMesh[triID];
-            const float myOriQw_shift = granData->oriQw[myOwnerID_shift];
-            const float myOriQx_shift = granData->oriQx[myOwnerID_shift];
-            const float myOriQy_shift = granData->oriQy[myOwnerID_shift];
-            const float myOriQz_shift = granData->oriQz[myOwnerID_shift];
-
-            // Use any vertex pair; for a sandwich it is a constant translation for all vertices.
-            float3 shift_local = make_float3(nodeA2[triID].x - nodeA1[triID].x, nodeA2[triID].y - nodeA1[triID].y,
-                                             nodeA2[triID].z - nodeA1[triID].z);
-            applyOriQToVector3<float, deme::oriQ_t>(shift_local.x, shift_local.y, shift_local.z, myOriQw_shift,
-                                                    myOriQx_shift, myOriQy_shift, myOriQz_shift);
-            shift_world = shift_local;
-        }
+    if (!ok1 && !ok2) {
+        return;
+    }
 
-        // Merge bounds (or take the valid one, if only one is valid).
-        if (ok1 && ok2) {
-            L1[0] = DEME_MIN(L1[0], L2[0]);
-            L1[1] = DEME_MIN(L1[1], L2[1]);
-            L1[2] = DEME_MIN(L1[2], L2[2]);
-            U1[0] = DEME_MAX(U1[0], U2[0]);
-            U1[1] = DEME_MAX(U1[1], U2[1]);
-            U1[2] = DEME_MAX(U1[2], U2[2]);
-        } else if (!ok1) {
-            L1[0] = L2[0];
-            L1[1] = L2[1];
-            L1[2] = L2[2];
-            U1[0] = U2[0];
-            U1[1] = U2[1];
-            U1[2] = U2[2];
-        }
+    const float3 vA1 = vA1_all[triID];
+    const float3 vB1 = vB1_all[triID];
+    const float3 vC1 = vC1_all[triID];
+    const float3 vA2 = vA2_all[triID];
+    const float3 vB2 = vB2_all[triID];
+    const float3 vC2 = vC2_all[triID];
+    const float3 shift_world = shift_world_all[triID];
+
+    int3 LA = make_int3(0, 0, 0), UA = make_int3(-1, -1, -1);
+    int3 LB = make_int3(0, 0, 0), UB = make_int3(-1, -1, -1);
+    if (ok1) {
+        LA = LA_all[triID];
+        UA = UA_all[triID];
+    }
+    if (ok2) {
+        LB = LB_all[triID];
+        UB = UB_all[triID];
+    }
 
-        deme::binsTriangleTouchPairs_t myReportOffset = numBinsTriTouchesScan[triID];
-        // In case this sweep does not agree with the previous one, we need to intercept such potential segfaults
-        const deme::binsTriangleTouchPairs_t myReportOffset_end = numBinsTriTouchesScan[triID + 1];
-
-        // Triangle may span a collection of bins...
-        float BinCenter[3];
-        const float binSizeF = (float)simParams->dyn.binSize;
-        const float binHalfSpan = binSizeF * (0.5f + (float)DEME_BIN_ENLARGE_RATIO_FOR_FACETS);
-        float BinHalfSizes[3] = {binHalfSpan, binHalfSpan, binHalfSpan};
-        const float startX = binSizeF * (float)L1[0] + 0.5f * binSizeF;
-        const float startY = binSizeF * (float)L1[1] + 0.5f * binSizeF;
-        const float startZ = binSizeF * (float)L1[2] + 0.5f * binSizeF;
-        for (deme::binID_t i = L1[0], ix = 0; i <= U1[0]; i++, ix++) {
-            BinCenter[0] = startX + ix * binSizeF;
-            float cy0 = startY;
-            for (deme::binID_t j = L1[1]; j <= U1[1]; j++) {
-                BinCenter[1] = cy0;
-                float cz = startZ;
-                for (deme::binID_t k = L1[2]; k <= U1[2]; k++) {
-                    if (myReportOffset >= myReportOffset_end) {
-                        continue;  // Don't step on the next triangle's domain
-                    }
-                    BinCenter[2] = cz;
-                    const float3 c = make_float3(BinCenter[0], BinCenter[1], BinCenter[2]);
-                    // Bounds-gating, only test the triangle(s) that can possibly touch this bin.
-                    const bool inA =
-                        ok1 && (i >= LA[0] && i <= UA[0] && j >= LA[1] && j <= UA[1] && k >= LA[2] && k <= UA[2]);
-                    const bool inB =
-                        ok2 && (i >= L2[0] && i <= U2[0] && j >= L2[1] && j <= U2[1] && k >= L2[2] && k <= U2[2]);
-                    if (!inA && !inB) {
-                        continue;
-                    }
-                    const float3 a0 = make_float3(vA1.x - c.x, vA1.y - c.y, vA1.z - c.z);
-                    const float3 a1 = make_float3(vB1.x - c.x, vB1.y - c.y, vB1.z - c.z);
-                    const float3 a2 = make_float3(vC1.x - c.x, vC1.y - c.y, vC1.z - c.z);
-                    const bool hitFast =
-                        triBoxOverlapBinLocalEdgesUnionShiftFP32(a0, a1, a2, shift_world, binHalfSpan, inA, inB);
-                    if (hitFast) {
-                        binIDsEachTriTouches[myReportOffset] =
-                            binIDFrom3Indices<deme::binID_t>(i, j, k, simParams->nbX, simParams->nbY, simParams->nbZ);
-                        triIDsEachBinTouches[myReportOffset] = triID;
-                        myReportOffset++;
-                    }
-                    cz += binSizeF;
-                }
-                cy0 += binSizeF;
-            }
-        }
-        // This can happen for like 1 in 10^9 chance, for the tri--bin contact algorithm has stochasticity on GPU
-        for (; myReportOffset < myReportOffset_end; myReportOffset++) {
-            binIDsEachTriTouches[myReportOffset] = deme::NULL_BINID;
-            triIDsEachBinTouches[myReportOffset] = triID;
-        }
+    // Union bounds
+    deme::binID_t Lx, Ly, Lz, Ux, Uy, Uz;
+    if (ok1 && ok2) {
+        Lx = (deme::binID_t)DEME_MIN(LA.x, LB.x);
+        Ly = (deme::binID_t)DEME_MIN(LA.y, LB.y);
+        Lz = (deme::binID_t)DEME_MIN(LA.z, LB.z);
+        Ux = (deme::binID_t)DEME_MAX(UA.x, UB.x);
+        Uy = (deme::binID_t)DEME_MAX(UA.y, UB.y);
+        Uz = (deme::binID_t)DEME_MAX(UA.z, UB.z);
+    } else if (ok1) {
+        Lx = (deme::binID_t)LA.x;
+        Ly = (deme::binID_t)LA.y;
+        Lz = (deme::binID_t)LA.z;
+        Ux = (deme::binID_t)UA.x;
+        Uy = (deme::binID_t)UA.y;
+        Uz = (deme::binID_t)UA.z;
+    } else {
+        Lx = (deme::binID_t)LB.x;
+        Ly = (deme::binID_t)LB.y;
+        Lz = (deme::binID_t)LB.z;
+        Ux = (deme::binID_t)UB.x;
+        Uy = (deme::binID_t)UB.y;
+        Uz = (deme::binID_t)UB.z;
+    }
 
-        // No need to do the following if meshUniversalContact is false
-        if (meshUniversalContact) {
-            deme::binsTriangleTouchPairs_t myTriGeoReportOffset = numAnalGeoTriTouchesScan[triID];
-            deme::binsTriangleTouchPairs_t myTriGeoReportOffset_end = numAnalGeoTriTouchesScan[triID + 1];
-            for (deme::objID_t objB = 0; objB < simParams->nAnalGM; objB++) {
-                deme::bodyID_t objBOwner = objOwner[objB];
-                // Grab family number from memory (not jitified: b/c family number can change frequently in a sim)
-                unsigned int objFamilyNum = granData->familyID[objBOwner];
-                deme::bodyID_t triOwnerID = granData->ownerTriMesh[triID];
-                unsigned int triFamilyNum = granData->familyID[triOwnerID];
-                unsigned int maskMatID = locateMaskPair<unsigned int>(triFamilyNum, objFamilyNum);
-                // If marked no contact, skip ths iteration
-                if (granData->familyMasks[maskMatID] != deme::DONT_PREVENT_CONTACT) {
+    // Write tri-bin pairs
+    const deme::binsTriangleTouchPairs_t myReportOffset = numBinsTriTouchesScan[triID];
+    const deme::binsTriangleTouchPairs_t myUpperBound = numBinsTriTouchesScan[triID + 1];
+
+    deme::binsTriangleTouchPairs_t count = 0;
+    const float binSizeF = (float)simParams->dyn.binSize;
+    const float binHalfSpan = binSizeF * (0.5f + (float)DEME_BIN_ENLARGE_RATIO_FOR_FACETS);
+    const float startX = binSizeF * (float)Lx + 0.5f * binSizeF;
+    const float startY = binSizeF * (float)Ly + 0.5f * binSizeF;
+    const float startZ = binSizeF * (float)Lz + 0.5f * binSizeF;
+
+    float BinCenter[3];
+    for (deme::binID_t i = Lx, ix = 0; i <= Ux; i++, ix++) {
+        float cy0 = startY;
+        BinCenter[0] = startX + ix * binSizeF;
+        for (deme::binID_t j = Ly; j <= Uy; j++) {
+            float cz = startZ;
+            BinCenter[1] = cy0;
+            for (deme::binID_t k = Lz; k <= Uz; k++) {
+                BinCenter[2] = cz;
+                const float3 c = make_float3(BinCenter[0], BinCenter[1], BinCenter[2]);
+
+                const bool inA = ok1 && (i >= (deme::binID_t)LA.x && i <= (deme::binID_t)UA.x &&
+                                        j >= (deme::binID_t)LA.y && j <= (deme::binID_t)UA.y &&
+                                        k >= (deme::binID_t)LA.z && k <= (deme::binID_t)UA.z);
+                const bool inB = ok2 && (i >= (deme::binID_t)LB.x && i <= (deme::binID_t)UB.x &&
+                                        j >= (deme::binID_t)LB.y && j <= (deme::binID_t)UB.y &&
+                                        k >= (deme::binID_t)LB.z && k <= (deme::binID_t)UB.z);
+                if (!inA && !inB) {
+                    cz += binSizeF;
                     continue;
                 }
-                float3 ownerXYZ;
-                voxelIDToPosition<float, deme::voxelID_t, deme::subVoxelPos_t>(
-                    ownerXYZ.x, ownerXYZ.y, ownerXYZ.z, granData->voxelID[objBOwner], granData->locX[objBOwner],
-                    granData->locY[objBOwner], granData->locZ[objBOwner], _nvXp2_, _nvYp2_, _voxelSize_, _l_);
-                const float ownerOriQw = granData->oriQw[objBOwner];
-                const float ownerOriQx = granData->oriQx[objBOwner];
-                const float ownerOriQy = granData->oriQy[objBOwner];
-                const float ownerOriQz = granData->oriQz[objBOwner];
-                float objBRelPosX = objRelPosX[objB];
-                float objBRelPosY = objRelPosY[objB];
-                float objBRelPosZ = objRelPosZ[objB];
-                float objBRotX = objRotX[objB];
-                float objBRotY = objRotY[objB];
-                float objBRotZ = objRotZ[objB];
-                applyOriQToVector3<float, deme::oriQ_t>(objBRelPosX, objBRelPosY, objBRelPosZ, ownerOriQw, ownerOriQx,
-                                                        ownerOriQy, ownerOriQz);
-                applyOriQToVector3<float, deme::oriQ_t>(objBRotX, objBRotY, objBRotZ, ownerOriQw, ownerOriQx,
-                                                        ownerOriQy, ownerOriQz);
-                float3 objBPosXYZ = ownerXYZ + make_float3(objBRelPosX, objBRelPosY, objBRelPosZ);
-
-                deme::contact_t contact_type = checkTriEntityOverlapFP32(
-                    vA1, vB1, vC1, objType[objB], objBPosXYZ, make_float3(objBRotX, objBRotY, objBRotZ), objSize1[objB],
-                    objSize2[objB], objSize3[objB], objNormal[objB], granData->marginSizeAnalytical[objB]);
-                if (contact_type == deme::NOT_A_CONTACT) {
-                    contact_type = checkTriEntityOverlapFP32(vA2, vB2, vC2, objType[objB], objBPosXYZ,
-                                                             make_float3(objBRotX, objBRotY, objBRotZ), objSize1[objB],
-                                                             objSize2[objB], objSize3[objB], objNormal[objB],
-                                                             granData->marginSizeAnalytical[objB]);
-                }
-                // Unlike the sphere-X contact case, we do not test against family extra margin here, which is more
-                // lenient and perhaps makes more fake contacts.
-                if (contact_type == deme::TRIANGLE_ANALYTICAL_CONTACT) {
-                    idGeoA[myTriGeoReportOffset] = triID;
-                    idGeoB[myTriGeoReportOffset] = (deme::bodyID_t)objB;
-                    contactTypePrimitive[myTriGeoReportOffset] = contact_type;
-                    myTriGeoReportOffset++;
-                    if (myTriGeoReportOffset >= myTriGeoReportOffset_end) {
-                        return;  // Don't step on the next triangle's domain
+
+                const float3 a0 = make_float3(vA1.x - c.x, vA1.y - c.y, vA1.z - c.z);
+                const float3 a1 = make_float3(vB1.x - c.x, vB1.y - c.y, vB1.z - c.z);
+                const float3 a2 = make_float3(vC1.x - c.x, vC1.y - c.y, vC1.z - c.z);
+                const bool hit = triBoxOverlapBinLocalEdgesUnionShiftFP32(a0, a1, a2, shift_world, binHalfSpan, inA, inB);
+                if (hit) {
+                    const deme::binsTriangleTouchPairs_t outIdx = myReportOffset + count;
+                    if (outIdx < myUpperBound) {
+                        binIDsEachTriTouches[outIdx] = binIDFrom3Indices<deme::binID_t>(i, j, k, simParams->nbX, simParams->nbY,
+                                                                                       simParams->nbZ);
+                        triIDsEachBinTouches[outIdx] = triID;
                     }
+                    count++;
                 }
+
+                cz += binSizeF;
+            }
+            cy0 += binSizeF;
+        }
+    }
+
+    // Tri-anal contacts: keep identical to original populate kernel
+    if (meshUniversalContact) {
+        const deme::binsTriangleTouchPairs_t myAnalOffset = numAnalGeoTriTouchesScan[triID];
+        deme::binsTriangleTouchPairs_t analCount = 0;
+        for (deme::objID_t objB = 0; objB < simParams->nAnalGM; objB++) {
+            deme::bodyID_t objBOwner = objOwner[objB];
+            unsigned int objFamilyNum = granData->familyID[objBOwner];
+            deme::bodyID_t triOwnerID = granData->ownerTriMesh[triID];
+            unsigned int triFamilyNum = granData->familyID[triOwnerID];
+            unsigned int maskMatID = locateMaskPair<unsigned int>(triFamilyNum, objFamilyNum);
+            if (granData->familyMasks[maskMatID] != deme::DONT_PREVENT_CONTACT) {
+                continue;
+            }
+
+            float3 ownerXYZ;
+            voxelIDToPosition<float, deme::voxelID_t, deme::subVoxelPos_t>(
+                ownerXYZ.x, ownerXYZ.y, ownerXYZ.z, granData->voxelID[objBOwner], granData->locX[objBOwner],
+                granData->locY[objBOwner], granData->locZ[objBOwner], _nvXp2_, _nvYp2_, _voxelSize_, _l_);
+
+            const float ownerOriQw = granData->oriQw[objBOwner];
+            const float ownerOriQx = granData->oriQx[objBOwner];
+            const float ownerOriQy = granData->oriQy[objBOwner];
+            const float ownerOriQz = granData->oriQz[objBOwner];
+
+            float objBRelPosX = objRelPosX[objB];
+            float objBRelPosY = objRelPosY[objB];
+            float objBRelPosZ = objRelPosZ[objB];
+            float objBRotX = objRotX[objB];
+            float objBRotY = objRotY[objB];
+            float objBRotZ = objRotZ[objB];
+
+            applyOriQToVector3<float, deme::oriQ_t>(objBRelPosX, objBRelPosY, objBRelPosZ,
+                                                   ownerOriQw, ownerOriQx, ownerOriQy, ownerOriQz);
+            applyOriQToVector3<float, deme::oriQ_t>(objBRotX, objBRotY, objBRotZ,
+                                                   ownerOriQw, ownerOriQx, ownerOriQy, ownerOriQz);
+
+            float3 objBPosXYZ = ownerXYZ + make_float3(objBRelPosX, objBRelPosY, objBRelPosZ);
+
+            deme::contact_t contact_type = checkTriEntityOverlapFP32(
+                vA1, vB1, vC1, objType[objB], objBPosXYZ, make_float3(objBRotX, objBRotY, objBRotZ), objSize1[objB],
+                objSize2[objB], objSize3[objB], objNormal[objB], granData->marginSizeAnalytical[objB]);
+            if (contact_type == deme::NOT_A_CONTACT) {
+                contact_type = checkTriEntityOverlapFP32(
+                    vA2, vB2, vC2, objType[objB], objBPosXYZ, make_float3(objBRotX, objBRotY, objBRotZ),
+                    objSize1[objB], objSize2[objB], objSize3[objB], objNormal[objB], granData->marginSizeAnalytical[objB]);
             }
-            // Take care of potentially unfilled slots in the report
-            for (; myTriGeoReportOffset < myTriGeoReportOffset_end; myTriGeoReportOffset++) {
-                contactTypePrimitive[myTriGeoReportOffset] = deme::NOT_A_CONTACT;
+
+            if (contact_type == deme::TRIANGLE_ANALYTICAL_CONTACT) {
+                const deme::binsTriangleTouchPairs_t outIdx = myAnalOffset + analCount;
+                idGeoA[outIdx] = triID;
+                idGeoB[outIdx] = (deme::bodyID_t)objB;
+                contactTypePrimitive[outIdx] = contact_type;
+                analCount++;
             }
         }
     }
diff --git a/src/kernel/DEMCalcForceKernels_Primitive.cu b/src/kernel/DEMCalcForceKernels_Primitive.cu
index a70286c6..39754ab4 100644
--- a/src/kernel/DEMCalcForceKernels_Primitive.cu
+++ b/src/kernel/DEMCalcForceKernels_Primitive.cu
@@ -250,14 +250,23 @@ __device__ __forceinline__ void calculatePrimitiveContactForces_impl(deme::DEMSi
             // Use the dedicated SAT check function to determine if triangles are truly in physical contact
             // Note: checkTriangleTriangleOverlap uses projection which can report contact even for non-physical
             // "submerged" cases, so we need the actual SAT test for accurate physical contact determination
-            bool satisfiesSAT = checkTriangleTriangleSAT<double3, double>(triANode1, triANode2, triANode3, triBNode1,
-                                                                          triBNode2, triBNode3);
-            granData->contactSATSatisfied[myPrimitiveContactID] = satisfiesSAT ? 1 : 0;
-
-            // If SAT says no physical contact potential, drop this pair (projection can report non-physical overlaps)
-            if (!satisfiesSAT) {
+            bool check_sat = true;
+            if (!in_contact && overlapDepth <= -extraMarginSize) {
+                // This pair is already beyond the extra margin, SAT cannot make it a valid contact
+                check_sat = false;
+                granData->contactSATSatisfied[myPrimitiveContactID] = 0;
                 ContactType = deme::NOT_A_CONTACT;
             }
+            if (check_sat) {
+                bool satisfiesSAT = checkTriangleTriangleSAT<double3, double>(triANode1, triANode2, triANode3, triBNode1,
+                                                                              triBNode2, triBNode3);
+                granData->contactSATSatisfied[myPrimitiveContactID] = satisfiesSAT ? 1 : 0;
+
+                // If SAT says no physical contact potential, drop this pair (projection can report non-physical overlaps)
+                if (!satisfiesSAT) {
+                    ContactType = deme::NOT_A_CONTACT;
+                }
+            }
 
             // Fix ContactType if needed
             // If the solver says in contact, we do not question it
diff --git a/src/kernel/DEMCollisionKernels_SphTri_TriTri.cuh b/src/kernel/DEMCollisionKernels_SphTri_TriTri.cuh
index 83dfaab3..f9d55412 100644
--- a/src/kernel/DEMCollisionKernels_SphTri_TriTri.cuh
+++ b/src/kernel/DEMCollisionKernels_SphTri_TriTri.cuh
@@ -1149,9 +1149,8 @@ __device__ bool checkTriangleTriangleOverlap(
     // Triangle B vertices (tri2)
     const T1 triB[3] = {A2, B2, C2};
 
-    // Compute face normals
+    // Compute face normal for triangle A first; triangle B normal is only needed if B->A projection hits.
     T1 nA = normalize(cross(B1 - A1, C1 - A1));
-    T1 nB = normalize(cross(B2 - A2, C2 - A2));
 
     //// TODO: And degenerated triangles?
 
@@ -1163,17 +1162,36 @@ __device__ bool checkTriangleTriangleOverlap(
     // Project triangle B onto triangle A's plane and clip against A
     T2 depthBA, areaBA;
     T1 centroidBA;
-    bool contactBA = projectTriangleOntoTriangle<T1, T2>(triB, triA, nA, depthBA, areaBA, centroidBA);
+    const bool contactBA = projectTriangleOntoTriangle<T1, T2>(triB, triA, nA, depthBA, areaBA, centroidBA);
+
+    if (!contactBA) {
+        // No contact detected, Provide separation info
+        T1 centA = (triA[0] + triA[1] + triA[2]) / 3.0;
+        T1 centB = (triB[0] + triB[1] + triB[2]) / 3.0;
+        T1 sep = centA - centB;
+        T2 sepLen2 = dot(sep, sep);
+
+        if (sepLen2 > (DEME_TINY_FLOAT * DEME_TINY_FLOAT)) {
+            T2 sepLen = sqrt(sepLen2);
+            normal = sep / sepLen;
+            depth = -sepLen;  // Negative for separation
+            point = (centA + centB) * 0.5;
+        } else {
+            normal = nA;
+            depth = -DEME_TINY_FLOAT;
+            point = centA;
+        }
+        projectedArea = 0.0;
+        return false;
+    }
 
     // Project triangle A onto triangle B's plane and clip against B
+    T1 nB = normalize(cross(B2 - A2, C2 - A2));
     T2 depthAB, areaAB;
     T1 centroidAB;
-    bool contactAB = projectTriangleOntoTriangle<T1, T2>(triA, triB, nB, depthAB, areaAB, centroidAB);
-
-    // Determine if there is contact
-    bool inContact = contactBA && contactAB;
+    const bool contactAB = projectTriangleOntoTriangle<T1, T2>(triA, triB, nB, depthAB, areaAB, centroidAB);
 
-    if (!inContact) {
+    if (!contactAB) {
         // No contact detected, Provide separation info
         T1 centA = (triA[0] + triA[1] + triA[2]) / 3.0;
         T1 centB = (triB[0] + triB[1] + triB[2]) / 3.0;
diff --git a/src/kernel/DEMKinematicMisc.cu b/src/kernel/DEMKinematicMisc.cu
index 8bb55e46..f0ffc9d9 100644
--- a/src/kernel/DEMKinematicMisc.cu
+++ b/src/kernel/DEMKinematicMisc.cu
@@ -82,16 +82,21 @@ __global__ void computeMarginFromAbsv_implTri(deme::DEMSimParams* simParams,
         // as our meshed particle representation is surface only, so we need to account for existing penetration length
         // in our future-proof contact detection, always.
         double penetrationMargin = *maxTriTriPenetration;
-        //// TODO: Temporary measure
-        penetrationMargin = 0.;  // (meshUniversalContact && penetrationMargin > 0.0) ? penetrationMargin : 0.0;
+        penetrationMargin = (meshUniversalContact && penetrationMargin > 0.0) ? penetrationMargin : 0.0;
         // Clamp penetration margin to the maximum allowed value to prevent super large margins
         if (penetrationMargin > simParams->capTriTriPenetration) {
             penetrationMargin = simParams->capTriTriPenetration;
         }
-
-        granData->marginSizeTriangle[triID] =
+        // We hope that penetrationMargin is small, so it's absorbed into the velocity-induce margin.
+        // But if not, it should prevail to avoid losing contacts involving triangles inside another mesh.
+        double finalMargin =
             (double)(vel * simParams->dyn.expSafetyMulti + simParams->dyn.expSafetyAdder) * (*ts) * (*maxDrift) +
-            penetrationMargin + granData->familyExtraMarginSize[my_family];
+            granData->familyExtraMarginSize[my_family];
+        // if (finalMargin < penetrationMargin) {
+        //     finalMargin = penetrationMargin;
+        // }
+
+        granData->marginSizeTriangle[triID] = finalMargin;
     }
 }
 

From e07e44b595fe5b7bbe34a303cb77b0206f11013a Mon Sep 17 00:00:00 2001
From: Florian Reinle <f.reinle@otec.de>
Date: Mon, 19 Jan 2026 23:58:44 +0100
Subject: [PATCH 06/17] Feature Add: Planar Contact Zylinder - a simple/fast
 plane contact but in cylindric direction

---
 src/DEM/APIPrivate.cpp                        |  5 ++
 src/DEM/BdrsAndObjs.h                         | 50 +++++++++++++-
 src/DEM/Defines.h                             |  1 +
 src/demo/DEMdemo_DrumCubes.cpp                |  6 +-
 src/kernel/DEMCollisionKernels_SphSph.cuh     | 20 ++++++
 .../DEMCollisionKernels_SphTri_TriTri.cuh     | 69 +++++++++++++++++++
 6 files changed, 145 insertions(+), 6 deletions(-)

diff --git a/src/DEM/APIPrivate.cpp b/src/DEM/APIPrivate.cpp
index 7e858194..5148f130 100644
--- a/src/DEM/APIPrivate.cpp
+++ b/src/DEM/APIPrivate.cpp
@@ -717,6 +717,11 @@ void DEMSolver::preprocessAnalyticalObjs() {
                     addAnalCompTemplate(ANAL_OBJ_TYPE_CYL_INF, comp_mat.at(i), thisLoadExtObj, param.cyl.center,
                                         param.cyl.dir, param.cyl.radius, 0, 0, param.cyl.normal);
                     break;
+                case OBJ_COMPONENT::PLANAR_CYL:
+                    addAnalCompTemplate(ANAL_OBJ_TYPE_PLANAR_CYL, comp_mat.at(i), thisLoadExtObj, param.cyl.center,
+                                        param.cyl.dir, param.cyl.radius, 0, 0,
+                                        param.cyl.normal);
+                    break;
                 default:
                     DEME_ERROR(std::string("There is at least one analytical boundary that has a type not supported."));
             }
diff --git a/src/DEM/BdrsAndObjs.h b/src/DEM/BdrsAndObjs.h
index 89ac1620..3e8b0b34 100644
--- a/src/DEM/BdrsAndObjs.h
+++ b/src/DEM/BdrsAndObjs.h
@@ -25,7 +25,7 @@ namespace deme {
 /// External object type
 /// Note all of them are `shell', not solid objects. If you need a solid cylinder for example, then use one CYLINDER as
 /// the side plus 2 CIRCLE as the ends to emulate it. Please be sure to set OUTWARD CYLINDER normal in this case.
-enum class OBJ_COMPONENT { PLANE, SPHERE, PLATE, CIRCLE, CYL, CYL_INF, CONE, CONE_INF, TRIANGLE };
+enum class OBJ_COMPONENT { PLANE, SPHERE, PLATE, CIRCLE, CYL, CYL_INF, PLANAR_CYL, CONE, CONE_INF, TRIANGLE };
 
 /// Sphere
 struct DEMSphereParams_t {
@@ -224,6 +224,54 @@ class DEMExternObj : public DEMInitializer {
         assertThreeElements(axis, "AddCylinder", "axis");
         AddCylinder(make_float3(pos[0], pos[1], pos[2]), make_float3(axis[0], axis[1], axis[2]), rad, material, normal);
     }
+
+    /// Add a z-axis-aligned cylinder of infinite length with planar contact approximation
+    void AddZPlanarContactCylinder(const float3 pos,
+                                   const float rad,
+                                   const std::shared_ptr<DEMMaterial>& material,
+                                   const objNormal_t normal = ENTITY_NORMAL_INWARD) {
+        types.push_back(OBJ_COMPONENT::PLANAR_CYL);
+        materials.push_back(material);
+        DEMAnalEntParams params;
+        params.cyl.center = pos;
+        params.cyl.radius = rad;
+        params.cyl.dir = make_float3(0, 0, 1);
+        params.cyl.normal = normal;
+        entity_params.push_back(params);
+    }
+    void AddZPlanarContactCylinder(const std::vector<float>& pos,
+                                   const float rad,
+                                   const std::shared_ptr<DEMMaterial>& material,
+                                   const objNormal_t normal = ENTITY_NORMAL_INWARD) {
+        assertThreeElements(pos, "AddZPlanarContactCylinder", "pos");
+        AddZPlanarContactCylinder(make_float3(pos[0], pos[1], pos[2]), rad, material, normal);
+    }
+
+    /// Add a cylinder of infinite length with planar contact approximation, along a user-specific axis
+    void AddPlanarContactCylinder(const float3 pos,
+                                  const float3 axis,
+                                  const float rad,
+                                  const std::shared_ptr<DEMMaterial>& material,
+                                  const objNormal_t normal = ENTITY_NORMAL_INWARD) {
+        types.push_back(OBJ_COMPONENT::PLANAR_CYL);
+        materials.push_back(material);
+        DEMAnalEntParams params;
+        params.cyl.center = pos;
+        params.cyl.radius = rad;
+        params.cyl.dir = normalize(axis);
+        params.cyl.normal = normal;
+        entity_params.push_back(params);
+    }
+    void AddPlanarContactCylinder(const std::vector<float>& pos,
+                                  const std::vector<float>& axis,
+                                  const float rad,
+                                  const std::shared_ptr<DEMMaterial>& material,
+                                  const objNormal_t normal = ENTITY_NORMAL_INWARD) {
+        assertThreeElements(pos, "AddPlanarContactCylinder", "pos");
+        assertThreeElements(axis, "AddPlanarContactCylinder", "axis");
+        AddPlanarContactCylinder(make_float3(pos[0], pos[1], pos[2]), make_float3(axis[0], axis[1], axis[2]), rad,
+                                 material, normal);
+    }
 };
 
 // DEM mesh object
diff --git a/src/DEM/Defines.h b/src/DEM/Defines.h
index ec20710b..2a7593fa 100644
--- a/src/DEM/Defines.h
+++ b/src/DEM/Defines.h
@@ -134,6 +134,7 @@ constexpr contact_t ALL_CONTACT_TYPES[NUM_SUPPORTED_CONTACT_TYPES] = {
 const objType_t ANAL_OBJ_TYPE_PLANE = 0;
 const objType_t ANAL_OBJ_TYPE_PLATE = 1;
 const objType_t ANAL_OBJ_TYPE_CYL_INF = 2;
+const objType_t ANAL_OBJ_TYPE_PLANAR_CYL = 3;
 const objNormal_t ENTITY_NORMAL_INWARD = 0;
 const objNormal_t ENTITY_NORMAL_OUTWARD = 1;
 
diff --git a/src/demo/DEMdemo_DrumCubes.cpp b/src/demo/DEMdemo_DrumCubes.cpp
index 8c680f3f..8e28b3af 100644
--- a/src/demo/DEMdemo_DrumCubes.cpp
+++ b/src/demo/DEMdemo_DrumCubes.cpp
@@ -54,11 +54,7 @@ int main() {
     float IZZ = CylMass * CylRad * CylRad / 2;
     float IYY = (CylMass / 12) * (3 * CylRad * CylRad + CylHeight * CylHeight);
     auto Drum = DEMSim.AddExternalObject();
-    // Drum->AddCylinder(CylCenter, CylAxis, CylRad, mat_type_drum, 0);
-    Drum->AddPlane(make_float3(CylRad, 0, 0), make_float3(-1, 0, 0), mat_type_drum);
-    Drum->AddPlane(make_float3(-CylRad, 0, 0), make_float3(1, 0, 0), mat_type_drum);
-    Drum->AddPlane(make_float3(0, CylRad, 0), make_float3(0, -1, 0), mat_type_drum);
-    Drum->AddPlane(make_float3(0, -CylRad, 0), make_float3(0, 1, 0), mat_type_drum);
+    Drum->AddPlanarContactCylinder(CylCenter, CylAxis, CylRad, mat_type_drum, ENTITY_NORMAL_INWARD);
     Drum->SetMass(CylMass);
     Drum->SetMOI(make_float3(IYY, IYY, IZZ));
     auto Drum_tracker = DEMSim.Track(Drum);
diff --git a/src/kernel/DEMCollisionKernels_SphSph.cuh b/src/kernel/DEMCollisionKernels_SphSph.cuh
index e3c47365..0798af5c 100644
--- a/src/kernel/DEMCollisionKernels_SphSph.cuh
+++ b/src/kernel/DEMCollisionKernels_SphSph.cuh
@@ -130,6 +130,26 @@ __host__ __device__ deme::contact_t checkSphereEntityOverlap(const T1& A,
             CP = A - to_real3<float3, T1>(cntNormal * (radA - overlapDepth / 2.0));
             return contactTypePrimitive;
         }
+        case (deme::ANAL_OBJ_TYPE_PLANAR_CYL): {
+            T1 cyl2sph = cylRadialDistanceVec<T1>(A, B, dirB);
+            const T3 dist_delta_r = length(cyl2sph);
+            if (dist_delta_r <= (T3)DEME_TINY_FLOAT) {
+                return deme::NOT_A_CONTACT;
+            }
+            const T3 dist_plane = normal_sign * ((T3)size1B - dist_delta_r);
+            if (dist_plane < 0) {
+                return deme::NOT_A_CONTACT;
+            }
+            cntNormal = to_real3<T1, float3>(-normal_sign / dist_delta_r * cyl2sph);
+            overlapDepth = (T3)(radA + beta4Entity) - dist_plane;
+            if (overlapDepth <= DEME_TINY_FLOAT) {
+                contactTypePrimitive = deme::NOT_A_CONTACT;
+            } else {
+                contactTypePrimitive = deme::SPHERE_ANALYTICAL_CONTACT;
+            }
+            CP = A - to_real3<float3, T1>(cntNormal * (dist_plane + overlapDepth / 2.0));
+            return contactTypePrimitive;
+        }
         default:
             return deme::NOT_A_CONTACT;
     }
diff --git a/src/kernel/DEMCollisionKernels_SphTri_TriTri.cuh b/src/kernel/DEMCollisionKernels_SphTri_TriTri.cuh
index f9d55412..0861eb99 100644
--- a/src/kernel/DEMCollisionKernels_SphTri_TriTri.cuh
+++ b/src/kernel/DEMCollisionKernels_SphTri_TriTri.cuh
@@ -118,6 +118,30 @@ bool __device__ tri_plane_penetration(const T1** tri,
     return in_contact;
 }
 
+template <typename T1>
+inline __host__ __device__ bool planar_cyl_plane_from_ref(const T1& ref,
+                                                          const T1& entityLoc,
+                                                          const float3& entityDir,
+                                                          const float& radius,
+                                                          const float& normal_sign,
+                                                          T1& plane_point,
+                                                          float3& plane_normal) {
+    T1 radial_vec = cylRadialDistanceVec<T1>(ref, entityLoc, entityDir);
+    const auto dist = length(radial_vec);
+    if (dist <= (decltype(dist))DEME_TINY_FLOAT) {
+        return false;
+    }
+    const T1 radial_dir = radial_vec / dist;
+    const float dist_plane = normal_sign * (radius - (float)dist);
+    if (dist_plane < 0) {
+        return false;
+    }
+    plane_normal = to_real3<T1, float3>(-normal_sign * radial_dir);
+    const T1 axis_point = ref - radial_vec;
+    plane_point = axis_point + radial_dir * radius;
+    return true;
+}
+
 template <typename T1, typename T2>
 bool __device__ tri_cyl_penetration(const T1** tri,
                                     const T1& entityLoc,
@@ -174,6 +198,22 @@ __host__ __device__ deme::contact_t checkTriEntityOverlap(const T1& A,
             }
             return deme::NOT_A_CONTACT;
         }
+        case (deme::ANAL_OBJ_TYPE_PLANAR_CYL): {
+            T1 centroid = (A + B + C) / 3.0;
+            T1 plane_point;
+            float3 plane_normal;
+            if (!planar_cyl_plane_from_ref(centroid, entityLoc, entityDir, entitySize1, normal_sign, plane_point,
+                                           plane_normal)) {
+                return deme::NOT_A_CONTACT;
+            }
+            for (const T1*& v : tri) {
+                double d = planeSignedDistance<double>(*v, plane_point, plane_normal);
+                double overlapDepth = beta4Entity - d;
+                if (overlapDepth >= 0.0)
+                    return deme::TRIANGLE_ANALYTICAL_CONTACT;
+            }
+            return deme::NOT_A_CONTACT;
+        }
         default:
             return deme::NOT_A_CONTACT;
     }
@@ -215,6 +255,22 @@ __host__ __device__ deme::contact_t checkTriEntityOverlapFP32(const T1& A,
             }
             return deme::NOT_A_CONTACT;
         }
+        case (deme::ANAL_OBJ_TYPE_PLANAR_CYL): {
+            T1 centroid = (A + B + C) / 3.0f;
+            T1 plane_point;
+            float3 plane_normal;
+            if (!planar_cyl_plane_from_ref(centroid, entityLoc, entityDir, entitySize1, normal_sign, plane_point,
+                                           plane_normal)) {
+                return deme::NOT_A_CONTACT;
+            }
+            for (const T1*& v : tri) {
+                const float d = planeSignedDistance<float>(*v, plane_point, plane_normal);
+                const float overlapDepth = beta4Entity - d;
+                if (overlapDepth >= 0.0f)
+                    return deme::TRIANGLE_ANALYTICAL_CONTACT;
+            }
+            return deme::NOT_A_CONTACT;
+        }
         default:
             return deme::NOT_A_CONTACT;
     }
@@ -253,6 +309,19 @@ bool __device__ calcTriEntityOverlap(const T1& A,
             in_contact = tri_cyl_penetration<T1, T2>(tri, entityLoc, entityDir, entitySize1, entitySize2, normal_sign,
                                                      contact_normal, overlapDepth, overlapArea, contactPnt);
             return in_contact;
+        case deme::ANAL_OBJ_TYPE_PLANAR_CYL: {
+            T1 centroid = (A + B + C) / 3.0;
+            T1 plane_point;
+            float3 plane_normal;
+            if (!planar_cyl_plane_from_ref(centroid, entityLoc, entityDir, entitySize1, normal_sign, plane_point,
+                                           plane_normal)) {
+                return false;
+            }
+            in_contact =
+                tri_plane_penetration<T1, T2>(tri, plane_point, plane_normal, overlapDepth, overlapArea, contactPnt);
+            contact_normal = plane_normal;
+            return in_contact;
+        }
         default:
             return false;
     }

From e016668ba483a1759ee575ebd93ab0d0f20b767c Mon Sep 17 00:00:00 2001
From: Florian Reinle <f.reinle@otec.de>
Date: Tue, 20 Jan 2026 11:56:55 +0100
Subject: [PATCH 07/17] Async writeout no GPU wait speeds up simulation

---
 src/DEM/API.h                |   5 +
 src/DEM/APIPublic.cpp        | 124 +++++++++++++++------
 src/DEM/dT.cpp               | 207 +++++++++++++++++++++++++++++++----
 src/DEM/dT.h                 |   9 ++
 src/core/utils/JitHelper.cpp |   4 -
 src/core/utils/JitHelper.h   |   7 --
 6 files changed, 293 insertions(+), 63 deletions(-)

diff --git a/src/DEM/API.h b/src/DEM/API.h
index da1b1752..631f3e64 100644
--- a/src/DEM/API.h
+++ b/src/DEM/API.h
@@ -10,6 +10,7 @@
 #include <set>
 #include <cfloat>
 #include <functional>
+#include <thread>
 
 #include "kT.h"
 #include "dT.h"
@@ -1203,6 +1204,7 @@ class DEMSolver {
     /// Remove host-side cached vectors (so you can re-define them, and then re-initialize system)
     void ClearCache();
 
+    /// Output methods enqueue asynchronous writes; call WaitForPendingOutput() to block for completion.
     /// Write the current status of clumps to a file
     void WriteClumpFile(const std::string& outfilename, unsigned int accuracy = 10) const;
     void WriteClumpFile(const std::filesystem::path& outfilename, unsigned int accuracy = 10) const {
@@ -1231,6 +1233,8 @@ class DEMSolver {
     /// Write the current status of all meshes to a file.
     void WriteMeshFile(const std::string& outfilename) const;
     void WriteMeshFile(const std::filesystem::path& outfilename) const { WriteMeshFile(outfilename.string()); }
+    /// Wait for any in-flight async output to finish.
+    void WaitForPendingOutput() const;
 
     /// @brief Read 3 columns of your choice from a CSV filem and group them by clump_header.
     /// @param infilename CSV filename.
@@ -1570,6 +1574,7 @@ class DEMSolver {
     bool m_is_out_owner_wildcards = false;
     bool m_is_out_cnt_wildcards = false;
     bool m_is_out_geo_wildcards = false;
+    mutable std::thread m_output_thread;
 
     // User-instructed simulation `world' size. Note it is an approximate of the true size and we will generate a world
     // not smaller than this. This is useful if the user want to automatically add BCs enclosing this user-defined
diff --git a/src/DEM/APIPublic.cpp b/src/DEM/APIPublic.cpp
index 24b9c5c6..d93358f3 100644
--- a/src/DEM/APIPublic.cpp
+++ b/src/DEM/APIPublic.cpp
@@ -56,6 +56,7 @@ DEMSolver::DEMSolver(unsigned int nGPUs) {
 }
 
 DEMSolver::~DEMSolver() {
+    WaitForPendingOutput();
     if (sys_initialized)
         DoDynamicsThenSync(0.0);
     delete kT;
@@ -2028,28 +2029,45 @@ std::shared_ptr<DEMInspector> DEMSolver::CreateInspector(const std::string& quan
 }
 
 void DEMSolver::WriteSphereFile(const std::string& outfilename) const {
+    WaitForPendingOutput();
     switch (m_out_format) {
 #ifdef DEME_USE_CHPF
         case (OUTPUT_FORMAT::CHPF): {
-            std::ofstream ptFile(outfilename, std::ios::out | std::ios::binary);
-            dT->writeSpheresAsChpf(ptFile);
-            ptFile.close();
+            dT->migrateFamilyToHost();
+            dT->migrateClumpPosInfoToHost();
+            dT->migrateClumpHighOrderInfoToHost();
+            m_output_thread = std::thread([this, outfilename]() {
+                std::ofstream ptFile(outfilename, std::ios::out | std::ios::binary);
+                dT->writeSpheresAsChpfFromHost(ptFile);
+            });
             break;
         }
 #endif
         case (OUTPUT_FORMAT::CSV): {
-            std::ofstream ptFile(outfilename, std::ios::out);
-            dT->writeSpheresAsCsv(ptFile);
-            ptFile.close();
+            dT->migrateFamilyToHost();
+            dT->migrateClumpPosInfoToHost();
+            dT->migrateClumpHighOrderInfoToHost();
+            dT->migrateOwnerWildcardToHost();
+            dT->migrateSphGeoWildcardToHost();
+            m_output_thread = std::thread([this, outfilename]() {
+                std::ofstream ptFile(outfilename, std::ios::out);
+                dT->writeSpheresAsCsvFromHost(ptFile);
+            });
             break;
         }
         case (OUTPUT_FORMAT::BINARY): {
             // std::ofstream ptFile(outfilename, std::ios::out | std::ios::binary);
             //// TODO: Implement it
-            std::ofstream ptFile(outfilename, std::ios::out);
             DEME_WARNING(std::string("Binary sphere output is not implemented yet, using CSV..."));
-            dT->writeSpheresAsCsv(ptFile);
-            ptFile.close();
+            dT->migrateFamilyToHost();
+            dT->migrateClumpPosInfoToHost();
+            dT->migrateClumpHighOrderInfoToHost();
+            dT->migrateOwnerWildcardToHost();
+            dT->migrateSphGeoWildcardToHost();
+            m_output_thread = std::thread([this, outfilename]() {
+                std::ofstream ptFile(outfilename, std::ios::out);
+                dT->writeSpheresAsCsvFromHost(ptFile);
+            });
             break;
         }
         default:
@@ -2058,28 +2076,43 @@ void DEMSolver::WriteSphereFile(const std::string& outfilename) const {
 }
 
 void DEMSolver::WriteClumpFile(const std::string& outfilename, unsigned int accuracy) const {
+    WaitForPendingOutput();
     switch (m_out_format) {
 #ifdef DEME_USE_CHPF
         case (OUTPUT_FORMAT::CHPF): {
-            std::ofstream ptFile(outfilename, std::ios::out | std::ios::binary);
-            dT->writeClumpsAsChpf(ptFile, accuracy);
-            ptFile.close();
+            dT->migrateFamilyToHost();
+            dT->migrateClumpPosInfoToHost();
+            dT->migrateClumpHighOrderInfoToHost();
+            m_output_thread = std::thread([this, outfilename, accuracy]() {
+                std::ofstream ptFile(outfilename, std::ios::out | std::ios::binary);
+                dT->writeClumpsAsChpfFromHost(ptFile, accuracy);
+            });
             break;
         }
 #endif
         case (OUTPUT_FORMAT::CSV): {
-            std::ofstream ptFile(outfilename, std::ios::out);
-            dT->writeClumpsAsCsv(ptFile, accuracy);
-            ptFile.close();
+            dT->migrateFamilyToHost();
+            dT->migrateClumpPosInfoToHost();
+            dT->migrateClumpHighOrderInfoToHost();
+            dT->migrateOwnerWildcardToHost();
+            m_output_thread = std::thread([this, outfilename, accuracy]() {
+                std::ofstream ptFile(outfilename, std::ios::out);
+                dT->writeClumpsAsCsvFromHost(ptFile, accuracy);
+            });
             break;
         }
         case (OUTPUT_FORMAT::BINARY): {
             // std::ofstream ptFile(outfilename, std::ios::out | std::ios::binary);
             //// TODO: Implement it
-            std::ofstream ptFile(outfilename, std::ios::out);
             DEME_WARNING(std::string("Binary clump output is not implemented yet, using CSV..."));
-            dT->writeClumpsAsCsv(ptFile, accuracy);
-            ptFile.close();
+            dT->migrateFamilyToHost();
+            dT->migrateClumpPosInfoToHost();
+            dT->migrateClumpHighOrderInfoToHost();
+            dT->migrateOwnerWildcardToHost();
+            m_output_thread = std::thread([this, outfilename, accuracy]() {
+                std::ofstream ptFile(outfilename, std::ios::out);
+                dT->writeClumpsAsCsvFromHost(ptFile, accuracy);
+            });
             break;
         }
         default:
@@ -2088,6 +2121,7 @@ void DEMSolver::WriteClumpFile(const std::string& outfilename, unsigned int accu
 }
 
 void DEMSolver::WriteContactFile(const std::string& outfilename, float force_thres) const {
+    WaitForPendingOutput();
     if (no_recording_contact_forces) {
         DEME_WARNING(std::string(
             "The solver is instructed to not record contact force info, so no work is done in a WriteContactFile "
@@ -2096,18 +2130,26 @@ void DEMSolver::WriteContactFile(const std::string& outfilename, float force_thr
     }
     switch (m_cnt_out_format) {
         case (OUTPUT_FORMAT::CSV): {
-            std::ofstream ptFile(outfilename, std::ios::out);
-            dT->writeContactsAsCsv(ptFile, force_thres);
-            ptFile.close();
+            dT->migrateFamilyToHost();
+            dT->migrateClumpPosInfoToHost();
+            dT->migrateContactInfoToHost();
+            m_output_thread = std::thread([this, outfilename, force_thres]() {
+                std::ofstream ptFile(outfilename, std::ios::out);
+                dT->writeContactsAsCsvFromHost(ptFile, force_thres);
+            });
             break;
         }
         case (OUTPUT_FORMAT::BINARY): {
             // std::ofstream ptFile(outfilename, std::ios::out | std::ios::binary);
             //// TODO: Implement it
             DEME_WARNING(std::string("Binary contact pair output is not implemented yet, using CSV..."));
-            std::ofstream ptFile(outfilename, std::ios::out);
-            dT->writeContactsAsCsv(ptFile, force_thres);
-            ptFile.close();
+            dT->migrateFamilyToHost();
+            dT->migrateClumpPosInfoToHost();
+            dT->migrateContactInfoToHost();
+            m_output_thread = std::thread([this, outfilename, force_thres]() {
+                std::ofstream ptFile(outfilename, std::ios::out);
+                dT->writeContactsAsCsvFromHost(ptFile, force_thres);
+            });
             break;
         }
         default:
@@ -2118,23 +2160,33 @@ void DEMSolver::WriteContactFile(const std::string& outfilename, float force_thr
 }
 
 void DEMSolver::WriteMeshFile(const std::string& outfilename) const {
+    WaitForPendingOutput();
     switch (m_mesh_out_format) {
         case (MESH_FORMAT::VTK): {
-            std::ofstream ptFile(outfilename, std::ios::out);
-            dT->writeMeshesAsVtk(ptFile);
-            ptFile.close();
+            dT->migrateFamilyToHost();
+            dT->migrateClumpPosInfoToHost();
+            m_output_thread = std::thread([this, outfilename]() {
+                std::ofstream ptFile(outfilename, std::ios::out);
+                dT->writeMeshesAsVtkFromHost(ptFile);
+            });
             break;
         }
         case (MESH_FORMAT::STL): {
-            std::ofstream ptFile(outfilename, std::ios::out);
-            dT->writeMeshesAsStl(ptFile);
-            ptFile.close();
+            dT->migrateFamilyToHost();
+            dT->migrateClumpPosInfoToHost();
+            m_output_thread = std::thread([this, outfilename]() {
+                std::ofstream ptFile(outfilename, std::ios::out);
+                dT->writeMeshesAsStlFromHost(ptFile);
+            });
             break;
         }
         case (MESH_FORMAT::PLY): {
-            std::ofstream ptFile(outfilename, std::ios::out);
-            dT->writeMeshesAsPly(ptFile);
-            ptFile.close();
+            dT->migrateFamilyToHost();
+            dT->migrateClumpPosInfoToHost();
+            m_output_thread = std::thread([this, outfilename]() {
+                std::ofstream ptFile(outfilename, std::ios::out);
+                dT->writeMeshesAsPlyFromHost(ptFile);
+            });
             break;
         }
         default:
@@ -2143,6 +2195,12 @@ void DEMSolver::WriteMeshFile(const std::string& outfilename) const {
     }
 }
 
+void DEMSolver::WaitForPendingOutput() const {
+    if (m_output_thread.joinable()) {
+        m_output_thread.join();
+    }
+}
+
 size_t DEMSolver::ChangeClumpFamily(unsigned int fam_num,
                                     const std::pair<double, double>& X,
                                     const std::pair<double, double>& Y,
diff --git a/src/DEM/dT.cpp b/src/DEM/dT.cpp
index 6f2a1622..a72d3197 100644
--- a/src/DEM/dT.cpp
+++ b/src/DEM/dT.cpp
@@ -1378,11 +1378,15 @@ void DEMDynamicThread::updateClumpMeshArrays(const std::vector<std::shared_ptr<D
 
 #ifdef DEME_USE_CHPF
 void DEMDynamicThread::writeSpheresAsChpf(std::ofstream& ptFile) {
-    chpf::Writer pw;
-    // pw.write(ptFile, chpf::Compressor::Type::USE_DEFAULT, mass);
     migrateFamilyToHost();
     migrateClumpPosInfoToHost();
     migrateClumpHighOrderInfoToHost();
+    writeSpheresAsChpfFromHost(ptFile);
+}
+
+void DEMDynamicThread::writeSpheresAsChpfFromHost(std::ofstream& ptFile) {
+    chpf::Writer pw;
+    // pw.write(ptFile, chpf::Compressor::Type::USE_DEFAULT, mass);
 
     // simParams host version should not be different from device version, so no need to update
     std::vector<float> posX(simParams->nSpheresGM);
@@ -1458,13 +1462,16 @@ void DEMDynamicThread::writeSpheresAsChpf(std::ofstream& ptFile) {
 #endif
 
 void DEMDynamicThread::writeSpheresAsCsv(std::ofstream& ptFile) {
-    std::ostringstream outstrstream;
-
     migrateFamilyToHost();
     migrateClumpPosInfoToHost();
     migrateClumpHighOrderInfoToHost();
     migrateOwnerWildcardToHost();
     migrateSphGeoWildcardToHost();
+    writeSpheresAsCsvFromHost(ptFile);
+}
+
+void DEMDynamicThread::writeSpheresAsCsvFromHost(std::ofstream& ptFile) {
+    std::ostringstream outstrstream;
 
     outstrstream << OUTPUT_FILE_X_COL_NAME + "," + OUTPUT_FILE_Y_COL_NAME + "," + OUTPUT_FILE_Z_COL_NAME + "," +
                         OUTPUT_FILE_R_COL_NAME;
@@ -1611,11 +1618,15 @@ void DEMDynamicThread::writeSpheresAsCsv(std::ofstream& ptFile) {
 
 #ifdef DEME_USE_CHPF
 void DEMDynamicThread::writeClumpsAsChpf(std::ofstream& ptFile, unsigned int accuracy) {
-    //// TODO: Note using accuracy
-    chpf::Writer pw;
     migrateFamilyToHost();
     migrateClumpPosInfoToHost();
     migrateClumpHighOrderInfoToHost();
+    writeClumpsAsChpfFromHost(ptFile, accuracy);
+}
+
+void DEMDynamicThread::writeClumpsAsChpfFromHost(std::ofstream& ptFile, unsigned int accuracy) {
+    //// TODO: Note using accuracy
+    chpf::Writer pw;
 
     // simParams host version should not be different from device version, so no need to update
     std::vector<float> posX(simParams->nOwnerBodies);
@@ -1695,13 +1706,16 @@ void DEMDynamicThread::writeClumpsAsChpf(std::ofstream& ptFile, unsigned int acc
 #endif
 
 void DEMDynamicThread::writeClumpsAsCsv(std::ofstream& ptFile, unsigned int accuracy) {
-    std::ostringstream outstrstream;
-    outstrstream.precision(accuracy);
-
     migrateFamilyToHost();
     migrateClumpPosInfoToHost();
     migrateClumpHighOrderInfoToHost();
     migrateOwnerWildcardToHost();
+    writeClumpsAsCsvFromHost(ptFile, accuracy);
+}
+
+void DEMDynamicThread::writeClumpsAsCsvFromHost(std::ofstream& ptFile, unsigned int accuracy) {
+    std::ostringstream outstrstream;
+    outstrstream.precision(accuracy);
 
     // xyz and quaternion are always there
     outstrstream << OUTPUT_FILE_X_COL_NAME + "," + OUTPUT_FILE_Y_COL_NAME + "," + OUTPUT_FILE_Z_COL_NAME +
@@ -1823,11 +1837,13 @@ void DEMDynamicThread::writeClumpsAsCsv(std::ofstream& ptFile, unsigned int accu
 }
 
 std::shared_ptr<ContactInfoContainer> DEMDynamicThread::generateContactInfo(float force_thres) {
-    // Migrate contact info to host
     migrateFamilyToHost();
     migrateClumpPosInfoToHost();
     migrateContactInfoToHost();
+    return generateContactInfoFromHost(force_thres);
+}
 
+std::shared_ptr<ContactInfoContainer> DEMDynamicThread::generateContactInfoFromHost(float force_thres) {
     size_t total_contacts = *(solverScratchSpace.numContacts);
     // Wildcards supports only floats now
     std::vector<std::pair<std::string, std::string>> existing_wildcards(m_contact_wildcard_names.size());
@@ -2050,9 +2066,122 @@ void DEMDynamicThread::writeContactsAsCsv(std::ofstream& ptFile, float force_thr
     ptFile << outstrstream.str();
 }
 
+void DEMDynamicThread::writeContactsAsCsvFromHost(std::ofstream& ptFile, float force_thres) {
+    std::ostringstream outstrstream;
+
+    std::shared_ptr<ContactInfoContainer> contactInfo = generateContactInfoFromHost(force_thres);
+
+    outstrstream << OUTPUT_FILE_CNT_TYPE_NAME;
+    if (solverFlags.cntOutFlags & CNT_OUTPUT_CONTENT::OWNER) {
+        outstrstream << "," + OUTPUT_FILE_OWNER_1_NAME + "," + OUTPUT_FILE_OWNER_2_NAME;
+    }
+    if (solverFlags.cntOutFlags & CNT_OUTPUT_CONTENT::GEO_ID) {
+        outstrstream << "," + OUTPUT_FILE_GEO_ID_1_NAME + "," + OUTPUT_FILE_GEO_ID_2_NAME;
+    }
+    if (solverFlags.cntOutFlags & CNT_OUTPUT_CONTENT::FORCE) {
+        outstrstream << "," + OUTPUT_FILE_FORCE_X_NAME + "," + OUTPUT_FILE_FORCE_Y_NAME + "," +
+                            OUTPUT_FILE_FORCE_Z_NAME;
+    }
+    if (solverFlags.cntOutFlags & CNT_OUTPUT_CONTENT::CNT_POINT) {
+        outstrstream << "," + OUTPUT_FILE_X_COL_NAME + "," + OUTPUT_FILE_Y_COL_NAME + "," + OUTPUT_FILE_Z_COL_NAME;
+    }
+    // if (solverFlags.cntOutFlags & CNT_OUTPUT_CONTENT::COMPONENT) {
+    //     outstrstream << ","+OUTPUT_FILE_COMP_1_NAME+","+OUTPUT_FILE_COMP_2_NAME;
+    // }
+    // if (solverFlags.cntOutFlags & CNT_OUTPUT_CONTENT::NICKNAME) {
+    //     outstrstream << ","+OUTPUT_FILE_OWNER_NICKNAME_1_NAME+","+OUTPUT_FILE_OWNER_NICKNAME_2_NAME;
+    // }
+    if (solverFlags.cntOutFlags & CNT_OUTPUT_CONTENT::NORMAL) {
+        outstrstream << "," + OUTPUT_FILE_NORMAL_X_NAME + "," + OUTPUT_FILE_NORMAL_Y_NAME + "," +
+                            OUTPUT_FILE_NORMAL_Z_NAME;
+    }
+    if (solverFlags.cntOutFlags & CNT_OUTPUT_CONTENT::TORQUE) {
+        outstrstream << "," + OUTPUT_FILE_TORQUE_X_NAME + "," + OUTPUT_FILE_TORQUE_Y_NAME + "," +
+                            OUTPUT_FILE_TORQUE_Z_NAME;
+    }
+    if (solverFlags.cntOutFlags & CNT_OUTPUT_CONTENT::CNT_WILDCARD) {
+        // Write all wildcard names as header
+        for (const auto& w_name : m_contact_wildcard_names) {
+            outstrstream << "," + w_name;
+        }
+    }
+    outstrstream << "\n";
+
+    for (size_t i = 0; i < contactInfo->Size(); i++) {
+        outstrstream << contactInfo->Get<std::string>("ContactType")[i];
+
+        // (Internal) ownerID and/or geometry ID
+        if (solverFlags.cntOutFlags & CNT_OUTPUT_CONTENT::OWNER) {
+            outstrstream << "," << contactInfo->Get<bodyID_t>("AOwner")[i] << ","
+                         << contactInfo->Get<bodyID_t>("BOwner")[i];
+        }
+        if (solverFlags.cntOutFlags & CNT_OUTPUT_CONTENT::GEO_ID) {
+            outstrstream << "," << contactInfo->Get<bodyID_t>("AGeo")[i] << ","
+                         << contactInfo->Get<bodyID_t>("BGeo")[i];
+        }
+
+        // Force is already in global...
+        if (solverFlags.cntOutFlags & CNT_OUTPUT_CONTENT::FORCE) {
+            outstrstream << "," << contactInfo->Get<float3>("Force")[i].x << ","
+                         << contactInfo->Get<float3>("Force")[i].y << "," << contactInfo->Get<float3>("Force")[i].z;
+        }
+
+        if (solverFlags.cntOutFlags & CNT_OUTPUT_CONTENT::CNT_POINT) {
+            // oriQ is updated already... whereas the contact point is effectively last step's... That's unfortunate.
+            // Should we do somthing ahout it?
+            outstrstream << "," << contactInfo->Get<float3>("Point")[i].x << ","
+                         << contactInfo->Get<float3>("Point")[i].y << "," << contactInfo->Get<float3>("Point")[i].z;
+        }
+
+        if (solverFlags.cntOutFlags & CNT_OUTPUT_CONTENT::NORMAL) {
+            outstrstream << "," << contactInfo->Get<float3>("Normal")[i].x << ","
+                         << contactInfo->Get<float3>("Normal")[i].y << "," << contactInfo->Get<float3>("Normal")[i].z;
+        }
+
+        // Torque is in global already...
+        if (solverFlags.cntOutFlags & CNT_OUTPUT_CONTENT::TORQUE) {
+            outstrstream << "," << contactInfo->Get<float3>("Torque")[i].x << ","
+                         << contactInfo->Get<float3>("Torque")[i].y << "," << contactInfo->Get<float3>("Torque")[i].z;
+        }
+
+        // Contact wildcards
+        if (solverFlags.cntOutFlags & CNT_OUTPUT_CONTENT::CNT_WILDCARD) {
+            // The order shouldn't be an issue... the same set is being processed here and in equip_contact_wildcards,
+            // see Model.h
+            for (const auto& name : m_contact_wildcard_names) {
+                outstrstream << "," << contactInfo->Get<float>(name)[i];
+            }
+        }
+
+        outstrstream << "\n";
+    }
+
+    ptFile << outstrstream.str();
+}
+
 void DEMDynamicThread::writeMeshesAsVtk(std::ofstream& ptFile) {
-    std::ostringstream ostream;
     migrateFamilyToHost();
+    migrateClumpPosInfoToHost();
+    writeMeshesAsVtkFromHost(ptFile);
+}
+
+void DEMDynamicThread::writeMeshesAsVtkFromHost(std::ofstream& ptFile) {
+    std::ostringstream ostream;
+
+    auto ownerPosFromHost = [this](bodyID_t owner) {
+        double X, Y, Z;
+        voxelID_t voxel = voxelID[owner];
+        subVoxelPos_t subVoxX = locX[owner];
+        subVoxelPos_t subVoxY = locY[owner];
+        subVoxelPos_t subVoxZ = locZ[owner];
+        voxelIDToPosition<double, voxelID_t, subVoxelPos_t>(X, Y, Z, voxel, subVoxX, subVoxY, subVoxZ,
+                                                            simParams->nvXp2, simParams->nvYp2, simParams->voxelSize,
+                                                            simParams->l);
+        return make_float3(X + simParams->LBFX, Y + simParams->LBFY, Z + simParams->LBFZ);
+    };
+    auto ownerOriQFromHost = [this](bodyID_t owner) {
+        return make_float4(oriQx[owner], oriQy[owner], oriQz[owner], oriQw[owner]);
+    };
 
     std::vector<size_t> vertexOffset(m_meshes.size() + 1, 0);
     size_t total_f = 0;
@@ -2097,8 +2226,8 @@ void DEMDynamicThread::writeMeshesAsVtk(std::ofstream& ptFile) {
     for (const auto& mmesh : m_meshes) {
         if (!thisMeshSkip[mesh_num]) {
             bodyID_t mowner = mmesh->owner;
-            float3 ownerPos = this->getOwnerPos(mowner)[0];
-            float4 ownerOriQ = this->getOwnerOriQ(mowner)[0];
+            float3 ownerPos = ownerPosFromHost(mowner);
+            float4 ownerOriQ = ownerOriQFromHost(mowner);
             for (const auto& v : mmesh->GetCoordsVertices()) {
                 float3 point = v;
                 applyFrameTransformLocalToGlobal(point, ownerPos, ownerOriQ);
@@ -2139,8 +2268,28 @@ void DEMDynamicThread::writeMeshesAsVtk(std::ofstream& ptFile) {
 }
 
 void DEMDynamicThread::writeMeshesAsStl(std::ofstream& ptFile) {
-    std::ostringstream ostream;
     migrateFamilyToHost();
+    migrateClumpPosInfoToHost();
+    writeMeshesAsStlFromHost(ptFile);
+}
+
+void DEMDynamicThread::writeMeshesAsStlFromHost(std::ofstream& ptFile) {
+    std::ostringstream ostream;
+
+    auto ownerPosFromHost = [this](bodyID_t owner) {
+        double X, Y, Z;
+        voxelID_t voxel = voxelID[owner];
+        subVoxelPos_t subVoxX = locX[owner];
+        subVoxelPos_t subVoxY = locY[owner];
+        subVoxelPos_t subVoxZ = locZ[owner];
+        voxelIDToPosition<double, voxelID_t, subVoxelPos_t>(X, Y, Z, voxel, subVoxX, subVoxY, subVoxZ,
+                                                            simParams->nvXp2, simParams->nvYp2, simParams->voxelSize,
+                                                            simParams->l);
+        return make_float3(X + simParams->LBFX, Y + simParams->LBFY, Z + simParams->LBFZ);
+    };
+    auto ownerOriQFromHost = [this](bodyID_t owner) {
+        return make_float4(oriQx[owner], oriQy[owner], oriQz[owner], oriQw[owner]);
+    };
 
     std::vector<notStupidBool_t> thisMeshSkip(m_meshes.size(), 0);
     unsigned int mesh_num = 0;
@@ -2158,8 +2307,8 @@ void DEMDynamicThread::writeMeshesAsStl(std::ofstream& ptFile) {
     for (const auto& mmesh : m_meshes) {
         if (!thisMeshSkip[mesh_num]) {
             bodyID_t mowner = mmesh->owner;
-            float3 ownerPos = this->getOwnerPos(mowner)[0];
-            float4 ownerOriQ = this->getOwnerOriQ(mowner)[0];
+            float3 ownerPos = ownerPosFromHost(mowner);
+            float4 ownerOriQ = ownerOriQFromHost(mowner);
             const auto& vertices = mmesh->GetCoordsVertices();
             const auto& faces = mmesh->GetIndicesVertexes();
 
@@ -2189,8 +2338,28 @@ void DEMDynamicThread::writeMeshesAsStl(std::ofstream& ptFile) {
 }
 
 void DEMDynamicThread::writeMeshesAsPly(std::ofstream& ptFile) {
-    std::ostringstream ostream;
     migrateFamilyToHost();
+    migrateClumpPosInfoToHost();
+    writeMeshesAsPlyFromHost(ptFile);
+}
+
+void DEMDynamicThread::writeMeshesAsPlyFromHost(std::ofstream& ptFile) {
+    std::ostringstream ostream;
+
+    auto ownerPosFromHost = [this](bodyID_t owner) {
+        double X, Y, Z;
+        voxelID_t voxel = voxelID[owner];
+        subVoxelPos_t subVoxX = locX[owner];
+        subVoxelPos_t subVoxY = locY[owner];
+        subVoxelPos_t subVoxZ = locZ[owner];
+        voxelIDToPosition<double, voxelID_t, subVoxelPos_t>(X, Y, Z, voxel, subVoxX, subVoxY, subVoxZ,
+                                                            simParams->nvXp2, simParams->nvYp2, simParams->voxelSize,
+                                                            simParams->l);
+        return make_float3(X + simParams->LBFX, Y + simParams->LBFY, Z + simParams->LBFZ);
+    };
+    auto ownerOriQFromHost = [this](bodyID_t owner) {
+        return make_float4(oriQx[owner], oriQy[owner], oriQz[owner], oriQw[owner]);
+    };
 
     std::vector<size_t> vertexOffset(m_meshes.size() + 1, 0);
     size_t total_f = 0;
@@ -2230,8 +2399,8 @@ void DEMDynamicThread::writeMeshesAsPly(std::ofstream& ptFile) {
     for (const auto& mmesh : m_meshes) {
         if (!thisMeshSkip[mesh_num]) {
             bodyID_t mowner = mmesh->owner;
-            float3 ownerPos = this->getOwnerPos(mowner)[0];
-            float4 ownerOriQ = this->getOwnerOriQ(mowner)[0];
+            float3 ownerPos = ownerPosFromHost(mowner);
+            float4 ownerOriQ = ownerOriQFromHost(mowner);
             for (const auto& v : mmesh->GetCoordsVertices()) {
                 float3 point = v;
                 applyFrameTransformLocalToGlobal(point, ownerPos, ownerOriQ);
diff --git a/src/DEM/dT.h b/src/DEM/dT.h
index 96e1df94..a6dd86f8 100644
--- a/src/DEM/dT.h
+++ b/src/DEM/dT.h
@@ -854,10 +854,13 @@ class DEMDynamicThread {
 
     // Generate contact info container based on the current contact array, and return it.
     std::shared_ptr<ContactInfoContainer> generateContactInfo(float force_thres);
+    std::shared_ptr<ContactInfoContainer> generateContactInfoFromHost(float force_thres);
 
 #ifdef DEME_USE_CHPF
     void writeSpheresAsChpf(std::ofstream& ptFile);
     void writeClumpsAsChpf(std::ofstream& ptFile, unsigned int accuracy = 10);
+    void writeSpheresAsChpfFromHost(std::ofstream& ptFile);
+    void writeClumpsAsChpfFromHost(std::ofstream& ptFile, unsigned int accuracy = 10);
 #endif
     void writeSpheresAsCsv(std::ofstream& ptFile);
     void writeClumpsAsCsv(std::ofstream& ptFile, unsigned int accuracy = 10);
@@ -865,6 +868,12 @@ class DEMDynamicThread {
     void writeMeshesAsVtk(std::ofstream& ptFile);
     void writeMeshesAsStl(std::ofstream& ptFile);
     void writeMeshesAsPly(std::ofstream& ptFile);
+    void writeSpheresAsCsvFromHost(std::ofstream& ptFile);
+    void writeClumpsAsCsvFromHost(std::ofstream& ptFile, unsigned int accuracy = 10);
+    void writeContactsAsCsvFromHost(std::ofstream& ptFile, float force_thres = DEME_TINY_FLOAT);
+    void writeMeshesAsVtkFromHost(std::ofstream& ptFile);
+    void writeMeshesAsStlFromHost(std::ofstream& ptFile);
+    void writeMeshesAsPlyFromHost(std::ofstream& ptFile);
 
     /// Called each time when the user calls DoDynamicsThenSync.
     void startThread();
diff --git a/src/core/utils/JitHelper.cpp b/src/core/utils/JitHelper.cpp
index 874b9232..7d68abaa 100644
--- a/src/core/utils/JitHelper.cpp
+++ b/src/core/utils/JitHelper.cpp
@@ -78,10 +78,6 @@ JitHelper::CachedProgram JitHelper::buildProgram(const std::string& name,
     for (auto& subst : ordered_subs) {
         code = std::regex_replace(code, std::regex(subst.first), subst.second);
     }
-
-    if (std::find(flags.begin(), flags.end(), "-std=c++17") == flags.end()) {
-        flags.push_back("-std=c++17");
-    }
     {
         // Collect CUDA include paths from CMake and common fallbacks
         std::vector<std::filesystem::path> include_paths;
diff --git a/src/core/utils/JitHelper.h b/src/core/utils/JitHelper.h
index d631312d..610c375c 100644
--- a/src/core/utils/JitHelper.h
+++ b/src/core/utils/JitHelper.h
@@ -47,13 +47,6 @@ class JitHelper {
         std::unordered_map<std::string, std::string> substitutions = std::unordered_map<std::string, std::string>(),
         std::vector<std::string> flags = std::vector<std::string>());
 
-    //// I'm pretty sure C++17 auto-converts this
-    // static CachedProgram buildProgram(
-    // 	const std::string& name, const std::string& code,
-    // 	std::vector<Header> headers = 0,
-    // 	std::vector<std::string> flags = 0
-    // );
-
     static const std::filesystem::path KERNEL_DIR;
     static const std::filesystem::path KERNEL_INCLUDE_DIR;
     static const std::filesystem::path CACHE_DIR;

From 881278aa1995981773f92b553d29bc877b48af0d Mon Sep 17 00:00:00 2001
From: Ruochun <ruochunz@gmail.com>
Date: Tue, 27 Jan 2026 01:30:38 +0800
Subject: [PATCH 08/17] (Supposedly) fix fake-remote-contact-induced big
 penetration problems

- Done by requiring tri--tri primitive contacts to respect the general
  relative direction of the mesh patches involved to have the right in
  contact normal voting. So this is still done by finding a good voting
strategy which I believe is the key to rule out remote fake contacts
---
 src/DEM/Defines.h                           |  4 +-
 src/DEM/dT.cpp                              | 22 ++-----
 src/DEM/dT.h                                |  4 +-
 src/algorithms/DEMDynamicMisc.cu            | 73 ++++-----------------
 src/algorithms/DEMStaticDeviceSubroutines.h | 13 ----
 src/demo/DEMdemo_DrumCubes.cpp              |  2 +-
 src/kernel/DEMCalcForceKernels_Primitive.cu | 28 ++++++--
 src/kernel/DEMKinematicMisc.cu              |  6 +-
 8 files changed, 45 insertions(+), 107 deletions(-)

diff --git a/src/DEM/Defines.h b/src/DEM/Defines.h
index 8cc61e26..3e300742 100644
--- a/src/DEM/Defines.h
+++ b/src/DEM/Defines.h
@@ -360,8 +360,8 @@ struct DEMDataDT {
     float3* contactTorque_convToForce;
     float3* contactPointGeometryA;
     float3* contactPointGeometryB;
-    // Array to record whether a triangle-triangle primitive contact satisfies SAT (is in physical contact)
-    notStupidBool_t* contactSATSatisfied;
+    // Array to record whether a triangle-triangle primitive contact is valid (respects patch--patch general direction)
+    notStupidBool_t* contactPatchDirectionRespected;
     // float3* contactHistory;
     // float* contactDuration;
 
diff --git a/src/DEM/dT.cpp b/src/DEM/dT.cpp
index f8c16464..47496ce0 100644
--- a/src/DEM/dT.cpp
+++ b/src/DEM/dT.cpp
@@ -67,7 +67,7 @@ void DEMDynamicThread::packDataPointers() {
     contactTorque_convToForce.bindDevicePointer(&(granData->contactTorque_convToForce));
     contactPointGeometryA.bindDevicePointer(&(granData->contactPointGeometryA));
     contactPointGeometryB.bindDevicePointer(&(granData->contactPointGeometryB));
-    contactSATSatisfied.bindDevicePointer(&(granData->contactSATSatisfied));
+    contactPatchDirectionRespected.bindDevicePointer(&(granData->contactPatchDirectionRespected));
     // granData->contactHistory = contactHistory.data();
     // granData->contactDuration = contactDuration.data();
 
@@ -576,7 +576,7 @@ void DEMDynamicThread::allocateGPUArrays(size_t nOwnerBodies,
         DEME_DUAL_ARRAY_RESIZE(idPrimitiveB, cnt_arr_size, 0);
         DEME_DUAL_ARRAY_RESIZE(contactTypePrimitive, cnt_arr_size, NOT_A_CONTACT);
         DEME_DUAL_ARRAY_RESIZE(geomToPatchMap, cnt_arr_size, 0);
-        DEME_DUAL_ARRAY_RESIZE(contactSATSatisfied, cnt_arr_size, 0);
+        DEME_DUAL_ARRAY_RESIZE(contactPatchDirectionRespected, cnt_arr_size, 0);
 
         DEME_DUAL_ARRAY_RESIZE(idPatchA, cnt_arr_size, 0);
         DEME_DUAL_ARRAY_RESIZE(idPatchB, cnt_arr_size, 0);
@@ -2042,7 +2042,7 @@ inline void DEMDynamicThread::contactPrimitivesArraysResize(size_t nContactPairs
         DEME_DUAL_ARRAY_RESIZE(contactPointGeometryA, nContactPairs, make_float3(0));
         DEME_DUAL_ARRAY_RESIZE(contactPointGeometryB, nContactPairs, make_float3(0));
         // NEW: Resize SAT satisfaction array for tracking tri-tri physical contact
-        DEME_DUAL_ARRAY_RESIZE(contactSATSatisfied, nContactPairs, 0);
+        DEME_DUAL_ARRAY_RESIZE(contactPatchDirectionRespected, nContactPairs, 0);
     }
 
     // Re-packing pointers now is automatic
@@ -2466,17 +2466,6 @@ inline void DEMDynamicThread::dispatchPatchBasedForceCorrections(
                     startOffsetPrimitive, startOffsetPatch, countPrimitive, streamInfo.stream);
                 solverScratchSpace.finishUsingTempVector("maxPenetrations");
 
-                // Step 8d: Check if each patch has any SAT-satisfying primitive (for tri-tri contacts)
-                // If no primitive satisfies SAT, the patch contact is non-physical and should use Step 9 fallback
-                notStupidBool_t* patchHasSAT = nullptr;
-                if (contact_type == TRIANGLE_TRIANGLE_CONTACT) {
-                    patchHasSAT = (notStupidBool_t*)solverScratchSpace.allocateTempVector(
-                        "patchHasSAT", countPatch * sizeof(notStupidBool_t));
-                    checkPatchHasSATSatisfyingPrimitive(&granData, patchHasSAT, keys, startOffsetPrimitive,
-                                                        startOffsetPatch, countPrimitive, countPatch,
-                                                        streamInfo.stream);
-                }
-
                 // Clean up keys arrays now that we're done with reductions
                 solverScratchSpace.finishUsingTempVector("votingKeys");
                 solverScratchSpace.finishUsingTempVector("uniqueKeys");
@@ -2497,8 +2486,8 @@ inline void DEMDynamicThread::dispatchPatchBasedForceCorrections(
                 double3* finalContactPoints =
                     (double3*)solverScratchSpace.allocateTempVector("finalContactPoints", countPatch * sizeof(double3));
                 finalizePatchResults(totalProjectedAreas, votedNormals, maxProjectedPenetrations, votedContactPoints,
-                                     zeroAreaNormals, zeroAreaPenetrations, zeroAreaContactPoints, patchHasSAT,
-                                     finalAreas, finalNormals, finalPenetrations.data(), finalContactPoints, countPatch,
+                                     zeroAreaNormals, zeroAreaPenetrations, zeroAreaContactPoints, finalAreas,
+                                     finalNormals, finalPenetrations.data(), finalContactPoints, countPatch,
                                      streamInfo.stream);
                 solverScratchSpace.finishUsingTempVector("totalProjectedAreas");
                 solverScratchSpace.finishUsingTempVector("votedNormals");
@@ -2507,7 +2496,6 @@ inline void DEMDynamicThread::dispatchPatchBasedForceCorrections(
                 solverScratchSpace.finishUsingTempVector("zeroAreaPenetrations");
                 solverScratchSpace.finishUsingTempVector("votedContactPoints");
                 solverScratchSpace.finishUsingTempVector("zeroAreaContactPoints");
-                solverScratchSpace.finishUsingTempVector("patchHasSAT");
 
                 // Now we have:
                 // - finalAreas: final contact area per patch pair (countPatch elements)
diff --git a/src/DEM/dT.h b/src/DEM/dT.h
index c95908b8..77720d99 100644
--- a/src/DEM/dT.h
+++ b/src/DEM/dT.h
@@ -236,8 +236,8 @@ class DEMDynamicThread {
     // Local position of contact point of contact w.r.t. the reference frame of body A and B
     DualArray<float3> contactPointGeometryA = DualArray<float3>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
     DualArray<float3> contactPointGeometryB = DualArray<float3>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
-    // Array to record whether a triangle-triangle primitive contact satisfies SAT (is in physical contact)
-    DualArray<notStupidBool_t> contactSATSatisfied =
+    // Array to record whether a triangle-triangle primitive contact respects patch--patch general direction
+    DualArray<notStupidBool_t> contactPatchDirectionRespected =
         DualArray<notStupidBool_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
     // Wildcard (extra property) arrays associated with contacts and owners
     std::vector<std::unique_ptr<DualArray<float>>> contactWildcards;
diff --git a/src/algorithms/DEMDynamicMisc.cu b/src/algorithms/DEMDynamicMisc.cu
index 8179fe47..92c9344c 100644
--- a/src/algorithms/DEMDynamicMisc.cu
+++ b/src/algorithms/DEMDynamicMisc.cu
@@ -138,15 +138,16 @@ __global__ void prepareWeightedNormalsForVoting_impl(DEMDataDT* granData,
         // Extract the area (double) from contactPointGeometryB (stored as float3)
         float3 areaStorage = granData->contactPointGeometryB[myContactID];
         double area = float3StorageToDouble(areaStorage);
-        float3 penStorage = granData->contactPointGeometryA[myContactID];
-        double penetration = float3StorageToDouble(penStorage);
-        penetration = (penetration > DEME_TINY_FLOAT) ? penetration : DEME_TINY_FLOAT;
-        double recipPen = 1.0 / penetration;
+        // But primitive contacts that do not respect the patch general direction have no right in deciding the contact
+        // normal
+        notStupidBool_t directionRespected = granData->contactPatchDirectionRespected[myContactID];
+        if (!directionRespected) {
+            area = 0.0;
+        }
 
         // Compute weighted normal (normal * area)
         // Note that fake contacts do not affect as their area is 0
-        weightedNormals[idx] = make_float3((double)normal.x * area * recipPen, (double)normal.y * area * recipPen,
-                                           (double)normal.z * area * recipPen);
+        weightedNormals[idx] = make_float3((double)normal.x * area, (double)normal.y * area, (double)normal.z * area);
 
         // Store area for reduction
         areas[idx] = area;
@@ -405,55 +406,7 @@ void findMaxPenetrationPrimitiveForZeroAreaPatches(DEMDataDT* granData,
     }
 }
 
-// Kernel to check if any primitive in each patch satisfies SAT (for tri-tri contacts)
-// Uses simple idempotent writes to set patchHasSAT[patchIdx] = 1 if any primitive has contactSATSatisfied = 1
-// Since we only transition from 0 to 1, and the array is pre-initialized to 0, multiple threads writing 1 is safe
-__global__ void checkPatchHasSATSatisfyingPrimitive_impl(DEMDataDT* granData,
-                                                         notStupidBool_t* patchHasSAT,
-                                                         contactPairs_t* keys,
-                                                         contactPairs_t startOffsetPrimitive,
-                                                         contactPairs_t startOffsetPatch,
-                                                         contactPairs_t countPrimitive) {
-    contactPairs_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < countPrimitive) {
-        contactPairs_t myContactID = startOffsetPrimitive + idx;
-        contactPairs_t patchIdx = keys[idx];
-        contactPairs_t localPatchIdx = patchIdx - startOffsetPatch;
-
-        // Check if this primitive satisfies SAT
-        notStupidBool_t satisfiesSAT = granData->contactSATSatisfied[myContactID];
-
-        // If this primitive satisfies SAT, mark the patch as having at least one SAT-satisfying primitive
-        // Since we only need to set 0 -> 1, a simple write is safe (multiple threads writing 1 is idempotent)
-        if (satisfiesSAT) {
-            patchHasSAT[localPatchIdx] = 1;
-        }
-    }
-}
-
-void checkPatchHasSATSatisfyingPrimitive(DEMDataDT* granData,
-                                         notStupidBool_t* patchHasSAT,
-                                         contactPairs_t* keys,
-                                         contactPairs_t startOffsetPrimitive,
-                                         contactPairs_t startOffsetPatch,
-                                         contactPairs_t countPrimitive,
-                                         contactPairs_t countPatch,
-                                         cudaStream_t& this_stream) {
-    // Initialize patchHasSAT to 0
-    DEME_GPU_CALL(cudaMemsetAsync(patchHasSAT, 0, countPatch * sizeof(notStupidBool_t), this_stream));
-
-    size_t blocks_needed = (countPrimitive + DEME_MAX_THREADS_PER_BLOCK - 1) / DEME_MAX_THREADS_PER_BLOCK;
-    if (blocks_needed > 0) {
-        checkPatchHasSATSatisfyingPrimitive_impl<<<blocks_needed, DEME_MAX_THREADS_PER_BLOCK, 0, this_stream>>>(
-            granData, patchHasSAT, keys, startOffsetPrimitive, startOffsetPatch, countPrimitive);
-        DEME_GPU_CALL(cudaStreamSynchronize(this_stream));
-    }
-}
-
 // Kernel to finalize patch results by combining normal voting results with zero-area case handling
-// For patches with totalArea > 0 AND patchHasSAT = 1: use voted normal and weighted penetration
-// For patches with totalArea == 0 OR patchHasSAT = 0: use max-penetration primitive's normal and penetration (Step 8
-// fallback)
 __global__ void finalizePatchResults_impl(double* totalProjectedAreas,
                                           float3* votedNormals,
                                           double* votedPenetrations,
@@ -461,7 +414,6 @@ __global__ void finalizePatchResults_impl(double* totalProjectedAreas,
                                           float3* zeroAreaNormals,
                                           double* zeroAreaPenetrations,
                                           double3* zeroAreaContactPoints,
-                                          notStupidBool_t* patchHasSAT,
                                           double* finalAreas,
                                           float3* finalNormals,
                                           double* finalPenetrations,
@@ -470,18 +422,16 @@ __global__ void finalizePatchResults_impl(double* totalProjectedAreas,
     contactPairs_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < count) {
         double projectedArea = totalProjectedAreas[idx];
-        // Default to 1 (SAT satisfied) for non-triangle-triangle contacts where patchHasSAT is null
-        notStupidBool_t hasSAT = (patchHasSAT != nullptr) ? patchHasSAT[idx] : 1;
 
-        // Use voted results only if projectedArea > 0 AND at least one primitive satisfies SAT
-        if (projectedArea > 0.0 && hasSAT) {
+        // Use voted results only if projectedArea > 0
+        if (projectedArea > 0.0) {
             // Normal case: use voted results
             finalAreas[idx] = projectedArea;
             finalNormals[idx] = votedNormals[idx];
             finalPenetrations[idx] = votedPenetrations[idx];
             finalContactPoints[idx] = votedContactPoints[idx];
         } else {
-            // Zero-area case OR no SAT-satisfying primitives: use max-penetration primitive's results (Step 8 fallback)
+            // Zero-area case: use max-penetration primitive's results (Step 8 fallback)
             // Set finalArea to 0 for these cases
             finalAreas[idx] = 0.0;
             finalNormals[idx] = zeroAreaNormals[idx];
@@ -498,7 +448,6 @@ void finalizePatchResults(double* totalProjectedAreas,
                           float3* zeroAreaNormals,
                           double* zeroAreaPenetrations,
                           double3* zeroAreaContactPoints,
-                          notStupidBool_t* patchHasSAT,
                           double* finalAreas,
                           float3* finalNormals,
                           double* finalPenetrations,
@@ -509,7 +458,7 @@ void finalizePatchResults(double* totalProjectedAreas,
     if (blocks_needed > 0) {
         finalizePatchResults_impl<<<blocks_needed, DEME_MAX_THREADS_PER_BLOCK, 0, this_stream>>>(
             totalProjectedAreas, votedNormals, votedPenetrations, votedContactPoints, zeroAreaNormals,
-            zeroAreaPenetrations, zeroAreaContactPoints, patchHasSAT, finalAreas, finalNormals, finalPenetrations,
+            zeroAreaPenetrations, zeroAreaContactPoints, finalAreas, finalNormals, finalPenetrations,
             finalContactPoints, count);
         DEME_GPU_CALL(cudaStreamSynchronize(this_stream));
     }
diff --git a/src/algorithms/DEMStaticDeviceSubroutines.h b/src/algorithms/DEMStaticDeviceSubroutines.h
index 747e74c9..edcaf6e5 100644
--- a/src/algorithms/DEMStaticDeviceSubroutines.h
+++ b/src/algorithms/DEMStaticDeviceSubroutines.h
@@ -222,17 +222,6 @@ void findMaxPenetrationPrimitiveForZeroAreaPatches(DEMDataDT* granData,
                                                    contactPairs_t countPrimitive,
                                                    cudaStream_t& this_stream);
 
-// Checks if any primitive in each patch satisfies SAT (for tri-tri contacts)
-// Outputs a flag per patch: 1 if at least one SAT-satisfying primitive exists, 0 otherwise
-void checkPatchHasSATSatisfyingPrimitive(DEMDataDT* granData,
-                                         notStupidBool_t* patchHasSAT,
-                                         contactPairs_t* keys,
-                                         contactPairs_t startOffsetPrimitive,
-                                         contactPairs_t startOffsetPatch,
-                                         contactPairs_t countPrimitive,
-                                         contactPairs_t countPatch,
-                                         cudaStream_t& this_stream);
-
 // Finalizes patch results by combining normal voting with zero-area case handling
 void finalizePatchResults(double* totalProjectedAreas,
                           float3* votedNormals,
@@ -241,7 +230,6 @@ void finalizePatchResults(double* totalProjectedAreas,
                           float3* zeroAreaNormals,
                           double* zeroAreaPenetrations,
                           double3* zeroAreaContactPoints,
-                          notStupidBool_t* patchHasSAT,
                           double* finalAreas,
                           float3* finalNormals,
                           double* finalPenetrations,
@@ -253,7 +241,6 @@ void finalizePatchResults(double* totalProjectedAreas,
 void finalizePatchContactPoints(double* totalAreas,
                                 double3* votedContactPoints,
                                 double3* zeroAreaContactPoints,
-                                notStupidBool_t* patchHasSAT,
                                 double3* finalContactPoints,
                                 contactPairs_t count,
                                 cudaStream_t& this_stream);
diff --git a/src/demo/DEMdemo_DrumCubes.cpp b/src/demo/DEMdemo_DrumCubes.cpp
index 836c4679..55b1842d 100644
--- a/src/demo/DEMdemo_DrumCubes.cpp
+++ b/src/demo/DEMdemo_DrumCubes.cpp
@@ -113,7 +113,7 @@ int main() {
     create_directory(out_dir);
 
     float time_end = 3.0f;
-    unsigned int fps = 20;
+    unsigned int fps = 100;
     float frame_time = 1.0f / fps;
 
     std::cout << "Output at " << fps << " FPS" << std::endl;
diff --git a/src/kernel/DEMCalcForceKernels_Primitive.cu b/src/kernel/DEMCalcForceKernels_Primitive.cu
index b9c0614e..47d231d7 100644
--- a/src/kernel/DEMCalcForceKernels_Primitive.cu
+++ b/src/kernel/DEMCalcForceKernels_Primitive.cu
@@ -33,7 +33,10 @@ __device__ __forceinline__ void calculatePrimitiveContactForces_impl(deme::DEMSi
     double overlapDepth = 0.0;
     // Area of the contact surface, or in the mesh--mesh case, area of the clipping polygon projection
     double overlapArea = 0.0;
+    // `Body pos' in the primitive contact kernel means the position of the primitive itself, e.g., sphere center or
+    // triangle nodes
     double3 AOwnerPos, bodyAPos, BOwnerPos, bodyBPos;
+    // Radius always means radius of curvature; for triangle and analytical entity, it's set to a huge number
     float AOwnerMass, ARadius, BOwnerMass, BRadius;
     float4 AOriQ, BOriQ;
     deme::materialsOffset_t bodyAMatType, bodyBMatType;
@@ -41,6 +44,9 @@ __device__ __forceinline__ void calculatePrimitiveContactForces_impl(deme::DEMSi
     float extraMarginSize = 0.;
     // Triangle A's three points are defined outside, as may be reused in B's acquisition and penetration calc.
     double3 triANode1, triANode2, triANode3;
+    // Mesh's patch location may be needed for testing if this primitive contact respects the patch's general spatial
+    // direction
+    float3 triPatchPosA;
     // Then allocate the optional quantities that will be needed in the force model (note: this one can't be in a
     // curly bracket, obviously...)
     _forceModelIngredientDefinition_;
@@ -97,6 +103,7 @@ __device__ __forceinline__ void calculatePrimitiveContactForces_impl(deme::DEMSi
         deme::bodyID_t myPatchID = granData->triPatchID[triID];
         bodyAMatType = granData->patchMaterialOffset[myPatchID];
         extraMarginSize = granData->familyExtraMarginSize[AOwnerFamily];
+        float3 relPosPatch = granData->relPosPatch[myPatchID];
 
         triANode1 = to_double3(granData->relPosNode1[triID]);
         triANode2 = to_double3(granData->relPosNode2[triID]);
@@ -123,6 +130,10 @@ __device__ __forceinline__ void calculatePrimitiveContactForces_impl(deme::DEMSi
         triANode3 += AOwnerPos;
         // Assign the correct bodyAPos
         bodyAPos = triangleCentroid<double3>(triANode1, triANode2, triANode3);
+
+        // Get triPatchPosA ready
+        applyOriQToVector3(relPosPatch.x, relPosPatch.y, relPosPatch.z, AOriQ.w, AOriQ.x, AOriQ.y, AOriQ.z);
+        triPatchPosA = relPosPatch + to_float3(AOwnerPos);
     } else {
         // Currently, we only support sphere and mesh for body A
         ContactType = deme::NOT_A_CONTACT;
@@ -185,6 +196,7 @@ __device__ __forceinline__ void calculatePrimitiveContactForces_impl(deme::DEMSi
         // If this is a triangle then it has a patch ID
         deme::bodyID_t myPatchID = granData->triPatchID[triID];
         bodyBMatType = granData->patchMaterialOffset[myPatchID];
+        float3 relPosPatch = granData->relPosPatch[myPatchID];
 
         // As the grace margin, the distance (negative overlap) just needs to be within the grace margin. So we pick
         // the larger of the 2 familyExtraMarginSize.
@@ -217,6 +229,9 @@ __device__ __forceinline__ void calculatePrimitiveContactForces_impl(deme::DEMSi
         triBNode3 += BOwnerPos;
         // Assign the correct bodyBPos
         bodyBPos = triangleCentroid<double3>(triBNode1, triBNode2, triBNode3);
+        // Get triPatchPosB ready
+        applyOriQToVector3(relPosPatch.x, relPosPatch.y, relPosPatch.z, BOriQ.w, BOriQ.x, BOriQ.y, BOriQ.z);
+        float3 triPatchPosB = relPosPatch + to_float3(BOwnerPos);
 
         // If B is a triangle, then A can be a sphere or a triangle.
         if constexpr (AType == deme::GEO_T_SPHERE) {
@@ -246,13 +261,12 @@ __device__ __forceinline__ void calculatePrimitiveContactForces_impl(deme::DEMSi
                                                                             overlapDepth, overlapArea, contactPnt);
             B2A = to_float3(contact_normal);
 
-            // Record whether this tri-tri primitive contact satisfies SAT (is in physical contact)
-            // Use the dedicated SAT check function to determine if triangles are truly in physical contact
-            // Note: checkTriangleTriangleOverlap uses projection which can report contact even for non-physical
-            // "submerged" cases, so we need the actual SAT test for accurate physical contact determination
-            bool satisfiesSAT = checkTriangleTriangleSAT<double3, double>(triANode1, triANode2, triANode3, triBNode1,
-                                                                          triBNode2, triBNode3);
-            granData->contactSATSatisfied[myPrimitiveContactID] = satisfiesSAT ? 1 : 0;
+            // We require that in the tri--tri case, the contact also respects the patch--patch general direction. This
+            // is because if the contact margin is very large, the algorithm can detect remote fake `submerge' cases
+            // which involve the triangles of the wrong sides of the mesh particles. But in this case, the direction of
+            // this contact is almost always opposite to the general direction of the 2 patches (in terms of B2A).
+            float dotProd = dot(B2A, triPatchPosA - triPatchPosB);
+            granData->contactPatchDirectionRespected[myPrimitiveContactID] = (dotProd > 0.f) ? 1 : 0;
 
             // Fix ContactType if needed
             // If the solver says in contact, we do not question it
diff --git a/src/kernel/DEMKinematicMisc.cu b/src/kernel/DEMKinematicMisc.cu
index 82c62723..9793906b 100644
--- a/src/kernel/DEMKinematicMisc.cu
+++ b/src/kernel/DEMKinematicMisc.cu
@@ -89,9 +89,9 @@ __global__ void computeMarginFromAbsv_implTri(deme::DEMSimParams* simParams,
         double finalMargin =
             (double)(vel * simParams->expSafetyMulti + simParams->expSafetyAdder) * (*ts) * (*maxDrift) +
             granData->familyExtraMarginSize[my_family];
-        // if (finalMargin < penetrationMargin) {
-        //     finalMargin = penetrationMargin;
-        // }
+        if (finalMargin < penetrationMargin) {
+            finalMargin = penetrationMargin;
+        }
 
         granData->marginSizeTriangle[triID] = finalMargin;
     }

From 0068b68e1fcf7c83fef39e45305e275cfc09c55b Mon Sep 17 00:00:00 2001
From: Florian Reinle <f.reinle@otec.de>
Date: Mon, 26 Jan 2026 19:55:03 +0100
Subject: [PATCH 09/17] AutoPatchSplit early mock up + colored PLY output

---
 src/DEM/API.h                               |   4 +
 src/DEM/APIPublic.cpp                       |   6 +-
 src/DEM/BdrsAndObjs.h                       | 122 ++-
 src/DEM/MeshUtils.cpp                       | 878 ++++++++++++++++++--
 src/DEM/dT.cpp                              |  39 +-
 src/DEM/dT.h                                |   4 +-
 src/demo/ModularTests/DEMTest_MeshPatch.cpp | 123 ++-
 7 files changed, 1104 insertions(+), 72 deletions(-)

diff --git a/src/DEM/API.h b/src/DEM/API.h
index 631f3e64..0a2aa32c 100644
--- a/src/DEM/API.h
+++ b/src/DEM/API.h
@@ -1475,6 +1475,8 @@ class DEMSolver {
     /// @brief Specify the output file format of meshes.
     /// @param format A choice between "VTK", "OBJ", "STL", "PLY".
     void SetMeshOutputFormat(const std::string& format);
+    /// @brief Enable/disable per-patch face colors in PLY mesh output (for testing auto patch splitting only).
+    void EnableMeshPatchColorOutput(bool enable = true);
     /// @brief Clear stored solver logs (errors, warnings, messages).
     void ClearLog() { Logger::GetInstance().Clear(); }
     /// @brief Show error and warnings.
@@ -1570,6 +1572,8 @@ class DEMSolver {
                                      CNT_OUTPUT_CONTENT::CNT_WILDCARD;
     // The output file format for meshes
     MESH_FORMAT m_mesh_out_format = MESH_FORMAT::VTK;
+    // If PLY mesh output should include per-patch face colors
+    bool m_mesh_out_ply_patch_colors = false;
     // If the solver should output wildcards to file
     bool m_is_out_owner_wildcards = false;
     bool m_is_out_cnt_wildcards = false;
diff --git a/src/DEM/APIPublic.cpp b/src/DEM/APIPublic.cpp
index d93358f3..de4d0b8c 100644
--- a/src/DEM/APIPublic.cpp
+++ b/src/DEM/APIPublic.cpp
@@ -172,6 +172,10 @@ void DEMSolver::SetMeshOutputFormat(const std::string& format) {
     }
 }
 
+void DEMSolver::EnableMeshPatchColorOutput(bool enable) {
+    m_mesh_out_ply_patch_colors = enable;
+}
+
 void DEMSolver::SetOutputContent(const std::vector<std::string>& content) {
     std::vector<std::string> u_content(content.size());
     for (unsigned int i = 0; i < content.size(); i++) {
@@ -2185,7 +2189,7 @@ void DEMSolver::WriteMeshFile(const std::string& outfilename) const {
             dT->migrateClumpPosInfoToHost();
             m_output_thread = std::thread([this, outfilename]() {
                 std::ofstream ptFile(outfilename, std::ios::out);
-                dT->writeMeshesAsPlyFromHost(ptFile);
+                dT->writeMeshesAsPlyFromHost(ptFile, m_mesh_out_ply_patch_colors);
             });
             break;
         }
diff --git a/src/DEM/BdrsAndObjs.h b/src/DEM/BdrsAndObjs.h
index 3e8b0b34..7bb747ae 100644
--- a/src/DEM/BdrsAndObjs.h
+++ b/src/DEM/BdrsAndObjs.h
@@ -606,16 +606,6 @@ class DEMMesh : public DEMInitializer {
     // Whether patch locations have been explicitly set
     bool patch_locations_explicitly_set = false;
 
-    /// @brief Split the mesh into convex patches based on angle threshold.
-    /// @details Uses a region-growing algorithm to group adjacent triangles whose face normals differ by less than
-    /// the specified angle threshold. Each patch represents a locally convex region of the mesh. Patches are
-    /// non-overlapping and cover the entire mesh. This is useful for contact force calculations.
-    /// @param angle_threshold_deg Maximum angle (in degrees) between adjacent face normals to be in same patch.
-    /// Default is 30.0 degrees. Lower values create more patches (stricter convexity), higher values create fewer
-    /// patches (relaxed convexity).
-    /// @return Number of patches created.
-    unsigned int SplitIntoConvexPatches(float angle_threshold_deg = 30.0f);
-
     /// @brief Manually set the patch IDs for each triangle.
     /// @details Allows user to manually specify which patch each triangle belongs to. This is useful when
     /// the user has pre-computed patch information or wants to define patches based on custom criteria.
@@ -660,6 +650,118 @@ class DEMMesh : public DEMInitializer {
     /// patch.
     /// @return Vector of locations (one per patch).
     std::vector<float3> ComputePatchLocations() const;
+    // ------------------------------------------------------------
+    // Advanced mesh patch splitting + quality reporting
+    // ------------------------------------------------------------
+    enum class PatchQualityLevel : uint8_t { SAFE = 0, WARN = 1, CRITICAL = 2 };
+
+    enum class PatchConstraintStatus : uint8_t {
+        SATISFIED = 0,
+        TOO_MANY_UNMERGEABLE = 1,   // patch_max konnte wegen hard/concave Barrieren nicht erreicht werden
+        TOO_FEW_UNSPLITTABLE = 2    // patch_min konnte nicht erreicht werden (zu wenig "splittable" Struktur)
+    };
+
+    struct PatchQualityPatch {
+        PatchQualityLevel level = PatchQualityLevel::SAFE;
+
+        // Normal statistics (area-weighted mean normal, area only for weighting)
+        float worst_angle_deg = 0.0f;   // max deviation from mean normal (largest triangle deviation)
+        float coherence_r = 1.0f;       // ||sum(A*n)|| / sum(A) in [0,1] (1 = perfectly aligned)
+
+        unsigned int n_tris = 0;
+
+        // Internal violations (should be 0 in a "clean" patching)
+        unsigned int hard_crossings = 0;     // internal edges whose triangle normals exceed hard_angle_deg
+        unsigned int concave_crossings = 0;  // internal concave edges (if concavity enabled and oriented edge is reliable)
+        unsigned int unoriented_edges = 0;   // internal edges where orientation test failed (sign dihedral unreliable)
+    };
+
+    struct PatchQualityReport {
+        PatchQualityLevel overall = PatchQualityLevel::SAFE;
+        PatchConstraintStatus constraint_status = PatchConstraintStatus::SATISFIED;
+
+        unsigned int achieved_patches = 0;
+        unsigned int requested_min = 1;
+        unsigned int requested_max = std::numeric_limits<unsigned int>::max();
+
+        std::vector<PatchQualityPatch> per_patch;
+    };
+
+    struct PatchQualityOptions {
+        // Coherence thresholds
+        float safe_r = 0.85f;
+        float warn_r = 0.65f;
+
+        // Worst-angle tolerance:
+        // - compare worst_angle_deg to the "reference" (patch_normal_max if enabled, else hard_angle)
+        float warn_worst_angle_margin_deg = 5.0f;
+
+        bool hard_crossings_are_critical = true;
+        bool concave_crossings_are_critical = false;
+
+        // If unoriented edges are many, concavity sign is unreliable; treat it at least as WARN if concavity is enabled.
+        unsigned int unoriented_warn_threshold = 10;
+    };
+
+    struct PatchSplitOptions {
+        // Hysteresis:
+        // - soft < hard  => easy merges below soft, cautious merges in (soft..hard)
+        // - soft < 0     => disable hysteresis (soft = hard)
+        float soft_angle_deg = -1.0f;
+
+        // Statistical criterion:
+        // Max allowed angle between candidate triangle normal and current PATCH mean normal.
+        // < 0 => disabled (legacy-like behavior).
+        float patch_normal_max_deg = -1.0f;
+
+        // Concavity filter using signed dihedral angle (reliable for consistently oriented manifold surfaces)
+        bool block_concave_edges = false;
+        float concave_allow_deg = 0.0f;  // 0 => block any concave edge; allow small negative dihedral if desired
+
+        // Patch count constraints (count-only; no area threshold)
+        unsigned int patch_min = 1;
+        unsigned int patch_max = std::numeric_limits<unsigned int>::max();
+
+        // Seeding strategy
+        bool seed_largest_first = true;
+
+        // Optional auto-tuning (OFF by default)
+        struct AutoTuneOptions {
+            bool enabled = false;
+
+            // Stop once overall quality is <= target_level (SAFE is strictest)
+            PatchQualityLevel target_level = PatchQualityLevel::WARN;
+
+            unsigned int max_iters = 6;
+
+            // Step sizes for tightening/loosening (deg)
+            float step_deg = 5.0f;
+
+            // Allow enabling concavity block automatically if it helps
+            bool allow_enable_concavity = true;
+        } auto_tune;
+    };
+
+    /// @brief Smart patch splitter with optional hysteresis, patch-normal statistics, dihedral concavity blocking,
+    ///        patch_min/patch_max enforcement, and optional quality report + auto-tuning.
+    /// @param hard_angle_deg Mandatory: edges above this are NEVER merged.
+    /// @param opt Advanced controls.
+    /// @param out_report Optional: returns SAFE/WARN/CRITICAL feedback + constraint status.
+    /// @param qopt Classification thresholds for feedback.
+    /// @return Number of patches created (achieved).
+    unsigned int SplitIntoConvexPatches(float hard_angle_deg,
+                                        const PatchSplitOptions& opt,
+                                        PatchQualityReport* out_report,
+                                        const PatchQualityOptions& qopt);
+    unsigned int SplitIntoConvexPatches(float hard_angle_deg) {
+        return SplitIntoConvexPatches(hard_angle_deg, PatchSplitOptions(), nullptr, PatchQualityOptions());
+    }
+    unsigned int SplitIntoConvexPatches(float hard_angle_deg, const PatchSplitOptions& opt) {
+        return SplitIntoConvexPatches(hard_angle_deg, opt, nullptr, PatchQualityOptions());
+    }
+    unsigned int SplitIntoConvexPatches(float hard_angle_deg, const PatchSplitOptions& opt, PatchQualityReport* out_report) {
+        return SplitIntoConvexPatches(hard_angle_deg, opt, out_report, PatchQualityOptions());
+    }
 
     ////////////////////////////////////////////////////////
     // Some geo wildcard-related stuff
diff --git a/src/DEM/MeshUtils.cpp b/src/DEM/MeshUtils.cpp
index f6ded434..80d35e96 100644
--- a/src/DEM/MeshUtils.cpp
+++ b/src/DEM/MeshUtils.cpp
@@ -673,89 +673,865 @@ static std::vector<std::vector<size_t>> buildAdjacencyMap(const std::vector<int3
     return adjacency;
 }
 
-// Split mesh into convex patches using region-growing algorithm.
-// The algorithm groups adjacent triangles (sharing an edge) if the angle between their
-// face normals is below the threshold. Each patch represents a locally convex region.
-unsigned int DEMMesh::SplitIntoConvexPatches(float angle_threshold_deg) {
+// ------------------------------------------------------------
+// Helpers for advanced patching
+// ------------------------------------------------------------
+struct EdgeAdjInfo {
+    size_t nbr = 0;
+    int va = -1;              // oriented edge vertex A (as appears in the current triangle)
+    int vb = -1;              // oriented edge vertex B (as appears in the current triangle)
+    bool oriented_ok = false; // true if the neighbor sees the shared edge reversed (good sign for oriented manifold)
+};
+
+static inline float dot3(const float3& a, const float3& b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+static inline float3 cross3(const float3& a, const float3& b) {
+    return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+}
+static inline float norm3(const float3& v) {
+    return std::sqrt(dot3(v, v));
+}
+static inline float3 normalize3(const float3& v) {
+    float n = norm3(v);
+    if (n > DEME_TINY_FLOAT)
+        return make_float3(v.x / n, v.y / n, v.z / n);
+    return make_float3(0, 0, 0);
+}
+static inline float3 add3(const float3& a, const float3& b) {
+    return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+static inline float3 mul3(const float3& v, float s) {
+    return make_float3(v.x * s, v.y * s, v.z * s);
+}
+static inline float clamp11(float x) {
+    return std::max(-1.0f, std::min(1.0f, x));
+}
+static inline float deg2rad(float deg) {
+    return deg * (deme::PI / 180.0f);
+}
+static inline float rad2deg(float rad) {
+    return rad * (180.0f / deme::PI);
+}
+
+static float computeTriangleArea(const float3& v0, const float3& v1, const float3& v2) {
+    float3 e1 = make_float3(v1.x - v0.x, v1.y - v0.y, v1.z - v0.z);
+    float3 e2 = make_float3(v2.x - v0.x, v2.y - v0.y, v2.z - v0.z);
+    float3 c = cross3(e1, e2);
+    return 0.5f * norm3(c);
+}
+
+// Signed dihedral angle (deg) around oriented edge va->vb of the current triangle.
+// Sign is meaningful only when edge orientation is reliable (oriented_ok == true).
+static float signedDihedralDeg(const float3& n_cur, const float3& n_nbr, const float3& vA, const float3& vB) {
+    float3 e = normalize3(make_float3(vB.x - vA.x, vB.y - vA.y, vB.z - vA.z));
+    float s = dot3(e, cross3(n_cur, n_nbr));
+    float c = clamp11(dot3(n_cur, n_nbr));
+    float theta = std::atan2(s, c);  // [-pi, pi]
+    return rad2deg(theta);
+}
+
+// Build triangle adjacency WITH oriented shared-edge info.
+// Non-manifold edges (shared by != 2 faces) are treated as boundaries.
+static std::vector<std::vector<EdgeAdjInfo>> buildAdjacencyWithEdgeInfo(const std::vector<int3>& face_v_indices) {
+    struct EdgeRec {
+        size_t f;
+        int a;
+        int b;
+    };
+
+    const size_t num_faces = face_v_indices.size();
+    std::vector<std::vector<EdgeAdjInfo>> adj(num_faces);
+
+    std::map<std::pair<int, int>, std::vector<EdgeRec>> edge_map;
+
+    auto add_edge = [&](size_t f, int a, int b) {
+        int lo = std::min(a, b);
+        int hi = std::max(a, b);
+        edge_map[{lo, hi}].push_back(EdgeRec{f, a, b});
+    };
+
+    for (size_t i = 0; i < num_faces; ++i) {
+        const int3& tri = face_v_indices[i];
+        add_edge(i, tri.x, tri.y);
+        add_edge(i, tri.y, tri.z);
+        add_edge(i, tri.z, tri.x);
+    }
+
+    for (const auto& kv : edge_map) {
+        const auto& recs = kv.second;
+        if (recs.size() != 2) {
+            continue;  // boundary or non-manifold
+        }
+        const EdgeRec& r0 = recs[0];
+        const EdgeRec& r1 = recs[1];
+
+        bool oriented_ok_0 = (r0.a == r1.b && r0.b == r1.a);
+        bool oriented_ok_1 = oriented_ok_0;
+
+        adj[r0.f].push_back(EdgeAdjInfo{r1.f, r0.a, r0.b, oriented_ok_0});
+        adj[r1.f].push_back(EdgeAdjInfo{r0.f, r1.a, r1.b, oriented_ok_1});
+    }
+
+    return adj;
+}
+
+// ------------------------------------------------------------
+// Smart patch splitter
+// ------------------------------------------------------------
+unsigned int DEMMesh::SplitIntoConvexPatches(float hard_angle_deg,
+                                             const PatchSplitOptions& opt_in,
+                                             PatchQualityReport* out_report,
+                                             const PatchQualityOptions& qopt) {
     if (nTri == 0) {
         patches_explicitly_set = false;
         nPatches = 1;
+        if (out_report) {
+            out_report->overall = PatchQualityLevel::SAFE;
+            out_report->constraint_status = PatchConstraintStatus::SATISFIED;
+            out_report->achieved_patches = 1;
+            out_report->requested_min = 1;
+            out_report->requested_max = 1;
+            out_report->per_patch.clear();
+        }
         return 0;
     }
 
-    // Initialize patch IDs (all -1 means unassigned)
-    m_patch_ids.clear();
-    m_patch_ids.resize(nTri, -1);
+    if (hard_angle_deg <= 0.0f) {
+        DEME_ERROR("SplitIntoConvexPatches: hard_angle_deg must be > 0.");
+    }
+    if (opt_in.patch_min == 0) {
+        DEME_ERROR("SplitIntoConvexPatches: patch_min must be >= 1.");
+    }
+    if (opt_in.patch_min > opt_in.patch_max) {
+        DEME_ERROR("SplitIntoConvexPatches: patch_min cannot be > patch_max.");
+    }
+
+    // Copy options (we may adjust defaults in a controlled way)
+    PatchSplitOptions opt = opt_in;
+
+    hard_angle_deg = std::min(180.0f, std::max(0.0f, hard_angle_deg));
+
+    // Resolve hysteresis
+    float soft_angle_deg = (opt.soft_angle_deg >= 0.0f) ? opt.soft_angle_deg : hard_angle_deg;
+    soft_angle_deg = std::min(hard_angle_deg, std::max(0.0f, soft_angle_deg));
+
+    // If user activates hysteresis (soft < hard) but didn't enable patch-normal gating, set a sensible default:
+    // otherwise the mid-band has no extra decision signal.
+    bool patch_gate_enabled = (opt.patch_normal_max_deg >= 0.0f);
+    if (!patch_gate_enabled && soft_angle_deg < hard_angle_deg) {
+        opt.patch_normal_max_deg = soft_angle_deg;
+        patch_gate_enabled = true;
+    }
+
+    float patch_normal_max_deg = opt.patch_normal_max_deg;  // may be <0 => disabled
+    if (patch_gate_enabled) {
+        patch_normal_max_deg = std::min(180.0f, std::max(0.0f, patch_normal_max_deg));
+    }
+
+    const float cos_hard = std::cos(deg2rad(hard_angle_deg));
+    const float cos_soft = std::cos(deg2rad(soft_angle_deg));
+    float cos_patch = -1.0f;
+    if (patch_gate_enabled) {
+        cos_patch = std::cos(deg2rad(patch_normal_max_deg));
+    }
 
-    // Compute face normals for all triangles
+    // Precompute face normals and areas
     std::vector<float3> face_normals(nTri);
+    std::vector<float> face_areas(nTri, 0.0f);
     for (size_t i = 0; i < nTri; ++i) {
-        const int3& face = m_face_v_indices[i];
-        const float3& v0 = m_vertices[face.x];
-        const float3& v1 = m_vertices[face.y];
-        const float3& v2 = m_vertices[face.z];
+        const int3& f = m_face_v_indices[i];
+        const float3& v0 = m_vertices[f.x];
+        const float3& v1 = m_vertices[f.y];
+        const float3& v2 = m_vertices[f.z];
         face_normals[i] = computeFaceNormal(v0, v1, v2);
+        face_areas[i] = computeTriangleArea(v0, v1, v2);
+        if (face_areas[i] <= DEME_TINY_FLOAT)
+            face_areas[i] = 0.0f;
+    }
+
+    // Adjacency with edge info
+    auto adjacency = buildAdjacencyWithEdgeInfo(m_face_v_indices);
+
+    // Seed order
+    std::vector<size_t> seeds(nTri);
+    for (size_t i = 0; i < nTri; ++i)
+        seeds[i] = i;
+    if (opt.seed_largest_first) {
+        std::stable_sort(seeds.begin(), seeds.end(), [&](size_t a, size_t b) { return face_areas[a] > face_areas[b]; });
     }
 
-    // Build adjacency map (which triangles share edges)
-    std::vector<std::vector<size_t>> adjacency = buildAdjacencyMap(m_face_v_indices);
+    // Core segmentation routine (no post-merge/split)
+    auto segment_once = [&](const PatchSplitOptions& o,
+                            float soft_deg,
+                            bool patch_gate,
+                            float cosPatch,
+                            std::vector<patchID_t>& out_ids,
+                            unsigned int& out_nP) {
+        out_ids.assign(nTri, (patchID_t)-1);
+
+        int current_patch_id = 0;
+        std::vector<size_t> queue;
+        queue.reserve(256);
+
+        for (size_t si = 0; si < nTri; ++si) {
+            size_t seed = seeds[si];
+            if (out_ids[seed] != (patchID_t)-1)
+                continue;
 
-    // Region growing algorithm to assign patches
-    int current_patch_id = 0;
-    std::vector<size_t> queue;
+            if (current_patch_id > std::numeric_limits<patchID_t>::max()) {
+                DEME_ERROR("SplitIntoPatches: too many patches for patchID_t.");
+            }
 
-    for (size_t seed = 0; seed < nTri; ++seed) {
-        // Skip if already assigned to a patch
-        if (m_patch_ids[seed] != -1) {
-            continue;
+            float3 sumN = mul3(face_normals[seed], face_areas[seed]);
+            float sumA = face_areas[seed];
+            float3 patchN = normalize3(sumN);
+
+            queue.clear();
+            queue.push_back(seed);
+            out_ids[seed] = (patchID_t)current_patch_id;
+
+            size_t qi = 0;
+            while (qi < queue.size()) {
+                size_t cur = queue[qi++];
+
+                for (const auto& e : adjacency[cur]) {
+                    size_t nb = e.nbr;
+                    if (out_ids[nb] != (patchID_t)-1)
+                        continue;
+
+                    const float3& n_cur = face_normals[cur];
+                    const float3& n_nb = face_normals[nb];
+
+                    // Hard barrier (mandatory)
+                    float d_cn = clamp11(dot3(n_cur, n_nb));
+                    if (d_cn < cos_hard)
+                        continue;
+
+                    // Optional concavity barrier
+                    if (o.block_concave_edges && e.oriented_ok) {
+                        const float3& vA = m_vertices[e.va];
+                        const float3& vB = m_vertices[e.vb];
+                        float dih = signedDihedralDeg(n_cur, n_nb, vA, vB);
+                        if (dih < -o.concave_allow_deg)
+                            continue;
+                    }
+
+                    // Hysteresis band:
+                    // - if below soft: we still require patch gate if enabled (otherwise accept)
+                    // - if between soft and hard: require patch gate if enabled; otherwise accept (legacy-like)
+                    bool in_soft = (d_cn >= cos_soft);
+
+                    if (patch_gate) {
+                        float d_pn = clamp11(dot3(patchN, n_nb));
+                        if (d_pn < cosPatch)
+                            continue;
+                        // pass patch gate => accept
+                    } else {
+                        // no patch gate => legacy-like behavior (soft only matters if patch gate is active)
+                        (void)in_soft;
+                    }
+
+                    out_ids[nb] = (patchID_t)current_patch_id;
+                    queue.push_back(nb);
+
+                    if (face_areas[nb] > 0.0f) {
+                        sumN = add3(sumN, mul3(n_nb, face_areas[nb]));
+                        sumA += face_areas[nb];
+                        patchN = normalize3(sumN);
+                    }
+                }
+            }
+
+            current_patch_id++;
+        }
+
+        out_nP = (unsigned int)current_patch_id;
+    };
+
+    // A small helper to compress patch IDs to [0..nP-1]
+    auto compress_ids = [&](std::vector<patchID_t>& ids, unsigned int& out_nP) {
+        auto res = rank_transform<patchID_t>(ids);
+        ids = std::move(res.first);
+        // recompute nP
+        patchID_t mx = 0;
+        for (auto v : ids)
+            if (v > mx) mx = v;
+        out_nP = (unsigned int)(mx + 1);
+    };
+
+    // Enforce patch_max by merging adjacent patches where allowed (hard/concave respected)
+    auto enforce_patch_max = [&](std::vector<patchID_t>& ids, unsigned int& pcount, PatchConstraintStatus& cstat) {
+        if (pcount <= opt.patch_max)
+            return;
+
+        // Build patch mean normals (area-weighted)
+        std::vector<float3> pSumN(pcount, make_float3(0, 0, 0));
+        std::vector<float> pSumA(pcount, 0.0f);
+
+        for (size_t t = 0; t < nTri; ++t) {
+            int p = (int)ids[t];
+            if (face_areas[t] > 0.0f) {
+                pSumN[p] = add3(pSumN[p], mul3(face_normals[t], face_areas[t]));
+                pSumA[p] += face_areas[t];
+            }
         }
 
-        // Start a new patch from this seed triangle
-        queue.clear();
-        queue.push_back(seed);
-        m_patch_ids[seed] = current_patch_id;
+        struct DSU {
+            std::vector<int> parent, rnk;
+            std::vector<float3>* sumN;
+            std::vector<float>* sumA;
+
+            DSU(int n, std::vector<float3>& sN, std::vector<float>& sA) : parent(n), rnk(n, 0), sumN(&sN), sumA(&sA) {
+                for (int i = 0; i < n; ++i) parent[i] = i;
+            }
+            int find(int x) {
+                while (parent[x] != x) {
+                    parent[x] = parent[parent[x]];
+                    x = parent[x];
+                }
+                return x;
+            }
+            bool unite(int a, int b) {
+                a = find(a); b = find(b);
+                if (a == b) return false;
+                if (rnk[a] < rnk[b]) std::swap(a, b);
+                parent[b] = a;
+                if (rnk[a] == rnk[b]) rnk[a]++;
+                (*sumN)[a] = add3((*sumN)[a], (*sumN)[b]);
+                (*sumA)[a] += (*sumA)[b];
+                return true;
+            }
+            float3 patchN(int x) {
+                x = find(x);
+                return normalize3((*sumN)[x]);
+            }
+        };
+
+        DSU dsu((int)pcount, pSumN, pSumA);
+
+        struct Cand { float cost; int a; int b; };
+        struct Cmp { bool operator()(const Cand& x, const Cand& y) const { return x.cost > y.cost; } };
+
+        auto cost_between = [&](int a, int b) {
+            float3 na = dsu.patchN(a);
+            float3 nb = dsu.patchN(b);
+            float d = clamp11(dot3(na, nb));
+            return 1.0f - d;  // smaller is better (more parallel)
+        };
+
+        // Candidate patch adjacency across mergeable edges (hard + optional concavity)
+        std::map<std::pair<int, int>, float> best_cost;
+
+        for (size_t t = 0; t < nTri; ++t) {
+            int pt = (int)ids[t];
+            for (const auto& e : adjacency[t]) {
+                size_t nb = e.nbr;
+                int pn = (int)ids[nb];
+                if (pt == pn)
+                    continue;
+
+                float d = clamp11(dot3(face_normals[t], face_normals[nb]));
+                if (d < cos_hard)
+                    continue;
+
+                if (opt.block_concave_edges && e.oriented_ok) {
+                    const float3& vA = m_vertices[e.va];
+                    const float3& vB = m_vertices[e.vb];
+                    float dih = signedDihedralDeg(face_normals[t], face_normals[nb], vA, vB);
+                    if (dih < -opt.concave_allow_deg)
+                        continue;
+                }
+
+                int a = std::min(pt, pn);
+                int b = std::max(pt, pn);
+                float c = cost_between(a, b);
+
+                auto key = std::make_pair(a, b);
+                auto it = best_cost.find(key);
+                if (it == best_cost.end() || c < it->second)
+                    best_cost[key] = c;
+            }
+        }
+
+        std::priority_queue<Cand, std::vector<Cand>, Cmp> pq;
+        for (const auto& kv : best_cost)
+            pq.push(Cand{kv.second, kv.first.first, kv.first.second});
+
+        unsigned int cur = pcount;
+        while (cur > opt.patch_max && !pq.empty()) {
+            auto c = pq.top(); pq.pop();
+            int ra = dsu.find(c.a);
+            int rb = dsu.find(c.b);
+            if (ra == rb)
+                continue;
+            if (dsu.unite(ra, rb))
+                cur--;
+        }
+
+        // If we couldn't merge enough, mark as unmergeable
+        if (cur > opt.patch_max)
+            cstat = PatchConstraintStatus::TOO_MANY_UNMERGEABLE;
+
+        // Write back merged ids and compress
+        std::unordered_map<int, patchID_t> rep2new;
+        rep2new.reserve(pcount * 2);
+
+        patchID_t next = 0;
+        for (size_t i = 0; i < nTri; ++i) {
+            int p = (int)ids[i];
+            int r = dsu.find(p);
+            auto it = rep2new.find(r);
+            if (it == rep2new.end()) {
+                rep2new.emplace(r, next);
+                ids[i] = next;
+                next++;
+            } else {
+                ids[i] = it->second;
+            }
+        }
+        pcount = (unsigned int)next;
+    };
+
+    // Enforce patch_min by splitting worst-spread patches (count-only)
+    auto enforce_patch_min = [&](std::vector<patchID_t>& ids, unsigned int& pcount, PatchConstraintStatus& cstat) {
+        if (pcount >= opt.patch_min)
+            return;
 
-        // Grow the region
-        size_t queue_idx = 0;
-        while (queue_idx < queue.size()) {
-            size_t current = queue[queue_idx++];
+        auto rebuild_patch_lists = [&](std::vector<std::vector<size_t>>& pTris) {
+            pTris.assign(pcount, {});
+            for (size_t i = 0; i < nTri; ++i) {
+                int p = (int)ids[i];
+                pTris[p].push_back(i);
+            }
+        };
+
+        std::vector<std::vector<size_t>> pTris;
+        rebuild_patch_lists(pTris);
+
+        auto patch_mean_normal = [&](int p) {
+            float3 sumN = make_float3(0, 0, 0);
+            float sumA = 0.0f;
+            for (size_t t : pTris[p]) {
+                if (face_areas[t] > 0.0f) {
+                    sumN = add3(sumN, mul3(face_normals[t], face_areas[t]));
+                    sumA += face_areas[t];
+                }
+            }
+            (void)sumA;
+            return normalize3(sumN);
+        };
+
+        auto pick_patch_to_split = [&]() -> int {
+            float worst = 1.0f;
+            int worst_p = -1;
+            for (int p = 0; p < (int)pcount; ++p) {
+                if (pTris[p].size() < 2)
+                    continue;
+                float3 pn = patch_mean_normal(p);
+                float minDot = 1.0f;
+                for (size_t t : pTris[p]) {
+                    float d = clamp11(dot3(pn, face_normals[t]));
+                    minDot = std::min(minDot, d);
+                }
+                if (minDot < worst) {
+                    worst = minDot;
+                    worst_p = p;
+                }
+            }
+            return worst_p;
+        };
+
+        struct Node { float cost; size_t tri; int label; };
+        struct NodeCmp { bool operator()(const Node& a, const Node& b) const { return a.cost > b.cost; } };
+
+        std::vector<int8_t> label(nTri, -2);
+        std::vector<size_t> touched; touched.reserve(2048);
+
+        while (pcount < opt.patch_min) {
+            int p = pick_patch_to_split();
+            if (p < 0) {
+                cstat = PatchConstraintStatus::TOO_FEW_UNSPLITTABLE;
+                break;
+            }
+            const auto& tris = pTris[p];
+            if (tris.size() < 2) {
+                cstat = PatchConstraintStatus::TOO_FEW_UNSPLITTABLE;
+                break;
+            }
+
+            // choose 2 seeds with farthest normals (2-sweep)
+            size_t t0 = tris[0];
+            size_t sA = t0;
+            float best = 1.0f;
+            for (size_t t : tris) {
+                float d = clamp11(dot3(face_normals[t0], face_normals[t]));
+                if (d < best) { best = d; sA = t; }
+            }
+            size_t sB = sA;
+            best = 1.0f;
+            for (size_t t : tris) {
+                float d = clamp11(dot3(face_normals[sA], face_normals[t]));
+                if (d < best) { best = d; sB = t; }
+            }
+            if (sA == sB) {
+                cstat = PatchConstraintStatus::TOO_FEW_UNSPLITTABLE;
+                break;
+            }
+
+            touched.clear();
+            for (size_t t : tris) {
+                label[t] = -1;
+                touched.push_back(t);
+            }
+
+            std::priority_queue<Node, std::vector<Node>, NodeCmp> pq;
+            label[sA] = 0; label[sB] = 1;
+            pq.push(Node{0.0f, sA, 0});
+            pq.push(Node{0.0f, sB, 1});
+
+            const float3 seedN[2] = {face_normals[sA], face_normals[sB]};
 
-            // Check all adjacent triangles
-            for (size_t neighbor : adjacency[current]) {
-                // Skip if already assigned
-                if (m_patch_ids[neighbor] != -1) {
+            while (!pq.empty()) {
+                Node cur = pq.top(); pq.pop();
+                size_t t = cur.tri;
+                int lbl = cur.label;
+                if (label[t] != lbl)
                     continue;
+
+                for (const auto& e : adjacency[t]) {
+                    size_t nb = e.nbr;
+                    if (label[nb] != -1)
+                        continue;
+
+                    float d = clamp11(dot3(face_normals[t], face_normals[nb]));
+                    if (d < cos_hard)
+                        continue;
+
+                    float dn = clamp11(dot3(face_normals[nb], seedN[lbl]));
+                    float cost = 1.0f - dn;
+
+                    label[nb] = (int8_t)lbl;
+                    pq.push(Node{cost, nb, lbl});
                 }
+            }
+
+            size_t c0 = 0, c1 = 0;
+            for (size_t t : tris) {
+                if (label[t] == 0) c0++;
+                else if (label[t] == 1) c1++;
+            }
+            if (c0 == 0 || c1 == 0) {
+                for (size_t t : touched) label[t] = -2;
+                cstat = PatchConstraintStatus::TOO_FEW_UNSPLITTABLE;
+                break;
+            }
 
-                // Check angle between normals
-                float angle = computeAngleBetweenNormals(face_normals[current], face_normals[neighbor]);
+            patchID_t newP = (patchID_t)pcount;
+            pcount++;
+
+            for (size_t t : tris) {
+                ids[t] = (label[t] == 1) ? newP : (patchID_t)p;
+            }
 
-                // If angle is below threshold, add to same patch
-                if (angle <= angle_threshold_deg) {
-                    m_patch_ids[neighbor] = current_patch_id;
-                    queue.push_back(neighbor);
+            for (size_t t : touched) label[t] = -2;
+
+            // compress & rebuild
+            compress_ids(ids, pcount);
+            rebuild_patch_lists(pTris);
+        }
+    };
+
+    // Quality report computation
+    auto compute_report = [&](const std::vector<patchID_t>& ids,
+                              unsigned int pcount,
+                              PatchConstraintStatus cstat,
+                              PatchQualityReport& rep) {
+        rep.per_patch.assign(pcount, PatchQualityPatch{});
+        rep.overall = PatchQualityLevel::SAFE;
+        rep.constraint_status = cstat;
+        rep.achieved_patches = pcount;
+        rep.requested_min = opt.patch_min;
+        rep.requested_max = opt.patch_max;
+
+        std::vector<std::vector<size_t>> pTris(pcount);
+        for (size_t i = 0; i < nTri; ++i) {
+            int p = (int)ids[i];
+            pTris[p].push_back(i);
+        }
+
+        std::vector<float3> pSumN(pcount, make_float3(0, 0, 0));
+        std::vector<float>  pSumA(pcount, 0.0f);
+
+        for (int p = 0; p < (int)pcount; ++p) {
+            for (size_t t : pTris[p]) {
+                if (face_areas[t] > 0.0f) {
+                    pSumN[p] = add3(pSumN[p], mul3(face_normals[t], face_areas[t]));
+                    pSumA[p] += face_areas[t];
                 }
             }
         }
 
-        // Move to next patch
-        current_patch_id++;
+        // reference angle for classification
+        float ref_angle_deg = patch_gate_enabled ? patch_normal_max_deg : hard_angle_deg;
+
+        for (int p = 0; p < (int)pcount; ++p) {
+            PatchQualityPatch pq;
+            pq.n_tris = (unsigned int)pTris[p].size();
+
+            float3 meanN = normalize3(pSumN[p]);
+            float sumA = pSumA[p];
+            float r = (sumA > DEME_TINY_FLOAT) ? (norm3(pSumN[p]) / sumA) : 0.0f;
+            pq.coherence_r = std::min(1.0f, std::max(0.0f, r));
+
+            float minDot = 1.0f;
+            for (size_t t : pTris[p]) {
+                float d = clamp11(dot3(meanN, face_normals[t]));
+                minDot = std::min(minDot, d);
+            }
+            pq.worst_angle_deg = rad2deg(std::acos(clamp11(minDot)));
+
+            unsigned int hard_cross = 0;
+            unsigned int conc_cross = 0;
+            unsigned int unoriented = 0;
+
+            for (size_t t : pTris[p]) {
+                for (const auto& e : adjacency[t]) {
+                    size_t nb = e.nbr;
+                    if ((int)ids[nb] != p)
+                        continue;
+
+                    float d = clamp11(dot3(face_normals[t], face_normals[nb]));
+                    if (d < cos_hard)
+                        hard_cross++;
+
+                    if (opt.block_concave_edges) {
+                        if (!e.oriented_ok) {
+                            unoriented++;
+                        } else {
+                            const float3& vA = m_vertices[e.va];
+                            const float3& vB = m_vertices[e.vb];
+                            float dih = signedDihedralDeg(face_normals[t], face_normals[nb], vA, vB);
+                            if (dih < -opt.concave_allow_deg)
+                                conc_cross++;
+                        }
+                    }
+                }
+            }
+
+            pq.hard_crossings = hard_cross / 2;
+            pq.concave_crossings = conc_cross / 2;
+            pq.unoriented_edges = unoriented / 2;
+
+            PatchQualityLevel lvl = PatchQualityLevel::SAFE;
+
+            if (qopt.hard_crossings_are_critical && pq.hard_crossings > 0) {
+                lvl = PatchQualityLevel::CRITICAL;
+            }
+
+            if (lvl != PatchQualityLevel::CRITICAL) {
+                bool angle_ok = (pq.worst_angle_deg <= ref_angle_deg);
+                bool angle_warn = (pq.worst_angle_deg <= ref_angle_deg + qopt.warn_worst_angle_margin_deg);
+
+                if (pq.coherence_r < qopt.warn_r || !angle_warn) {
+                    lvl = PatchQualityLevel::CRITICAL;
+                } else if (pq.coherence_r < qopt.safe_r || !angle_ok) {
+                    lvl = PatchQualityLevel::WARN;
+                }
+            }
+
+            if (opt.block_concave_edges && pq.concave_crossings > 0) {
+                if (qopt.concave_crossings_are_critical)
+                    lvl = PatchQualityLevel::CRITICAL;
+                else if (lvl == PatchQualityLevel::SAFE)
+                    lvl = PatchQualityLevel::WARN;
+            }
+
+            if (opt.block_concave_edges && pq.unoriented_edges >= qopt.unoriented_warn_threshold && lvl == PatchQualityLevel::SAFE) {
+                lvl = PatchQualityLevel::WARN;
+            }
+
+            pq.level = lvl;
+            rep.per_patch[p] = pq;
+
+            if ((int)lvl > (int)rep.overall)
+                rep.overall = lvl;
+        }
+    };
+
+    // ------------------------------------------------------------
+    // Optional auto tuning (OFF unless opt.auto_tune.enabled == true)
+    // ------------------------------------------------------------
+    auto run_full = [&](PatchSplitOptions run_opt,
+                        std::vector<patchID_t>& ids_out,
+                        unsigned int& pcount_out,
+                        PatchConstraintStatus& cstat_out,
+                        PatchQualityReport* rep_out) {
+        cstat_out = PatchConstraintStatus::SATISFIED;
+
+        float run_soft = (run_opt.soft_angle_deg >= 0.0f) ? run_opt.soft_angle_deg : hard_angle_deg;
+        run_soft = std::min(hard_angle_deg, std::max(0.0f, run_soft));
+
+        bool run_patch_gate = (run_opt.patch_normal_max_deg >= 0.0f);
+        if (!run_patch_gate && run_soft < hard_angle_deg) {
+            run_opt.patch_normal_max_deg = run_soft;
+            run_patch_gate = true;
+        }
+
+        float run_cos_patch = -1.0f;
+        if (run_patch_gate) {
+            float run_patch_deg = std::min(180.0f, std::max(0.0f, run_opt.patch_normal_max_deg));
+            run_cos_patch = std::cos(deg2rad(run_patch_deg));
+        }
+
+        // segment
+        segment_once(run_opt, run_soft, run_patch_gate, run_cos_patch, ids_out, pcount_out);
+        compress_ids(ids_out, pcount_out);
+
+        // enforce max, then min (count-only)
+        enforce_patch_max(ids_out, pcount_out, cstat_out);
+        enforce_patch_min(ids_out, pcount_out, cstat_out);
+
+        // final compress
+        compress_ids(ids_out, pcount_out);
+
+        if (rep_out) {
+            PatchQualityReport tmp;
+            // Update globals for report reference (patch_gate_enabled etc.) are based on outer opt;
+            // for report classification, we reuse "current" (outer) patch_gate_enabled and patch_normal_max_deg.
+            // For best accuracy you can compute ref_angle from run_opt as well; keep simple here.
+            compute_report(ids_out, pcount_out, cstat_out, tmp);
+            *rep_out = std::move(tmp);
+        }
+    };
+
+    std::vector<patchID_t> best_ids;
+    unsigned int best_pcount = 0;
+    PatchConstraintStatus best_cstat = PatchConstraintStatus::SATISFIED;
+    PatchQualityReport best_rep;
+
+    if (!opt.auto_tune.enabled) {
+        run_full(opt, best_ids, best_pcount, best_cstat, out_report ? &best_rep : nullptr);
+    } else {
+        // Auto-tuning is conservative: it will not run if you hard-fix the count (patch_min == patch_max),
+        // because then your intention is explicit ("keep the cube a cube").
+        if (opt.patch_min == opt.patch_max) {
+            run_full(opt, best_ids, best_pcount, best_cstat, out_report ? &best_rep : nullptr);
+        } else {
+            // Start from user options; search by tightening/loosening patch_normal_max_deg (and soft if present)
+            PatchSplitOptions cur = opt;
+
+            auto severity_score = [&](PatchQualityLevel lvl) { return (int)lvl; };
+
+            bool have_best = false;
+
+            for (unsigned int it = 0; it < opt.auto_tune.max_iters; ++it) {
+                std::vector<patchID_t> ids;
+                unsigned int pc = 0;
+                PatchConstraintStatus cs = PatchConstraintStatus::SATISFIED;
+                PatchQualityReport rep;
+
+                run_full(cur, ids, pc, cs, &rep);
+
+                // candidate score: prioritize meeting constraints, then quality, then fewer patches
+                bool constraints_ok = (cs == PatchConstraintStatus::SATISFIED);
+                int sev = severity_score(rep.overall);
+
+                auto better_than = [&](bool ok, int s, unsigned int p) {
+                    if (!have_best) return true;
+                    bool best_ok = (best_cstat == PatchConstraintStatus::SATISFIED);
+                    int best_sev = severity_score(best_rep.overall);
+                    if (ok != best_ok) return ok;          // prefer satisfied
+                    if (s != best_sev) return s < best_sev; // prefer safer
+                    return p < best_pcount;                // prefer fewer patches
+                };
+
+                if (better_than(constraints_ok, sev, pc)) {
+                    best_ids = std::move(ids);
+                    best_pcount = pc;
+                    best_cstat = cs;
+                    best_rep = std::move(rep);
+                    have_best = true;
+                }
+
+                // stop if good enough
+                if (constraints_ok && (int)best_rep.overall <= (int)opt.auto_tune.target_level)
+                    break;
+
+                // Adjust rules:
+                // - If CRITICAL and we can afford more patches => tighten (smaller patch_normal_max, smaller soft)
+                // - If too many unmergeable patches => loosen (bigger patch_normal_max, bigger soft, disable concavity if needed)
+                // - If too few patches => tighten
+                if (cs == PatchConstraintStatus::TOO_MANY_UNMERGEABLE) {
+                    // loosen
+                    if (cur.patch_normal_max_deg >= 0.0f)
+                        cur.patch_normal_max_deg = std::min(180.0f, cur.patch_normal_max_deg + opt.auto_tune.step_deg);
+                    if (cur.soft_angle_deg >= 0.0f)
+                        cur.soft_angle_deg = std::min(hard_angle_deg, cur.soft_angle_deg + opt.auto_tune.step_deg);
+                    if (cur.block_concave_edges && opt.auto_tune.allow_enable_concavity) {
+                        // concavity block can prevent merging; relax it
+                        cur.block_concave_edges = false;
+                    }
+                } else if (pc < opt.patch_min || rep.overall == PatchQualityLevel::CRITICAL) {
+                    // tighten if possible
+                    if (cur.patch_normal_max_deg < 0.0f)
+                        cur.patch_normal_max_deg = std::min(hard_angle_deg, 45.0f);  // enable with a sane default
+                    else
+                        cur.patch_normal_max_deg = std::max(0.0f, cur.patch_normal_max_deg - opt.auto_tune.step_deg);
+
+                    if (cur.soft_angle_deg >= 0.0f)
+                        cur.soft_angle_deg = std::max(0.0f, cur.soft_angle_deg - opt.auto_tune.step_deg);
+
+                    if (!cur.block_concave_edges && opt.auto_tune.allow_enable_concavity) {
+                        cur.block_concave_edges = true;
+                        cur.concave_allow_deg = std::max(0.0f, cur.concave_allow_deg);
+                    }
+                } else if (pc > opt.patch_max) {
+                    // loosen (but note: enforce_patch_max already tries)
+                    if (cur.patch_normal_max_deg >= 0.0f)
+                        cur.patch_normal_max_deg = std::min(180.0f, cur.patch_normal_max_deg + opt.auto_tune.step_deg);
+                    if (cur.soft_angle_deg >= 0.0f)
+                        cur.soft_angle_deg = std::min(hard_angle_deg, cur.soft_angle_deg + opt.auto_tune.step_deg);
+                } else {
+                    // stable but not good enough; slightly tighten coherence if we have headroom under patch_max
+                    if (pc < opt.patch_max) {
+                        if (cur.patch_normal_max_deg < 0.0f)
+                            cur.patch_normal_max_deg = std::min(hard_angle_deg, 45.0f);
+                        else
+                            cur.patch_normal_max_deg = std::max(0.0f, cur.patch_normal_max_deg - opt.auto_tune.step_deg);
+                    } else {
+                        break;
+                    }
+                }
+            }
+
+            // If never found, fall back
+            if (!have_best) {
+                run_full(opt, best_ids, best_pcount, best_cstat, out_report ? &best_rep : nullptr);
+            }
+        }
     }
 
-    nPatches = current_patch_id;
+    // Commit to mesh state
+    m_patch_ids = std::move(best_ids);
+    nPatches = best_pcount;
     patches_explicitly_set = true;
 
-    // If material is set and we cannot broadcast it to all patches, we raise error
+    // Feedback output
+    if (out_report) {
+        *out_report = std::move(best_rep);
+    }
+
+    // Material broadcasting (same as existing behavior)
+    if (isMaterialSet && materials.size() == 1) {
+        materials = std::vector<std::shared_ptr<DEMMaterial>>(nPatches, materials[0]);
+    }
     if (isMaterialSet && materials.size() != nPatches) {
         DEME_ERROR(
             "The number of materials set (%zu) does not match the number of patches (%u). Please set the "
             "material for each patch or use a single material for all patches.",
             materials.size(), nPatches);
     }
-    // If material is set and we can broadcast it to all patches, we do so
-    if (isMaterialSet && materials.size() == 1) {
-        materials = std::vector<std::shared_ptr<DEMMaterial>>(nPatches, materials[0]);
-    }
 
     return nPatches;
 }
@@ -786,6 +1562,10 @@ void DEMMesh::SetPatchIDs(const std::vector<patchID_t>& patch_ids) {
 
     patches_explicitly_set = true;
 
+    // If material is set and we can broadcast it to all patches, we do so
+    if (isMaterialSet && materials.size() == 1) {
+        materials = std::vector<std::shared_ptr<DEMMaterial>>(nPatches, materials[0]);
+    }
     // If material is set and we cannot broadcast it to all patches, we raise error
     if (isMaterialSet && materials.size() != nPatches) {
         DEME_ERROR(
@@ -793,10 +1573,6 @@ void DEMMesh::SetPatchIDs(const std::vector<patchID_t>& patch_ids) {
             "material for each patch or use a single material for all patches.",
             materials.size(), nPatches);
     }
-    // If material is set and we can broadcast it to all patches, we do so
-    if (isMaterialSet && materials.size() == 1) {
-        materials = std::vector<std::shared_ptr<DEMMaterial>>(nPatches, materials[0]);
-    }
 }
 
 // Compute patch locations (relative to CoM, which is implicitly at 0,0,0)
diff --git a/src/DEM/dT.cpp b/src/DEM/dT.cpp
index a72d3197..6232694c 100644
--- a/src/DEM/dT.cpp
+++ b/src/DEM/dT.cpp
@@ -2337,13 +2337,13 @@ void DEMDynamicThread::writeMeshesAsStlFromHost(std::ofstream& ptFile) {
     ptFile << ostream.str();
 }
 
-void DEMDynamicThread::writeMeshesAsPly(std::ofstream& ptFile) {
+void DEMDynamicThread::writeMeshesAsPly(std::ofstream& ptFile, bool patch_colors) {
     migrateFamilyToHost();
     migrateClumpPosInfoToHost();
-    writeMeshesAsPlyFromHost(ptFile);
+    writeMeshesAsPlyFromHost(ptFile, patch_colors);
 }
 
-void DEMDynamicThread::writeMeshesAsPlyFromHost(std::ofstream& ptFile) {
+void DEMDynamicThread::writeMeshesAsPlyFromHost(std::ofstream& ptFile, bool patch_colors) {
     std::ostringstream ostream;
 
     auto ownerPosFromHost = [this](bodyID_t owner) {
@@ -2393,6 +2393,11 @@ void DEMDynamicThread::writeMeshesAsPlyFromHost(std::ofstream& ptFile) {
     ostream << "property float z" << std::endl;
     ostream << "element face " << total_f << std::endl;
     ostream << "property list uchar int vertex_indices" << std::endl;
+    if (patch_colors) {
+        ostream << "property uchar red" << std::endl;
+        ostream << "property uchar green" << std::endl;
+        ostream << "property uchar blue" << std::endl;
+    }
     ostream << "end_header" << std::endl;
 
     mesh_num = 0;
@@ -2411,13 +2416,37 @@ void DEMDynamicThread::writeMeshesAsPlyFromHost(std::ofstream& ptFile) {
     }
 
     ostream << std::endl;
+    auto hash32 = [](uint32_t x) {
+        x ^= x >> 16;
+        x *= 0x7feb352d;
+        x ^= x >> 15;
+        x *= 0x846ca68b;
+        x ^= x >> 16;
+        return x;
+    };
+
     mesh_num = 0;
     for (const auto& mmesh : m_meshes) {
         if (!thisMeshSkip[mesh_num]) {
-            for (const auto& f : mmesh->GetIndicesVertexes()) {
+            const auto& faces = mmesh->GetIndicesVertexes();
+            const auto& patch_ids = mmesh->GetPatchIDs();
+            bool has_patch_ids = (patch_ids.size() == faces.size());
+
+            for (size_t fi = 0; fi < faces.size(); ++fi) {
+                const auto& f = faces[fi];
                 ostream << "3 " << (size_t)f.x + vertexOffset[mesh_num] << " "
                         << (size_t)f.y + vertexOffset[mesh_num] << " "
-                        << (size_t)f.z + vertexOffset[mesh_num] << std::endl;
+                        << (size_t)f.z + vertexOffset[mesh_num];
+                if (patch_colors) {
+                    uint32_t patch_id = has_patch_ids ? static_cast<uint32_t>(patch_ids[fi]) : 0u;
+                    uint32_t key = patch_id + 0x9e3779b9u * (mesh_num + 1u);
+                    uint32_t h = hash32(key);
+                    unsigned int r = (h >> 16) & 0xFFu;
+                    unsigned int g = (h >> 8) & 0xFFu;
+                    unsigned int b = h & 0xFFu;
+                    ostream << " " << r << " " << g << " " << b;
+                }
+                ostream << std::endl;
             }
         }
         mesh_num++;
diff --git a/src/DEM/dT.h b/src/DEM/dT.h
index a6dd86f8..cbb4f0c7 100644
--- a/src/DEM/dT.h
+++ b/src/DEM/dT.h
@@ -867,13 +867,13 @@ class DEMDynamicThread {
     void writeContactsAsCsv(std::ofstream& ptFile, float force_thres = DEME_TINY_FLOAT);
     void writeMeshesAsVtk(std::ofstream& ptFile);
     void writeMeshesAsStl(std::ofstream& ptFile);
-    void writeMeshesAsPly(std::ofstream& ptFile);
+    void writeMeshesAsPly(std::ofstream& ptFile, bool patch_colors = false);
     void writeSpheresAsCsvFromHost(std::ofstream& ptFile);
     void writeClumpsAsCsvFromHost(std::ofstream& ptFile, unsigned int accuracy = 10);
     void writeContactsAsCsvFromHost(std::ofstream& ptFile, float force_thres = DEME_TINY_FLOAT);
     void writeMeshesAsVtkFromHost(std::ofstream& ptFile);
     void writeMeshesAsStlFromHost(std::ofstream& ptFile);
-    void writeMeshesAsPlyFromHost(std::ofstream& ptFile);
+    void writeMeshesAsPlyFromHost(std::ofstream& ptFile, bool patch_colors = false);
 
     /// Called each time when the user calls DoDynamicsThenSync.
     void startThread();
diff --git a/src/demo/ModularTests/DEMTest_MeshPatch.cpp b/src/demo/ModularTests/DEMTest_MeshPatch.cpp
index cd98a258..06c9b5e2 100644
--- a/src/demo/ModularTests/DEMTest_MeshPatch.cpp
+++ b/src/demo/ModularTests/DEMTest_MeshPatch.cpp
@@ -18,6 +18,7 @@
 #include <cstdio>
 #include <iostream>
 #include <iomanip>
+#include <limits>
 
 using namespace deme;
 using namespace std::filesystem;
@@ -75,6 +76,23 @@ int main() {
             }
         }
 
+        // Optimized patch settings for convex-focused splitting (prefer single patch)
+        std::cout << "\n--- Test 2b: Optimized Convex Patch Splitting (Cube) ---" << std::endl;
+        DEMMesh::PatchSplitOptions opt;
+        opt.soft_angle_deg = -1.0f;
+        opt.patch_normal_max_deg = -1.0f;
+        opt.block_concave_edges = true;
+        opt.concave_allow_deg = 0.0f;
+        opt.patch_min = 1;
+        opt.patch_max = std::numeric_limits<unsigned int>::max();
+        opt.seed_largest_first = true;
+        opt.auto_tune.enabled = false;
+
+        DEMMesh::PatchQualityReport rep_cube;
+        size_t num_patches_opt = cube_mesh->SplitIntoConvexPatches(120.0f, opt, &rep_cube);
+        std::cout << "Optimized patches: " << num_patches_opt << " (quality "
+                  << static_cast<int>(rep_cube.overall) << ")" << std::endl;
+
         // Test manual patch ID setting
         std::cout << "\n--- Test 3: Manual Patch ID Setting ---" << std::endl;
         size_t num_tris = cube_mesh->GetNumTriangles();
@@ -113,9 +131,21 @@ int main() {
         std::cout << "Number of triangles: " << sphere_mesh->GetNumTriangles() << std::endl;
         std::cout << "Number of vertices: " << sphere_mesh->GetNumNodes() << std::endl;
 
-        // Test with 30 degree threshold
-        size_t num_patches = sphere_mesh->SplitIntoConvexPatches(30.0f);
-        std::cout << "Split into " << num_patches << " patches (threshold: 30 degrees)" << std::endl;
+        // Optimized patch split (prefer single patch)
+        DEMMesh::PatchSplitOptions opt;
+        opt.soft_angle_deg = -1.0f;
+        opt.patch_normal_max_deg = -1.0f;
+        opt.block_concave_edges = true;
+        opt.concave_allow_deg = 0.0f;
+        opt.patch_min = 1;
+        opt.patch_max = std::numeric_limits<unsigned int>::max();
+        opt.seed_largest_first = true;
+        opt.auto_tune.enabled = false;
+
+        DEMMesh::PatchQualityReport rep_sphere;
+        size_t num_patches = sphere_mesh->SplitIntoConvexPatches(120.0f, opt, &rep_sphere);
+        std::cout << "Split into " << num_patches << " patches (optimized, quality "
+                  << static_cast<int>(rep_sphere.overall) << ")" << std::endl;
 
         if (sphere_mesh->ArePatchesExplicitlySet()) {
             const auto& patch_ids = sphere_mesh->GetPatchIDs();
@@ -146,6 +176,93 @@ int main() {
     std::cout << "Patches explicitly set: " << (empty_mesh->ArePatchesExplicitlySet() ? "yes" : "no")
               << " (expected: no)" << std::endl;
 
+    // Test concave mesh (drum)
+    std::cout << "\n--- Test 6: Concave Drum Mesh (STL) ---" << std::endl;
+    auto drum_mesh = std::make_shared<DEMMesh>();
+    loaded = drum_mesh->LoadSTLMesh((GET_DATA_PATH() / "mesh/drum.stl").string());
+    if (loaded) {
+        std::cout << "Loaded drum mesh successfully" << std::endl;
+        std::cout << "Number of triangles: " << drum_mesh->GetNumTriangles() << std::endl;
+        std::cout << "Number of vertices: " << drum_mesh->GetNumNodes() << std::endl;
+
+        DEMMesh::PatchSplitOptions opt;
+        opt.soft_angle_deg = -1.0f;
+        opt.patch_normal_max_deg = -1.0f;
+        opt.block_concave_edges = true;
+        opt.concave_allow_deg = 0.0f;
+        opt.patch_min = 1;
+        opt.patch_max = std::numeric_limits<unsigned int>::max();
+        opt.seed_largest_first = true;
+        opt.auto_tune.enabled = false;
+
+        DEMMesh::PatchQualityReport rep_drum;
+        size_t num_patches = drum_mesh->SplitIntoConvexPatches(120.0f, opt, &rep_drum);
+        std::cout << "Split into " << num_patches << " patches (concave, quality "
+                  << static_cast<int>(rep_drum.overall) << ")" << std::endl;
+    } else {
+        std::cout << "Drum mesh not available, skipping" << std::endl;
+    }
+
+    // Test PLY export with per-patch colors (debug view)
+    std::cout << "\n--- Test 7: PLY Export with Patch Colors (per mesh) ---" << std::endl;
+    {
+        path out_dir = current_path();
+        out_dir /= "DemoOutput_MeshPatch";
+        create_directory(out_dir);
+
+        auto export_mesh = [&](const std::string& label, const path& mesh_path, bool is_stl) {
+            DEMSolver DEMSim;
+            DEMSim.SetVerbosity("INFO");
+            DEMSim.SetMeshOutputFormat("PLY");
+            DEMSim.EnableMeshPatchColorOutput(true);
+            DEMSim.InstructBoxDomainDimension(10, 10, 10);
+            DEMSim.SetMeshUniversalContact(true);
+
+            auto mat_type = DEMSim.LoadMaterial({{"E", 1e9}, {"nu", 0.3}, {"CoR", 0.6}, {"mu", 0.5}});
+
+            std::shared_ptr<DEMMesh> mesh_template;
+            if (is_stl) {
+                mesh_template = DEMSim.LoadMeshType(mesh_path.string(), mat_type, true, false);
+            } else {
+                mesh_template = DEMSim.LoadMeshType(mesh_path.string(), mat_type, true, false);
+            }
+
+            if (!mesh_template) {
+                std::cout << "Failed to load mesh template for " << label << std::endl;
+                return;
+            }
+
+            DEMMesh::PatchSplitOptions opt;
+            opt.soft_angle_deg = -1.0f;
+            opt.patch_normal_max_deg = -1.0f;
+            opt.block_concave_edges = true;
+            opt.concave_allow_deg = 0.0f;
+            opt.patch_min = 1;
+            opt.patch_max = std::numeric_limits<unsigned int>::max();
+            opt.seed_largest_first = true;
+            opt.auto_tune.enabled = false;
+
+            mesh_template->SplitIntoConvexPatches(120.0f, opt);
+            mesh_template->SetMaterial(mat_type);
+
+            auto mesh_instance = DEMSim.AddMeshFromTemplate(mesh_template, make_float3(0, 0, 0));
+            mesh_instance->SetFamily(0);
+            mesh_instance->SetMass(1000.);
+            mesh_instance->SetMOI(make_float3(200., 200., 200.));
+
+            DEMSim.Initialize();
+
+            path ply_file = out_dir / ("mesh_patch_colors_" + label + ".ply");
+            DEMSim.WriteMeshFile(ply_file);
+            DEMSim.WaitForPendingOutput();
+            std::cout << "Wrote patch-colored PLY to: " << ply_file << std::endl;
+        };
+
+        export_mesh("cube", GET_DATA_PATH() / "mesh/cube.obj", false);
+        export_mesh("sphere", GET_DATA_PATH() / "mesh/sphere.obj", false);
+        export_mesh("drum", GET_DATA_PATH() / "mesh/drum.stl", true);
+    }
+
     std::cout << "\n========================================" << std::endl;
     std::cout << "Demo completed successfully!" << std::endl;
     std::cout << "========================================" << std::endl;

From f9b7dab83f8be73040b29cbe7e843def4681cb6e Mon Sep 17 00:00:00 2001
From: Florian Reinle <f.reinle@otec.de>
Date: Mon, 26 Jan 2026 22:18:43 +0100
Subject: [PATCH 10/17] Simple Collison Test V00 added

---
 src/demo/CMakeLists.txt                       |  10 +-
 src/demo/ModularTests/CMakeLists.txt          |  11 +-
 .../ModularTests/DEMTest_SimpleCollisions.cpp | 294 ++++++++++++++++++
 3 files changed, 307 insertions(+), 8 deletions(-)
 create mode 100644 src/demo/ModularTests/DEMTest_SimpleCollisions.cpp

diff --git a/src/demo/CMakeLists.txt b/src/demo/CMakeLists.txt
index 82771448..70f46bbc 100644
--- a/src/demo/CMakeLists.txt
+++ b/src/demo/CMakeLists.txt
@@ -89,10 +89,12 @@ FOREACH(PROGRAM ${DEMOS})
 		
 		add_dependencies(${PROGRAM} ${LIBRARIES})
 
-		set_target_properties(
-			${PROGRAM} PROPERTIES
-			CXX_STANDARD ${CXXSTD_SUPPORTED}
-		)
+		if (CXXSTD_SUPPORTED)
+			set_target_properties(
+				${PROGRAM} PROPERTIES
+				CXX_STANDARD ${CXXSTD_SUPPORTED}
+			)
+		endif()
 
 		# install(TARGETS ${PROGRAM} DESTINATION ${DEME_INSTALL_DEMO})
 
diff --git a/src/demo/ModularTests/CMakeLists.txt b/src/demo/ModularTests/CMakeLists.txt
index d6083976..8d10e1fd 100644
--- a/src/demo/ModularTests/CMakeLists.txt
+++ b/src/demo/ModularTests/CMakeLists.txt
@@ -14,6 +14,7 @@ SET(MODULAR_TESTS
 		DEMTest_MeshTemplate
 		DEMTest_PatchLocations
 		DEMTest_MeshPatch
+		DEMTest_SimpleCollisions
 )
 
 # ------------------------------------------------------------------------------
@@ -50,9 +51,11 @@ FOREACH(PROGRAM ${MODULAR_TESTS})
 		
 		add_dependencies(${PROGRAM} ${LIBRARIES})
 
-		set_target_properties(
-			${PROGRAM} PROPERTIES
-			CXX_STANDARD ${CXXSTD_SUPPORTED}
-		)
+		if (CXXSTD_SUPPORTED)
+			set_target_properties(
+				${PROGRAM} PROPERTIES
+				CXX_STANDARD ${CXXSTD_SUPPORTED}
+			)
+		endif()
 
 ENDFOREACH(PROGRAM)
diff --git a/src/demo/ModularTests/DEMTest_SimpleCollisions.cpp b/src/demo/ModularTests/DEMTest_SimpleCollisions.cpp
new file mode 100644
index 00000000..34d98d64
--- /dev/null
+++ b/src/demo/ModularTests/DEMTest_SimpleCollisions.cpp
@@ -0,0 +1,294 @@
+//  Copyright (c) 2021, SBEL GPU Development Team
+//  Copyright (c) 2021, University of Wisconsin - Madison
+//
+//	SPDX-License-Identifier: BSD-3-Clause
+
+// =============================================================================
+// Simple collision test: a cube hits an analytical plane with no gravity.
+// Cases:
+// 1) Edge-first impact (45 deg rotation)
+// 2) Corner-first impact (45 deg around X and Y)
+// For each case, run with:
+//  a) Single patch cube
+//  b) 12-patch cube (one patch per triangle)
+// Each scenario is repeated 10 times. We log rebound speed, rebound direction,
+// and peak normal force on the plane, plus mean/min/max/std stats.
+// =============================================================================
+
+#include <core/ApiVersion.h>
+#include <core/utils/ThreadManager.h>
+#include <DEM/API.h>
+#include <DEM/utils/HostSideHelpers.hpp>
+
+#include <cmath>
+#include <iostream>
+#include <limits>
+#include <numeric>
+#include <string>
+#include <vector>
+
+using namespace deme;
+
+namespace {
+
+constexpr int kNumRuns = 10;
+constexpr double kGap = 0.01;        // 10 mm
+constexpr double kSpeed = 1.0;       // 1 m/s
+constexpr double kTimeStep = 1e-5;   // seconds
+constexpr int kMaxSteps = 200000;    // 2 seconds max
+constexpr double kContactEps = 1e-6; // contact force threshold
+
+struct RunResult {
+    bool ok = false;
+    double rebound_speed = 0.0;
+    double peak_normal_force = 0.0;
+    float3 rebound_dir = make_float3(0, 0, 0);
+};
+
+struct Stats {
+    double mean = 0.0;
+    double min = 0.0;
+    double max = 0.0;
+    double stddev = 0.0;
+};
+
+double vec_length(const float3& v) {
+    return std::sqrt(v.x * v.x + v.y * v.y + v.z * v.z);
+}
+
+double vec_dot(const float3& a, const float3& b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+float3 vec_scale(const float3& v, double s) {
+    return make_float3(v.x * s, v.y * s, v.z * s);
+}
+
+Stats calc_stats(const std::vector<double>& values) {
+    Stats s;
+    if (values.empty()) {
+        return s;
+    }
+    s.min = values.front();
+    s.max = values.front();
+    double sum = 0.0;
+    for (double v : values) {
+        s.min = std::min(s.min, v);
+        s.max = std::max(s.max, v);
+        sum += v;
+    }
+    s.mean = sum / values.size();
+    double var = 0.0;
+    for (double v : values) {
+        double d = v - s.mean;
+        var += d * d;
+    }
+    s.stddev = std::sqrt(var / values.size());
+    return s;
+}
+
+double compute_min_z_rotated(const std::shared_ptr<DEMMesh>& mesh, const float4& rotQ) {
+    double min_z = std::numeric_limits<double>::max();
+    for (const auto& v_in : mesh->m_vertices) {
+        float3 v = v_in;
+        applyFrameTransformLocalToGlobal(v, make_float3(0, 0, 0), rotQ);
+        min_z = std::min(min_z, static_cast<double>(v.z));
+    }
+    return min_z;
+}
+
+std::shared_ptr<DEMMesh> load_cube_template(DEMSolver& DEMSim,
+                                            const std::shared_ptr<DEMMaterial>& mat_type,
+                                            bool per_triangle_patches) {
+    auto mesh_template = DEMSim.LoadMeshType((GET_DATA_PATH() / "mesh/cube.obj").string(), mat_type,
+                                             true,   // load_normals
+                                             false); // load_uv
+    if (!mesh_template) {
+        return nullptr;
+    }
+
+    const size_t num_tris = mesh_template->GetNumTriangles();
+    std::vector<patchID_t> patch_ids(num_tris, 0);
+    if (per_triangle_patches) {
+        for (size_t i = 0; i < num_tris; ++i) {
+            patch_ids[i] = static_cast<patchID_t>(i);
+        }
+    }
+    mesh_template->SetPatchIDs(patch_ids);
+    // Ensure material vector matches patch count after overriding patch IDs.
+    mesh_template->SetMaterial(mat_type);
+    return mesh_template;
+}
+
+RunResult run_single_collision(const float4& init_rot,
+                               bool per_triangle_patches,
+                               const std::string& label,
+                               int run_id) {
+    RunResult result;
+
+    DEMSolver DEMSim;
+    DEMSim.SetOutputFormat(OUTPUT_FORMAT::CSV);
+    DEMSim.InstructBoxDomainDimension(5, 5, 5);
+    DEMSim.SetGravitationalAcceleration(make_float3(0, 0, 0));
+    DEMSim.SetCDUpdateFreq(0);
+    DEMSim.UseAdaptiveUpdateFreq(false);
+    DEMSim.SetMeshUniversalContact(true);
+
+    auto mat_type = DEMSim.LoadMaterial({{"E", 1e9}, {"nu", 0.3}, {"CoR", 0.6}, {"mu", 0.5}, {"Crr", 0.00}});
+
+    float3 plane_normal = make_float3(0, 0, 1);
+    auto plane = DEMSim.AddBCPlane(make_float3(0, 0, 0), plane_normal, mat_type);
+    auto plane_tracker = DEMSim.Track(plane);
+    auto mesh_template = load_cube_template(DEMSim, mat_type, per_triangle_patches);
+    if (!mesh_template) {
+        std::cout << "[" << label << "] Run " << run_id << ": failed to load cube mesh" << std::endl;
+        return result;
+    }
+    double min_z = compute_min_z_rotated(mesh_template, init_rot);
+    double init_z = kGap - min_z;
+
+    auto cube = DEMSim.AddMeshFromTemplate(mesh_template, make_float3(0, 0, 0));
+    cube->SetFamily(0);
+    cube->SetMass(1.0);
+    cube->SetMOI(make_float3(1.0 / 6.0, 1.0 / 6.0, 1.0 / 6.0));
+    cube->SetInitQuat(init_rot);
+    cube->SetInitPos(make_float3(0, 0, static_cast<float>(init_z)));
+    auto cube_tracker = DEMSim.Track(cube);
+
+    DEMSim.SetInitTimeStep(kTimeStep);
+    DEMSim.Initialize();
+    cube_tracker->SetVel(make_float3(0, 0, -static_cast<float>(kSpeed)));
+
+    bool contact_started = false;
+    bool rebound_captured = false;
+    double peak_normal_force = 0.0;
+
+    for (int step = 0; step < kMaxSteps; ++step) {
+        DEMSim.DoStepDynamics();
+
+        float3 plane_force = plane_tracker->ContactAcc();
+        plane_force = vec_scale(plane_force, plane_tracker->Mass());
+        double normal_force = std::abs(vec_dot(plane_force, plane_normal));
+        peak_normal_force = std::max(peak_normal_force, normal_force);
+
+        if (normal_force > kContactEps) {
+            contact_started = true;
+        }
+
+        float3 vel = cube_tracker->Vel();
+        double vel_n = vec_dot(vel, plane_normal);
+
+        if (contact_started && normal_force <= kContactEps && vel_n > 0.0) {
+            double speed = vec_length(vel);
+            float3 dir = make_float3(0, 0, 0);
+            if (speed > 0) {
+                dir = vec_scale(vel, 1.0 / speed);
+            }
+            result.ok = true;
+            result.rebound_speed = speed;
+            result.peak_normal_force = peak_normal_force;
+            result.rebound_dir = dir;
+            rebound_captured = true;
+            break;
+        }
+    }
+
+    if (!rebound_captured) {
+        std::cout << "[" << label << "] Run " << run_id << ": rebound not captured within max steps" << std::endl;
+    }
+
+    return result;
+}
+
+void print_stats_block(const std::string& label,
+                       const std::vector<RunResult>& results) {
+    std::vector<double> speeds;
+    std::vector<double> forces;
+    std::vector<double> dir_x;
+    std::vector<double> dir_y;
+    std::vector<double> dir_z;
+
+    for (const auto& r : results) {
+        if (!r.ok) {
+            continue;
+        }
+        speeds.push_back(r.rebound_speed);
+        forces.push_back(r.peak_normal_force);
+        dir_x.push_back(r.rebound_dir.x);
+        dir_y.push_back(r.rebound_dir.y);
+        dir_z.push_back(r.rebound_dir.z);
+    }
+
+    Stats s_speed = calc_stats(speeds);
+    Stats s_force = calc_stats(forces);
+    Stats s_dx = calc_stats(dir_x);
+    Stats s_dy = calc_stats(dir_y);
+    Stats s_dz = calc_stats(dir_z);
+
+    std::cout << "\n=== " << label << " stats (population stddev) ===" << std::endl;
+    std::cout << "Rebound speed [m/s]: mean=" << s_speed.mean << " min=" << s_speed.min << " max=" << s_speed.max
+              << " std=" << s_speed.stddev << std::endl;
+    std::cout << "Peak normal force [N]: mean=" << s_force.mean << " min=" << s_force.min << " max=" << s_force.max
+              << " std=" << s_force.stddev << std::endl;
+    std::cout << "Rebound dir X: mean=" << s_dx.mean << " min=" << s_dx.min << " max=" << s_dx.max
+              << " std=" << s_dx.stddev << std::endl;
+    std::cout << "Rebound dir Y: mean=" << s_dy.mean << " min=" << s_dy.min << " max=" << s_dy.max
+              << " std=" << s_dy.stddev << std::endl;
+    std::cout << "Rebound dir Z: mean=" << s_dz.mean << " min=" << s_dz.min << " max=" << s_dz.max
+              << " std=" << s_dz.stddev << std::endl;
+}
+
+float4 edge_quat() {
+    float4 q = make_float4(0, 0, 0, 1);
+    q = RotateQuat(q, make_float3(1, 0, 0), static_cast<float>(PI / 4.0));
+    return q;
+}
+
+float4 corner_quat() {
+    float4 q = make_float4(0, 0, 0, 1);
+    q = RotateQuat(q, make_float3(1, 0, 0), static_cast<float>(PI / 4.0));
+    q = RotateQuat(q, make_float3(0, 1, 0), static_cast<float>(PI / 4.0));
+    return q;
+}
+
+void run_scenario(const std::string& label, const float4& rot, bool per_triangle_patches) {
+    std::cout << "\n========================================" << std::endl;
+    std::cout << label << std::endl;
+    std::cout << "========================================" << std::endl;
+
+    std::vector<RunResult> results;
+    results.reserve(kNumRuns);
+
+    for (int i = 0; i < kNumRuns; ++i) {
+        RunResult r = run_single_collision(rot, per_triangle_patches, label, i);
+        results.push_back(r);
+        if (r.ok) {
+            std::cout << "Run " << i << ": speed=" << r.rebound_speed << " dir=(" << r.rebound_dir.x << ", "
+                      << r.rebound_dir.y << ", " << r.rebound_dir.z << ") force=" << r.peak_normal_force
+                      << std::endl;
+        }
+    }
+
+    print_stats_block(label, results);
+}
+
+}  // namespace
+
+int main() {
+    std::cout << "========================================" << std::endl;
+    std::cout << "DEM Simple Collisions Test" << std::endl;
+    std::cout << "========================================" << std::endl;
+
+    float4 q_edge = edge_quat();
+    float4 q_corner = corner_quat();
+
+    run_scenario("Edge impact - single patch", q_edge, false);
+    run_scenario("Edge impact - 12 patches", q_edge, true);
+    run_scenario("Corner impact - single patch", q_corner, false);
+    run_scenario("Corner impact - 12 patches", q_corner, true);
+
+    std::cout << "\n========================================" << std::endl;
+    std::cout << "Test completed" << std::endl;
+    std::cout << "========================================" << std::endl;
+    return 0;
+}

From f9b87fa8640cd29f2d50aa330adec13576ece8a5 Mon Sep 17 00:00:00 2001
From: Florian Reinle <f.reinle@otec.de>
Date: Tue, 27 Jan 2026 14:20:01 +0100
Subject: [PATCH 11/17] Added auto volume and MOI calc for detailed meshes -
 Added also a tri path for different tangential stiffness

---
 src/DEM/BdrsAndObjs.h                         |   7 +-
 src/DEM/MeshUtils.cpp                         | 155 ++++++++++++++++++
 src/demo/DEMdemo_ResponseAngleMesh.cpp        |  40 +++--
 .../FullHertzianForceModel.cu                 |   8 +-
 4 files changed, 187 insertions(+), 23 deletions(-)

diff --git a/src/DEM/BdrsAndObjs.h b/src/DEM/BdrsAndObjs.h
index 7bb747ae..68b2bfb0 100644
--- a/src/DEM/BdrsAndObjs.h
+++ b/src/DEM/BdrsAndObjs.h
@@ -441,6 +441,10 @@ class DEMMesh : public DEMInitializer {
         assertThreeElements(MOI, "SetMOI", "MOI");
         SetMOI(make_float3(MOI[0], MOI[1], MOI[2]));
     }
+    /// Compute volume, centroid and MOI in CoM frame (unit density).
+    void ComputeMassProperties(double& volume, float3& center, float3& inertia) const;
+    /// Check if mesh is watertight (closed, manifold). Returns true if no boundary/non-manifold edges.
+    bool IsWatertight(size_t* boundary_edges = nullptr, size_t* nonmanifold_edges = nullptr) const;
     /// Set mesh family number.
     void SetFamily(unsigned int num) { this->family_code = num; }
 
@@ -459,9 +463,6 @@ class DEMMesh : public DEMInitializer {
     }
 
     /*
-    /// Compute barycenter, mass and MOI in CoM frame
-    void ComputeMassProperties(double& mass, float3& center, float3& inertia);
-
     /// Create a map of neighboring triangles, vector of:
     /// [Ti TieA TieB TieC]
     /// (the free sides have triangle id = -1).
diff --git a/src/DEM/MeshUtils.cpp b/src/DEM/MeshUtils.cpp
index 80d35e96..f16b958c 100644
--- a/src/DEM/MeshUtils.cpp
+++ b/src/DEM/MeshUtils.cpp
@@ -185,6 +185,15 @@ bool DEMMesh::LoadSTLMesh(std::string input_file, bool load_normals) {
     m_face_uv_indices.clear();
 
     set_default_patch_info();
+    {
+        size_t boundary_edges = 0;
+        size_t nonmanifold_edges = 0;
+        if (!IsWatertight(&boundary_edges, &nonmanifold_edges)) {
+            DEME_WARNING(
+                "Mesh %s is not watertight (boundary edges: %zu, non-manifold edges: %zu). Auto Volume/MOI may be inaccurate.",
+                filename.c_str(), boundary_edges, nonmanifold_edges);
+        }
+    }
     return true;
 }
 
@@ -553,6 +562,16 @@ bool DEMMesh::LoadWavefrontMesh(std::string input_file, bool load_normals, bool
     this->nPatches = 1;
     this->patches_explicitly_set = false;
 
+    {
+        size_t boundary_edges = 0;
+        size_t nonmanifold_edges = 0;
+        if (!IsWatertight(&boundary_edges, &nonmanifold_edges)) {
+            DEME_WARNING(
+                "Mesh %s is not watertight (boundary edges: %zu, non-manifold edges: %zu). Volume/MOI may be inaccurate.",
+                filename.c_str(), boundary_edges, nonmanifold_edges);
+        }
+    }
+
     return true;
 }
 
@@ -1620,4 +1639,140 @@ std::vector<float3> DEMMesh::ComputePatchLocations() const {
     return patch_locations;
 }
 
+// Compute volume, centroid and MOI in CoM frame (unit density).
+// ATTENTION: Only correct for "watertight" meshes with fine and non-degenerated triangles.
+void DEMMesh::ComputeMassProperties(double& volume, float3& center, float3& inertia) const {
+    double vol = 0.0;
+    double mx = 0.0;
+    double my = 0.0;
+    double mz = 0.0;
+    double ix2 = 0.0;
+    double iy2 = 0.0;
+    double iz2 = 0.0;
+    double ixy = 0.0;
+    double iyz = 0.0;
+    double izx = 0.0;
+
+    for (const auto& face : m_face_v_indices) {
+        const float3& a = m_vertices[face.x];
+        const float3& b = m_vertices[face.y];
+        const float3& c = m_vertices[face.z];
+
+        const float3 bcross = cross(b, c);
+        const double v = static_cast<double>(dot(a, bcross)) / 6.0;
+
+        vol += v;
+        mx += v * (static_cast<double>(a.x) + b.x + c.x) / 4.0;
+        my += v * (static_cast<double>(a.y) + b.y + c.y) / 4.0;
+        mz += v * (static_cast<double>(a.z) + b.z + c.z) / 4.0;
+
+        const double ax = a.x, ay = a.y, az = a.z;
+        const double bx = b.x, by = b.y, bz = b.z;
+        const double cx = c.x, cy = c.y, cz = c.z;
+
+        const double f1x = ax * ax + bx * bx + cx * cx + ax * bx + bx * cx + cx * ax;
+        const double f1y = ay * ay + by * by + cy * cy + ay * by + by * cy + cy * ay;
+        const double f1z = az * az + bz * bz + cz * cz + az * bz + bz * cz + cz * az;
+
+        ix2 += v * f1x / 10.0;
+        iy2 += v * f1y / 10.0;
+        iz2 += v * f1z / 10.0;
+
+        const double fxy = 2.0 * (ax * ay + bx * by + cx * cy) +
+                           (ax * by + ay * bx + bx * cy + by * cx + cx * ay + cy * ax);
+        const double fyz = 2.0 * (ay * az + by * bz + cy * cz) +
+                           (ay * bz + az * by + by * cz + bz * cy + cy * az + cz * ay);
+        const double fzx = 2.0 * (az * ax + bz * bx + cz * cx) +
+                           (az * bx + ax * bz + bz * cx + bx * cz + cz * ax + cx * az);
+
+        ixy += v * fxy / 20.0;
+        iyz += v * fyz / 20.0;
+        izx += v * fzx / 20.0;
+    }
+
+    if (vol == 0.0) {
+        volume = 0.0;
+        center = make_float3(0, 0, 0);
+        inertia = make_float3(0, 0, 0);
+        return;
+    }
+
+    if (vol < 0.0) {
+        vol = -vol;
+        mx = -mx;
+        my = -my;
+        mz = -mz;
+        ix2 = -ix2;
+        iy2 = -iy2;
+        iz2 = -iz2;
+        ixy = -ixy;
+        iyz = -iyz;
+        izx = -izx;
+    }
+
+    const double cx = mx / vol;
+    const double cy = my / vol;
+    const double cz = mz / vol;
+
+    double Ixx = iy2 + iz2;
+    double Iyy = ix2 + iz2;
+    double Izz = ix2 + iy2;
+    double Ixy = -ixy;
+    double Iyz = -iyz;
+    double Izx = -izx;
+
+    // Shift to center of mass.
+    Ixx -= vol * (cy * cy + cz * cz);
+    Iyy -= vol * (cx * cx + cz * cz);
+    Izz -= vol * (cx * cx + cy * cy);
+    Ixy += vol * cx * cy;
+    Iyz += vol * cy * cz;
+    Izx += vol * cz * cx;
+
+    volume = vol;
+    center = make_float3(static_cast<float>(cx), static_cast<float>(cy), static_cast<float>(cz));
+    inertia = make_float3(static_cast<float>(Ixx), static_cast<float>(Iyy), static_cast<float>(Izz));
+}
+
+bool DEMMesh::IsWatertight(size_t* boundary_edges, size_t* nonmanifold_edges) const {
+    if (boundary_edges) {
+        *boundary_edges = 0;
+    }
+    if (nonmanifold_edges) {
+        *nonmanifold_edges = 0;
+    }
+    if (m_face_v_indices.empty()) {
+        return true;
+    }
+
+    std::map<std::pair<int, int>, size_t> edge_counts;
+    for (const auto& face : m_face_v_indices) {
+        std::pair<int, int> edges[3] = {{std::min(face.x, face.y), std::max(face.x, face.y)},
+                                        {std::min(face.y, face.z), std::max(face.y, face.z)},
+                                        {std::min(face.z, face.x), std::max(face.z, face.x)}};
+        for (int e = 0; e < 3; ++e) {
+            edge_counts[edges[e]]++;
+        }
+    }
+
+    size_t boundary = 0;
+    size_t nonmanifold = 0;
+    for (const auto& kv : edge_counts) {
+        if (kv.second == 1) {
+            boundary++;
+        } else if (kv.second > 2) {
+            nonmanifold++;
+        }
+    }
+
+    if (boundary_edges) {
+        *boundary_edges = boundary;
+    }
+    if (nonmanifold_edges) {
+        *nonmanifold_edges = nonmanifold;
+    }
+
+    return boundary == 0 && nonmanifold == 0;
+}
+
 }  // end namespace deme
diff --git a/src/demo/DEMdemo_ResponseAngleMesh.cpp b/src/demo/DEMdemo_ResponseAngleMesh.cpp
index cabefbe7..112e5a64 100644
--- a/src/demo/DEMdemo_ResponseAngleMesh.cpp
+++ b/src/demo/DEMdemo_ResponseAngleMesh.cpp
@@ -57,14 +57,6 @@ std::shared_ptr<DEMMesh> LoadStlMesh(DEMSolver& sim,
     return sim.AddMesh(mesh);
 }
 
-float3 ComputeBoxMOI(const float3& dims, float mass) {
-    // MOI of a box about its center: Ixx = 1/12 m (b^2 + c^2), etc.
-    float ix = mass / 12.f * (dims.y * dims.y + dims.z * dims.z);
-    float iy = mass / 12.f * (dims.x * dims.x + dims.z * dims.z);
-    float iz = mass / 12.f * (dims.x * dims.x + dims.y * dims.y);
-    return make_float3(ix, iy, iz);
-}
-
 std::pair<float3, float3> ComputeBounds(const std::vector<float3>& vertices) {
     float3 vmin = make_float3(std::numeric_limits<float>::max());
     float3 vmax = make_float3(std::numeric_limits<float>::lowest());
@@ -107,9 +99,17 @@ int main() {
     const float tri_diag = std::sqrt(tri_dims.x * tri_dims.x + tri_dims.y * tri_dims.y + tri_dims.z * tri_dims.z);
     const float tri_radius = 0.5f * tri_diag;
     const float particle_density = 2600.0f;
-    const float particle_volume = tri_dims.x * tri_dims.y * tri_dims.z;
-    const float particle_mass = particle_density * particle_volume;
-    const float3 particle_moi = ComputeBoxMOI(tri_dims, particle_mass);
+    double tri_volume = 0.0;
+    float3 tri_center = make_float3(0, 0, 0);
+    float3 tri_inertia = make_float3(0, 0, 0);
+    tri_template->ComputeMassProperties(tri_volume, tri_center, tri_inertia);
+    const float particle_mass = static_cast<float>(tri_volume * particle_density);
+    const float3 particle_moi = tri_inertia * particle_density;
+    std::cout << "Particle STL volume (m^3): " << tri_volume << std::endl;
+    std::cout << "Particle STL MOI (unit density, CoM): " << tri_inertia.x << ", " << tri_inertia.y << ", "
+              << tri_inertia.z << std::endl;
+    const double cube_vol = std::pow(4.0e-3, 3);
+    std::cout << "Particle mass (kg): " << particle_mass << std::endl;
 
     // Load drum mantle from STL; STL units are mm with z in [0, 100]
     path drum_path = GET_DATA_PATH() / "mesh" / "drum.stl";
@@ -118,14 +118,18 @@ int main() {
     const float drum_height = drum_max.z - drum_min.z;
     unsigned int drum_family = 100;
     drum_mesh->SetFamily(drum_family);
-    const float drum_mass = 5.0f;
+    const float drum_density = 2600.0f;
+    double drum_volume = 0.0;
+    float3 drum_center = make_float3(0, 0, 0);
+    float3 drum_inertia = make_float3(0, 0, 0);
+    drum_mesh->ComputeMassProperties(drum_volume, drum_center, drum_inertia);
+    const float drum_mass = static_cast<float>(drum_volume * drum_density);
     drum_mesh->SetMass(drum_mass);
-    const float drum_outer_radius =
-        std::max(std::max(std::abs(drum_min.x), std::abs(drum_max.x)),
-                 std::max(std::abs(drum_min.y), std::abs(drum_max.y)));
-    float izz = 0.5f * drum_mass * drum_outer_radius * drum_outer_radius;
-    float ixx = (drum_mass / 12.0f) * (3 * drum_outer_radius * drum_outer_radius + drum_height * drum_height);
-    drum_mesh->SetMOI(make_float3(ixx, ixx, izz));
+    drum_mesh->SetMOI(drum_inertia * drum_density);
+    std::cout << "Drum STL volume (m^3): " << drum_volume << std::endl;
+    std::cout << "Drum STL MOI (unit density, CoM): " << drum_inertia.x << ", " << drum_inertia.y << ", "
+              << drum_inertia.z << std::endl;
+    std::cout << "Drum mass (kg): " << drum_mass << std::endl;
     DEMSim.SetFamilyPrescribedAngVel(drum_family, "0", "0", to_string_with_precision(drum_ang_vel));
 
     // Add top and bottom planes at z = 0 and z = 0.1 m. They rotate with the drum family (axis-aligned so rotation
diff --git a/src/kernel/DEMCustomizablePolicies/FullHertzianForceModel.cu b/src/kernel/DEMCustomizablePolicies/FullHertzianForceModel.cu
index e5b6d89e..ccd94b27 100644
--- a/src/kernel/DEMCustomizablePolicies/FullHertzianForceModel.cu
+++ b/src/kernel/DEMCustomizablePolicies/FullHertzianForceModel.cu
@@ -127,9 +127,13 @@ if (overlapDepth > 0) {
 
     // Tangential force part
     if (mu_cnt > 0.f) {
+        float gt;
         const float kt = 8.f * G_cnt * contact_radius;
-        const float gt =
-            -deme::TWO_TIMES_SQRT_FIVE_OVER_SIX * beta * sqrtf(mass_eff * kt);  // do we neen higher damping??
+        if (tri_involved) {
+            gt = -deme::TWO_TIMES_SQRT_FIVE_OVER_THREE * beta * sqrtf(mass_eff * kt);
+        } else {
+            gt = -deme::TWO_TIMES_SQRT_FIVE_OVER_SIX * beta * sqrtf(mass_eff * kt);
+        }
         float3 tangent_force = -kt * delta_tan - gt * vrel_tan;
         const float ft = length(tangent_force);
         if (ft > DEME_TINY_FLOAT) {

From 6745df9111997beeaee4d2659817113e7f81a870 Mon Sep 17 00:00:00 2001
From: Ruochun <ruochunz@gmail.com>
Date: Wed, 28 Jan 2026 00:27:26 +0800
Subject: [PATCH 12/17] Clarify that the patch loc metric works for MM contact
 only

---
 src/DEM/dT.cpp                              |   2 +-
 src/algorithms/DEMDynamicMisc.cu            |  16 +-
 src/algorithms/DEMStaticDeviceSubroutines.h |   1 +
 src/demo/DEMdemo_MeshFalling.cpp            | 153 ++++++++++----------
 4 files changed, 90 insertions(+), 82 deletions(-)

diff --git a/src/DEM/dT.cpp b/src/DEM/dT.cpp
index 47496ce0..312a9d09 100644
--- a/src/DEM/dT.cpp
+++ b/src/DEM/dT.cpp
@@ -2343,7 +2343,7 @@ inline void DEMDynamicThread::dispatchPatchBasedForceCorrections(
                 // Step 1: Prepare weighted normals, areas, and keys
                 // The kernel extracts keys from geomToPatchMap, computes weighted normals, and stores areas
                 prepareWeightedNormalsForVoting(&granData, weightedNormals, areas, keys, startOffsetPrimitive,
-                                                countPrimitive, streamInfo.stream);
+                                                countPrimitive, contact_type, streamInfo.stream);
 
                 // Step 2: Reduce-by-key for weighted normals (sum)
                 // The keys are geomToPatchMap values (contactPairs_t), which group primitives by patch pair
diff --git a/src/algorithms/DEMDynamicMisc.cu b/src/algorithms/DEMDynamicMisc.cu
index 92c9344c..d56a782a 100644
--- a/src/algorithms/DEMDynamicMisc.cu
+++ b/src/algorithms/DEMDynamicMisc.cu
@@ -127,7 +127,8 @@ __global__ void prepareWeightedNormalsForVoting_impl(DEMDataDT* granData,
                                                      double* areas,
                                                      contactPairs_t* keys,
                                                      contactPairs_t startOffset,
-                                                     contactPairs_t count) {
+                                                     contactPairs_t count,
+                                                     contact_t contactType) {
     contactPairs_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < count) {
         contactPairs_t myContactID = startOffset + idx;
@@ -139,10 +140,12 @@ __global__ void prepareWeightedNormalsForVoting_impl(DEMDataDT* granData,
         float3 areaStorage = granData->contactPointGeometryB[myContactID];
         double area = float3StorageToDouble(areaStorage);
         // But primitive contacts that do not respect the patch general direction have no right in deciding the contact
-        // normal
-        notStupidBool_t directionRespected = granData->contactPatchDirectionRespected[myContactID];
-        if (!directionRespected) {
-            area = 0.0;
+        // normal (in mesh--mesh contact)
+        if (contactType == TRIANGLE_TRIANGLE_CONTACT) {
+            notStupidBool_t directionRespected = granData->contactPatchDirectionRespected[myContactID];
+            if (!directionRespected) {
+                area = 0.0;
+            }
         }
 
         // Compute weighted normal (normal * area)
@@ -163,11 +166,12 @@ void prepareWeightedNormalsForVoting(DEMDataDT* granData,
                                      contactPairs_t* keys,
                                      contactPairs_t startOffset,
                                      contactPairs_t count,
+                                     contact_t contactType,
                                      cudaStream_t& this_stream) {
     size_t blocks_needed = (count + DEME_MAX_THREADS_PER_BLOCK - 1) / DEME_MAX_THREADS_PER_BLOCK;
     if (blocks_needed > 0) {
         prepareWeightedNormalsForVoting_impl<<<blocks_needed, DEME_MAX_THREADS_PER_BLOCK, 0, this_stream>>>(
-            granData, weightedNormals, areas, keys, startOffset, count);
+            granData, weightedNormals, areas, keys, startOffset, count, contactType);
         DEME_GPU_CALL(cudaStreamSynchronize(this_stream));
     }
 }
diff --git a/src/algorithms/DEMStaticDeviceSubroutines.h b/src/algorithms/DEMStaticDeviceSubroutines.h
index edcaf6e5..c0cd93c2 100644
--- a/src/algorithms/DEMStaticDeviceSubroutines.h
+++ b/src/algorithms/DEMStaticDeviceSubroutines.h
@@ -179,6 +179,7 @@ void prepareWeightedNormalsForVoting(DEMDataDT* granData,
                                      contactPairs_t* keys,
                                      contactPairs_t startOffset,
                                      contactPairs_t count,
+                                     contact_t contactType,
                                      cudaStream_t& this_stream);
 
 // Normalizes voted normals by total area and scatters to output
diff --git a/src/demo/DEMdemo_MeshFalling.cpp b/src/demo/DEMdemo_MeshFalling.cpp
index 65a94682..26bdc0ae 100644
--- a/src/demo/DEMdemo_MeshFalling.cpp
+++ b/src/demo/DEMdemo_MeshFalling.cpp
@@ -35,8 +35,8 @@ int main() {
     DEMSim.SetMeshUniversalContact(true);
 
     // Define material properties
-    auto mat_box = DEMSim.LoadMaterial({{"E", 1e9}, {"nu", 0.3}, {"CoR", 0.6}, {"mu", 0.4}, {"Crr", 0.1}});
-    auto mat_plane = DEMSim.LoadMaterial({{"E", 1e9}, {"nu", 0.3}, {"CoR", 0.5}, {"mu", 0.3}, {"Crr", 0.1}});
+    auto mat_box = DEMSim.LoadMaterial({{"E", 1e9}, {"nu", 0.3}, {"CoR", 0.4}, {"mu", 0.4}, {"Crr", 0.1}});
+    auto mat_plane = DEMSim.LoadMaterial({{"E", 1e9}, {"nu", 0.3}, {"CoR", 0.4}, {"mu", 0.3}, {"Crr", 0.1}});
 
     // Add a bottom plane at z = 0
     DEMSim.AddBCPlane(make_float3(0, 0, 0), make_float3(0, 0, 1), mat_plane);
@@ -45,7 +45,7 @@ int main() {
     const int num_particles_x = 6;
     const int num_particles_y = 6;
     const float particle_spacing = 2.0;
-    const float initial_height = 8.0;
+    const float initial_height = 2.0;
     const float base_size = 0.5;              // Base scale for all meshes
     const float cylinder_scale_factor = 0.5;  // Cylinders scaled down since they're taller
 
@@ -61,82 +61,85 @@ int main() {
 
     for (int i = 0; i < num_particles_x; i++) {
         for (int j = 0; j < num_particles_y; j++) {
-            float x = (i - num_particles_x / 2.0 + 0.5) * particle_spacing + pos_dist(gen);
-            float y = (j - num_particles_y / 2.0 + 0.5) * particle_spacing + pos_dist(gen);
-            float z = initial_height + (i + j) * 0.5 + pos_dist(gen) * 2;
-
-            // Select mesh type randomly: 0=cube, 1=sphere, 2=cone, 3=cylinder
-            int mesh_type = mesh_type_dist(gen);
-            std::shared_ptr<DEMMesh> particle;
-
-            if (mesh_type == 0) {
-                // Cube with non-uniform scaling
-                particle = DEMSim.AddWavefrontMeshObject((GET_DATA_PATH() / "mesh/cube.obj").string(), mat_box);
-                float scale_x = base_size * scale_dist(gen);
-                float scale_y = base_size * scale_dist(gen);
-                float scale_z = base_size * scale_dist(gen);
-                particle->Scale(make_float3(scale_x, scale_y, scale_z));  // Non-uniform scaling
-
-                // Set mass and MOI for the box (approximate as uniform density)
-                float mass = 1000.0 * scale_x * scale_y * scale_z;
-                float moi_x = mass * (scale_y * scale_y + scale_z * scale_z) / 12.0;
-                float moi_y = mass * (scale_x * scale_x + scale_z * scale_z) / 12.0;
-                float moi_z = mass * (scale_x * scale_x + scale_y * scale_y) / 12.0;
-                particle->SetMass(mass);
-                particle->SetMOI(make_float3(moi_x, moi_y, moi_z));
-            } else if (mesh_type == 1) {
-                // Sphere (unit sphere in mesh)
-                particle = DEMSim.AddWavefrontMeshObject((GET_DATA_PATH() / "mesh/sphere.obj").string(), mat_box);
-                float scale = base_size * scale_dist(gen);
-                particle->Scale(scale);
-
-                // Set mass and MOI for sphere
-                float mass = 1000.0 * (4.0 / 3.0) * math_PI * scale * scale * scale;
-                float moi = 0.4 * mass * scale * scale;  // MOI for sphere
-                particle->SetMass(mass);
-                particle->SetMOI(make_float3(moi, moi, moi));
-            } else if (mesh_type == 2) {
-                // Cone (height ~1, radius ~1 in mesh)
-                particle = DEMSim.AddWavefrontMeshObject((GET_DATA_PATH() / "mesh/cone.obj").string(), mat_box);
-                // Unit cone's CoM is at this location...
-                particle->InformCentroidPrincipal(make_float3(0, 0, 3. / 4.), make_float4(0, 0, 0, 1));
-                float scale = base_size * scale_dist(gen);
-                particle->Scale(scale);
-
-                // Set mass and MOI for cone (approximate)
-                float mass = 1000.0 * (1.0 / 3.0) * math_PI * scale * scale * scale;
-                float moi_base = 0.3 * mass * scale * scale;  // Approximate MOI
-                float moi_height = 0.15 * mass * scale * scale;
-                particle->SetMass(mass);
-                particle->SetMOI(make_float3(moi_base, moi_base, moi_height));
-            } else {
-                // Cylinder (radius ~1, height ~2 in mesh)
-                particle = DEMSim.AddWavefrontMeshObject((GET_DATA_PATH() / "mesh/cyl_r1_h2.obj").string(), mat_box);
-                float scale = base_size * cylinder_scale_factor * scale_dist(gen);
-                particle->Scale(scale);
-
-                // Set mass and MOI for cylinder
-                float radius = scale;
-                float height = 2.0 * scale;
-                float mass = 1000.0 * math_PI * radius * radius * height;
-                float moi_radial = mass * (3.0 * radius * radius + height * height) / 12.0;
-                float moi_axial = 0.5 * mass * radius * radius;
-                particle->SetMass(mass);
-                particle->SetMOI(make_float3(moi_radial, moi_radial, moi_axial));
+            for (int k = 0; k < 2; k++) {
+                float x = (i - num_particles_x / 2.0 + 0.5) * particle_spacing + pos_dist(gen);
+                float y = (j - num_particles_y / 2.0 + 0.5) * particle_spacing + pos_dist(gen);
+                float z = initial_height + (i + j) * 0.5 + pos_dist(gen) * 2 + k * 2.5;
+
+                // Select mesh type randomly: 0=cube, 1=sphere, 2=cone, 3=cylinder
+                int mesh_type = mesh_type_dist(gen);
+                std::shared_ptr<DEMMesh> particle;
+
+                if (mesh_type == 0) {
+                    // Cube with non-uniform scaling
+                    particle = DEMSim.AddWavefrontMeshObject((GET_DATA_PATH() / "mesh/cube.obj").string(), mat_box);
+                    float scale_x = base_size * scale_dist(gen);
+                    float scale_y = base_size * scale_dist(gen);
+                    float scale_z = base_size * scale_dist(gen);
+                    particle->Scale(make_float3(scale_x, scale_y, scale_z));  // Non-uniform scaling
+
+                    // Set mass and MOI for the box (approximate as uniform density)
+                    float mass = 1000.0 * scale_x * scale_y * scale_z;
+                    float moi_x = mass * (scale_y * scale_y + scale_z * scale_z) / 12.0;
+                    float moi_y = mass * (scale_x * scale_x + scale_z * scale_z) / 12.0;
+                    float moi_z = mass * (scale_x * scale_x + scale_y * scale_y) / 12.0;
+                    particle->SetMass(mass);
+                    particle->SetMOI(make_float3(moi_x, moi_y, moi_z));
+                } else if (mesh_type == 1) {
+                    // Sphere (unit sphere in mesh)
+                    particle = DEMSim.AddWavefrontMeshObject((GET_DATA_PATH() / "mesh/sphere.obj").string(), mat_box);
+                    float scale = base_size * scale_dist(gen);
+                    particle->Scale(scale);
+
+                    // Set mass and MOI for sphere
+                    float mass = 1000.0 * (4.0 / 3.0) * math_PI * scale * scale * scale;
+                    float moi = 0.4 * mass * scale * scale;  // MOI for sphere
+                    particle->SetMass(mass);
+                    particle->SetMOI(make_float3(moi, moi, moi));
+                } else if (mesh_type == 2) {
+                    // Cone (height ~1, radius ~1 in mesh)
+                    particle = DEMSim.AddWavefrontMeshObject((GET_DATA_PATH() / "mesh/cone.obj").string(), mat_box);
+                    // Unit cone's CoM is at this location...
+                    particle->InformCentroidPrincipal(make_float3(0, 0, 3. / 4.), make_float4(0, 0, 0, 1));
+                    float scale = base_size * scale_dist(gen);
+                    particle->Scale(scale);
+
+                    // Set mass and MOI for cone (approximate)
+                    float mass = 1000.0 * (1.0 / 3.0) * math_PI * scale * scale * scale;
+                    float moi_base = 0.3 * mass * scale * scale;  // Approximate MOI
+                    float moi_height = 0.15 * mass * scale * scale;
+                    particle->SetMass(mass);
+                    particle->SetMOI(make_float3(moi_base, moi_base, moi_height));
+                } else {
+                    // Cylinder (radius ~1, height ~2 in mesh)
+                    particle =
+                        DEMSim.AddWavefrontMeshObject((GET_DATA_PATH() / "mesh/cyl_r1_h2.obj").string(), mat_box);
+                    float scale = base_size * cylinder_scale_factor * scale_dist(gen);
+                    particle->Scale(scale);
+
+                    // Set mass and MOI for cylinder
+                    float radius = scale;
+                    float height = 2.0 * scale;
+                    float mass = 1000.0 * math_PI * radius * radius * height;
+                    float moi_radial = mass * (3.0 * radius * radius + height * height) / 12.0;
+                    float moi_axial = 0.5 * mass * radius * radius;
+                    particle->SetMass(mass);
+                    particle->SetMOI(make_float3(moi_radial, moi_radial, moi_axial));
+                }
+
+                particle->SetFamily(0);
+                particle->SetInitPos(make_float3(x, y, z));
+
+                // Add small initial rotation for more interesting dynamics
+                particle->SetInitQuat(make_float4(rot_dist(gen), rot_dist(gen), rot_dist(gen), 1.0));
+
+                auto tracker = DEMSim.Track(particle);
+                trackers.push_back(tracker);
             }
-
-            particle->SetFamily(0);
-            particle->SetInitPos(make_float3(x, y, z));
-
-            // Add small initial rotation for more interesting dynamics
-            particle->SetInitQuat(make_float4(rot_dist(gen), rot_dist(gen), rot_dist(gen), 1.0));
-
-            auto tracker = DEMSim.Track(particle);
-            trackers.push_back(tracker);
         }
     }
 
-    float step_time = 1e-5;
+    float step_time = 5e-6;
     DEMSim.SetInitTimeStep(step_time);
     DEMSim.SetGravitationalAcceleration(make_float3(0, 0, -9.81));
     DEMSim.SetExpandSafetyType("auto");

From b5893e5a2b6338a18fea32d820b2cb64f573a7e3 Mon Sep 17 00:00:00 2001
From: Florian Reinle <f.reinle@otec.de>
Date: Tue, 27 Jan 2026 21:42:50 +0100
Subject: [PATCH 13/17] Fix merging mistakes, Modified collision Test with STL
 particles

---
 src/algorithms/DEMDynamicMisc.cu              |  1 -
 src/algorithms/DEMStaticDeviceSubroutines.h   |  1 -
 src/demo/CMakeLists.txt                       |  1 -
 .../ModularTests/DEMTest_SimpleCollisions.cpp | 90 +++++++++++++++----
 4 files changed, 71 insertions(+), 22 deletions(-)

diff --git a/src/algorithms/DEMDynamicMisc.cu b/src/algorithms/DEMDynamicMisc.cu
index 52e43396..ee3bdc51 100644
--- a/src/algorithms/DEMDynamicMisc.cu
+++ b/src/algorithms/DEMDynamicMisc.cu
@@ -150,7 +150,6 @@ void prepareWeightedNormalsForVoting(DEMDataDT* granData,
                                      float3* weightedNormals,
                                      contactPairs_t startOffset,
                                      contactPairs_t count,
-                                     contact_t contactType,
                                      cudaStream_t& this_stream) {
     size_t blocks_needed = (count + DEME_MAX_THREADS_PER_BLOCK - 1) / DEME_MAX_THREADS_PER_BLOCK;
     if (blocks_needed > 0) {
diff --git a/src/algorithms/DEMStaticDeviceSubroutines.h b/src/algorithms/DEMStaticDeviceSubroutines.h
index 7eb19461..87cf6685 100644
--- a/src/algorithms/DEMStaticDeviceSubroutines.h
+++ b/src/algorithms/DEMStaticDeviceSubroutines.h
@@ -248,7 +248,6 @@ void finalizePatchResultsFromAccumulators(const PatchContactAccum* patchAccumula
                                           const float3* zeroAreaNormals,
                                           const double* zeroAreaPenetrations,
                                           const double3* zeroAreaContactPoints,
-                                          const notStupidBool_t* patchHasSAT,
                                           double* finalAreas,
                                           float3* finalNormals,
                                           double* finalPenetrations,
diff --git a/src/demo/CMakeLists.txt b/src/demo/CMakeLists.txt
index fd97e089..70f46bbc 100644
--- a/src/demo/CMakeLists.txt
+++ b/src/demo/CMakeLists.txt
@@ -24,7 +24,6 @@ SET(DEMOS
 		DEMdemo_TestPack
 		DEMdemo_TestRestart
 		DEMdemo_RotatingDrum
-		DEMdemo_DrumCubes
 		DEMdemo_Centrifuge
 		DEMdemo_DrumCubes
 		DEMdemo_ResponseAngleMesh
diff --git a/src/demo/ModularTests/DEMTest_SimpleCollisions.cpp b/src/demo/ModularTests/DEMTest_SimpleCollisions.cpp
index 34d98d64..b8a99a20 100644
--- a/src/demo/ModularTests/DEMTest_SimpleCollisions.cpp
+++ b/src/demo/ModularTests/DEMTest_SimpleCollisions.cpp
@@ -31,6 +31,10 @@ using namespace deme;
 
 namespace {
 
+constexpr bool kUseTriangleParticles = true; // toggle to run the STL-based triangle setup
+constexpr float kMmToMeters = 0.001f;
+constexpr double kTriangleParticleDensity = 2600.0;
+
 constexpr int kNumRuns = 10;
 constexpr double kGap = 0.01;        // 10 mm
 constexpr double kSpeed = 1.0;       // 1 m/s
@@ -97,16 +101,12 @@ double compute_min_z_rotated(const std::shared_ptr<DEMMesh>& mesh, const float4&
     return min_z;
 }
 
-std::shared_ptr<DEMMesh> load_cube_template(DEMSolver& DEMSim,
-                                            const std::shared_ptr<DEMMaterial>& mat_type,
-                                            bool per_triangle_patches) {
-    auto mesh_template = DEMSim.LoadMeshType((GET_DATA_PATH() / "mesh/cube.obj").string(), mat_type,
-                                             true,   // load_normals
-                                             false); // load_uv
+void assign_patch_ids(const std::shared_ptr<DEMMesh>& mesh_template,
+                      bool per_triangle_patches,
+                      const std::shared_ptr<DEMMaterial>& mat_type) {
     if (!mesh_template) {
-        return nullptr;
+        return;
     }
-
     const size_t num_tris = mesh_template->GetNumTriangles();
     std::vector<patchID_t> patch_ids(num_tris, 0);
     if (per_triangle_patches) {
@@ -115,13 +115,50 @@ std::shared_ptr<DEMMesh> load_cube_template(DEMSolver& DEMSim,
         }
     }
     mesh_template->SetPatchIDs(patch_ids);
-    // Ensure material vector matches patch count after overriding patch IDs.
     mesh_template->SetMaterial(mat_type);
+}
+
+std::shared_ptr<DEMMesh> load_cube_template(DEMSolver& DEMSim,
+                                            const std::shared_ptr<DEMMaterial>& mat_type,
+                                            bool per_triangle_patches) {
+    auto mesh_template = DEMSim.LoadMeshType((GET_DATA_PATH() / "mesh/cube.obj").string(), mat_type,
+                                             true,   // load_normals
+                                             false); // load_uv
+    if (!mesh_template) {
+        return nullptr;
+    }
+
+    assign_patch_ids(mesh_template, per_triangle_patches, mat_type);
+    return mesh_template;
+}
+
+std::shared_ptr<DEMMesh> load_triangle_template(DEMSolver& DEMSim,
+                                                const std::shared_ptr<DEMMaterial>& mat_type,
+                                                bool per_triangle_patches,
+                                                float& out_mass,
+                                                float3& out_moi) {
+    std::shared_ptr<DEMMesh> mesh_template =
+        DEMSim.LoadMeshType((GET_DATA_PATH() / "mesh/simpleTriangleShape4mm.stl").string(), mat_type, true, false);
+    if (!mesh_template) {
+        return nullptr;
+    }
+    mesh_template->Scale(kMmToMeters);
+
+    double volume = 0.0;
+    float3 center = make_float3(0, 0, 0);
+    float3 inertia = make_float3(0, 0, 0);
+    mesh_template->ComputeMassProperties(volume, center, inertia);
+
+    out_mass = static_cast<float>(volume * kTriangleParticleDensity);
+    out_moi = inertia * static_cast<float>(kTriangleParticleDensity);
+
+    assign_patch_ids(mesh_template, per_triangle_patches, mat_type);
     return mesh_template;
 }
 
 RunResult run_single_collision(const float4& init_rot,
                                bool per_triangle_patches,
+                               bool use_triangle_particles,
                                const std::string& label,
                                int run_id) {
     RunResult result;
@@ -139,9 +176,18 @@ RunResult run_single_collision(const float4& init_rot,
     float3 plane_normal = make_float3(0, 0, 1);
     auto plane = DEMSim.AddBCPlane(make_float3(0, 0, 0), plane_normal, mat_type);
     auto plane_tracker = DEMSim.Track(plane);
-    auto mesh_template = load_cube_template(DEMSim, mat_type, per_triangle_patches);
+    const char* mesh_desc = use_triangle_particles ? "triangle mesh" : "cube mesh";
+    auto mesh_template = std::shared_ptr<DEMMesh>{};
+    float particle_mass = 1.0f;
+    float3 particle_moi = make_float3(1.0f / 6.0f, 1.0f / 6.0f, 1.0f / 6.0f);
+
+    if (use_triangle_particles) {
+        mesh_template = load_triangle_template(DEMSim, mat_type, per_triangle_patches, particle_mass, particle_moi);
+    } else {
+        mesh_template = load_cube_template(DEMSim, mat_type, per_triangle_patches);
+    }
     if (!mesh_template) {
-        std::cout << "[" << label << "] Run " << run_id << ": failed to load cube mesh" << std::endl;
+        std::cout << "[" << label << "] Run " << run_id << ": failed to load " << mesh_desc << std::endl;
         return result;
     }
     double min_z = compute_min_z_rotated(mesh_template, init_rot);
@@ -149,8 +195,8 @@ RunResult run_single_collision(const float4& init_rot,
 
     auto cube = DEMSim.AddMeshFromTemplate(mesh_template, make_float3(0, 0, 0));
     cube->SetFamily(0);
-    cube->SetMass(1.0);
-    cube->SetMOI(make_float3(1.0 / 6.0, 1.0 / 6.0, 1.0 / 6.0));
+    cube->SetMass(particle_mass);
+    cube->SetMOI(particle_moi);
     cube->SetInitQuat(init_rot);
     cube->SetInitPos(make_float3(0, 0, static_cast<float>(init_z)));
     auto cube_tracker = DEMSim.Track(cube);
@@ -251,16 +297,20 @@ float4 corner_quat() {
     return q;
 }
 
-void run_scenario(const std::string& label, const float4& rot, bool per_triangle_patches) {
+void run_scenario(const std::string& label,
+                  const float4& rot,
+                  bool per_triangle_patches,
+                  bool use_triangle_particles) {
     std::cout << "\n========================================" << std::endl;
     std::cout << label << std::endl;
     std::cout << "========================================" << std::endl;
+    std::cout << "Using mesh: " << (use_triangle_particles ? "simpleTriangleShape4mm.stl" : "cube.obj") << std::endl;
 
     std::vector<RunResult> results;
     results.reserve(kNumRuns);
 
     for (int i = 0; i < kNumRuns; ++i) {
-        RunResult r = run_single_collision(rot, per_triangle_patches, label, i);
+        RunResult r = run_single_collision(rot, per_triangle_patches, use_triangle_particles, label, i);
         results.push_back(r);
         if (r.ok) {
             std::cout << "Run " << i << ": speed=" << r.rebound_speed << " dir=(" << r.rebound_dir.x << ", "
@@ -278,14 +328,16 @@ int main() {
     std::cout << "========================================" << std::endl;
     std::cout << "DEM Simple Collisions Test" << std::endl;
     std::cout << "========================================" << std::endl;
+    std::cout << "Particle mesh mode: "
+              << (kUseTriangleParticles ? "simpleTriangleShape4mm.stl" : "cube.obj") << std::endl;
 
     float4 q_edge = edge_quat();
     float4 q_corner = corner_quat();
 
-    run_scenario("Edge impact - single patch", q_edge, false);
-    run_scenario("Edge impact - 12 patches", q_edge, true);
-    run_scenario("Corner impact - single patch", q_corner, false);
-    run_scenario("Corner impact - 12 patches", q_corner, true);
+    run_scenario("Edge impact - single patch", q_edge, false, kUseTriangleParticles);
+    run_scenario("Edge impact - 12 patches", q_edge, true, kUseTriangleParticles);
+    run_scenario("Corner impact - single patch", q_corner, false, kUseTriangleParticles);
+    run_scenario("Corner impact - 12 patches", q_corner, true, kUseTriangleParticles);
 
     std::cout << "\n========================================" << std::endl;
     std::cout << "Test completed" << std::endl;

From cde63ee3e6111801fb8516c698861dd67a0bfca1 Mon Sep 17 00:00:00 2001
From: Florian Reinle <f.reinle@otec.de>
Date: Wed, 28 Jan 2026 11:14:53 +0100
Subject: [PATCH 14/17] Fix planar contact bug, fixed isWatertight check, more
 debug info test collision

---
 src/DEM/MeshUtils.cpp                         | 146 ++++++++++++++----
 .../ModularTests/DEMTest_SimpleCollisions.cpp | 126 ++++++++++++++-
 src/kernel/DEMCalcForceKernels_Primitive.cu   |   4 +
 3 files changed, 248 insertions(+), 28 deletions(-)

diff --git a/src/DEM/MeshUtils.cpp b/src/DEM/MeshUtils.cpp
index f16b958c..daee8c4d 100644
--- a/src/DEM/MeshUtils.cpp
+++ b/src/DEM/MeshUtils.cpp
@@ -1734,45 +1734,137 @@ void DEMMesh::ComputeMassProperties(double& volume, float3& center, float3& iner
     inertia = make_float3(static_cast<float>(Ixx), static_cast<float>(Iyy), static_cast<float>(Izz));
 }
 
-bool DEMMesh::IsWatertight(size_t* boundary_edges, size_t* nonmanifold_edges) const {
-    if (boundary_edges) {
-        *boundary_edges = 0;
-    }
-    if (nonmanifold_edges) {
-        *nonmanifold_edges = 0;
+// Section for Watertight test, false if not
+
+struct QuantKey3 {
+    int64_t x, y, z;
+    bool operator==(const QuantKey3& o) const noexcept { return x==o.x && y==o.y && z==o.z; }
+};
+struct QuantKey3Hash {
+    size_t operator()(const QuantKey3& k) const noexcept {
+        size_t h1 = std::hash<int64_t>{}(k.x);
+        size_t h2 = std::hash<int64_t>{}(k.y);
+        size_t h3 = std::hash<int64_t>{}(k.z);
+        size_t h = h1;
+        h ^= h2 + 0x9e3779b97f4a7c15ULL + (h<<6) + (h>>2);
+        h ^= h3 + 0x9e3779b97f4a7c15ULL + (h<<6) + (h>>2);
+        return h;
     }
-    if (m_face_v_indices.empty()) {
+};
+
+static inline int64_t q(double v, double eps) {
+    return (int64_t)std::llround(v / eps);
+}
+
+bool DEMMesh::IsWatertight(size_t* boundary_edges, size_t* nonmanifold_edges) const {
+    if (boundary_edges) *boundary_edges = 0;
+    if (nonmanifold_edges) *nonmanifold_edges = 0;
+    if (m_face_v_indices.empty()) return true;
+
+    auto count_edges_by_index = [&](size_t& boundary, size_t& nonmanifold) {
+        std::map<std::pair<size_t, size_t>, size_t> edge_counts;
+
+        for (const auto& face : m_face_v_indices) {
+            const int fx = face.x, fy = face.y, fz = face.z;
+            if (fx < 0 || fy < 0 || fz < 0) continue;
+
+            const size_t a = (size_t)fx, b = (size_t)fy, c = (size_t)fz;
+            if (a == b || b == c || c == a) continue;
+
+            std::pair<size_t, size_t> edges[3] = {
+                {std::min(a,b), std::max(a,b)},
+                {std::min(b,c), std::max(b,c)},
+                {std::min(c,a), std::max(c,a)}
+            };
+            edge_counts[edges[0]]++;
+            edge_counts[edges[1]]++;
+            edge_counts[edges[2]]++;
+        }
+
+        boundary = 0; nonmanifold = 0;
+        for (const auto& kv : edge_counts) {
+            if (kv.second == 1) boundary++;
+            else if (kv.second > 2) nonmanifold++;
+        }
+    };
+
+    size_t boundary1 = 0, nonmanifold1 = 0;
+    count_edges_by_index(boundary1, nonmanifold1);
+
+    if (boundary1 == 0 && nonmanifold1 == 0) {
+        if (boundary_edges) *boundary_edges = 0;
+        if (nonmanifold_edges) *nonmanifold_edges = 0;
         return true;
     }
 
-    std::map<std::pair<int, int>, size_t> edge_counts;
-    for (const auto& face : m_face_v_indices) {
-        std::pair<int, int> edges[3] = {{std::min(face.x, face.y), std::max(face.x, face.y)},
-                                        {std::min(face.y, face.z), std::max(face.y, face.z)},
-                                        {std::min(face.z, face.x), std::max(face.z, face.x)}};
-        for (int e = 0; e < 3; ++e) {
-            edge_counts[edges[e]]++;
-        }
+    if (m_vertices.empty()) {
+        if (boundary_edges) *boundary_edges = boundary1;
+        if (nonmanifold_edges) *nonmanifold_edges = nonmanifold1;
+        return false;
     }
 
-    size_t boundary = 0;
-    size_t nonmanifold = 0;
-    for (const auto& kv : edge_counts) {
-        if (kv.second == 1) {
-            boundary++;
-        } else if (kv.second > 2) {
-            nonmanifold++;
+    double minx = m_vertices[0].x, miny = m_vertices[0].y, minz = m_vertices[0].z;
+    double maxx = minx, maxy = miny, maxz = minz;
+    for (const auto& v : m_vertices) {
+        minx = std::min(minx, (double)v.x); miny = std::min(miny, (double)v.y);
+        minz = std::min(minz, (double)v.z);
+        maxx = std::max(maxx, (double)v.x); maxy = std::max(maxy, (double)v.y);
+        maxz = std::max(maxz, (double)v.z);
+    }
+    const double dx = maxx - minx, dy = maxy - miny, dz = maxz - minz;
+    const double diag = std::sqrt(dx*dx + dy*dy + dz*dz);
+    const double eps = std::max(diag * 1e-9, 1e-12);
+
+    std::unordered_map<QuantKey3, size_t, QuantKey3Hash> rep;
+    rep.reserve(m_vertices.size());
+
+    std::vector<size_t> canon(m_vertices.size(), (size_t)-1);
+    size_t next_id = 0;
+
+    for (size_t i = 0; i < m_vertices.size(); ++i) {
+        const auto& v = m_vertices[i];
+        QuantKey3 key{ q(v.x, eps), q(v.y, eps), q(v.z, eps) };
+
+        auto it = rep.find(key);
+        if (it == rep.end()) {
+            rep.emplace(key, next_id);
+            canon[i] = next_id;
+            next_id++;
+        } else {
+            canon[i] = it->second;
         }
     }
 
-    if (boundary_edges) {
-        *boundary_edges = boundary;
+    std::map<std::pair<size_t, size_t>, size_t> edge_counts2;
+    for (const auto& face : m_face_v_indices) {
+        const int fx = face.x, fy = face.y, fz = face.z;
+        if (fx < 0 || fy < 0 || fz < 0) continue;
+
+        const size_t a0 = (size_t)fx, b0 = (size_t)fy, c0 = (size_t)fz;
+        if (a0 >= canon.size() || b0 >= canon.size() || c0 >= canon.size()) continue;
+
+        const size_t a = canon[a0], b = canon[b0], c = canon[c0];
+        if (a == b || b == c || c == a) continue;
+
+        std::pair<size_t, size_t> edges[3] = {
+            {std::min(a,b), std::max(a,b)},
+            {std::min(b,c), std::max(b,c)},
+            {std::min(c,a), std::max(c,a)}
+        };
+        edge_counts2[edges[0]]++;
+        edge_counts2[edges[1]]++;
+        edge_counts2[edges[2]]++;
     }
-    if (nonmanifold_edges) {
-        *nonmanifold_edges = nonmanifold;
+
+    size_t boundary2 = 0, nonmanifold2 = 0;
+    for (const auto& kv : edge_counts2) {
+        if (kv.second == 1) boundary2++;
+        else if (kv.second > 2) nonmanifold2++;
     }
 
-    return boundary == 0 && nonmanifold == 0;
+    if (boundary_edges) *boundary_edges = boundary2;
+    if (nonmanifold_edges) *nonmanifold_edges = nonmanifold2;
+    return boundary2 == 0 && nonmanifold2 == 0;
 }
 
 }  // end namespace deme
diff --git a/src/demo/ModularTests/DEMTest_SimpleCollisions.cpp b/src/demo/ModularTests/DEMTest_SimpleCollisions.cpp
index b8a99a20..0662f106 100644
--- a/src/demo/ModularTests/DEMTest_SimpleCollisions.cpp
+++ b/src/demo/ModularTests/DEMTest_SimpleCollisions.cpp
@@ -21,17 +21,21 @@
 #include <DEM/utils/HostSideHelpers.hpp>
 
 #include <cmath>
+#include <cstdio>
+#include <filesystem>
 #include <iostream>
 #include <limits>
 #include <numeric>
 #include <string>
+#include <utility>
 #include <vector>
 
 using namespace deme;
+using namespace std::filesystem;
 
 namespace {
 
-constexpr bool kUseTriangleParticles = true; // toggle to run the STL-based triangle setup
+constexpr bool kUseTriangleParticles = false; // toggle to run the STL-based triangle setup
 constexpr float kMmToMeters = 0.001f;
 constexpr double kTriangleParticleDensity = 2600.0;
 
@@ -41,6 +45,10 @@ constexpr double kSpeed = 1.0;       // 1 m/s
 constexpr double kTimeStep = 1e-5;   // seconds
 constexpr int kMaxSteps = 200000;    // 2 seconds max
 constexpr double kContactEps = 1e-6; // contact force threshold
+constexpr bool kFixWinding = true;   // flip inward-facing triangles based on CoM
+constexpr bool kWriteFrames = true;
+constexpr unsigned int kOutputFPS = 2000;
+constexpr const char* kOutputDir = "DemoOutput_SimpleCollisions";
 
 struct RunResult {
     bool ok = false;
@@ -68,6 +76,88 @@ float3 vec_scale(const float3& v, double s) {
     return make_float3(v.x * s, v.y * s, v.z * s);
 }
 
+std::pair<float3, float3> compute_bounds(const std::vector<float3>& vertices) {
+    if (vertices.empty()) {
+        return {make_float3(0, 0, 0), make_float3(0, 0, 0)};
+    }
+    float3 min_v = vertices.front();
+    float3 max_v = vertices.front();
+    for (const auto& v : vertices) {
+        min_v.x = std::min(min_v.x, v.x);
+        min_v.y = std::min(min_v.y, v.y);
+        min_v.z = std::min(min_v.z, v.z);
+        max_v.x = std::max(max_v.x, v.x);
+        max_v.y = std::max(max_v.y, v.y);
+        max_v.z = std::max(max_v.z, v.z);
+    }
+    return {min_v, max_v};
+}
+
+void print_mesh_diagnostics(const std::shared_ptr<DEMMesh>& mesh, const std::string& label) {
+    if (!mesh) {
+        return;
+    }
+    size_t boundary_edges = 0;
+    size_t nonmanifold_edges = 0;
+    bool watertight = mesh->IsWatertight(&boundary_edges, &nonmanifold_edges);
+
+    double volume = 0.0;
+    float3 center = make_float3(0, 0, 0);
+    float3 inertia = make_float3(0, 0, 0);
+    mesh->ComputeMassProperties(volume, center, inertia);
+
+    auto [min_v, max_v] = compute_bounds(mesh->GetCoordsVertices());
+    float3 dims = max_v - min_v;
+
+    std::cout << "\n[" << label << "] mesh diagnostics" << std::endl;
+    std::cout << "Vertices: " << mesh->GetNumNodes() << " Triangles: " << mesh->GetNumTriangles() << std::endl;
+    std::cout << "Bounds min=(" << min_v.x << ", " << min_v.y << ", " << min_v.z << ") max=(" << max_v.x << ", "
+              << max_v.y << ", " << max_v.z << ") dims=(" << dims.x << ", " << dims.y << ", " << dims.z << ")"
+              << std::endl;
+    std::cout << "Watertight: " << (watertight ? "yes" : "no") << " boundary_edges=" << boundary_edges
+              << " nonmanifold_edges=" << nonmanifold_edges << std::endl;
+    std::cout << "Volume=" << volume << " CoM=(" << center.x << ", " << center.y << ", " << center.z
+              << ") MOI(unit density, CoM)=(" << inertia.x << ", " << inertia.y << ", " << inertia.z << ")"
+              << std::endl;
+}
+
+void diagnose_winding(const std::shared_ptr<DEMMesh>& mesh,
+                      const std::string& label,
+                      bool fix_winding) {
+    if (!mesh || mesh->m_face_v_indices.empty()) {
+        return;
+    }
+    double volume = 0.0;
+    float3 center = make_float3(0, 0, 0);
+    float3 inertia = make_float3(0, 0, 0);
+    mesh->ComputeMassProperties(volume, center, inertia);
+    if (volume == 0.0) {
+        center = make_float3(0, 0, 0);
+    }
+
+    size_t inward = 0;
+    for (size_t i = 0; i < mesh->m_face_v_indices.size(); ++i) {
+        const int3& f = mesh->m_face_v_indices[i];
+        const float3& v0 = mesh->m_vertices[f.x];
+        const float3& v1 = mesh->m_vertices[f.y];
+        const float3& v2 = mesh->m_vertices[f.z];
+        const float3 n = face_normal(v0, v1, v2);
+        const float3 centroid = (v0 + v1 + v2) / 3.0f;
+        const float3 to_face = centroid - center;
+        const float d = dot(n, to_face);
+        if (d < 0.0f) {
+            inward++;
+            if (fix_winding) {
+                mesh->m_face_v_indices[i] = make_int3(f.x, f.z, f.y);
+            }
+        }
+    }
+
+    std::cout << "\n[" << label << "] winding diagnostics" << std::endl;
+    std::cout << "Faces total=" << mesh->m_face_v_indices.size() << " inward=" << inward
+              << (fix_winding ? " (flipped)" : "") << std::endl;
+}
+
 Stats calc_stats(const std::vector<double>& values) {
     Stats s;
     if (values.empty()) {
@@ -152,6 +242,18 @@ std::shared_ptr<DEMMesh> load_triangle_template(DEMSolver& DEMSim,
     out_mass = static_cast<float>(volume * kTriangleParticleDensity);
     out_moi = inertia * static_cast<float>(kTriangleParticleDensity);
 
+    print_mesh_diagnostics(mesh_template, "simpleTriangleShape4mm.stl (scaled)");
+    diagnose_winding(mesh_template, "simpleTriangleShape4mm.stl (scaled)", kFixWinding);
+    if (center.x != 0.0f || center.y != 0.0f || center.z != 0.0f) {
+        for (auto& v : mesh_template->m_vertices) {
+            v.x -= center.x;
+            v.y -= center.y;
+            v.z -= center.z;
+        }
+        std::cout << "[simpleTriangleShape4mm.stl] shifted vertices to CoM frame ("
+                  << center.x << ", " << center.y << ", " << center.z << ")" << std::endl;
+    }
+
     assign_patch_ids(mesh_template, per_triangle_patches, mat_type);
     return mesh_template;
 }
@@ -165,6 +267,7 @@ RunResult run_single_collision(const float4& init_rot,
 
     DEMSolver DEMSim;
     DEMSim.SetOutputFormat(OUTPUT_FORMAT::CSV);
+    DEMSim.SetMeshOutputFormat("VTK");
     DEMSim.InstructBoxDomainDimension(5, 5, 5);
     DEMSim.SetGravitationalAcceleration(make_float3(0, 0, 0));
     DEMSim.SetCDUpdateFreq(0);
@@ -208,10 +311,31 @@ RunResult run_single_collision(const float4& init_rot,
     bool contact_started = false;
     bool rebound_captured = false;
     double peak_normal_force = 0.0;
+    unsigned int frame_id = 0;
+    double next_frame_time = 0.0;
+    path out_dir;
+    if (kWriteFrames) {
+        out_dir = current_path() / kOutputDir / label / ("run_" + std::to_string(run_id));
+        create_directories(out_dir);
+        next_frame_time = 0.0;
+        char filename[128];
+        std::snprintf(filename, sizeof(filename), "frame_%06u.vtk", frame_id++);
+        DEMSim.WriteMeshFile(out_dir / filename);
+    }
 
     for (int step = 0; step < kMaxSteps; ++step) {
         DEMSim.DoStepDynamics();
 
+        if (kWriteFrames) {
+            double sim_time = DEMSim.GetSimTime();
+            while (sim_time + 1e-12 >= next_frame_time) {
+                char filename[128];
+                std::snprintf(filename, sizeof(filename), "frame_%06u.vtk", frame_id++);
+                DEMSim.WriteMeshFile(out_dir / filename);
+                next_frame_time += 1.0 / static_cast<double>(kOutputFPS);
+            }
+        }
+
         float3 plane_force = plane_tracker->ContactAcc();
         plane_force = vec_scale(plane_force, plane_tracker->Mass());
         double normal_force = std::abs(vec_dot(plane_force, plane_normal));
diff --git a/src/kernel/DEMCalcForceKernels_Primitive.cu b/src/kernel/DEMCalcForceKernels_Primitive.cu
index 43fbe269..a7261b7e 100644
--- a/src/kernel/DEMCalcForceKernels_Primitive.cu
+++ b/src/kernel/DEMCalcForceKernels_Primitive.cu
@@ -59,6 +59,10 @@ __device__ __forceinline__ void calculatePrimitiveContactForces_impl(deme::DEMSi
     // resulting into the correct place needs to be done here.
     deme::contactPairs_t myPatchContactID = granData->geomToPatchMap[myPrimitiveContactID];
 
+    // Default: patch-direction check should not filter non-tri-tri contacts.
+    // Tri-tri will overwrite this after computing patch direction.
+    granData->contactPatchDirectionRespected[myPrimitiveContactID] = 1;
+
     // ----------------------------------------------------------------
     // Based on A's type, equip info
     // ----------------------------------------------------------------

From 5232ad04c9005239d8f9ff6eae2a414a97c566ec Mon Sep 17 00:00:00 2001
From: Florian Reinle <f.reinle@otec.de>
Date: Thu, 29 Jan 2026 16:56:22 +0100
Subject: [PATCH 15/17] Fixed and improved demo

---
 src/demo/DEMdemo_ResponseAngleMesh.cpp        | 117 +++++++--------
 .../ModularTests/DEMTest_SimpleCollisions.cpp | 135 +-----------------
 2 files changed, 65 insertions(+), 187 deletions(-)

diff --git a/src/demo/DEMdemo_ResponseAngleMesh.cpp b/src/demo/DEMdemo_ResponseAngleMesh.cpp
index 112e5a64..f5064ea8 100644
--- a/src/demo/DEMdemo_ResponseAngleMesh.cpp
+++ b/src/demo/DEMdemo_ResponseAngleMesh.cpp
@@ -16,6 +16,7 @@
 #include <chrono>
 #include <cmath>
 #include <cstdio>
+#include <cctype>
 #include <filesystem>
 #include <random>
 #include <limits>
@@ -27,34 +28,34 @@ using namespace std::filesystem;
 
 namespace {
 
-/// Load an STL mesh, scale it, attach material and register it as a template.
-std::shared_ptr<DEMMesh> LoadStlTemplate(DEMSolver& sim,
-                                         const path& file,
-                                         const std::shared_ptr<DEMMaterial>& mat,
-                                         float scale) {
-    DEMMesh mesh;
-    bool ok = mesh.LoadSTLMesh(file.string());
-    if (!ok) {
-        DEME_ERROR("Failed to load STL mesh template %s", file.string().c_str());
+std::string ToLower(std::string s) {
+    for (char& c : s) {
+        c = static_cast<char>(std::tolower(static_cast<unsigned char>(c)));
     }
-    mesh.SetMaterial(mat);
-    mesh.Scale(scale);
-    return sim.LoadMeshType(mesh);
+    return s;
 }
 
-/// Load an STL mesh, scale it, attach material and place it directly in the scene.
-std::shared_ptr<DEMMesh> LoadStlMesh(DEMSolver& sim,
-                                     const path& file,
-                                     const std::shared_ptr<DEMMaterial>& mat,
-                                     float scale) {
+/// Load a mesh (STL or OBJ), scale it, attach material and register it as a template.
+std::shared_ptr<DEMMesh> LoadMeshTemplate(DEMSolver& sim,
+                                          const path& file,
+                                          const std::shared_ptr<DEMMaterial>& mat,
+                                          float scale) {
     DEMMesh mesh;
-    bool ok = mesh.LoadSTLMesh(file.string());
+    std::string ext = ToLower(file.extension().string());
+    bool ok = false;
+    if (ext == ".stl") {
+        ok = mesh.LoadSTLMesh(file.string());
+    } else if (ext == ".obj") {
+        ok = mesh.LoadWavefrontMesh(file.string());
+    } else {
+        DEME_ERROR("Unsupported mesh format: %s (only .stl or .obj)", ext.c_str());
+    }
     if (!ok) {
-        DEME_ERROR("Failed to load STL mesh %s", file.string().c_str());
+        DEME_ERROR("Failed to load mesh template %s", file.string().c_str());
     }
     mesh.SetMaterial(mat);
     mesh.Scale(scale);
-    return sim.AddMesh(mesh);
+    return sim.LoadMeshType(mesh);
 }
 
 std::pair<float3, float3> ComputeBounds(const std::vector<float3>& vertices) {
@@ -82,18 +83,23 @@ int main() {
     DEMSim.SetMeshUniversalContact(true);
     const float mm_to_m = 0.001f;
     const float drum_inner_radius = 0.1f;  // 200 mm diameter
-    const float wall_clearance = 0.002f;   // leave a small gap to the mantle
+    const float wall_clearance = 0.001f;   // leave a small gap to the mantle
     const float rpm = 40.0f;
     const float drum_ang_vel = rpm * 2.0f * PI / 60.0f;
 
     auto mat_type_particle =
-        DEMSim.LoadMaterial({{"E", 1e6}, {"nu", 0.3}, {"CoR", 0.6}, {"mu", 0.5}, {"Crr", 0.01}});
-    auto mat_type_drum = DEMSim.LoadMaterial({{"E", 2e6}, {"nu", 0.3}, {"CoR", 0.6}, {"mu", 0.5}, {"Crr", 0.01}});
+        DEMSim.LoadMaterial({{"E", 1e6}, {"nu", 0.3}, {"CoR", 0.6}, {"mu", 0.5}, {"Crr", 0.00}});
+    auto mat_type_drum = DEMSim.LoadMaterial({{"E", 2e6}, {"nu", 0.3}, {"CoR", 0.6}, {"mu", 0.5}, {"Crr", 0.00}});
     DEMSim.SetMaterialPropertyPair("mu", mat_type_particle, mat_type_drum, 0.5);
 
-    // Load particle mesh template from STL (approx. 4 mm triangular prism)
-    path tri_path = GET_DATA_PATH() / "mesh" / "simpleTriangleShape4mm.stl";
-    auto tri_template = LoadStlTemplate(DEMSim, tri_path, mat_type_particle, mm_to_m);
+    // --------------------- Particle settings block ---------------------
+    // Mesh file can be .stl or .obj (path is relative to data/mesh).
+    const path particle_mesh_file = GET_DATA_PATH() / "mesh" / "cube.obj"; // "simpleTriangleShape4mm.stl"
+    const float particle_mesh_scale = mm_to_m * 5.0f; // 1.0f for STLs in mm size
+    const unsigned int target_particles = 5000;
+    // -------------------------------------------------------------------
+
+    auto tri_template = LoadMeshTemplate(DEMSim, particle_mesh_file, mat_type_particle, particle_mesh_scale);
     auto [tri_min, tri_max] = ComputeBounds(tri_template->GetCoordsVertices());
     const float3 tri_dims = tri_max - tri_min;
     const float tri_diag = std::sqrt(tri_dims.x * tri_dims.x + tri_dims.y * tri_dims.y + tri_dims.z * tri_dims.z);
@@ -105,50 +111,41 @@ int main() {
     tri_template->ComputeMassProperties(tri_volume, tri_center, tri_inertia);
     const float particle_mass = static_cast<float>(tri_volume * particle_density);
     const float3 particle_moi = tri_inertia * particle_density;
-    std::cout << "Particle STL volume (m^3): " << tri_volume << std::endl;
-    std::cout << "Particle STL MOI (unit density, CoM): " << tri_inertia.x << ", " << tri_inertia.y << ", "
+    std::cout << "Particle volume (m^3): " << tri_volume << ", mass (kg): "<< particle_mass << std::endl;
+    std::cout << "Particle MOI (unit density, CoM): " << tri_inertia.x << ", " << tri_inertia.y << ", "
               << tri_inertia.z << std::endl;
     const double cube_vol = std::pow(4.0e-3, 3);
-    std::cout << "Particle mass (kg): " << particle_mass << std::endl;
 
-    // Load drum mantle from STL; STL units are mm with z in [0, 100]
-    path drum_path = GET_DATA_PATH() / "mesh" / "drum.stl";
-    auto drum_mesh = LoadStlMesh(DEMSim, drum_path, mat_type_drum, mm_to_m);
-    auto [drum_min, drum_max] = ComputeBounds(drum_mesh->GetCoordsVertices());
-    const float drum_height = drum_max.z - drum_min.z;
+    // Analytical drum mantle (planar contact cylinder) with end caps.
+    const float drum_height = 0.1f;
+    const float drum_mass = 1.0f;
+    const float IZZ = drum_mass * drum_inner_radius * drum_inner_radius / 2.0f;
+    const float IYY = (drum_mass / 12.0f) * (3.0f * drum_inner_radius * drum_inner_radius + drum_height * drum_height);
     unsigned int drum_family = 100;
-    drum_mesh->SetFamily(drum_family);
-    const float drum_density = 2600.0f;
-    double drum_volume = 0.0;
-    float3 drum_center = make_float3(0, 0, 0);
-    float3 drum_inertia = make_float3(0, 0, 0);
-    drum_mesh->ComputeMassProperties(drum_volume, drum_center, drum_inertia);
-    const float drum_mass = static_cast<float>(drum_volume * drum_density);
-    drum_mesh->SetMass(drum_mass);
-    drum_mesh->SetMOI(drum_inertia * drum_density);
-    std::cout << "Drum STL volume (m^3): " << drum_volume << std::endl;
-    std::cout << "Drum STL MOI (unit density, CoM): " << drum_inertia.x << ", " << drum_inertia.y << ", "
-              << drum_inertia.z << std::endl;
-    std::cout << "Drum mass (kg): " << drum_mass << std::endl;
+
+    auto drum = DEMSim.AddExternalObject();
+    drum->AddPlanarContactCylinder(make_float3(0, 0, drum_height / 2.0f), make_float3(0, 0, 1), drum_inner_radius,
+                                   mat_type_drum, ENTITY_NORMAL_INWARD);
+    drum->SetFamily(drum_family);
+    drum->SetMass(drum_mass);
+    drum->SetMOI(make_float3(IYY, IYY, IZZ));
     DEMSim.SetFamilyPrescribedAngVel(drum_family, "0", "0", to_string_with_precision(drum_ang_vel));
 
-    // Add top and bottom planes at z = 0 and z = 0.1 m. They rotate with the drum family (axis-aligned so rotation
-    // does not change their normals).
+    // Add top and bottom planes at z = 0 and z = drum_height. They rotate with the drum family.
     auto end_caps = DEMSim.AddExternalObject();
-    end_caps->AddPlane(make_float3(0, 0, drum_max.z), make_float3(0, 0, -1), mat_type_drum);
-    end_caps->AddPlane(make_float3(0, 0, drum_min.z), make_float3(0, 0, 1), mat_type_drum);
+    end_caps->AddPlane(make_float3(0, 0, drum_height), make_float3(0, 0, -1), mat_type_drum);
+    end_caps->AddPlane(make_float3(0, 0, 0), make_float3(0, 0, 1), mat_type_drum);
     end_caps->SetFamily(drum_family);
 
-    auto drum_tracker = DEMSim.Track(drum_mesh);
+    auto drum_tracker = DEMSim.Track(drum);
     auto cap_tracker = DEMSim.Track(end_caps);
 
-    // Sample 5000 particles inside the cylindrical volume with a small wall clearance.
-    const unsigned int target_particles = 5000;
+    // Sample particles inside the cylindrical volume with a small wall clearance.
     const float sample_radius = drum_inner_radius - wall_clearance - tri_radius;
     const float sample_halfheight = drum_height / 2.0f - wall_clearance - tri_radius;
-    HCPSampler sampler(tri_diag * 1.05f);
+    HCPSampler sampler(tri_diag * 1.01f);
     auto candidate_pos =
-        sampler.SampleCylinderZ(make_float3(0, 0, drum_min.z + drum_height / 2.0f), sample_radius, sample_halfheight);
+        sampler.SampleCylinderZ(make_float3(0, 0, drum_height / 2.0f), sample_radius, sample_halfheight);
     if (candidate_pos.size() < target_particles) {
         DEME_WARNING("Sampler produced fewer points (%zu) than requested (%u). Using all generated points.",
                      candidate_pos.size(), target_particles);
@@ -166,7 +163,8 @@ int main() {
         tri->SetMOI(particle_moi);
         tri->SetInitQuat(make_float4(0.f, 0.f, 0.f, 1.0f));
     }
-    std::cout << "Placed " << candidate_pos.size() << " STL particles inside the drum." << std::endl;
+    const float total_particle_mass = particle_mass * candidate_pos.size();
+    std::cout << "Placed " << candidate_pos.size() << " particles with a mass of "<< total_particle_mass <<" kg inside the drum." <<std::endl;
 
     auto max_v_finder = DEMSim.CreateInspector("max_absv");
     float max_v;
@@ -177,7 +175,10 @@ int main() {
     DEMSim.SetGPUTimersEnabled(true);
     DEMSim.SetGravitationalAcceleration(make_float3(0, -9.81, 0));
     DEMSim.SetExpandSafetyType("auto");
-    DEMSim.SetExpandSafetyAdder(drum_ang_vel * drum_inner_radius);
+    const float vmax_grav = std::sqrt(2.0f * 9.81f * drum_inner_radius);
+    const float vmax_rot = drum_ang_vel * drum_inner_radius;
+    const float vmax = (vmax_grav > vmax_rot) ? vmax_grav : vmax_rot;
+    DEMSim.SetExpandSafetyAdder(vmax);
     DEMSim.Initialize();
 
     path out_dir = current_path();
diff --git a/src/demo/ModularTests/DEMTest_SimpleCollisions.cpp b/src/demo/ModularTests/DEMTest_SimpleCollisions.cpp
index 0662f106..4722c916 100644
--- a/src/demo/ModularTests/DEMTest_SimpleCollisions.cpp
+++ b/src/demo/ModularTests/DEMTest_SimpleCollisions.cpp
@@ -21,34 +21,27 @@
 #include <DEM/utils/HostSideHelpers.hpp>
 
 #include <cmath>
-#include <cstdio>
-#include <filesystem>
 #include <iostream>
 #include <limits>
 #include <numeric>
 #include <string>
-#include <utility>
 #include <vector>
 
 using namespace deme;
-using namespace std::filesystem;
 
 namespace {
 
-constexpr bool kUseTriangleParticles = false; // toggle to run the STL-based triangle setup
+constexpr bool kUseTriangleParticles = true; // toggle to run the STL-based triangle setup
 constexpr float kMmToMeters = 0.001f;
 constexpr double kTriangleParticleDensity = 2600.0;
 
 constexpr int kNumRuns = 10;
-constexpr double kGap = 0.01;        // 10 mm
+constexpr double kGap = 0.005;        // 0.5 mm
 constexpr double kSpeed = 1.0;       // 1 m/s
 constexpr double kTimeStep = 1e-5;   // seconds
-constexpr int kMaxSteps = 200000;    // 2 seconds max
+constexpr int kMaxSteps = 100000;    // 1 seconds max
 constexpr double kContactEps = 1e-6; // contact force threshold
-constexpr bool kFixWinding = true;   // flip inward-facing triangles based on CoM
-constexpr bool kWriteFrames = true;
-constexpr unsigned int kOutputFPS = 2000;
-constexpr const char* kOutputDir = "DemoOutput_SimpleCollisions";
+double vmax = kSpeed;
 
 struct RunResult {
     bool ok = false;
@@ -76,88 +69,6 @@ float3 vec_scale(const float3& v, double s) {
     return make_float3(v.x * s, v.y * s, v.z * s);
 }
 
-std::pair<float3, float3> compute_bounds(const std::vector<float3>& vertices) {
-    if (vertices.empty()) {
-        return {make_float3(0, 0, 0), make_float3(0, 0, 0)};
-    }
-    float3 min_v = vertices.front();
-    float3 max_v = vertices.front();
-    for (const auto& v : vertices) {
-        min_v.x = std::min(min_v.x, v.x);
-        min_v.y = std::min(min_v.y, v.y);
-        min_v.z = std::min(min_v.z, v.z);
-        max_v.x = std::max(max_v.x, v.x);
-        max_v.y = std::max(max_v.y, v.y);
-        max_v.z = std::max(max_v.z, v.z);
-    }
-    return {min_v, max_v};
-}
-
-void print_mesh_diagnostics(const std::shared_ptr<DEMMesh>& mesh, const std::string& label) {
-    if (!mesh) {
-        return;
-    }
-    size_t boundary_edges = 0;
-    size_t nonmanifold_edges = 0;
-    bool watertight = mesh->IsWatertight(&boundary_edges, &nonmanifold_edges);
-
-    double volume = 0.0;
-    float3 center = make_float3(0, 0, 0);
-    float3 inertia = make_float3(0, 0, 0);
-    mesh->ComputeMassProperties(volume, center, inertia);
-
-    auto [min_v, max_v] = compute_bounds(mesh->GetCoordsVertices());
-    float3 dims = max_v - min_v;
-
-    std::cout << "\n[" << label << "] mesh diagnostics" << std::endl;
-    std::cout << "Vertices: " << mesh->GetNumNodes() << " Triangles: " << mesh->GetNumTriangles() << std::endl;
-    std::cout << "Bounds min=(" << min_v.x << ", " << min_v.y << ", " << min_v.z << ") max=(" << max_v.x << ", "
-              << max_v.y << ", " << max_v.z << ") dims=(" << dims.x << ", " << dims.y << ", " << dims.z << ")"
-              << std::endl;
-    std::cout << "Watertight: " << (watertight ? "yes" : "no") << " boundary_edges=" << boundary_edges
-              << " nonmanifold_edges=" << nonmanifold_edges << std::endl;
-    std::cout << "Volume=" << volume << " CoM=(" << center.x << ", " << center.y << ", " << center.z
-              << ") MOI(unit density, CoM)=(" << inertia.x << ", " << inertia.y << ", " << inertia.z << ")"
-              << std::endl;
-}
-
-void diagnose_winding(const std::shared_ptr<DEMMesh>& mesh,
-                      const std::string& label,
-                      bool fix_winding) {
-    if (!mesh || mesh->m_face_v_indices.empty()) {
-        return;
-    }
-    double volume = 0.0;
-    float3 center = make_float3(0, 0, 0);
-    float3 inertia = make_float3(0, 0, 0);
-    mesh->ComputeMassProperties(volume, center, inertia);
-    if (volume == 0.0) {
-        center = make_float3(0, 0, 0);
-    }
-
-    size_t inward = 0;
-    for (size_t i = 0; i < mesh->m_face_v_indices.size(); ++i) {
-        const int3& f = mesh->m_face_v_indices[i];
-        const float3& v0 = mesh->m_vertices[f.x];
-        const float3& v1 = mesh->m_vertices[f.y];
-        const float3& v2 = mesh->m_vertices[f.z];
-        const float3 n = face_normal(v0, v1, v2);
-        const float3 centroid = (v0 + v1 + v2) / 3.0f;
-        const float3 to_face = centroid - center;
-        const float d = dot(n, to_face);
-        if (d < 0.0f) {
-            inward++;
-            if (fix_winding) {
-                mesh->m_face_v_indices[i] = make_int3(f.x, f.z, f.y);
-            }
-        }
-    }
-
-    std::cout << "\n[" << label << "] winding diagnostics" << std::endl;
-    std::cout << "Faces total=" << mesh->m_face_v_indices.size() << " inward=" << inward
-              << (fix_winding ? " (flipped)" : "") << std::endl;
-}
-
 Stats calc_stats(const std::vector<double>& values) {
     Stats s;
     if (values.empty()) {
@@ -242,18 +153,6 @@ std::shared_ptr<DEMMesh> load_triangle_template(DEMSolver& DEMSim,
     out_mass = static_cast<float>(volume * kTriangleParticleDensity);
     out_moi = inertia * static_cast<float>(kTriangleParticleDensity);
 
-    print_mesh_diagnostics(mesh_template, "simpleTriangleShape4mm.stl (scaled)");
-    diagnose_winding(mesh_template, "simpleTriangleShape4mm.stl (scaled)", kFixWinding);
-    if (center.x != 0.0f || center.y != 0.0f || center.z != 0.0f) {
-        for (auto& v : mesh_template->m_vertices) {
-            v.x -= center.x;
-            v.y -= center.y;
-            v.z -= center.z;
-        }
-        std::cout << "[simpleTriangleShape4mm.stl] shifted vertices to CoM frame ("
-                  << center.x << ", " << center.y << ", " << center.z << ")" << std::endl;
-    }
-
     assign_patch_ids(mesh_template, per_triangle_patches, mat_type);
     return mesh_template;
 }
@@ -267,12 +166,11 @@ RunResult run_single_collision(const float4& init_rot,
 
     DEMSolver DEMSim;
     DEMSim.SetOutputFormat(OUTPUT_FORMAT::CSV);
-    DEMSim.SetMeshOutputFormat("VTK");
     DEMSim.InstructBoxDomainDimension(5, 5, 5);
     DEMSim.SetGravitationalAcceleration(make_float3(0, 0, 0));
-    DEMSim.SetCDUpdateFreq(0);
-    DEMSim.UseAdaptiveUpdateFreq(false);
     DEMSim.SetMeshUniversalContact(true);
+    DEMSim.SetExpandSafetyType("auto");
+    DEMSim.SetExpandSafetyAdder(vmax);
 
     auto mat_type = DEMSim.LoadMaterial({{"E", 1e9}, {"nu", 0.3}, {"CoR", 0.6}, {"mu", 0.5}, {"Crr", 0.00}});
 
@@ -311,31 +209,10 @@ RunResult run_single_collision(const float4& init_rot,
     bool contact_started = false;
     bool rebound_captured = false;
     double peak_normal_force = 0.0;
-    unsigned int frame_id = 0;
-    double next_frame_time = 0.0;
-    path out_dir;
-    if (kWriteFrames) {
-        out_dir = current_path() / kOutputDir / label / ("run_" + std::to_string(run_id));
-        create_directories(out_dir);
-        next_frame_time = 0.0;
-        char filename[128];
-        std::snprintf(filename, sizeof(filename), "frame_%06u.vtk", frame_id++);
-        DEMSim.WriteMeshFile(out_dir / filename);
-    }
 
     for (int step = 0; step < kMaxSteps; ++step) {
         DEMSim.DoStepDynamics();
 
-        if (kWriteFrames) {
-            double sim_time = DEMSim.GetSimTime();
-            while (sim_time + 1e-12 >= next_frame_time) {
-                char filename[128];
-                std::snprintf(filename, sizeof(filename), "frame_%06u.vtk", frame_id++);
-                DEMSim.WriteMeshFile(out_dir / filename);
-                next_frame_time += 1.0 / static_cast<double>(kOutputFPS);
-            }
-        }
-
         float3 plane_force = plane_tracker->ContactAcc();
         plane_force = vec_scale(plane_force, plane_tracker->Mass());
         double normal_force = std::abs(vec_dot(plane_force, plane_normal));

From a6d960705b53d7e1efba7a9384b41ded6c827d9a Mon Sep 17 00:00:00 2001
From: Florian Reinle <f.reinle@otec.de>
Date: Thu, 29 Jan 2026 23:26:26 +0100
Subject: [PATCH 16/17] Switch to multi contact concept for convave shapes
 (default) -  tri edge neighbors for contact islands - finer contact mesh is
 "winner" and defines the islands - convace demo change (cross.stl) to check
 function

---
 data/mesh/cross.stl                           | Bin 0 -> 2284 bytes
 data/mesh/cross_fine.stl                      | Bin 0 -> 17684 bytes
 src/DEM/API.h                                 |   6 +
 src/DEM/APIPrivate.cpp                        | 180 ++++++-
 src/DEM/APIPublic.cpp                         |   5 +
 src/DEM/BdrsAndObjs.h                         |  12 +
 src/DEM/Defines.h                             |  14 +
 src/DEM/dT.cpp                                |  85 ++-
 src/DEM/dT.h                                  |  27 +
 src/DEM/kT.cpp                                |  78 ++-
 src/DEM/kT.h                                  |  28 +
 src/algorithms/DEMContactDetection.cu         | 491 +++++++++++++++++-
 src/algorithms/DEMContactDetectionKernels.cuh | 331 +++++++++++-
 src/algorithms/DEMStaticDeviceSubroutines.h   |   3 +
 src/demo/DEMdemo_DrumCubes.cpp                |   2 +
 src/demo/DEMdemo_ResponseAngleMesh.cpp        |  33 +-
 16 files changed, 1218 insertions(+), 77 deletions(-)
 create mode 100644 data/mesh/cross.stl
 create mode 100644 data/mesh/cross_fine.stl

diff --git a/data/mesh/cross.stl b/data/mesh/cross.stl
new file mode 100644
index 0000000000000000000000000000000000000000..a0c19126e2d897ceea6d5a1df376d319e270105a
GIT binary patch
literal 2284
zcma)7OKwv^5S*jTg0PCXLYf5<NGy<?3k2CgmJ&O%9T2;B81BJj$8k6U)ibYaYTg5h
zk)EescU5=I$M*U7u;1p-`@`Mod*0lB%=f3`W8T#0yX}|Vb>8fEkGsP;e>iNP&lhJM
z?=Ro(PP_c+>m@&2wr`S*>x!qJDgFM@!nvgM^pcX##Ma+4a7t86V<uEFM<v%ZmCs%y
zGQ@~!Kj(>}%nB10VpIczeIx6z#;i^P`;4dd#=KTUhOE*Qu^!iSS5I$y4`LGqa2vi~
zKD;g+1uk$Tm71&fuR)*Y%*?DT$9Sj!76q_oh|5?BHC{E9_o%LH;SBHEPZVj)kX6P?
zbA7O*2i@3MA^M8~qKFQ2Y4NC((yuF72eOgzU8XbZSQJTWf#H2t>H<=~?Z7-eWQeWd
z=qHM4%+Qspf&1_~tabl&BcYg@$`$%;s_KF&HdbnE|E!9*abSh-VW$!Aqq;(5V3hXG
zJwv_^uA_7(tF;*UQ*%|%z<+&^sZWu4#)*Tjlx6>=P*z}{OH(I~SO@oLX3(Jxim9(d
z8LAG<7WICSjrze@-j-2OSzw3f;oPmo7Yg6mS;4bdR2UVj+Nz%MLSem!uZcGk?7$0!
zWWwFD?^;2op`Smf!WnoT6V3y_y+hVUW>i`zz~T-s75-k5j8zY0RUH^n`MuIJAU1Ie
zEDB(*0~z81PJCy8+luS|2XO4Uvx!X<z<*I7%F}aPL=D{jXZf#&NOhyLi2_&@)0m;F
HGFJHmbX;o;

literal 0
HcmV?d00001

diff --git a/data/mesh/cross_fine.stl b/data/mesh/cross_fine.stl
new file mode 100644
index 0000000000000000000000000000000000000000..d83f148a1b281395ec714bd82afd9b6cd7adf961
GIT binary patch
literal 17684
zcmb_iKj<b`6&+hi6|mCchlwi$Yz2kvzMz5xQfXm8!A`K*R3fP!C@2_G*e=L6iQsl2
zm5}UPEP|jSSO|iMU;^3BN*lp*?!R~M%$xaHc**z8{mq>7&b{~DH}k&Pyz;egeEkcr
z{NKJl`Q1-!u>P3eYByus9^muF-EE`K56yl3d&lT6KLO)?MZE^!8T4-&{|ZJ|)Lz5K
ze|PBh!~h#t4;mboth-gC0}fj<hB7?DgT^6q*PxEtE{_tGL;@tyk~xmo8j@{O23f0G
zJ$m8kc=W3;zo;m$57Eysa~zXK=b1OZ^4yazz4__)^s1wX^av9J?8>Nm&fYT?sHS?J
z)1FbI@>z>Kz`kZkB%(33_l(-RvsM?3;|z+%jP{_B7!$RqZbw2eUYXrKBjNd3tF2F+
zhggs7A0CPFes!NmHR;B#$UVbRvlEvZ_ojw-yXz$lxDgc?G#R&5wp`G)-oXIUqv92h
zggWl?e9AmAGRl2LFB#0E>A1Y^K96crE4UF=VmQik+A$s<^B}_KL(%*`%m+2bjEWd)
znDuq$yqZvFwOT}#=iD@KIAji{*T+#=FXe&wd4)BjIcm>N8Xnb-;cTmPPmF~|o)0pC
zw(;oms;nqJao~H_?kju8n5z~H<{@bFH?d%Jxmp-;zcR*7Lm7vtTw|{D(10SY7L9#%
zH7{Xcd>Y5wXHJyp;HY++hBMOB;G(*57j4iP`q|INqo<#Jf@c(;I6(Qh^waaBX#C@c
zACJ#`@TxH!MTYC`H8P{>IXf{rBUzv@enbd=&&V3{?i_hk2G}>PGU6GPw3CJ~@|@QQ
zy~hrWGUwh)S=$&L0x-l|`fAR{)VtECInDXt*nTg0(4aGHpJ+ZVOLb?L@Qg9~%P$!N
ztM)bXdEI?hS<UaFD-DkQZqt$It6gcxe4CzW^oVL(YP6Y$3E*(4F=ieIM$w4a+tfH@
zUTSsMV^12p%sX}!9MATNp#jD-iM@$wb+L1iE4ok2sH_J<_slb+>ScD)*iKL_%^-X=
zt!~!9eZ*r2Dm37$tr!zkwKD^E(5C&v-#D{Y7mW3Mv>Gimy4MQp&tiY${fdg52Jm9m
zS1GC*NkrHP&-Zq&oQIF~<Ew}m_q9uIV-Q6K?Sg^3cZ}PnFSHp|*DDg0F^FJ?XZ9iw
zpvlUS2m{O%RmWg3UU`@p9N+Uus|g6s2`v~n=i~Z$VK@F2wd!kz=RvIM92m$U=Ru_C
z4SKX-z?Z1V@Tdk6dR{$#9~eNdG>AQ^)h;Tp6&Y%w(NpGN?Q7-ulc+2YM`iZyj9lFm
z6(XYTYo%vrt#m$k)^m|{-t5f;j*j5bKY+uGp{uqj!(%7XePY#w(QTbKWysowpBePk
zjG^lsXXSkfjvk4ubNd^NejODM5tYt4&s=3rlxq>IcA5FGdvF62zS>G7Vh74K_6+Pj
zcu#D)PmEfDA$RR|)WAN3XPfR5Gb-zCsaBa$^)ed_lF+MLKS8yWIeax`WDR?M0Ku$7
z1HPKpsxl_3YG+37tBQ)cHIrD*dDMsdQy{&DXBsKnQT4#+Rx7x3Mb&XbZvYe#(Pv%A
zlJ|)?@^4!067MOyBJ0rC2iKmRkB+}x!@FG~8{~&Z7xjaNN8}Yw46s*uID<zu?GW{G
z_NXy*B+n^n^rJ>pwdpn0i98^)*O1tghOWWN;O@fSCt_T$5670y1Up3q|3>yX_CB#q
z6dA5}#i)7<jZTdP3gc75tB^E=!F|<?%4aR|05dV*FY^E@YwsDgcW13;v<F6+^BL8Q
z_KGo4i|TeH0ppdqqGlwuR(aJvw5Kxn3`BYNxxc?t&;3YdI7*&{1A!osnM-IuE5>cp
zCtOXsu_q1QhhZ{qn+%Sk=Kbq?9v*2P2_th~F2q{TjEr($(fmGWFprUHW&SRzK6hfQ
zfHNHBIY9t+$UNo2>s~kdY7rI2=2)W9W>iG+{ImljX)rokV)RAki6L-fID5fBbTl8=
ztDtB39hqZn$pgnd%GrcpYvk&3bhSg~yQqkwhO#>%aAPF0qU>L=e0-j#zhumH4wR2u
zbJ}SXM%)<^8(H5Hl|%zZx+hjf(eR3TR6LqSr>AqU&s?$|o;6VsRTsuV!x`ym;87`~
zR5$LTLp4%V(Bj<VJqGrQ_C66PAD77V{3sgSkJ$Ufgd)TBQXU&MGNbA(G=#B0HK~Y3
z2>(iB&RXPgD?=gyDpzE$k#-rYx-B$nw|g(;ikL><#1L<p^En^0B8RBboaajC`QYHT
zYUO<;yPNyOcsoC)Ppq1+bG`?61}mzoROiFjNnsQXf7eQ8)+-Eo@4D9@Bi_NnSG&rC
zh|p+)?~(ah1E`#naQrS>^2p;xPGA>C(TLcAat*JYM}=L*U`2JG7z~_?H{9^mG;{1+
zo{)hx_+mvxR2Ph@ccQ_!oEXES3RFw;0bfm`>NTo}1gOjgqkHC+F;P`JX~4~$IBRvm
zNHbXVBElgG-J`<#lj`dF2o!hbsp`V<6_ui@kwk=z__(*jXD>axXFD3xBPM!m@ULL>
zHN#gFvGfxIerP}^#%<Fl{+hw-?AOdrjBXluIy}UOD4Itl9Y1(R^62!A3`dE^5D31O
z49~+*&d?PVwE~-07!fO+QS@m$MmGk~!Dz(kdG+{x&>#akJ}xn$*XYp>44_vGV$%$E
z{+*&q8eUyR@cT+b$E*=1MmG&cb+4klZmJ_1K+||ddUwI#^<8zHvD5IrB4e88BI|tE
zn>!pm4<6;yHZx=Bs>M~l%kbD;!`Vu?(jY@tEk@Ej$XRK~oxxC9kN9Bdonh2GA_tr|
z>%my&K~!iA{za3hy4@3pYDDb#`>11#r3TTF6MQvgVDEuBmw)qmucG9x4fL$AGO!Pk
zv-Unw*ow*#LZf@;nNjslG`15|OA*3Xlc=_=QRg`{E*TS5wX=pjKL+QcqN2JxjguLa
zfA40^w1#IIDO>JI(>#IEy}DWBqzXe6z@YWCw(}~2FWuws7!^IA>DH$WapNOBizC*g
z_8fe(ZW=G?--3qEqiDeQx01%dfl*~7ulVml(XbIJBUVmJwOZ!^qGUcJiQgM1QS}tn
zlZZe5Ps#JlnMeFGg@2KUX#D*7zu(!m?X9W75pLRB-~8atAKvNW<2u5TZ{)FH3}wHL
z-RAc{r@o^3?bX$BD|^T2$`Z!#h%)04l@g9oku4)U{o;?AhmB@%93m>Yk~Nl$ZVaLi
z437WKAAgEEGRJ9$S`E+XoH4tbW3k6*@K@^tSG-ofCWv<byi24jhDH<b7^r~`U9S%r
z*ZUXDi@pMsQO!N>jOg|}f<+DM{6!wF!H9sAT222V528JG`o2~~!}7!6W|7BGje4h^
z@8BJ;nNL)G8hkW<|GRI0UPsCd-e2cfnSI9i^}F}){Oq3}T2#zKSn@A(KloVg)@>?S
zm*9+|GMn#ct{D1TBd>|fK4^?NAB^hA-ZSF4-Hgc*+SU5tTytz5{ml^l-MwT}^YtMt
zGKMm5(Euut@Q-!dlJ$GJJ{p~}m$mA1KBC*3Yb@s@F*Fj(e(<=|*i<9e$KQ$Fv|oJh
z!}gs&e`<NB&Q+Z3E5;nxR+B{a@4v?TzlLDKWJTFJ<DA6%v3A9nD4TOIuk)CU9Q&0<
zyhX#<vkI9#lbAC&b^Kn!8N`aN_mWP=tAF~*oo)L_V%Hf|_L2cH)+*lt=32R0A2|>2
z`jhn`QE4O@sUw^}Yrz=auj*?bFMC(=s8y7#F~ZGy&Qnx57p+yTAqF>-b>6%?GU~QG
z{0SgA$*f(?$3$u78e!IxfVzg|cb{YIR~m8WK|XoC=U&k9`Av<Cn)j6YTrozUy_<EI
z*G;SUF=OCx=v|<1?~)l+jpRHJMiFxkU%T>Z26GN`+^h8w^If}dwP)yBE2*eNQ}Vdl
zku~4{)HSi#$%sbn24?M1p6yHOolMXY;rSk}#GZF&r+L)aIXB*0yF8Es&AVM=uQ{FM
z@i^^khwb=qq*m;G;ZNf)?oh`S)=Kt?@M+oCGhocQ9YFZg$9$rwH!U=Xv@61bFM8=b
zSRMcE@`#QB!z1#Mz<`Cu6V;G8PrkDhZH$tKcZk%A5w#huvO>e+&9RG~oi7^O>*M~j
z&%JF?jX5rV-*o;?#5uFhJbwGqZ~T|yFmzMYf&n6<;`8-|=N08~bKEn=U%u7-R~$9c
z=Zpc3lt)HoHp9UOwvVbXn&mZ!blYcM7-P+-c@##QdAWi{6)T5h;^Uo2RQB;_PYmlN
zL76Y?be`9QQTmEw(y|tPSeI|sYTh%3b~vxZ_)UGB>iEE-6Bm(?<?BVj^0m_NPVu{t
zMMZoUjYA$p$oR;~eMRGYMwPC{ofOPnb&ZkxDxN+-%lIua2b#vrKN@d8`06k9N<0$>
zo^<$lPQ<G9$LUTz<~^6=XEx9s9~d=qR9WwT)5zmywvKPrEvr_Z(xYb02w`2m5moJW
zUX^HYlv>$Q(fno}{SNCj5z43|5vx`i_QXj&CV34<^H>>L|A?wtg$I#tl~E(%>^a-;
zof_Ql{_S7?d0z8~{qC^c4USeTk~$gbd5){O3p1N=wLL?3m^yBLGC2MiRo-W1c4p+L
z7CE(&2Z;Ug{bzr~8I&t&2QYxdInM=9vqP6O9aB04{;s-UaXt>u4`bz=05#j+sb$uh
z>F{Vi;2j@WopVKcFEO$dRVSW|WB2)8jSOcCSiTO7Hc^bo*~aLLsDe?oje*0y6Z5Lg
zH6&D7%f1?{K%IX@^SjQuR@3J>S2{EDo*{RPvOUVoIQCMkkyiSuD=Vs-?(GrPm@5qE
zVC*6R$|I2(w2{D|b!J}g?cNo6zUjldd>`f`&%s&m)~7+fJ9|%=hDsdgV|aaP4GoUN
zVSRw1nb(RKqmxmi%8_{A?xU*Ds9dX@Q`V>qj_04u1_Ke%R(bf^CevqA!19F|OnZiW
zx3dmus<~s}2_yefCI2#ps^b5b;rQp7&tEKcr7qSvq}W$w*M}&y+Yd9?0f*H*9L=a|
jPKq{0{T($t{u*InlwF@>+o(}{!5Dp?M~z+TWh?5x>ct&S

literal 0
HcmV?d00001

diff --git a/src/DEM/API.h b/src/DEM/API.h
index 0a2aa32c..3ba4c82e 100644
--- a/src/DEM/API.h
+++ b/src/DEM/API.h
@@ -1905,6 +1905,8 @@ class DEMSolver {
     std::vector<float3> m_input_mesh_obj_xyz;
     std::vector<float4> m_input_mesh_obj_rot;
     std::vector<unsigned int> m_input_mesh_obj_family;
+    std::vector<notStupidBool_t> m_input_mesh_obj_convex;
+    std::vector<notStupidBool_t> m_input_mesh_obj_never_winner;
 
     // Processed unique family prescription info
     std::vector<familyPrescription_t> m_unique_family_prescription;
@@ -1940,6 +1942,10 @@ class DEMSolver {
     std::vector<bodyID_t> m_mesh_facet_owner;
     // Patch ID for each mesh facet, flattened
     std::vector<bodyID_t> m_mesh_facet_patch;
+    // Per-facet edge neighbors (global triangle indices, NULL_BODYID if boundary)
+    std::vector<bodyID_t> m_mesh_facet_neighbor1;
+    std::vector<bodyID_t> m_mesh_facet_neighbor2;
+    std::vector<bodyID_t> m_mesh_facet_neighbor3;
     // Three nodes of each triangle, flattened
     std::vector<DEMTriangle> m_mesh_facets;
 
diff --git a/src/DEM/APIPrivate.cpp b/src/DEM/APIPrivate.cpp
index 5148f130..531c860f 100644
--- a/src/DEM/APIPrivate.cpp
+++ b/src/DEM/APIPrivate.cpp
@@ -13,13 +13,141 @@
 #include <thread>
 #include <chrono>
 #include <cstring>
+#include <cstdint>
+#include <cmath>
 #include <limits>
 #include <algorithm>
 #include <map>
 #include <tuple>
+#include <unordered_map>
+#include <array>
 
 namespace deme {
 
+namespace {
+
+struct EdgeInfo {
+    size_t tri = 0;
+    int edge = 0;
+};
+
+struct QuantKey3 {
+    int64_t x, y, z;
+    bool operator==(const QuantKey3& o) const noexcept { return x == o.x && y == o.y && z == o.z; }
+};
+struct QuantKey3Hash {
+    size_t operator()(const QuantKey3& k) const noexcept {
+        size_t h1 = std::hash<int64_t>{}(k.x);
+        size_t h2 = std::hash<int64_t>{}(k.y);
+        size_t h3 = std::hash<int64_t>{}(k.z);
+        size_t h = h1;
+        h ^= h2 + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2);
+        h ^= h3 + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2);
+        return h;
+    }
+};
+
+inline uint64_t makeEdgeKey(int a, int b) {
+    const uint32_t lo = static_cast<uint32_t>(std::min(a, b));
+    const uint32_t hi = static_cast<uint32_t>(std::max(a, b));
+    return (static_cast<uint64_t>(lo) << 32) | static_cast<uint64_t>(hi);
+}
+
+static inline int64_t quantize(double v, double eps) {
+    return static_cast<int64_t>(std::llround(v / eps));
+}
+
+std::vector<std::array<bodyID_t, 3>> buildTriangleEdgeNeighbors(const std::vector<int3>& face_v_indices,
+                                                                 const std::vector<float3>& vertices) {
+    const size_t n_faces = face_v_indices.size();
+    std::vector<std::array<bodyID_t, 3>> neighbors(n_faces, {NULL_BODYID, NULL_BODYID, NULL_BODYID});
+    if (n_faces == 0) {
+        return neighbors;
+    }
+
+    std::vector<size_t> canon;
+    if (!vertices.empty()) {
+        double minx = vertices[0].x, miny = vertices[0].y, minz = vertices[0].z;
+        double maxx = minx, maxy = miny, maxz = minz;
+        for (const auto& v : vertices) {
+            minx = std::min(minx, (double)v.x);
+            miny = std::min(miny, (double)v.y);
+            minz = std::min(minz, (double)v.z);
+            maxx = std::max(maxx, (double)v.x);
+            maxy = std::max(maxy, (double)v.y);
+            maxz = std::max(maxz, (double)v.z);
+        }
+        const double dx = maxx - minx, dy = maxy - miny, dz = maxz - minz;
+        const double diag = std::sqrt(dx * dx + dy * dy + dz * dz);
+        const double eps = std::max(diag * 1e-9, 1e-12);
+
+        std::unordered_map<QuantKey3, size_t, QuantKey3Hash> rep;
+        rep.reserve(vertices.size());
+        canon.assign(vertices.size(), static_cast<size_t>(-1));
+        size_t next_id = 0;
+        for (size_t i = 0; i < vertices.size(); ++i) {
+            const auto& v = vertices[i];
+            QuantKey3 key{quantize(v.x, eps), quantize(v.y, eps), quantize(v.z, eps)};
+            auto it = rep.find(key);
+            if (it == rep.end()) {
+                rep.emplace(key, next_id);
+                canon[i] = next_id;
+                next_id++;
+            } else {
+                canon[i] = it->second;
+            }
+        }
+    }
+
+    std::unordered_map<uint64_t, std::vector<EdgeInfo>> edge_map;
+    edge_map.reserve(n_faces * 3);
+
+    for (size_t i = 0; i < n_faces; ++i) {
+        const int3& face = face_v_indices[i];
+        const int v0_raw = face.x;
+        const int v1_raw = face.y;
+        const int v2_raw = face.z;
+        if (v0_raw < 0 || v1_raw < 0 || v2_raw < 0) {
+            continue;
+        }
+        int v0 = v0_raw;
+        int v1 = v1_raw;
+        int v2 = v2_raw;
+        if (!canon.empty()) {
+            if (static_cast<size_t>(v0_raw) >= canon.size() || static_cast<size_t>(v1_raw) >= canon.size() ||
+                static_cast<size_t>(v2_raw) >= canon.size()) {
+                continue;
+            }
+            v0 = static_cast<int>(canon[static_cast<size_t>(v0_raw)]);
+            v1 = static_cast<int>(canon[static_cast<size_t>(v1_raw)]);
+            v2 = static_cast<int>(canon[static_cast<size_t>(v2_raw)]);
+        }
+        if (v0 == v1 || v1 == v2 || v2 == v0) {
+            continue;
+        }
+        const uint64_t e0 = makeEdgeKey(v0, v1);
+        const uint64_t e1 = makeEdgeKey(v1, v2);
+        const uint64_t e2 = makeEdgeKey(v2, v0);
+        edge_map[e0].push_back(EdgeInfo{i, 0});
+        edge_map[e1].push_back(EdgeInfo{i, 1});
+        edge_map[e2].push_back(EdgeInfo{i, 2});
+    }
+
+    for (const auto& entry : edge_map) {
+        const auto& info = entry.second;
+        if (info.size() == 2) {
+            const EdgeInfo& a = info[0];
+            const EdgeInfo& b = info[1];
+            neighbors[a.tri][a.edge] = static_cast<bodyID_t>(b.tri);
+            neighbors[b.tri][b.edge] = static_cast<bodyID_t>(a.tri);
+        }
+    }
+
+    return neighbors;
+}
+
+}  // namespace
+
 void DEMSolver::assertSysInit(const std::string& method_name) {
     if (!sys_initialized) {
         DEME_ERROR("DEMSolver's method %s can only be called after calling Initialize()", method_name.c_str());
@@ -840,17 +968,38 @@ void DEMSolver::preprocessTriangleObjs() {
         m_input_mesh_obj_xyz.push_back(mesh_obj->init_pos);
         m_input_mesh_obj_rot.push_back(mesh_obj->init_oriQ);
         m_input_mesh_obj_family.push_back(mesh_obj->family_code);
+        m_input_mesh_obj_convex.push_back(mesh_obj->is_convex ? 1 : 0);
+        m_input_mesh_obj_never_winner.push_back(mesh_obj->never_winner ? 1 : 0);
         m_mesh_facet_owner.insert(m_mesh_facet_owner.end(), mesh_obj->GetNumTriangles(), thisMeshObj);
 
-        // Initialize patch IDs if not already set (default: all facets in patch 0)
-        if (!mesh_obj->patches_explicitly_set && mesh_obj->m_patch_ids.empty()) {
-            mesh_obj->SetPatchIDs({0});
+        const bodyID_t tri_offset = static_cast<bodyID_t>(m_mesh_facets.size());
+        const auto local_neighbors = buildTriangleEdgeNeighbors(mesh_obj->m_face_v_indices, mesh_obj->m_vertices);
+
+        // Force single-patch semantics: one patch per mesh (all facets in patch 0)
+        if (mesh_obj->patches_explicitly_set || mesh_obj->GetNumPatches() > 1) {
+            DEME_WARNING(
+                "Mesh patch IDs were provided or computed, but single-patch mode is enabled; all facets will be "
+                "assigned to one patch.");
+        }
+        if (mesh_obj->GetNumTriangles() > 0) {
+            mesh_obj->m_patch_ids.assign(mesh_obj->GetNumTriangles(), 0);
+        } else {
+            mesh_obj->m_patch_ids.clear();
+        }
+        mesh_obj->nPatches = 1;
+        mesh_obj->patches_explicitly_set = true;
+        mesh_obj->m_patch_locations.clear();
+        mesh_obj->patch_locations_explicitly_set = false;
+        if (mesh_obj->materials.size() != 1 && !mesh_obj->materials.empty()) {
+            auto mat = mesh_obj->materials[0];
+            mesh_obj->materials.assign(1, mat);
+            DEME_WARNING("Mesh provided multiple patch materials; single-patch mode keeps only the first material.");
         }
 
         // Populate patch owner and material arrays (one entry per patch in this mesh)
         // Note patch_id in a mesh is always 0-based, and contiguous
         std::vector<materialsOffset_t> patch_materials(mesh_obj->GetNumPatches());
-        for (size_t facet_idx = 0; facet_idx < mesh_obj->GetNumPatches(); facet_idx++) {
+        for (size_t facet_idx = 0; facet_idx < mesh_obj->GetNumTriangles(); facet_idx++) {
             // patch_id is per-triangle
             bodyID_t patch_id = mesh_obj->m_patch_ids.at(facet_idx);
             // Assign this facet's material to its patch (will overwrite for each facet, but they should be consistent
@@ -886,6 +1035,11 @@ void DEMSolver::preprocessTriangleObjs() {
                 }
             }
             m_mesh_facets.push_back(tri);
+
+            const auto& nb = local_neighbors[i];
+            m_mesh_facet_neighbor1.push_back(nb[0] == NULL_BODYID ? NULL_BODYID : nb[0] + tri_offset);
+            m_mesh_facet_neighbor2.push_back(nb[1] == NULL_BODYID ? NULL_BODYID : nb[1] + tri_offset);
+            m_mesh_facet_neighbor3.push_back(nb[2] == NULL_BODYID ? NULL_BODYID : nb[2] + tri_offset);
         }
         thisLoadPatchCount += mesh_obj->GetNumPatches();
 
@@ -1345,8 +1499,10 @@ void DEMSolver::initializeGPUArrays() {
         // Analytical objects' initial stats
         m_input_ext_obj_xyz, m_input_ext_obj_rot, m_input_ext_obj_family,
         // Meshed objects' initial stats
-        cached_mesh_objs, m_input_mesh_obj_xyz, m_input_mesh_obj_rot, m_input_mesh_obj_family, m_mesh_facet_owner,
-        m_mesh_facet_patch, m_mesh_facets, m_mesh_patch_owner, m_mesh_patch_materials,
+        cached_mesh_objs, m_input_mesh_obj_xyz, m_input_mesh_obj_rot, m_input_mesh_obj_family,
+        m_input_mesh_obj_convex, m_input_mesh_obj_never_winner, m_mesh_facet_owner, m_mesh_facet_patch,
+        m_mesh_facet_neighbor1, m_mesh_facet_neighbor2, m_mesh_facet_neighbor3, m_mesh_facets, m_mesh_patch_owner,
+        m_mesh_patch_materials,
         // Clump template name mapping
         m_template_number_name_map,
         // Clump template info (mass, sphere components, materials etc.)
@@ -1368,7 +1524,8 @@ void DEMSolver::initializeGPUArrays() {
         // Analytical objects' initial stats
         m_input_ext_obj_family,
         // Meshed objects' initial stats
-        m_input_mesh_obj_family, m_mesh_facet_owner, m_mesh_facet_patch, m_mesh_facets,
+        m_input_mesh_obj_family, m_input_mesh_obj_convex, m_input_mesh_obj_never_winner, m_mesh_facet_owner,
+        m_mesh_facet_patch, m_mesh_facet_neighbor1, m_mesh_facet_neighbor2, m_mesh_facet_neighbor3, m_mesh_facets,
         // Analytical obj physics properties
         m_ext_obj_comp_num,
         // Family mask
@@ -1398,8 +1555,10 @@ void DEMSolver::updateClumpMeshArrays(size_t nOwners,
         // Analytical objects' initial stats
         m_input_ext_obj_xyz, m_input_ext_obj_rot, m_input_ext_obj_family,
         // Meshed objects' initial stats
-        cached_mesh_objs, m_input_mesh_obj_xyz, m_input_mesh_obj_rot, m_input_mesh_obj_family, m_mesh_facet_owner,
-        m_mesh_facet_patch, m_mesh_facets, m_mesh_patch_owner, m_mesh_patch_materials,
+        cached_mesh_objs, m_input_mesh_obj_xyz, m_input_mesh_obj_rot, m_input_mesh_obj_family,
+        m_input_mesh_obj_convex, m_input_mesh_obj_never_winner, m_mesh_facet_owner, m_mesh_facet_patch,
+        m_mesh_facet_neighbor1, m_mesh_facet_neighbor2, m_mesh_facet_neighbor3, m_mesh_facets, m_mesh_patch_owner,
+        m_mesh_patch_materials,
         // Clump template info (mass, sphere components, materials etc.)
         flattened_clump_templates,
         // Analytical obj physics properties
@@ -1420,7 +1579,8 @@ void DEMSolver::updateClumpMeshArrays(size_t nOwners,
         // Analytical objects' initial stats
         m_input_ext_obj_family,
         // Meshed objects' initial stats
-        m_input_mesh_obj_family, m_mesh_facet_owner, m_mesh_facet_patch, m_mesh_facets,
+        m_input_mesh_obj_family, m_input_mesh_obj_convex, m_input_mesh_obj_never_winner, m_mesh_facet_owner,
+        m_mesh_facet_patch, m_mesh_facet_neighbor1, m_mesh_facet_neighbor2, m_mesh_facet_neighbor3, m_mesh_facets,
         // Analytical obj physics properties
         m_ext_obj_comp_num,
         // Family mask
diff --git a/src/DEM/APIPublic.cpp b/src/DEM/APIPublic.cpp
index de4d0b8c..e79e8ee0 100644
--- a/src/DEM/APIPublic.cpp
+++ b/src/DEM/APIPublic.cpp
@@ -2389,6 +2389,8 @@ void DEMSolver::ReleaseFlattenedArrays() {
     deallocate_array(m_input_mesh_obj_xyz);
     deallocate_array(m_input_mesh_obj_rot);
     deallocate_array(m_input_mesh_obj_family);
+    deallocate_array(m_input_mesh_obj_convex);
+    deallocate_array(m_input_mesh_obj_never_winner);
 
     deallocate_array(m_unique_family_prescription);
     deallocate_array(m_input_clump_family);
@@ -2404,6 +2406,9 @@ void DEMSolver::ReleaseFlattenedArrays() {
 
     deallocate_array(m_mesh_facet_owner);
     deallocate_array(m_mesh_facet_patch);
+    deallocate_array(m_mesh_facet_neighbor1);
+    deallocate_array(m_mesh_facet_neighbor2);
+    deallocate_array(m_mesh_facet_neighbor3);
     deallocate_array(m_mesh_facets);
     deallocate_array(m_mesh_patch_owner);
     deallocate_array(m_mesh_patch_materials);
diff --git a/src/DEM/BdrsAndObjs.h b/src/DEM/BdrsAndObjs.h
index 68b2bfb0..6a848f5e 100644
--- a/src/DEM/BdrsAndObjs.h
+++ b/src/DEM/BdrsAndObjs.h
@@ -370,6 +370,10 @@ class DEMMesh : public DEMInitializer {
     // If true, when the mesh is initialized into the system, it will re-order the nodes of each triangle so that the
     // normals derived from right-hand-rule are the same as the normals in the mesh file
     bool use_mesh_normals = false;
+    // If true, this mesh is treated as convex for contact island reduction.
+    bool is_convex = false;
+    // If true, this mesh is never selected as the winner side for island labeling.
+    bool never_winner = false;
 
     DEMMesh() { obj_type = OWNER_TYPE::MESH; }
     DEMMesh(std::string input_file) {
@@ -407,6 +411,14 @@ class DEMMesh : public DEMInitializer {
     /// Instruct that when the mesh is initialized into the system, it will re-order the nodes of each triangle so that
     /// the normals derived from right-hand-rule are the same as the normals in the mesh file
     void UseNormals(bool use = true) { use_mesh_normals = use; }
+    /// Mark this mesh as convex for contact reduction purposes.
+    void SetConvex(bool convex = true) { is_convex = convex; }
+    /// Query whether this mesh is marked convex.
+    bool IsConvex() const { return is_convex; }
+    /// Prevent this mesh from ever being chosen as the winner side in island labeling.
+    void SetNeverWinner(bool never = true) { never_winner = never; }
+    /// Query whether this mesh is marked as never-winner.
+    bool IsNeverWinner() const { return never_winner; }
 
     /// Access the n-th triangle in mesh
     DEMTriangle GetTriangle(size_t index) const {  // No need to wrap (for Shlok)
diff --git a/src/DEM/Defines.h b/src/DEM/Defines.h
index d8dfe9f2..a9c7f799 100644
--- a/src/DEM/Defines.h
+++ b/src/DEM/Defines.h
@@ -363,6 +363,7 @@ struct DEMDataDT {
     bodyID_t* idPatchA;
     bodyID_t* idPatchB;
     contact_t* contactTypePatch;
+    bodyID_t* contactPatchIsland;
     contactPairs_t* contactMapping;
 
     // Family mask
@@ -388,7 +389,12 @@ struct DEMDataDT {
     bodyID_t* ownerTriMesh;
     bodyID_t* ownerPatchMesh;
     bodyID_t* ownerAnalBody;
+    notStupidBool_t* ownerMeshConvex;
+    notStupidBool_t* ownerMeshNeverWinner;
     bodyID_t* triPatchID;
+    bodyID_t* triNeighbor1;
+    bodyID_t* triNeighbor2;
+    bodyID_t* triNeighbor3;
     float3* relPosNode1;
     float3* relPosNode2;
     float3* relPosNode3;
@@ -464,7 +470,12 @@ struct DEMDataKT {
     clumpComponentOffsetExt_t* clumpComponentOffsetExt;
     bodyID_t* ownerTriMesh;
     bodyID_t* ownerAnalBody;
+    notStupidBool_t* ownerMeshConvex;
+    notStupidBool_t* ownerMeshNeverWinner;
     bodyID_t* triPatchID;
+    bodyID_t* triNeighbor1;
+    bodyID_t* triNeighbor2;
+    bodyID_t* triNeighbor3;
     float3* relPosNode1;
     float3* relPosNode2;
     float3* relPosNode3;
@@ -486,6 +497,8 @@ struct DEMDataKT {
     bodyID_t* previous_idPatchB;
     contact_t* contactTypePatch;
     contact_t* previous_contactTypePatch;
+    bodyID_t* contactPatchIsland;
+    bodyID_t* previous_contactPatchIsland;
     contactPairs_t* geomToPatchMap;
 
     // data pointers that is kT's transfer destination
@@ -500,6 +513,7 @@ struct DEMDataKT {
     bodyID_t* pDTOwnedBuffer_idPatchA = nullptr;
     bodyID_t* pDTOwnedBuffer_idPatchB = nullptr;
     contact_t* pDTOwnedBuffer_contactTypePatch = nullptr;
+    bodyID_t* pDTOwnedBuffer_contactPatchIsland = nullptr;
     contactPairs_t* pDTOwnedBuffer_geomToPatchMap = nullptr;
 
     // The collection of pointers to DEM template arrays such as radiiSphere, still useful when there are template info
diff --git a/src/DEM/dT.cpp b/src/DEM/dT.cpp
index 4163ee8b..9ad0a201 100644
--- a/src/DEM/dT.cpp
+++ b/src/DEM/dT.cpp
@@ -82,6 +82,7 @@ void DEMDynamicThread::packDataPointers() {
     idPatchA.bindDevicePointer(&(granData->idPatchA));
     idPatchB.bindDevicePointer(&(granData->idPatchB));
     contactTypePatch.bindDevicePointer(&(granData->contactTypePatch));
+    contactPatchIsland.bindDevicePointer(&(granData->contactPatchIsland));
 
     familyMaskMatrix.bindDevicePointer(&(granData->familyMasks));
     familyExtraMarginSize.bindDevicePointer(&(granData->familyExtraMarginSize));
@@ -115,8 +116,13 @@ void DEMDynamicThread::packDataPointers() {
 
     // Mesh and analytical-related
     ownerTriMesh.bindDevicePointer(&(granData->ownerTriMesh));
+    ownerMeshConvex.bindDevicePointer(&(granData->ownerMeshConvex));
+    ownerMeshNeverWinner.bindDevicePointer(&(granData->ownerMeshNeverWinner));
     ownerPatchMesh.bindDevicePointer(&(granData->ownerPatchMesh));
     triPatchID.bindDevicePointer(&(granData->triPatchID));
+    triNeighbor1.bindDevicePointer(&(granData->triNeighbor1));
+    triNeighbor2.bindDevicePointer(&(granData->triNeighbor2));
+    triNeighbor3.bindDevicePointer(&(granData->triNeighbor3));
     ownerAnalBody.bindDevicePointer(&(granData->ownerAnalBody));
     relPosNode1.bindDevicePointer(&(granData->relPosNode1));
     relPosNode2.bindDevicePointer(&(granData->relPosNode2));
@@ -245,6 +251,7 @@ void DEMDynamicThread::migrateDataToDevice() {
     contactTypePatch.toDeviceAsync(streamInfo.stream);
     idPatchA.toDeviceAsync(streamInfo.stream);
     idPatchB.toDeviceAsync(streamInfo.stream);
+    contactPatchIsland.toDeviceAsync(streamInfo.stream);
 
     familyMaskMatrix.toDeviceAsync(streamInfo.stream);
     familyExtraMarginSize.toDeviceAsync(streamInfo.stream);
@@ -273,8 +280,13 @@ void DEMDynamicThread::migrateDataToDevice() {
     volumeOwnerBody.toDeviceAsync(streamInfo.stream);
 
     ownerTriMesh.toDeviceAsync(streamInfo.stream);
+    ownerMeshConvex.toDeviceAsync(streamInfo.stream);
+    ownerMeshNeverWinner.toDeviceAsync(streamInfo.stream);
     ownerPatchMesh.toDeviceAsync(streamInfo.stream);
     triPatchID.toDeviceAsync(streamInfo.stream);
+    triNeighbor1.toDeviceAsync(streamInfo.stream);
+    triNeighbor2.toDeviceAsync(streamInfo.stream);
+    triNeighbor3.toDeviceAsync(streamInfo.stream);
     ownerAnalBody.toDeviceAsync(streamInfo.stream);
     relPosNode1.toDeviceAsync(streamInfo.stream);
     relPosNode2.toDeviceAsync(streamInfo.stream);
@@ -343,6 +355,7 @@ void DEMDynamicThread::migrateContactInfoToHost() {
     contactTypePatch.toHost();
     idPatchA.toHost();
     idPatchB.toHost();
+    contactPatchIsland.toHost();
 
     // Contact results
     contactForces.toHost();
@@ -599,6 +612,8 @@ void DEMDynamicThread::allocateGPUArrays(size_t nOwnerBodies,
     DEME_DUAL_ARRAY_RESIZE(alphaZ, nOwnerBodies, 0);
     DEME_DUAL_ARRAY_RESIZE(accSpecified, nOwnerBodies, 0);
     DEME_DUAL_ARRAY_RESIZE(angAccSpecified, nOwnerBodies, 0);
+    DEME_DUAL_ARRAY_RESIZE(ownerMeshConvex, nOwnerBodies, 0);
+    DEME_DUAL_ARRAY_RESIZE(ownerMeshNeverWinner, nOwnerBodies, 0);
 
     // Resize the family mask `matrix' (in fact it is flattened)
     DEME_DUAL_ARRAY_RESIZE(familyMaskMatrix, (NUM_AVAL_FAMILIES + 1) * NUM_AVAL_FAMILIES / 2, DONT_PREVENT_CONTACT);
@@ -630,6 +645,9 @@ void DEMDynamicThread::allocateGPUArrays(size_t nOwnerBodies,
     DEME_DUAL_ARRAY_RESIZE(relPosNode2, nTriGM, make_float3(0));
     DEME_DUAL_ARRAY_RESIZE(relPosNode3, nTriGM, make_float3(0));
     DEME_DUAL_ARRAY_RESIZE(triPatchID, nTriGM, 0);
+    DEME_DUAL_ARRAY_RESIZE(triNeighbor1, nTriGM, NULL_BODYID);
+    DEME_DUAL_ARRAY_RESIZE(triNeighbor2, nTriGM, NULL_BODYID);
+    DEME_DUAL_ARRAY_RESIZE(triNeighbor3, nTriGM, NULL_BODYID);
 
     // Resize to the number of mesh patches
     DEME_DUAL_ARRAY_RESIZE(ownerPatchMesh, nMeshPatches, 0);
@@ -797,8 +815,13 @@ void DEMDynamicThread::populateEntityArrays(const std::vector<std::shared_ptr<DE
                                             const std::vector<float3>& input_mesh_obj_xyz,
                                             const std::vector<float4>& input_mesh_obj_rot,
                                             const std::vector<unsigned int>& input_mesh_obj_family,
+                                            const std::vector<notStupidBool_t>& input_mesh_obj_convex,
+                                            const std::vector<notStupidBool_t>& input_mesh_obj_never_winner,
                                             const std::vector<unsigned int>& mesh_facet_owner,
                                             const std::vector<bodyID_t>& mesh_facet_patch,
+                                            const std::vector<bodyID_t>& mesh_facet_neighbor1,
+                                            const std::vector<bodyID_t>& mesh_facet_neighbor2,
+                                            const std::vector<bodyID_t>& mesh_facet_neighbor3,
                                             const std::vector<DEMTriangle>& mesh_facets,
                                             const std::vector<bodyID_t>& mesh_patch_owner,
                                             const std::vector<materialsOffset_t>& mesh_patch_materials,
@@ -1192,14 +1215,20 @@ void DEMDynamicThread::populateEntityArrays(const std::vector<std::shared_ptr<DE
             ownerTriMesh[nExistingFacets + k] = owner_offset_for_mesh_obj + this_facet_owner;
             // Tri's patch belonging needs to take into account those patches that are previously added
             triPatchID[nExistingFacets + k] = nExistingMeshPatches + mesh_facet_patch.at(k);
+            triNeighbor1[nExistingFacets + k] = mesh_facet_neighbor1.at(k);
+            triNeighbor2[nExistingFacets + k] = mesh_facet_neighbor2.at(k);
+            triNeighbor3[nExistingFacets + k] = mesh_facet_neighbor3.at(k);
             DEMTriangle this_tri = mesh_facets.at(k);
             relPosNode1[nExistingFacets + k] = this_tri.p1;
             relPosNode2[nExistingFacets + k] = this_tri.p2;
             relPosNode3[nExistingFacets + k] = this_tri.p3;
         }
 
+        const bodyID_t owner_id = owner_offset_for_mesh_obj + i;
         family_t this_family_num = input_mesh_obj_family.at(i);
-        familyID[i + owner_offset_for_mesh_obj] = this_family_num;
+        familyID[owner_id] = this_family_num;
+        ownerMeshConvex[owner_id] = input_mesh_obj_convex.at(i);
+        ownerMeshNeverWinner[owner_id] = input_mesh_obj_never_winner.at(i);
 
         // Cached initial values for wildcards of this mesh is not needed anymore
         m_meshes.back()->ClearWildcards();
@@ -1289,8 +1318,13 @@ void DEMDynamicThread::initGPUArrays(const std::vector<std::shared_ptr<DEMClumpB
                                      const std::vector<float3>& input_mesh_obj_xyz,
                                      const std::vector<float4>& input_mesh_obj_rot,
                                      const std::vector<unsigned int>& input_mesh_obj_family,
+                                     const std::vector<notStupidBool_t>& input_mesh_obj_convex,
+                                     const std::vector<notStupidBool_t>& input_mesh_obj_never_winner,
                                      const std::vector<unsigned int>& mesh_facet_owner,
                                      const std::vector<bodyID_t>& mesh_facet_patch,
+                                     const std::vector<bodyID_t>& mesh_facet_neighbor1,
+                                     const std::vector<bodyID_t>& mesh_facet_neighbor2,
+                                     const std::vector<bodyID_t>& mesh_facet_neighbor3,
                                      const std::vector<DEMTriangle>& mesh_facets,
                                      const std::vector<bodyID_t>& mesh_patch_owner,
                                      const std::vector<materialsOffset_t>& mesh_patch_materials,
@@ -1318,9 +1352,10 @@ void DEMDynamicThread::initGPUArrays(const std::vector<std::shared_ptr<DEMClumpB
     // For initialization, owner array offset is 0
     populateEntityArrays(input_clump_batches, input_ext_obj_xyz, input_ext_obj_rot, input_ext_obj_family,
                          input_mesh_objs, input_mesh_obj_xyz, input_mesh_obj_rot, input_mesh_obj_family,
-                         mesh_facet_owner, mesh_facet_patch, mesh_facets, mesh_patch_owner, mesh_patch_materials,
-                         clump_templates, ext_obj_mass_types, ext_obj_moi_types, ext_obj_comp_num, mesh_obj_mass_types,
-                         mesh_obj_moi_types, mesh_obj_mass_offsets, 0, 0, 0, 0);
+                         input_mesh_obj_convex, input_mesh_obj_never_winner, mesh_facet_owner, mesh_facet_patch,
+                         mesh_facet_neighbor1, mesh_facet_neighbor2, mesh_facet_neighbor3, mesh_facets,
+                         mesh_patch_owner, mesh_patch_materials, clump_templates, ext_obj_mass_types, ext_obj_moi_types,
+                         ext_obj_comp_num, mesh_obj_mass_types, mesh_obj_moi_types, mesh_obj_mass_offsets, 0, 0, 0, 0);
 
     buildTrackedObjs(input_clump_batches, ext_obj_comp_num, input_mesh_objs, tracked_objs, 0, 0, 0, 0);
 }
@@ -1333,8 +1368,13 @@ void DEMDynamicThread::updateClumpMeshArrays(const std::vector<std::shared_ptr<D
                                              const std::vector<float3>& input_mesh_obj_xyz,
                                              const std::vector<float4>& input_mesh_obj_rot,
                                              const std::vector<unsigned int>& input_mesh_obj_family,
+                                             const std::vector<notStupidBool_t>& input_mesh_obj_convex,
+                                             const std::vector<notStupidBool_t>& input_mesh_obj_never_winner,
                                              const std::vector<unsigned int>& mesh_facet_owner,
                                              const std::vector<bodyID_t>& mesh_facet_patch,
+                                             const std::vector<bodyID_t>& mesh_facet_neighbor1,
+                                             const std::vector<bodyID_t>& mesh_facet_neighbor2,
+                                             const std::vector<bodyID_t>& mesh_facet_neighbor3,
                                              const std::vector<DEMTriangle>& mesh_facets,
                                              const std::vector<bodyID_t>& mesh_patch_owner,
                                              const std::vector<materialsOffset_t>& mesh_patch_materials,
@@ -1366,10 +1406,11 @@ void DEMDynamicThread::updateClumpMeshArrays(const std::vector<std::shared_ptr<D
     // Analytical objects-related arrays should be empty
     populateEntityArrays(input_clump_batches, input_ext_obj_xyz, input_ext_obj_rot, input_ext_obj_family,
                          input_mesh_objs, input_mesh_obj_xyz, input_mesh_obj_rot, input_mesh_obj_family,
-                         mesh_facet_owner, mesh_facet_patch, mesh_facets, mesh_patch_owner, mesh_patch_materials,
-                         clump_templates, ext_obj_mass_types, ext_obj_moi_types, ext_obj_comp_num, mesh_obj_mass_types,
-                         mesh_obj_moi_types, mesh_obj_mass_offsets, nExistingOwners, nExistingSpheres, nExistingFacets,
-                         nExistingPatches);
+                         input_mesh_obj_convex, input_mesh_obj_never_winner, mesh_facet_owner, mesh_facet_patch,
+                         mesh_facet_neighbor1, mesh_facet_neighbor2, mesh_facet_neighbor3, mesh_facets,
+                         mesh_patch_owner, mesh_patch_materials, clump_templates, ext_obj_mass_types, ext_obj_moi_types,
+                         ext_obj_comp_num, mesh_obj_mass_types, mesh_obj_moi_types, mesh_obj_mass_offsets,
+                         nExistingOwners, nExistingSpheres, nExistingFacets, nExistingPatches);
 
     // Make changes to tracked objects (potentially add more)
     buildTrackedObjs(input_clump_batches, ext_obj_comp_num, input_mesh_objs, tracked_objs, nExistingOwners,
@@ -2486,6 +2527,7 @@ inline void DEMDynamicThread::contactPatchArrayResize(size_t nPatchPairs) {
     DEME_DUAL_ARRAY_RESIZE(idPatchA, nPatchPairs, 0);
     DEME_DUAL_ARRAY_RESIZE(idPatchB, nPatchPairs, 0);
     DEME_DUAL_ARRAY_RESIZE(contactTypePatch, nPatchPairs, NOT_A_CONTACT);
+    DEME_DUAL_ARRAY_RESIZE(contactPatchIsland, nPatchPairs, NULL_BODYID);
 
     // Re-packing pointers to device now is automatic
     // Sync pointers to device can be delayed... we'll only need to do that before kernel calls
@@ -2551,6 +2593,7 @@ inline void DEMDynamicThread::unpackMyBuffer() {
         swapped = swap_device_buffer(idPatchA, idPatchA_buffer[read_idx]) && swapped;
         swapped = swap_device_buffer(idPatchB, idPatchB_buffer[read_idx]) && swapped;
         swapped = swap_device_buffer(contactTypePatch, contactTypePatch_buffer[read_idx]) && swapped;
+        swapped = swap_device_buffer(contactPatchIsland, contactPatchIsland_buffer[read_idx]) && swapped;
     }
 #endif
     xfer::XferList xu;
@@ -2565,6 +2608,7 @@ inline void DEMDynamicThread::unpackMyBuffer() {
         xu.add(granData->idPatchA, idPatchA_buffer[read_idx].data(), nPatch * sizeof(bodyID_t));
         xu.add(granData->idPatchB, idPatchB_buffer[read_idx].data(), nPatch * sizeof(bodyID_t));
         xu.add(granData->contactTypePatch, contactTypePatch_buffer[read_idx].data(), nPatch * sizeof(contact_t));
+        xu.add(granData->contactPatchIsland, contactPatchIsland_buffer[read_idx].data(), nPatch * sizeof(bodyID_t));
     }
 
     if (!solverFlags.isHistoryless) {
@@ -2593,6 +2637,7 @@ inline void DEMDynamicThread::unpackMyBuffer() {
         kT->granData->pDTOwnedBuffer_idPatchA = idPatchA_buffer[kt_write_buf].data();
         kT->granData->pDTOwnedBuffer_idPatchB = idPatchB_buffer[kt_write_buf].data();
         kT->granData->pDTOwnedBuffer_contactTypePatch = contactTypePatch_buffer[kt_write_buf].data();
+        kT->granData->pDTOwnedBuffer_contactPatchIsland = contactPatchIsland_buffer[kt_write_buf].data();
         if (!solverFlags.isHistoryless) {
             kT->granData->pDTOwnedBuffer_contactMapping = contactMapping_buffer[kt_write_buf].data();
         }
@@ -3185,21 +3230,31 @@ inline void DEMDynamicThread::unpack_impl() {
     //     entry.second.second);
     // }
 
-    // Now for patch-based contacts, we do the same thing. Note the unique types herein will be the same as thosein.
-    cubRunLengthEncode<contact_t, contactPairs_t>(granData->contactTypePatch, existingContactTypes.device(), typeCounts,
+    // Now for patch-based contacts, we do the same thing. Keep primitive type list intact.
+    contact_t* patchTypesDevice = (contact_t*)solverScratchSpace.allocateTempVector(
+        "patchContactTypes", (NUM_SUPPORTED_CONTACT_TYPES + 1) * sizeof(contact_t));
+    cubRunLengthEncode<contact_t, contactPairs_t>(granData->contactTypePatch, patchTypesDevice, typeCounts,
                                                   solverScratchSpace.getDualStructDevice("numExistingTypes"),
                                                   *solverScratchSpace.numContacts, streamInfo.stream,
                                                   solverScratchSpace);
-    cubPrefixScan<contactPairs_t, contactPairs_t>(typeCounts, typeStartOffsetsPatch.device(), m_numExistingTypes,
+    solverScratchSpace.syncDualStructDeviceToHost("numExistingTypes");
+    size_t numPatchTypes = *solverScratchSpace.getDualStructHost("numExistingTypes");
+    cubPrefixScan<contactPairs_t, contactPairs_t>(typeCounts, typeStartOffsetsPatch.device(), numPatchTypes,
                                                   streamInfo.stream, solverScratchSpace);
     typeStartOffsetsPatch.toHost();
+    std::vector<contact_t> patchTypesHost(numPatchTypes);
+    if (numPatchTypes > 0) {
+        DEME_GPU_CALL(cudaMemcpy(patchTypesHost.data(), patchTypesDevice, numPatchTypes * sizeof(contact_t),
+                                 cudaMemcpyDeviceToHost));
+    }
     typeStartCountPatchMap.SetAll({0, 0});
-    for (size_t i = 0; i < m_numExistingTypes; i++) {
-        typeStartCountPatchMap[existingContactTypes[i]] = std::make_pair(
-            typeStartOffsetsPatch[i], (i + 1 < m_numExistingTypes ? typeStartOffsetsPatch[i + 1]
-                                                                  : (contactPairs_t)*solverScratchSpace.numContacts) -
+    for (size_t i = 0; i < numPatchTypes; i++) {
+        typeStartCountPatchMap[patchTypesHost[i]] = std::make_pair(
+            typeStartOffsetsPatch[i], (i + 1 < numPatchTypes ? typeStartOffsetsPatch[i + 1]
+                                                             : (contactPairs_t)*solverScratchSpace.numContacts) -
                                           typeStartOffsetsPatch[i]);
     }
+    solverScratchSpace.finishUsingTempVector("patchContactTypes");
 
     solverScratchSpace.finishUsingTempVector("typeCounts");
     solverScratchSpace.finishUsingDualStruct("numExistingTypes");
diff --git a/src/DEM/dT.h b/src/DEM/dT.h
index e3ffbe50..4406b56b 100644
--- a/src/DEM/dT.h
+++ b/src/DEM/dT.h
@@ -263,6 +263,8 @@ class DEMDynamicThread {
                                                 DeviceArray<bodyID_t>(&m_approxDeviceBytesUsed)};
     DeviceArray<contact_t> contactTypePatch_buffer[2] = {DeviceArray<contact_t>(&m_approxDeviceBytesUsed),
                                                          DeviceArray<contact_t>(&m_approxDeviceBytesUsed)};
+    DeviceArray<bodyID_t> contactPatchIsland_buffer[2] = {DeviceArray<bodyID_t>(&m_approxDeviceBytesUsed),
+                                                          DeviceArray<bodyID_t>(&m_approxDeviceBytesUsed)};
     DeviceArray<contactPairs_t> geomToPatchMap_buffer[2] = {DeviceArray<contactPairs_t>(&m_approxDeviceBytesUsed),
                                                             DeviceArray<contactPairs_t>(&m_approxDeviceBytesUsed)};
     DeviceArray<contactPairs_t> contactMapping_buffer[2] = {DeviceArray<contactPairs_t>(&m_approxDeviceBytesUsed),
@@ -396,6 +398,7 @@ class DEMDynamicThread {
     DualArray<bodyID_t> idPatchA = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
     DualArray<bodyID_t> idPatchB = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
     DualArray<contact_t> contactTypePatch = DualArray<contact_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
+    DualArray<bodyID_t> contactPatchIsland = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
     DualArray<contactPairs_t> geomToPatchMap =
         DualArray<contactPairs_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
 
@@ -456,9 +459,18 @@ class DEMDynamicThread {
     DualArray<bodyID_t> ownerClumpBody = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
     DualArray<bodyID_t> ownerTriMesh = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
     DualArray<bodyID_t> ownerAnalBody = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
+    // Mesh owner flags (indexed by owner body ID)
+    DualArray<notStupidBool_t> ownerMeshConvex =
+        DualArray<notStupidBool_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
+    DualArray<notStupidBool_t> ownerMeshNeverWinner =
+        DualArray<notStupidBool_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
     // Mesh patch information: each facet belongs to a patch, and each patch has material properties
     // Patch ID for each triangle facet (maps facet to patch)
     DualArray<bodyID_t> triPatchID = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
+    // Triangle edge neighbors (global triangle indices; NULL_BODYID for boundary)
+    DualArray<bodyID_t> triNeighbor1 = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
+    DualArray<bodyID_t> triNeighbor2 = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
+    DualArray<bodyID_t> triNeighbor3 = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
     // Mesh patch owner IDs (one per patch, flattened across all meshes)
     DualArray<bodyID_t> ownerPatchMesh = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
 
@@ -749,8 +761,13 @@ class DEMDynamicThread {
                               const std::vector<float3>& input_mesh_obj_xyz,
                               const std::vector<float4>& input_mesh_obj_rot,
                               const std::vector<unsigned int>& input_mesh_obj_family,
+                              const std::vector<notStupidBool_t>& input_mesh_obj_convex,
+                              const std::vector<notStupidBool_t>& input_mesh_obj_never_winner,
                               const std::vector<unsigned int>& mesh_facet_owner,
                               const std::vector<bodyID_t>& mesh_facet_patch,
+                              const std::vector<bodyID_t>& mesh_facet_neighbor1,
+                              const std::vector<bodyID_t>& mesh_facet_neighbor2,
+                              const std::vector<bodyID_t>& mesh_facet_neighbor3,
                               const std::vector<DEMTriangle>& mesh_facets,
                               const std::vector<bodyID_t>& mesh_patch_owner,
                               const std::vector<materialsOffset_t>& mesh_patch_materials,
@@ -784,8 +801,13 @@ class DEMDynamicThread {
                        const std::vector<float3>& input_mesh_obj_xyz,
                        const std::vector<float4>& input_mesh_obj_rot,
                        const std::vector<unsigned int>& input_mesh_obj_family,
+                       const std::vector<notStupidBool_t>& input_mesh_obj_convex,
+                       const std::vector<notStupidBool_t>& input_mesh_obj_never_winner,
                        const std::vector<unsigned int>& mesh_facet_owner,
                        const std::vector<bodyID_t>& mesh_facet_patch,
+                       const std::vector<bodyID_t>& mesh_facet_neighbor1,
+                       const std::vector<bodyID_t>& mesh_facet_neighbor2,
+                       const std::vector<bodyID_t>& mesh_facet_neighbor3,
                        const std::vector<DEMTriangle>& mesh_facets,
                        const std::vector<bodyID_t>& mesh_patch_owner,
                        const std::vector<materialsOffset_t>& mesh_patch_materials,
@@ -814,8 +836,13 @@ class DEMDynamicThread {
                                const std::vector<float3>& input_mesh_obj_xyz,
                                const std::vector<float4>& input_mesh_obj_rot,
                                const std::vector<unsigned int>& input_mesh_obj_family,
+                               const std::vector<notStupidBool_t>& input_mesh_obj_convex,
+                               const std::vector<notStupidBool_t>& input_mesh_obj_never_winner,
                                const std::vector<unsigned int>& mesh_facet_owner,
                                const std::vector<bodyID_t>& mesh_facet_patch,
+                               const std::vector<bodyID_t>& mesh_facet_neighbor1,
+                               const std::vector<bodyID_t>& mesh_facet_neighbor2,
+                               const std::vector<bodyID_t>& mesh_facet_neighbor3,
                                const std::vector<DEMTriangle>& mesh_facets,
                                const std::vector<bodyID_t>& mesh_patch_owner,
                                const std::vector<materialsOffset_t>& mesh_patch_materials,
diff --git a/src/DEM/kT.cpp b/src/DEM/kT.cpp
index ab8c824b..e7c5eb03 100644
--- a/src/DEM/kT.cpp
+++ b/src/DEM/kT.cpp
@@ -80,6 +80,7 @@ inline void DEMKinematicThread::transferPatchArrayResize(int buffer_idx, size_t
     DEME_DEVICE_ARRAY_RESIZE(dT->idPatchA_buffer[buffer_idx], nContactPairs);
     DEME_DEVICE_ARRAY_RESIZE(dT->idPatchB_buffer[buffer_idx], nContactPairs);
     DEME_DEVICE_ARRAY_RESIZE(dT->contactTypePatch_buffer[buffer_idx], nContactPairs);
+    DEME_DEVICE_ARRAY_RESIZE(dT->contactPatchIsland_buffer[buffer_idx], nContactPairs);
     if (!solverFlags.isHistoryless) {
         DEME_DEVICE_ARRAY_RESIZE(dT->contactMapping_buffer[buffer_idx], nContactPairs);
         granData->pDTOwnedBuffer_contactMapping = dT->contactMapping_buffer[buffer_idx].data();
@@ -87,6 +88,7 @@ inline void DEMKinematicThread::transferPatchArrayResize(int buffer_idx, size_t
     granData->pDTOwnedBuffer_idPatchA = dT->idPatchA_buffer[buffer_idx].data();
     granData->pDTOwnedBuffer_idPatchB = dT->idPatchB_buffer[buffer_idx].data();
     granData->pDTOwnedBuffer_contactTypePatch = dT->contactTypePatch_buffer[buffer_idx].data();
+    granData->pDTOwnedBuffer_contactPatchIsland = dT->contactPatchIsland_buffer[buffer_idx].data();
 
     // Unset the device change we just made
     DEME_GPU_CALL(cudaSetDevice(streamInfo.device));
@@ -378,6 +380,8 @@ inline void DEMKinematicThread::sendToTheirBuffer() {
         resize_patch = DEME_MAX(resize_patch, idPatchA.size());
         resize_patch = DEME_MAX(resize_patch, idPatchB.size());
         resize_patch = DEME_MAX(resize_patch, contactTypePatch.size());
+        // Keep patch-side buffers in lockstep; missing one can corrupt swap/copies and crash kernels.
+        resize_patch = DEME_MAX(resize_patch, contactPatchIsland.size());
         if (!solverFlags.isHistoryless) {
             resize_patch = DEME_MAX(resize_patch, contactMapping.size());
         }
@@ -389,7 +393,8 @@ inline void DEMKinematicThread::sendToTheirBuffer() {
                             resize_prim > dT->geomToPatchMap_buffer[write_idx].size();
     bool need_resize_patch = resize_patch > dT->idPatchA_buffer[write_idx].size() ||
                              resize_patch > dT->idPatchB_buffer[write_idx].size() ||
-                             resize_patch > dT->contactTypePatch_buffer[write_idx].size();
+                             resize_patch > dT->contactTypePatch_buffer[write_idx].size() ||
+                             resize_patch > dT->contactPatchIsland_buffer[write_idx].size();
     if (!solverFlags.isHistoryless) {
         need_resize_patch = need_resize_patch || (resize_patch > dT->contactMapping_buffer[write_idx].size());
     }
@@ -411,6 +416,8 @@ inline void DEMKinematicThread::sendToTheirBuffer() {
         output_swapped = swap_device_buffer(idPatchA, dT->idPatchA_buffer[write_idx]) && output_swapped;
         output_swapped = swap_device_buffer(idPatchB, dT->idPatchB_buffer[write_idx]) && output_swapped;
         output_swapped = swap_device_buffer(contactTypePatch, dT->contactTypePatch_buffer[write_idx]) && output_swapped;
+        output_swapped =
+            swap_device_buffer(contactPatchIsland, dT->contactPatchIsland_buffer[write_idx]) && output_swapped;
         if (!solverFlags.isHistoryless) {
             output_swapped = swap_device_buffer(contactMapping, dT->contactMapping_buffer[write_idx]) && output_swapped;
         }
@@ -424,6 +431,7 @@ inline void DEMKinematicThread::sendToTheirBuffer() {
     granData->pDTOwnedBuffer_idPatchA = dT->idPatchA_buffer[write_idx].data();
     granData->pDTOwnedBuffer_idPatchB = dT->idPatchB_buffer[write_idx].data();
     granData->pDTOwnedBuffer_contactTypePatch = dT->contactTypePatch_buffer[write_idx].data();
+    granData->pDTOwnedBuffer_contactPatchIsland = dT->contactPatchIsland_buffer[write_idx].data();
     if (!solverFlags.isHistoryless) {
         granData->pDTOwnedBuffer_contactMapping = dT->contactMapping_buffer[write_idx].data();
     }
@@ -458,6 +466,8 @@ inline void DEMKinematicThread::sendToTheirBuffer() {
         xs.add(dT->idPatchA_buffer[write_idx].data(), granData->idPatchA, nPatch * sizeof(bodyID_t));
         xs.add(dT->idPatchB_buffer[write_idx].data(), granData->idPatchB, nPatch * sizeof(bodyID_t));
         xs.add(dT->contactTypePatch_buffer[write_idx].data(), granData->contactTypePatch, nPatch * sizeof(contact_t));
+        xs.add(dT->contactPatchIsland_buffer[write_idx].data(), granData->contactPatchIsland,
+               nPatch * sizeof(bodyID_t));
         if (!solverFlags.isHistoryless) {
             xs.add(dT->contactMapping_buffer[write_idx].data(), granData->contactMapping,
                    nPatch * sizeof(contactPairs_t));
@@ -536,8 +546,8 @@ void DEMKinematicThread::workerThread() {
                              contactTypePrimitive, previous_idPrimitiveA, previous_idPrimitiveB,
                              previous_contactTypePrimitive, contactPersistency, contactMapping, idPatchA, idPatchB,
                              previous_idPatchA, previous_idPatchB, contactTypePatch, previous_contactTypePatch,
-                             typeStartCountPatchMap, geomToPatchMap, streamInfo.stream, solverScratchSpace, timers,
-                             stateParams);
+                             contactPatchIsland, previous_contactPatchIsland, typeStartCountPatchMap, geomToPatchMap,
+                             streamInfo.stream, solverScratchSpace, timers, stateParams);
             CDAccumTimer.End();
 
             timers.GetTimer("Send to dT buffer").start();
@@ -703,6 +713,8 @@ void DEMKinematicThread::packDataPointers() {
     previous_idPatchB.bindDevicePointer(&(granData->previous_idPatchB));
     contactTypePatch.bindDevicePointer(&(granData->contactTypePatch));
     previous_contactTypePatch.bindDevicePointer(&(granData->previous_contactTypePatch));
+    contactPatchIsland.bindDevicePointer(&(granData->contactPatchIsland));
+    previous_contactPatchIsland.bindDevicePointer(&(granData->previous_contactPatchIsland));
     geomToPatchMap.bindDevicePointer(&(granData->geomToPatchMap));
 
     familyMaskMatrix.bindDevicePointer(&(granData->familyMasks));
@@ -716,7 +728,12 @@ void DEMKinematicThread::packDataPointers() {
 
     // Mesh-related
     ownerTriMesh.bindDevicePointer(&(granData->ownerTriMesh));
+    ownerMeshConvex.bindDevicePointer(&(granData->ownerMeshConvex));
+    ownerMeshNeverWinner.bindDevicePointer(&(granData->ownerMeshNeverWinner));
     triPatchID.bindDevicePointer(&(granData->triPatchID));
+    triNeighbor1.bindDevicePointer(&(granData->triNeighbor1));
+    triNeighbor2.bindDevicePointer(&(granData->triNeighbor2));
+    triNeighbor3.bindDevicePointer(&(granData->triNeighbor3));
     relPosNode1.bindDevicePointer(&(granData->relPosNode1));
     relPosNode2.bindDevicePointer(&(granData->relPosNode2));
     relPosNode3.bindDevicePointer(&(granData->relPosNode3));
@@ -754,6 +771,8 @@ void DEMKinematicThread::migrateDataToDevice() {
     previous_idPatchB.toDeviceAsync(streamInfo.stream);
     contactTypePatch.toDeviceAsync(streamInfo.stream);
     previous_contactTypePatch.toDeviceAsync(streamInfo.stream);
+    contactPatchIsland.toDeviceAsync(streamInfo.stream);
+    previous_contactPatchIsland.toDeviceAsync(streamInfo.stream);
     familyMaskMatrix.toDeviceAsync(streamInfo.stream);
     familyExtraMarginSize.toDeviceAsync(streamInfo.stream);
 
@@ -763,7 +782,12 @@ void DEMKinematicThread::migrateDataToDevice() {
     ownerAnalBody.toDeviceAsync(streamInfo.stream);
 
     ownerTriMesh.toDeviceAsync(streamInfo.stream);
+    ownerMeshConvex.toDeviceAsync(streamInfo.stream);
+    ownerMeshNeverWinner.toDeviceAsync(streamInfo.stream);
     triPatchID.toDeviceAsync(streamInfo.stream);
+    triNeighbor1.toDeviceAsync(streamInfo.stream);
+    triNeighbor2.toDeviceAsync(streamInfo.stream);
+    triNeighbor3.toDeviceAsync(streamInfo.stream);
     relPosNode1.toDeviceAsync(streamInfo.stream);
     relPosNode2.toDeviceAsync(streamInfo.stream);
     relPosNode3.toDeviceAsync(streamInfo.stream);
@@ -801,6 +825,7 @@ void DEMKinematicThread::packTransferPointers(DEMDynamicThread*& dT) {
     granData->pDTOwnedBuffer_idPatchA = dT->idPatchA_buffer[write_idx].data();
     granData->pDTOwnedBuffer_idPatchB = dT->idPatchB_buffer[write_idx].data();
     granData->pDTOwnedBuffer_contactTypePatch = dT->contactTypePatch_buffer[write_idx].data();
+    granData->pDTOwnedBuffer_contactPatchIsland = dT->contactPatchIsland_buffer[write_idx].data();
     granData->pDTOwnedBuffer_contactMapping = dT->contactMapping_buffer[write_idx].data();
 }
 
@@ -901,6 +926,8 @@ void DEMKinematicThread::allocateGPUArrays(size_t nOwnerBodies,
     DEME_DUAL_ARRAY_RESIZE(oriQx, nOwnerBodies, 0);
     DEME_DUAL_ARRAY_RESIZE(oriQy, nOwnerBodies, 0);
     DEME_DUAL_ARRAY_RESIZE(oriQz, nOwnerBodies, 0);
+    DEME_DUAL_ARRAY_RESIZE(ownerMeshConvex, nOwnerBodies, 0);
+    DEME_DUAL_ARRAY_RESIZE(ownerMeshNeverWinner, nOwnerBodies, 0);
     DEME_DEVICE_ARRAY_RESIZE(marginSizeSphere, nSpheresGM);
     DEME_DEVICE_ARRAY_RESIZE(marginSizeAnalytical, nAnalGM);
     DEME_DEVICE_ARRAY_RESIZE(marginSizeTriangle, nTriGM);
@@ -948,6 +975,9 @@ void DEMKinematicThread::allocateGPUArrays(size_t nOwnerBodies,
     // Resize to the number of triangle facets
     DEME_DUAL_ARRAY_RESIZE(ownerTriMesh, nTriGM, 0);
     DEME_DUAL_ARRAY_RESIZE(triPatchID, nTriGM, 0);
+    DEME_DUAL_ARRAY_RESIZE(triNeighbor1, nTriGM, NULL_BODYID);
+    DEME_DUAL_ARRAY_RESIZE(triNeighbor2, nTriGM, NULL_BODYID);
+    DEME_DUAL_ARRAY_RESIZE(triNeighbor3, nTriGM, NULL_BODYID);
     DEME_DUAL_ARRAY_RESIZE(relPosNode1, nTriGM, make_float3(0));
     DEME_DUAL_ARRAY_RESIZE(relPosNode2, nTriGM, make_float3(0));
     DEME_DUAL_ARRAY_RESIZE(relPosNode3, nTriGM, make_float3(0));
@@ -986,6 +1016,7 @@ void DEMKinematicThread::allocateGPUArrays(size_t nOwnerBodies,
         DEME_DUAL_ARRAY_RESIZE(idPatchA, cnt_arr_size, 0);
         DEME_DUAL_ARRAY_RESIZE(idPatchB, cnt_arr_size, 0);
         DEME_DUAL_ARRAY_RESIZE(contactTypePatch, cnt_arr_size, NOT_A_CONTACT);
+        DEME_DUAL_ARRAY_RESIZE(contactPatchIsland, cnt_arr_size, NULL_BODYID);
         DEME_DUAL_ARRAY_RESIZE(geomToPatchMap, cnt_arr_size, 0);
 
         if (!solverFlags.isHistoryless) {
@@ -997,6 +1028,7 @@ void DEMKinematicThread::allocateGPUArrays(size_t nOwnerBodies,
             DEME_DUAL_ARRAY_RESIZE(previous_idPatchA, cnt_arr_size, 0);
             DEME_DUAL_ARRAY_RESIZE(previous_idPatchB, cnt_arr_size, 0);
             DEME_DUAL_ARRAY_RESIZE(previous_contactTypePatch, cnt_arr_size, NOT_A_CONTACT);
+            DEME_DUAL_ARRAY_RESIZE(previous_contactPatchIsland, cnt_arr_size, NULL_BODYID);
         }
     }
 }
@@ -1010,8 +1042,13 @@ void DEMKinematicThread::registerPolicies(const std::vector<notStupidBool_t>& fa
 void DEMKinematicThread::populateEntityArrays(const std::vector<std::shared_ptr<DEMClumpBatch>>& input_clump_batches,
                                               const std::vector<unsigned int>& input_ext_obj_family,
                                               const std::vector<unsigned int>& input_mesh_obj_family,
+                                              const std::vector<notStupidBool_t>& input_mesh_obj_convex,
+                                              const std::vector<notStupidBool_t>& input_mesh_obj_never_winner,
                                               const std::vector<unsigned int>& input_mesh_facet_owner,
                                               const std::vector<bodyID_t>& input_mesh_facet_patch,
+                                              const std::vector<bodyID_t>& input_mesh_facet_neighbor1,
+                                              const std::vector<bodyID_t>& input_mesh_facet_neighbor2,
+                                              const std::vector<bodyID_t>& input_mesh_facet_neighbor3,
                                               const std::vector<DEMTriangle>& input_mesh_facets,
                                               const ClumpTemplateFlatten& clump_templates,
                                               const std::vector<unsigned int>& ext_obj_comp_num,
@@ -1129,14 +1166,20 @@ void DEMKinematicThread::populateEntityArrays(const std::vector<std::shared_ptr<
                 break;
             ownerTriMesh[nExistingFacets + k] = owner_offset_for_mesh_obj + this_facet_owner;
             triPatchID[nExistingFacets + k] = nExistingMeshPatches + input_mesh_facet_patch.at(k);
+            triNeighbor1[nExistingFacets + k] = input_mesh_facet_neighbor1.at(k);
+            triNeighbor2[nExistingFacets + k] = input_mesh_facet_neighbor2.at(k);
+            triNeighbor3[nExistingFacets + k] = input_mesh_facet_neighbor3.at(k);
             DEMTriangle this_tri = input_mesh_facets.at(k);
             relPosNode1[nExistingFacets + k] = this_tri.p1;
             relPosNode2[nExistingFacets + k] = this_tri.p2;
             relPosNode3[nExistingFacets + k] = this_tri.p3;
         }
 
+        const bodyID_t owner_id = owner_offset_for_mesh_obj + i;
         family_t this_family_num = input_mesh_obj_family.at(i);
-        familyID[i + owner_offset_for_mesh_obj] = this_family_num;
+        familyID[owner_id] = this_family_num;
+        ownerMeshConvex[owner_id] = input_mesh_obj_convex.at(i);
+        ownerMeshNeverWinner[owner_id] = input_mesh_obj_never_winner.at(i);
         // DEME_DEBUG_PRINTF("kT just loaded a mesh in family %u", +(this_family_num));
         // DEME_DEBUG_PRINTF("Number of triangle facets loaded thus far: %zu", k);
     }
@@ -1145,8 +1188,13 @@ void DEMKinematicThread::populateEntityArrays(const std::vector<std::shared_ptr<
 void DEMKinematicThread::initGPUArrays(const std::vector<std::shared_ptr<DEMClumpBatch>>& input_clump_batches,
                                        const std::vector<unsigned int>& input_ext_obj_family,
                                        const std::vector<unsigned int>& input_mesh_obj_family,
+                                       const std::vector<notStupidBool_t>& input_mesh_obj_convex,
+                                       const std::vector<notStupidBool_t>& input_mesh_obj_never_winner,
                                        const std::vector<unsigned int>& input_mesh_facet_owner,
                                        const std::vector<bodyID_t>& input_mesh_facet_patch,
+                                       const std::vector<bodyID_t>& input_mesh_facet_neighbor1,
+                                       const std::vector<bodyID_t>& input_mesh_facet_neighbor2,
+                                       const std::vector<bodyID_t>& input_mesh_facet_neighbor3,
                                        const std::vector<DEMTriangle>& input_mesh_facets,
                                        const std::vector<unsigned int>& ext_obj_comp_num,
                                        const std::vector<notStupidBool_t>& family_mask_matrix,
@@ -1156,15 +1204,22 @@ void DEMKinematicThread::initGPUArrays(const std::vector<std::shared_ptr<DEMClum
 
     registerPolicies(family_mask_matrix);
 
-    populateEntityArrays(input_clump_batches, input_ext_obj_family, input_mesh_obj_family, input_mesh_facet_owner,
-                         input_mesh_facet_patch, input_mesh_facets, clump_templates, ext_obj_comp_num, 0, 0, 0, 0);
+    populateEntityArrays(input_clump_batches, input_ext_obj_family, input_mesh_obj_family, input_mesh_obj_convex,
+                         input_mesh_obj_never_winner, input_mesh_facet_owner, input_mesh_facet_patch,
+                         input_mesh_facet_neighbor1, input_mesh_facet_neighbor2, input_mesh_facet_neighbor3,
+                         input_mesh_facets, clump_templates, ext_obj_comp_num, 0, 0, 0, 0);
 }
 
 void DEMKinematicThread::updateClumpMeshArrays(const std::vector<std::shared_ptr<DEMClumpBatch>>& input_clump_batches,
                                                const std::vector<unsigned int>& input_ext_obj_family,
                                                const std::vector<unsigned int>& input_mesh_obj_family,
+                                               const std::vector<notStupidBool_t>& input_mesh_obj_convex,
+                                               const std::vector<notStupidBool_t>& input_mesh_obj_never_winner,
                                                const std::vector<unsigned int>& input_mesh_facet_owner,
                                                const std::vector<bodyID_t>& input_mesh_facet_patch,
+                                               const std::vector<bodyID_t>& input_mesh_facet_neighbor1,
+                                               const std::vector<bodyID_t>& input_mesh_facet_neighbor2,
+                                               const std::vector<bodyID_t>& input_mesh_facet_neighbor3,
                                                const std::vector<DEMTriangle>& input_mesh_facets,
                                                const std::vector<unsigned int>& ext_obj_comp_num,
                                                const std::vector<notStupidBool_t>& family_mask_matrix,
@@ -1177,9 +1232,11 @@ void DEMKinematicThread::updateClumpMeshArrays(const std::vector<std::shared_ptr
                                                size_t nExistingPatches,
                                                unsigned int nExistingObj,
                                                unsigned int nExistingAnalGM) {
-    populateEntityArrays(input_clump_batches, input_ext_obj_family, input_mesh_obj_family, input_mesh_facet_owner,
-                         input_mesh_facet_patch, input_mesh_facets, clump_templates, ext_obj_comp_num, nExistingOwners,
-                         nExistingSpheres, nExistingFacets, nExistingPatches);
+    populateEntityArrays(input_clump_batches, input_ext_obj_family, input_mesh_obj_family, input_mesh_obj_convex,
+                         input_mesh_obj_never_winner, input_mesh_facet_owner, input_mesh_facet_patch,
+                         input_mesh_facet_neighbor1, input_mesh_facet_neighbor2, input_mesh_facet_neighbor3,
+                         input_mesh_facets, clump_templates, ext_obj_comp_num, nExistingOwners, nExistingSpheres,
+                         nExistingFacets, nExistingPatches);
 }
 
 void DEMKinematicThread::updatePrevContactArrays(DualStruct<DEMDataDT>& dT_data, size_t nContacts) {
@@ -1187,7 +1244,8 @@ void DEMKinematicThread::updatePrevContactArrays(DualStruct<DEMDataDT>& dT_data,
     // Note kT never had the responsibility to migrate contact info to host, even at Update, as even in this case
     // its host-side update comes from dT
     overwritePrevContactArrays(granData, dT_data, previous_idPatchA, previous_idPatchB, previous_contactTypePatch,
-                               typeStartCountPatchMap, simParams, solverScratchSpace, streamInfo.stream, nContacts);
+                               previous_contactPatchIsland, typeStartCountPatchMap, simParams, solverScratchSpace,
+                               streamInfo.stream, nContacts);
     DEME_DEBUG_PRINTF("Number of contacts after a user-manual contact load: %zu", nContacts);
     DEME_DEBUG_PRINTF("Number of spheres after a user-manual contact load: %zu", (size_t)simParams->nSpheresGM);
 }
diff --git a/src/DEM/kT.h b/src/DEM/kT.h
index 57800f7e..5620b030 100644
--- a/src/DEM/kT.h
+++ b/src/DEM/kT.h
@@ -195,10 +195,19 @@ class DEMKinematicThread {
     DualArray<bodyID_t> ownerClumpBody = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
     DualArray<bodyID_t> ownerTriMesh = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
     DualArray<bodyID_t> ownerAnalBody = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
+    // Mesh owner flags (indexed by owner body ID)
+    DualArray<notStupidBool_t> ownerMeshConvex =
+        DualArray<notStupidBool_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
+    DualArray<notStupidBool_t> ownerMeshNeverWinner =
+        DualArray<notStupidBool_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
 
     // Mesh patch information: each facet belongs to a patch
     // Patch ID for each triangle facet (maps facet to patch)
     DualArray<bodyID_t> triPatchID = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
+    // Triangle edge neighbors (global triangle indices; NULL_BODYID for boundary)
+    DualArray<bodyID_t> triNeighbor1 = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
+    DualArray<bodyID_t> triNeighbor2 = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
+    DualArray<bodyID_t> triNeighbor3 = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
 
     // The ID that maps this sphere component's geometry-defining parameters, when this component is jitified
     DualArray<clumpComponentOffset_t> clumpComponentOffset =
@@ -224,6 +233,10 @@ class DEMKinematicThread {
     DualArray<contact_t> contactTypePatch = DualArray<contact_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
     DualArray<contact_t> previous_contactTypePatch =
         DualArray<contact_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
+    // Island label per patch contact (winner-side primitive label)
+    DualArray<bodyID_t> contactPatchIsland = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
+    DualArray<bodyID_t> previous_contactPatchIsland =
+        DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
 
     // Mapping array: maps from primitive-based pair index to patch-based pair index
     // Same length as primitive pair arrays (idPrimitiveA/B). For each primitive pair,
@@ -340,8 +353,13 @@ class DEMKinematicThread {
     void populateEntityArrays(const std::vector<std::shared_ptr<DEMClumpBatch>>& input_clump_batches,
                               const std::vector<unsigned int>& input_ext_obj_family,
                               const std::vector<unsigned int>& input_mesh_obj_family,
+                              const std::vector<notStupidBool_t>& input_mesh_obj_convex,
+                              const std::vector<notStupidBool_t>& input_mesh_obj_never_winner,
                               const std::vector<unsigned int>& input_mesh_facet_owner,
                               const std::vector<bodyID_t>& input_mesh_facet_patch,
+                              const std::vector<bodyID_t>& input_mesh_facet_neighbor1,
+                              const std::vector<bodyID_t>& input_mesh_facet_neighbor2,
+                              const std::vector<bodyID_t>& input_mesh_facet_neighbor3,
                               const std::vector<DEMTriangle>& input_mesh_facets,
                               const ClumpTemplateFlatten& clump_templates,
                               const std::vector<unsigned int>& ext_obj_comp_num,
@@ -354,8 +372,13 @@ class DEMKinematicThread {
     void initGPUArrays(const std::vector<std::shared_ptr<DEMClumpBatch>>& input_clump_batches,
                        const std::vector<unsigned int>& input_ext_obj_family,
                        const std::vector<unsigned int>& input_mesh_obj_family,
+                       const std::vector<notStupidBool_t>& input_mesh_obj_convex,
+                       const std::vector<notStupidBool_t>& input_mesh_obj_never_winner,
                        const std::vector<unsigned int>& input_mesh_facet_owner,
                        const std::vector<bodyID_t>& input_mesh_facet_patch,
+                       const std::vector<bodyID_t>& input_mesh_facet_neighbor1,
+                       const std::vector<bodyID_t>& input_mesh_facet_neighbor2,
+                       const std::vector<bodyID_t>& input_mesh_facet_neighbor3,
                        const std::vector<DEMTriangle>& input_mesh_facets,
                        const std::vector<unsigned int>& ext_obj_comp_num,
                        const std::vector<notStupidBool_t>& family_mask_matrix,
@@ -366,8 +389,13 @@ class DEMKinematicThread {
     void updateClumpMeshArrays(const std::vector<std::shared_ptr<DEMClumpBatch>>& input_clump_batches,
                                const std::vector<unsigned int>& input_ext_obj_family,
                                const std::vector<unsigned int>& input_mesh_obj_family,
+                               const std::vector<notStupidBool_t>& input_mesh_obj_convex,
+                               const std::vector<notStupidBool_t>& input_mesh_obj_never_winner,
                                const std::vector<unsigned int>& input_mesh_facet_owner,
                                const std::vector<bodyID_t>& input_mesh_facet_patch,
+                               const std::vector<bodyID_t>& input_mesh_facet_neighbor1,
+                               const std::vector<bodyID_t>& input_mesh_facet_neighbor2,
+                               const std::vector<bodyID_t>& input_mesh_facet_neighbor3,
                                const std::vector<DEMTriangle>& input_mesh_facets,
                                const std::vector<unsigned int>& ext_obj_comp_num,
                                const std::vector<notStupidBool_t>& family_mask_matrix,
diff --git a/src/algorithms/DEMContactDetection.cu b/src/algorithms/DEMContactDetection.cu
index 5565e0ee..eee8d0a9 100644
--- a/src/algorithms/DEMContactDetection.cu
+++ b/src/algorithms/DEMContactDetection.cu
@@ -55,11 +55,13 @@ inline void patchArraysResize(size_t nPatchInvolvedContacts,
                               DualArray<bodyID_t>& idA,
                               DualArray<bodyID_t>& idB,
                               DualArray<contact_t>& contactTypePatch,
+                              DualArray<bodyID_t>& contactPatchIsland,
                               DualStruct<DEMDataKT>& granData) {
     // Note these resizing are automatically on kT's device
     DEME_DUAL_ARRAY_RESIZE_NOVAL(idA, nPatchInvolvedContacts);
     DEME_DUAL_ARRAY_RESIZE_NOVAL(idB, nPatchInvolvedContacts);
     DEME_DUAL_ARRAY_RESIZE_NOVAL(contactTypePatch, nPatchInvolvedContacts);
+    DEME_DUAL_ARRAY_RESIZE_NOVAL(contactPatchIsland, nPatchInvolvedContacts);
 
     // Re-packing pointers now is automatic
 
@@ -301,6 +303,8 @@ void contactDetection(std::shared_ptr<JitHelper::CachedProgram>& bin_sphere_kern
                       DualArray<bodyID_t>& previous_idPatchB,
                       DualArray<contact_t>& contactTypePatch,
                       DualArray<contact_t>& previous_contactTypePatch,
+                      DualArray<bodyID_t>& contactPatchIsland,
+                      DualArray<bodyID_t>& previous_contactPatchIsland,
                       ContactTypeMap<std::pair<contactPairs_t, contactPairs_t>>& typeStartCountPatchMap,
                       DualArray<contactPairs_t>& geomToPatchMap,
                       cudaStream_t& this_stream,
@@ -1366,48 +1370,429 @@ void contactDetection(std::shared_ptr<JitHelper::CachedProgram>& bin_sphere_kern
                                                               isNewGroup, numTotalCnts);
             }
 
-            // Prefix scan gives 0-based patch-contact indices for each primitive contact.
-            cubDEMInclusiveScan<contactPairs_t, contactPairs_t>(isNewGroup, granData->geomToPatchMap, numTotalCnts,
-                                                                this_stream, scratchPad);
+            // Prefix scan gives 0-based group indices for each primitive contact (grouped by type + patch pair).
+            contactPairs_t* groupIndex =
+                (contactPairs_t*)scratchPad.allocateTempVector("groupIndex", numTotalCnts * sizeof(contactPairs_t));
+            cubDEMInclusiveScan<contactPairs_t, contactPairs_t>(isNewGroup, groupIndex, numTotalCnts, this_stream,
+                                                                scratchPad);
 
             // Flip the first element to 1 so it can be used for selection flags.
             setFirstFlagToOne<<<1, 1, 0, this_stream>>>(isNewGroup, numTotalCnts);
 
-            scratchPad.allocateDualStruct("numUniquePatchPairs");
-            cubDEMSum<contactPairs_t, size_t>(isNewGroup, scratchPad.getDualStructDevice("numUniquePatchPairs"),
+            scratchPad.allocateDualStruct("numUniqueGroups");
+            cubDEMSum<contactPairs_t, size_t>(isNewGroup, scratchPad.getDualStructDevice("numUniqueGroups"),
                                               numTotalCnts, this_stream, scratchPad);
-            scratchPad.syncDualStructDeviceToHost("numUniquePatchPairs");
-            size_t numUniquePatchPairs = *scratchPad.getDualStructHost("numUniquePatchPairs");
+            scratchPad.syncDualStructDeviceToHost("numUniqueGroups");
+            size_t numGroups = *scratchPad.getDualStructHost("numUniqueGroups");
+
+            // Select group contact types (one per group).
+            contact_t* groupContactTypes = nullptr;
+            if (numGroups > 0) {
+                groupContactTypes = (contact_t*)scratchPad.allocateTempVector("groupContactTypes",
+                                                                              numGroups * sizeof(contact_t));
+                cubDEMSelectFlagged<contact_t, contactPairs_t>(
+                    granData->contactTypePrimitive, groupContactTypes, isNewGroup,
+                    scratchPad.getDualStructDevice("numUniqueGroups"), numTotalCnts, this_stream, scratchPad);
+            }
+            // Select representative primitive IDs per group (first contact in each group).
+            bodyID_t* groupPrimA = nullptr;
+            bodyID_t* groupPrimB = nullptr;
+            if (numGroups > 0) {
+                groupPrimA =
+                    (bodyID_t*)scratchPad.allocateTempVector("groupPrimA", numGroups * sizeof(bodyID_t));
+                groupPrimB =
+                    (bodyID_t*)scratchPad.allocateTempVector("groupPrimB", numGroups * sizeof(bodyID_t));
+                cubDEMSelectFlagged<bodyID_t, contactPairs_t>(
+                    granData->idPrimitiveA, groupPrimA, isNewGroup,
+                    scratchPad.getDualStructDevice("numUniqueGroups"), numTotalCnts, this_stream, scratchPad);
+                cubDEMSelectFlagged<bodyID_t, contactPairs_t>(
+                    granData->idPrimitiveB, groupPrimB, isNewGroup,
+                    scratchPad.getDualStructDevice("numUniqueGroups"), numTotalCnts, this_stream, scratchPad);
+            }
+
+            // Count unique primitives per group on each side.
+            contactPairs_t* groupUniqueCountA =
+                (contactPairs_t*)scratchPad.allocateTempVector("groupUniqueCountA", numGroups * sizeof(contactPairs_t));
+            contactPairs_t* groupUniqueCountB =
+                (contactPairs_t*)scratchPad.allocateTempVector("groupUniqueCountB", numGroups * sizeof(contactPairs_t));
+            if (numGroups > 0) {
+                DEME_GPU_CALL(cudaMemsetAsync(groupUniqueCountA, 0, numGroups * sizeof(contactPairs_t), this_stream));
+                DEME_GPU_CALL(cudaMemsetAsync(groupUniqueCountB, 0, numGroups * sizeof(contactPairs_t), this_stream));
+            }
+
+            uint64_t* keyA = (uint64_t*)scratchPad.allocateTempVector("groupPrimKeyA", numTotalCnts * sizeof(uint64_t));
+            uint64_t* keyA_sorted =
+                (uint64_t*)scratchPad.allocateTempVector("groupPrimKeyA_sorted", numTotalCnts * sizeof(uint64_t));
+            if (blocks_needed_for_patch_ids > 0) {
+                buildGroupPrimitiveKeys<<<dim3(blocks_needed_for_patch_ids), dim3(DEME_MAX_THREADS_PER_BLOCK), 0,
+                                           this_stream>>>(groupIndex, granData->idPrimitiveA, keyA, numTotalCnts);
+            }
+            cubDEMSortKeys<uint64_t>(keyA, keyA_sorted, numTotalCnts, this_stream, scratchPad);
+
+            uint64_t* uniqueKeyA =
+                (uint64_t*)scratchPad.allocateTempVector("uniqueKeyA", numTotalCnts * sizeof(uint64_t));
+            scratchPad.allocateDualStruct("numUniqueKeyA");
+            cubDEMUnique<uint64_t>(keyA_sorted, uniqueKeyA, scratchPad.getDualStructDevice("numUniqueKeyA"),
+                                   numTotalCnts, this_stream, scratchPad);
+            scratchPad.syncDualStructDeviceToHost("numUniqueKeyA");
+            size_t numUniqueKeyA = *scratchPad.getDualStructHost("numUniqueKeyA");
+            if (numUniqueKeyA > 0) {
+                contactPairs_t* uniqueGroupA = (contactPairs_t*)scratchPad.allocateTempVector(
+                    "uniqueGroupA", numUniqueKeyA * sizeof(contactPairs_t));
+                size_t blocks_needed_unique =
+                    (numUniqueKeyA + DEME_MAX_THREADS_PER_BLOCK - 1) / DEME_MAX_THREADS_PER_BLOCK;
+                extractGroupIndexFromKey<<<dim3(blocks_needed_unique), dim3(DEME_MAX_THREADS_PER_BLOCK), 0,
+                                           this_stream>>>(uniqueKeyA, uniqueGroupA, numUniqueKeyA);
+
+                contactPairs_t* uniqueGroupsA = (contactPairs_t*)scratchPad.allocateTempVector(
+                    "uniqueGroupsA", numUniqueKeyA * sizeof(contactPairs_t));
+                contactPairs_t* countsA = (contactPairs_t*)scratchPad.allocateTempVector(
+                    "uniqueCountsA", numUniqueKeyA * sizeof(contactPairs_t));
+                scratchPad.allocateDualStruct("numGroupsA");
+                cubDEMRunLengthEncode<contactPairs_t, contactPairs_t>(
+                    uniqueGroupA, uniqueGroupsA, countsA, scratchPad.getDualStructDevice("numGroupsA"),
+                    numUniqueKeyA, this_stream, scratchPad);
+                scratchPad.syncDualStructDeviceToHost("numGroupsA");
+                size_t numGroupsA = *scratchPad.getDualStructHost("numGroupsA");
+                if (numGroupsA > 0) {
+                    size_t blocks_needed_groups =
+                        (numGroupsA + DEME_MAX_THREADS_PER_BLOCK - 1) / DEME_MAX_THREADS_PER_BLOCK;
+                    scatterGroupCounts<<<dim3(blocks_needed_groups), dim3(DEME_MAX_THREADS_PER_BLOCK), 0, this_stream>>>(
+                        uniqueGroupsA, countsA, groupUniqueCountA, numGroupsA);
+                }
+                scratchPad.finishUsingTempVector("uniqueGroupsA");
+                scratchPad.finishUsingTempVector("uniqueCountsA");
+                scratchPad.finishUsingDualStruct("numGroupsA");
+                scratchPad.finishUsingTempVector("uniqueGroupA");
+            }
+            scratchPad.finishUsingTempVector("uniqueKeyA");
+            scratchPad.finishUsingDualStruct("numUniqueKeyA");
+            scratchPad.finishUsingTempVector("groupPrimKeyA_sorted");
+            scratchPad.finishUsingTempVector("groupPrimKeyA");
+
+            uint64_t* keyB = (uint64_t*)scratchPad.allocateTempVector("groupPrimKeyB", numTotalCnts * sizeof(uint64_t));
+            uint64_t* keyB_sorted =
+                (uint64_t*)scratchPad.allocateTempVector("groupPrimKeyB_sorted", numTotalCnts * sizeof(uint64_t));
+            if (blocks_needed_for_patch_ids > 0) {
+                buildGroupPrimitiveKeys<<<dim3(blocks_needed_for_patch_ids), dim3(DEME_MAX_THREADS_PER_BLOCK), 0,
+                                           this_stream>>>(groupIndex, granData->idPrimitiveB, keyB, numTotalCnts);
+            }
+            cubDEMSortKeys<uint64_t>(keyB, keyB_sorted, numTotalCnts, this_stream, scratchPad);
+
+            uint64_t* uniqueKeyB =
+                (uint64_t*)scratchPad.allocateTempVector("uniqueKeyB", numTotalCnts * sizeof(uint64_t));
+            scratchPad.allocateDualStruct("numUniqueKeyB");
+            cubDEMUnique<uint64_t>(keyB_sorted, uniqueKeyB, scratchPad.getDualStructDevice("numUniqueKeyB"),
+                                   numTotalCnts, this_stream, scratchPad);
+            scratchPad.syncDualStructDeviceToHost("numUniqueKeyB");
+            size_t numUniqueKeyB = *scratchPad.getDualStructHost("numUniqueKeyB");
+            if (numUniqueKeyB > 0) {
+                contactPairs_t* uniqueGroupB = (contactPairs_t*)scratchPad.allocateTempVector(
+                    "uniqueGroupB", numUniqueKeyB * sizeof(contactPairs_t));
+                size_t blocks_needed_unique =
+                    (numUniqueKeyB + DEME_MAX_THREADS_PER_BLOCK - 1) / DEME_MAX_THREADS_PER_BLOCK;
+                extractGroupIndexFromKey<<<dim3(blocks_needed_unique), dim3(DEME_MAX_THREADS_PER_BLOCK), 0,
+                                           this_stream>>>(uniqueKeyB, uniqueGroupB, numUniqueKeyB);
+
+                contactPairs_t* uniqueGroupsB = (contactPairs_t*)scratchPad.allocateTempVector(
+                    "uniqueGroupsB", numUniqueKeyB * sizeof(contactPairs_t));
+                contactPairs_t* countsB = (contactPairs_t*)scratchPad.allocateTempVector(
+                    "uniqueCountsB", numUniqueKeyB * sizeof(contactPairs_t));
+                scratchPad.allocateDualStruct("numGroupsB");
+                cubDEMRunLengthEncode<contactPairs_t, contactPairs_t>(
+                    uniqueGroupB, uniqueGroupsB, countsB, scratchPad.getDualStructDevice("numGroupsB"),
+                    numUniqueKeyB, this_stream, scratchPad);
+                scratchPad.syncDualStructDeviceToHost("numGroupsB");
+                size_t numGroupsB = *scratchPad.getDualStructHost("numGroupsB");
+                if (numGroupsB > 0) {
+                    size_t blocks_needed_groups =
+                        (numGroupsB + DEME_MAX_THREADS_PER_BLOCK - 1) / DEME_MAX_THREADS_PER_BLOCK;
+                    scatterGroupCounts<<<dim3(blocks_needed_groups), dim3(DEME_MAX_THREADS_PER_BLOCK), 0, this_stream>>>(
+                        uniqueGroupsB, countsB, groupUniqueCountB, numGroupsB);
+                }
+                scratchPad.finishUsingTempVector("uniqueGroupsB");
+                scratchPad.finishUsingTempVector("uniqueCountsB");
+                scratchPad.finishUsingDualStruct("numGroupsB");
+                scratchPad.finishUsingTempVector("uniqueGroupB");
+            }
+            scratchPad.finishUsingTempVector("uniqueKeyB");
+            scratchPad.finishUsingDualStruct("numUniqueKeyB");
+            scratchPad.finishUsingTempVector("groupPrimKeyB_sorted");
+            scratchPad.finishUsingTempVector("groupPrimKeyB");
+
+            // Decide winner side per group.
+            notStupidBool_t* groupWinnerIsA =
+                (notStupidBool_t*)scratchPad.allocateTempVector("groupWinnerIsA", numGroups * sizeof(notStupidBool_t));
+            notStupidBool_t* groupWinnerIsTri = (notStupidBool_t*)scratchPad.allocateTempVector(
+                "groupWinnerIsTri", numGroups * sizeof(notStupidBool_t));
+            notStupidBool_t* groupForceSingleIsland = (notStupidBool_t*)scratchPad.allocateTempVector(
+                "groupForceSingleIsland", numGroups * sizeof(notStupidBool_t));
+            if (numGroups > 0) {
+                size_t blocks_needed_groups =
+                    (numGroups + DEME_MAX_THREADS_PER_BLOCK - 1) / DEME_MAX_THREADS_PER_BLOCK;
+                computeGroupWinners<<<dim3(blocks_needed_groups), dim3(DEME_MAX_THREADS_PER_BLOCK), 0, this_stream>>>(
+                    groupContactTypes, groupPrimA, groupPrimB, groupUniqueCountA, groupUniqueCountB,
+                    granData->ownerTriMesh, granData->ownerMeshConvex, granData->ownerMeshNeverWinner, groupWinnerIsA,
+                    groupWinnerIsTri, groupForceSingleIsland, numGroups);
+            }
+
+            // Winner primitive per contact.
+            bodyID_t* winnerPrimitive =
+                (bodyID_t*)scratchPad.allocateTempVector("winnerPrimitive", numTotalCnts * sizeof(bodyID_t));
+            notStupidBool_t* winnerIsTri =
+                (notStupidBool_t*)scratchPad.allocateTempVector("winnerIsTri", numTotalCnts * sizeof(notStupidBool_t));
+            if (blocks_needed_for_patch_ids > 0) {
+                selectWinnerPrimitive<<<dim3(blocks_needed_for_patch_ids), dim3(DEME_MAX_THREADS_PER_BLOCK), 0,
+                                        this_stream>>>(groupIndex, granData->idPrimitiveA, granData->idPrimitiveB,
+                                                       groupWinnerIsA, groupWinnerIsTri, groupForceSingleIsland,
+                                                       winnerPrimitive, winnerIsTri, numTotalCnts);
+            }
+
+            // Build active triangle keys and compact.
+            uint64_t* activeTriKeysAll =
+                (uint64_t*)scratchPad.allocateTempVector("activeTriKeysAll", numTotalCnts * sizeof(uint64_t));
+            notStupidBool_t* activeTriFlags = winnerIsTri;  // reuse winnerIsTri as flags
+            if (blocks_needed_for_patch_ids > 0) {
+                buildActiveTriKeys<<<dim3(blocks_needed_for_patch_ids), dim3(DEME_MAX_THREADS_PER_BLOCK), 0,
+                                     this_stream>>>(groupIndex, winnerPrimitive, activeTriFlags, activeTriKeysAll,
+                                                    activeTriFlags, numTotalCnts);
+            }
+            uint64_t* activeTriKeys =
+                (uint64_t*)scratchPad.allocateTempVector("activeTriKeys", numTotalCnts * sizeof(uint64_t));
+            scratchPad.allocateDualStruct("numActiveTriKeys");
+            cubDEMSelectFlagged<uint64_t, notStupidBool_t>(activeTriKeysAll, activeTriKeys, activeTriFlags,
+                                                           scratchPad.getDualStructDevice("numActiveTriKeys"),
+                                                           numTotalCnts, this_stream, scratchPad);
+            scratchPad.syncDualStructDeviceToHost("numActiveTriKeys");
+            size_t numActiveTriKeys = *scratchPad.getDualStructHost("numActiveTriKeys");
+
+            uint64_t* activeTriKeysUnique = nullptr;
+            bodyID_t* activeLabelsA = nullptr;
+            bodyID_t* activeLabelsB = nullptr;
+            contactPairs_t* groupActiveCount = nullptr;
+            contactPairs_t* groupActiveStart = nullptr;
+            bodyID_t* finalActiveLabels = nullptr;
+            size_t numUniqueActiveTri = 0;
+
+            if (numActiveTriKeys > 0) {
+                uint64_t* activeTriKeys_sorted = (uint64_t*)scratchPad.allocateTempVector(
+                    "activeTriKeys_sorted", numActiveTriKeys * sizeof(uint64_t));
+                cubDEMSortKeys<uint64_t>(activeTriKeys, activeTriKeys_sorted, numActiveTriKeys, this_stream,
+                                         scratchPad);
+
+                activeTriKeysUnique = (uint64_t*)scratchPad.allocateTempVector(
+                    "activeTriKeys_unique", numActiveTriKeys * sizeof(uint64_t));
+                scratchPad.allocateDualStruct("numUniqueActiveTri");
+                cubDEMUnique<uint64_t>(activeTriKeys_sorted, activeTriKeysUnique,
+                                       scratchPad.getDualStructDevice("numUniqueActiveTri"), numActiveTriKeys,
+                                       this_stream, scratchPad);
+                scratchPad.syncDualStructDeviceToHost("numUniqueActiveTri");
+                numUniqueActiveTri = *scratchPad.getDualStructHost("numUniqueActiveTri");
+
+                if (numUniqueActiveTri > 0) {
+                    activeLabelsA = (bodyID_t*)scratchPad.allocateTempVector(
+                        "activeTriLabelsA", numUniqueActiveTri * sizeof(bodyID_t));
+                    activeLabelsB = (bodyID_t*)scratchPad.allocateTempVector(
+                        "activeTriLabelsB", numUniqueActiveTri * sizeof(bodyID_t));
+                    size_t blocks_needed_active =
+                        (numUniqueActiveTri + DEME_MAX_THREADS_PER_BLOCK - 1) / DEME_MAX_THREADS_PER_BLOCK;
+                    initActiveTriLabels<<<dim3(blocks_needed_active), dim3(DEME_MAX_THREADS_PER_BLOCK), 0,
+                                         this_stream>>>(activeTriKeysUnique, activeLabelsA, numUniqueActiveTri);
+                    initActiveTriLabels<<<dim3(blocks_needed_active), dim3(DEME_MAX_THREADS_PER_BLOCK), 0,
+                                         this_stream>>>(activeTriKeysUnique, activeLabelsB, numUniqueActiveTri);
+
+                    groupActiveCount = (contactPairs_t*)scratchPad.allocateTempVector(
+                        "groupActiveCount", numGroups * sizeof(contactPairs_t));
+                    DEME_GPU_CALL(cudaMemsetAsync(groupActiveCount, 0, numGroups * sizeof(contactPairs_t), this_stream));
+                    countActiveTriPerGroup<<<dim3(blocks_needed_active), dim3(DEME_MAX_THREADS_PER_BLOCK), 0,
+                                             this_stream>>>(activeTriKeysUnique, groupActiveCount, numUniqueActiveTri);
+
+                    groupActiveStart = (contactPairs_t*)scratchPad.allocateTempVector(
+                        "groupActiveStart", numGroups * sizeof(contactPairs_t));
+                    if (numGroups > 0) {
+                        cubDEMPrefixScan<contactPairs_t, contactPairs_t>(groupActiveCount, groupActiveStart, numGroups,
+                                                                         this_stream, scratchPad);
+                    }
+
+                    // Label propagation iterations.
+                    const int kLabelIters = 4;
+                    bodyID_t* labelsIn = activeLabelsA;
+                    bodyID_t* labelsOut = activeLabelsB;
+                    for (int iter = 0; iter < kLabelIters; ++iter) {
+                        propagateActiveTriLabels<<<dim3(blocks_needed_active), dim3(DEME_MAX_THREADS_PER_BLOCK), 0,
+                                                   this_stream>>>(activeTriKeysUnique, labelsIn, labelsOut,
+                                                                  groupActiveStart, groupActiveCount,
+                                                                  granData->triNeighbor1, granData->triNeighbor2,
+                                                                  granData->triNeighbor3, numUniqueActiveTri);
+                        bodyID_t* tmp = labelsIn;
+                        labelsIn = labelsOut;
+                        labelsOut = tmp;
+                    }
+                    finalActiveLabels = labelsIn;
+                }
+
+                scratchPad.finishUsingTempVector("activeTriKeys_sorted");
+                scratchPad.finishUsingDualStruct("numUniqueActiveTri");
+            }
+
+            // Assign island label per contact (winner primitive label or propagated triangle label).
+            bodyID_t* contactIslandLabel =
+                (bodyID_t*)scratchPad.allocateTempVector("contactIslandLabel", numTotalCnts * sizeof(bodyID_t));
+            if (numUniqueActiveTri > 0) {
+                size_t blocks_needed_labels =
+                    (numTotalCnts + DEME_MAX_THREADS_PER_BLOCK - 1) / DEME_MAX_THREADS_PER_BLOCK;
+                assignContactIslandLabel<<<dim3(blocks_needed_labels), dim3(DEME_MAX_THREADS_PER_BLOCK), 0,
+                                           this_stream>>>(groupIndex, winnerPrimitive, winnerIsTri, activeTriKeysUnique,
+                                                          finalActiveLabels, groupActiveStart, groupActiveCount,
+                                                          contactIslandLabel, numTotalCnts);
+            } else {
+                size_t blocks_needed_labels =
+                    (numTotalCnts + DEME_MAX_THREADS_PER_BLOCK - 1) / DEME_MAX_THREADS_PER_BLOCK;
+                if (blocks_needed_labels > 0) {
+                    copyBodyIDArray<<<dim3(blocks_needed_labels), dim3(DEME_MAX_THREADS_PER_BLOCK), 0, this_stream>>>(
+                        winnerPrimitive, contactIslandLabel, numTotalCnts);
+                }
+            }
+
+            // Build composite key parts (primary + secondary) for island grouping.
+            uint64_t* islandKeyHi =
+                (uint64_t*)scratchPad.allocateTempVector("islandKeyHi", numTotalCnts * sizeof(uint64_t));
+            uint64_t* islandKeyLo =
+                (uint64_t*)scratchPad.allocateTempVector("islandKeyLo", numTotalCnts * sizeof(uint64_t));
+            if (blocks_needed_for_patch_ids > 0) {
+                buildIslandCompositeKeyParts<<<dim3(blocks_needed_for_patch_ids), dim3(DEME_MAX_THREADS_PER_BLOCK), 0,
+                                               this_stream>>>(contactPatchPairs, granData->contactTypePrimitive,
+                                                              contactIslandLabel, islandKeyHi, islandKeyLo,
+                                                              numTotalCnts);
+            }
 
-            if (numUniquePatchPairs > idPatchA.size()) {
-                DEME_DUAL_ARRAY_RESIZE_NOVAL(idPatchA, numUniquePatchPairs);
-                DEME_DUAL_ARRAY_RESIZE_NOVAL(idPatchB, numUniquePatchPairs);
-                DEME_DUAL_ARRAY_RESIZE_NOVAL(contactTypePatch, numUniquePatchPairs);
+            contactPairs_t* island_sort_indices =
+                (contactPairs_t*)scratchPad.allocateTempVector("islandSortIndices", idx_arr_bytes);
+            contactPairs_t* island_sort_indices_sorted =
+                (contactPairs_t*)scratchPad.allocateTempVector("islandSortIndices_sorted", idx_arr_bytes);
+            if (blocks_needed_for_patch_ids > 0) {
+                lineNumbers<<<dim3(blocks_needed_for_patch_ids), dim3(DEME_MAX_THREADS_PER_BLOCK), 0, this_stream>>>(
+                    island_sort_indices, numTotalCnts);
+            }
+
+            // Two-pass stable sort to avoid the CUDA 13/CUB compile error with ulonglong2 (128-bit) keys.
+            uint64_t* islandKeyLo_sorted =
+                (uint64_t*)scratchPad.allocateTempVector("islandKeyLo_sorted", numTotalCnts * sizeof(uint64_t));
+            cubDEMSortByKeys<uint64_t, contactPairs_t>(islandKeyLo, islandKeyLo_sorted, island_sort_indices,
+                                                       island_sort_indices_sorted, numTotalCnts, this_stream,
+                                                       scratchPad);
+
+            uint64_t* islandKeyHi_by_lo =
+                (uint64_t*)scratchPad.allocateTempVector("islandKeyHi_by_lo", numTotalCnts * sizeof(uint64_t));
+            if (blocks_needed_for_patch_ids > 0) {
+                gatherByIndex<uint64_t>
+                    <<<dim3(blocks_needed_for_patch_ids), dim3(DEME_MAX_THREADS_PER_BLOCK), 0, this_stream>>>(
+                        islandKeyHi, islandKeyHi_by_lo, island_sort_indices_sorted, numTotalCnts);
+            }
+
+            uint64_t* islandKeyHi_sorted =
+                (uint64_t*)scratchPad.allocateTempVector("islandKeyHi_sorted", numTotalCnts * sizeof(uint64_t));
+            // Stable sort by primary key (contactType + patchA), preserving low-key order.
+            cubDEMSortByKeys<uint64_t, contactPairs_t>(islandKeyHi_by_lo, islandKeyHi_sorted,
+                                                       island_sort_indices_sorted, island_sort_indices, numTotalCnts,
+                                                       this_stream, scratchPad);
+
+            // Reorder primitive arrays by island keys.
+            if (blocks_needed_for_patch_ids > 0) {
+                gatherByIndex<bodyID_t>
+                    <<<dim3(blocks_needed_for_patch_ids), dim3(DEME_MAX_THREADS_PER_BLOCK), 0, this_stream>>>(
+                        granData->idPrimitiveA, idA_sorted, island_sort_indices, numTotalCnts);
+                gatherByIndex<bodyID_t>
+                    <<<dim3(blocks_needed_for_patch_ids), dim3(DEME_MAX_THREADS_PER_BLOCK), 0, this_stream>>>(
+                        granData->idPrimitiveB, idB_sorted, island_sort_indices, numTotalCnts);
+                gatherByIndex<contact_t>
+                    <<<dim3(blocks_needed_for_patch_ids), dim3(DEME_MAX_THREADS_PER_BLOCK), 0, this_stream>>>(
+                        granData->contactTypePrimitive, contactType_sorted, island_sort_indices, numTotalCnts);
+                gatherByIndex<notStupidBool_t>
+                    <<<dim3(blocks_needed_for_patch_ids), dim3(DEME_MAX_THREADS_PER_BLOCK), 0, this_stream>>>(
+                        granData->contactPersistency, contactPersistency_sorted, island_sort_indices,
+                        numTotalCnts);
+                gatherByIndex<patchIDPair_t>
+                    <<<dim3(blocks_needed_for_patch_ids), dim3(DEME_MAX_THREADS_PER_BLOCK), 0, this_stream>>>(
+                        contactPatchPairs, patchPairs_sorted, island_sort_indices, numTotalCnts);
+            }
+
+            bodyID_t* contactIslandLabel_sorted = (bodyID_t*)scratchPad.allocateTempVector(
+                "contactIslandLabel_sorted", numTotalCnts * sizeof(bodyID_t));
+            if (blocks_needed_for_patch_ids > 0) {
+                gatherByIndex<bodyID_t>
+                    <<<dim3(blocks_needed_for_patch_ids), dim3(DEME_MAX_THREADS_PER_BLOCK), 0, this_stream>>>(
+                        contactIslandLabel, contactIslandLabel_sorted, island_sort_indices, numTotalCnts);
+            }
+
+            DEME_GPU_CALL(cudaMemcpyAsync(granData->idPrimitiveA, idA_sorted, total_ids_bytes,
+                                          cudaMemcpyDeviceToDevice, this_stream));
+            DEME_GPU_CALL(cudaMemcpyAsync(granData->idPrimitiveB, idB_sorted, total_ids_bytes,
+                                          cudaMemcpyDeviceToDevice, this_stream));
+            DEME_GPU_CALL(cudaMemcpyAsync(granData->contactTypePrimitive, contactType_sorted, type_arr_bytes,
+                                          cudaMemcpyDeviceToDevice, this_stream));
+            DEME_GPU_CALL(cudaMemcpyAsync(granData->contactPersistency, contactPersistency_sorted,
+                                          total_persistency_bytes, cudaMemcpyDeviceToDevice, this_stream));
+            DEME_GPU_CALL(cudaMemcpyAsync(contactPatchPairs, patchPairs_sorted, patch_arr_bytes,
+                                          cudaMemcpyDeviceToDevice, this_stream));
+
+            // Build final geomToPatchMap based on island keys.
+            contactPairs_t* isNewIslandGroup =
+                (contactPairs_t*)scratchPad.allocateTempVector("isNewIslandGroup", numTotalCnts * sizeof(contactPairs_t));
+            if (blocks_needed_for_patch_ids > 0) {
+                uint64_t* islandKeyLo_sorted_by_hi = (uint64_t*)scratchPad.allocateTempVector(
+                    "islandKeyLo_sorted_by_hi", numTotalCnts * sizeof(uint64_t));
+                gatherByIndex<uint64_t>
+                    <<<dim3(blocks_needed_for_patch_ids), dim3(DEME_MAX_THREADS_PER_BLOCK), 0, this_stream>>>(
+                        islandKeyLo, islandKeyLo_sorted_by_hi, island_sort_indices, numTotalCnts);
+                markNewCompositeGroups64<<<dim3(blocks_needed_for_patch_ids), dim3(DEME_MAX_THREADS_PER_BLOCK), 0,
+                                           this_stream>>>(islandKeyHi_sorted, islandKeyLo_sorted_by_hi,
+                                                          isNewIslandGroup, numTotalCnts);
+                scratchPad.finishUsingTempVector("islandKeyLo_sorted_by_hi");
+            }
+            cubDEMInclusiveScan<contactPairs_t, contactPairs_t>(isNewIslandGroup, granData->geomToPatchMap, numTotalCnts,
+                                                                this_stream, scratchPad);
+            setFirstFlagToOne<<<1, 1, 0, this_stream>>>(isNewIslandGroup, numTotalCnts);
+
+            scratchPad.allocateDualStruct("numUniqueIslands");
+            cubDEMSum<contactPairs_t, size_t>(isNewIslandGroup, scratchPad.getDualStructDevice("numUniqueIslands"),
+                                              numTotalCnts, this_stream, scratchPad);
+            scratchPad.syncDualStructDeviceToHost("numUniqueIslands");
+            size_t numUniqueIslands = *scratchPad.getDualStructHost("numUniqueIslands");
+
+            if (numUniqueIslands > idPatchA.size()) {
+                DEME_DUAL_ARRAY_RESIZE_NOVAL(idPatchA, numUniqueIslands);
+                DEME_DUAL_ARRAY_RESIZE_NOVAL(idPatchB, numUniqueIslands);
+                DEME_DUAL_ARRAY_RESIZE_NOVAL(contactTypePatch, numUniqueIslands);
+                DEME_DUAL_ARRAY_RESIZE_NOVAL(contactPatchIsland, numUniqueIslands);
                 granData.toDevice();
             }
 
             patchIDPair_t* unique_patch_pairs = nullptr;
-            if (numUniquePatchPairs > 0) {
+            if (numUniqueIslands > 0) {
                 unique_patch_pairs = (patchIDPair_t*)scratchPad.allocateTempVector(
-                    "unique_patch_pairs", numUniquePatchPairs * sizeof(patchIDPair_t));
+                    "unique_patch_pairs", numUniqueIslands * sizeof(patchIDPair_t));
                 cubDEMSelectFlagged<patchIDPair_t, contactPairs_t>(
-                    contactPatchPairs, unique_patch_pairs, isNewGroup,
-                    scratchPad.getDualStructDevice("numUniquePatchPairs"), numTotalCnts, this_stream, scratchPad);
+                    contactPatchPairs, unique_patch_pairs, isNewIslandGroup,
+                    scratchPad.getDualStructDevice("numUniqueIslands"), numTotalCnts, this_stream, scratchPad);
                 cubDEMSelectFlagged<contact_t, contactPairs_t>(
-                    granData->contactTypePrimitive, granData->contactTypePatch, isNewGroup,
-                    scratchPad.getDualStructDevice("numUniquePatchPairs"), numTotalCnts, this_stream, scratchPad);
+                    granData->contactTypePrimitive, granData->contactTypePatch, isNewIslandGroup,
+                    scratchPad.getDualStructDevice("numUniqueIslands"), numTotalCnts, this_stream, scratchPad);
+                cubDEMSelectFlagged<bodyID_t, contactPairs_t>(
+                    contactIslandLabel_sorted, contactPatchIsland.data(), isNewIslandGroup,
+                    scratchPad.getDualStructDevice("numUniqueIslands"), numTotalCnts, this_stream, scratchPad);
                 size_t blocks_needed_for_decode =
-                    (numUniquePatchPairs + DEME_MAX_THREADS_PER_BLOCK - 1) / DEME_MAX_THREADS_PER_BLOCK;
+                    (numUniqueIslands + DEME_MAX_THREADS_PER_BLOCK - 1) / DEME_MAX_THREADS_PER_BLOCK;
                 decodePatchPairsToSeparateArrays<<<dim3(blocks_needed_for_decode), dim3(DEME_MAX_THREADS_PER_BLOCK), 0,
                                                    this_stream>>>(unique_patch_pairs, granData->idPatchA,
-                                                                  granData->idPatchB, numUniquePatchPairs);
+                                                                  granData->idPatchB, numUniqueIslands);
             }
 
-            *scratchPad.numContacts = numUniquePatchPairs;
+            *scratchPad.numContacts = numUniqueIslands;
 
             // Build per-type start/count map for patch contacts.
             typeStartCountPatchMap_thisStep.SetAll({0, 0});
-            if (numUniquePatchPairs > 0) {
+            if (numUniqueIslands > 0) {
                 contact_t* unique_types = (contact_t*)scratchPad.allocateTempVector(
                     "unique_types", NUM_SUPPORTED_CONTACT_TYPES * sizeof(contact_t));
                 contactPairs_t* type_counts = (contactPairs_t*)scratchPad.allocateTempVector(
@@ -1416,7 +1801,7 @@ void contactDetection(std::shared_ptr<JitHelper::CachedProgram>& bin_sphere_kern
 
                 cubDEMRunLengthEncode<contact_t, contactPairs_t>(granData->contactTypePatch, unique_types, type_counts,
                                                                  scratchPad.getDualStructDevice("numUniqueTypes"),
-                                                                 numUniquePatchPairs, this_stream, scratchPad);
+                                                                 numUniqueIslands, this_stream, scratchPad);
                 scratchPad.syncDualStructDeviceToHost("numUniqueTypes");
                 size_t numTypes = *scratchPad.getDualStructHost("numUniqueTypes");
 
@@ -1443,8 +1828,54 @@ void contactDetection(std::shared_ptr<JitHelper::CachedProgram>& bin_sphere_kern
             if (unique_patch_pairs) {
                 scratchPad.finishUsingTempVector("unique_patch_pairs");
             }
+            scratchPad.finishUsingTempVector("contactIslandLabel_sorted");
+            if (groupContactTypes) {
+                scratchPad.finishUsingTempVector("groupContactTypes");
+            }
+            if (groupPrimA) {
+                scratchPad.finishUsingTempVector("groupPrimA");
+            }
+            if (groupPrimB) {
+                scratchPad.finishUsingTempVector("groupPrimB");
+            }
+            scratchPad.finishUsingTempVector("groupUniqueCountA");
+            scratchPad.finishUsingTempVector("groupUniqueCountB");
+            scratchPad.finishUsingTempVector("groupWinnerIsA");
+            scratchPad.finishUsingTempVector("groupWinnerIsTri");
+            scratchPad.finishUsingTempVector("groupForceSingleIsland");
+            scratchPad.finishUsingTempVector("winnerPrimitive");
+            scratchPad.finishUsingTempVector("winnerIsTri");
+            scratchPad.finishUsingTempVector("activeTriKeysAll");
+            scratchPad.finishUsingTempVector("activeTriKeys");
+            scratchPad.finishUsingDualStruct("numActiveTriKeys");
+            if (activeTriKeysUnique) {
+                scratchPad.finishUsingTempVector("activeTriKeys_unique");
+            }
+            if (activeLabelsA) {
+                scratchPad.finishUsingTempVector("activeTriLabelsA");
+            }
+            if (activeLabelsB) {
+                scratchPad.finishUsingTempVector("activeTriLabelsB");
+            }
+            if (groupActiveCount) {
+                scratchPad.finishUsingTempVector("groupActiveCount");
+            }
+            if (groupActiveStart) {
+                scratchPad.finishUsingTempVector("groupActiveStart");
+            }
+            scratchPad.finishUsingTempVector("contactIslandLabel");
+            scratchPad.finishUsingTempVector("islandKeyHi");
+            scratchPad.finishUsingTempVector("islandKeyLo");
+            scratchPad.finishUsingTempVector("islandKeyLo_sorted");
+            scratchPad.finishUsingTempVector("islandKeyHi_by_lo");
+            scratchPad.finishUsingTempVector("islandKeyHi_sorted");
+            scratchPad.finishUsingTempVector("islandSortIndices");
+            scratchPad.finishUsingTempVector("islandSortIndices_sorted");
+            scratchPad.finishUsingTempVector("isNewIslandGroup");
+            scratchPad.finishUsingDualStruct("numUniqueIslands");
+            scratchPad.finishUsingTempVector("groupIndex");
             scratchPad.finishUsingTempVector("isNewGroup");
-            scratchPad.finishUsingDualStruct("numUniquePatchPairs");
+            scratchPad.finishUsingDualStruct("numUniqueGroups");
             scratchPad.finishUsingTempVector("contactPatchPairs");
             scratchPad.finishUsingTempVector("patchPairs_sorted");
             scratchPad.finishUsingTempVector("contactSortIndices");
@@ -1517,9 +1948,9 @@ void contactDetection(std::shared_ptr<JitHelper::CachedProgram>& bin_sphere_kern
                     // Both steps have contacts of this type - perform mapping
                     buildPatchContactMappingForType<<<dim3(blocks_needed), dim3(DEME_MAX_THREADS_PER_BLOCK), 0,
                                                       this_stream>>>(
-                        granData->idPatchA, granData->idPatchB, granData->previous_idPatchA,
-                        granData->previous_idPatchB, granData->contactMapping, curr_start, curr_count, prev_start,
-                        prev_count);
+                        granData->idPatchA, granData->idPatchB, granData->contactPatchIsland,
+                        granData->previous_idPatchA, granData->previous_idPatchB, granData->previous_contactPatchIsland,
+                        granData->contactMapping, curr_start, curr_count, prev_start, prev_count);
                 }
             }
             // Synchronize once after all mapping kernels are launched
@@ -1531,7 +1962,7 @@ void contactDetection(std::shared_ptr<JitHelper::CachedProgram>& bin_sphere_kern
             size_t patch_type_arr_bytes = (*scratchPad.numContacts) * sizeof(contact_t);
             if (*scratchPad.numContacts > previous_idPatchA.size()) {
                 patchArraysResize(*scratchPad.numContacts, previous_idPatchA, previous_idPatchB,
-                                  previous_contactTypePatch, granData);
+                                  previous_contactTypePatch, previous_contactPatchIsland, granData);
             }
             int dev = 0;
             DEME_GPU_CALL(cudaGetDevice(&dev));
@@ -1540,6 +1971,7 @@ void contactDetection(std::shared_ptr<JitHelper::CachedProgram>& bin_sphere_kern
                 xt.add(granData->previous_idPatchA, granData->idPatchA, patch_id_arr_bytes);
                 xt.add(granData->previous_idPatchB, granData->idPatchB, patch_id_arr_bytes);
                 xt.add(granData->previous_contactTypePatch, granData->contactTypePatch, patch_type_arr_bytes);
+                xt.add(granData->previous_contactPatchIsland, granData->contactPatchIsland, patch_id_arr_bytes);
                 xt.run(dev, dev, this_stream);
             }
 
@@ -1600,6 +2032,7 @@ void overwritePrevContactArrays(DualStruct<DEMDataKT>& kT_data,
                                 DualArray<bodyID_t>& previous_idPatchA,
                                 DualArray<bodyID_t>& previous_idPatchB,
                                 DualArray<contact_t>& previous_contactTypePatch,
+                                DualArray<bodyID_t>& previous_contactPatchIsland,
                                 ContactTypeMap<std::pair<contactPairs_t, contactPairs_t>>& typeStartCountPatchMap,
                                 DualStruct<DEMSimParams>& simParams,
                                 DEMSolverScratchData& scratchPad,
@@ -1607,7 +2040,8 @@ void overwritePrevContactArrays(DualStruct<DEMDataKT>& kT_data,
                                 size_t nContacts) {
     // Make sure the storage is large enough
     if (nContacts > previous_idPatchA.size()) {
-        patchArraysResize(nContacts, previous_idPatchA, previous_idPatchB, previous_contactTypePatch, kT_data);
+        patchArraysResize(nContacts, previous_idPatchA, previous_idPatchB, previous_contactTypePatch,
+                          previous_contactPatchIsland, kT_data);
     }
 
     // No sort, copy over
@@ -1618,6 +2052,7 @@ void overwritePrevContactArrays(DualStruct<DEMDataKT>& kT_data,
         xt.add(kT_data->previous_idPatchA, dT_data->idPatchA, nContacts * sizeof(bodyID_t));
         xt.add(kT_data->previous_idPatchB, dT_data->idPatchB, nContacts * sizeof(bodyID_t));
         xt.add(kT_data->previous_contactTypePatch, dT_data->contactTypePatch, nContacts * sizeof(contact_t));
+        xt.add(kT_data->previous_contactPatchIsland, dT_data->contactPatchIsland, nContacts * sizeof(bodyID_t));
         xt.run(dev, dev, this_stream);
     }
 
diff --git a/src/algorithms/DEMContactDetectionKernels.cuh b/src/algorithms/DEMContactDetectionKernels.cuh
index 52d54b28..aa52a54e 100644
--- a/src/algorithms/DEMContactDetectionKernels.cuh
+++ b/src/algorithms/DEMContactDetectionKernels.cuh
@@ -248,6 +248,312 @@ __global__ void gatherByIndex(const T* in, T* out, const contactPairs_t* idx, si
     }
 }
 
+// Build packed (groupIndex, primitiveID) keys for unique counting.
+__global__ void buildGroupPrimitiveKeys(const contactPairs_t* groupIndex,
+                                        const bodyID_t* primitiveIDs,
+                                        uint64_t* keys,
+                                        size_t n) {
+    contactPairs_t myID = blockIdx.x * blockDim.x + threadIdx.x;
+    if (myID < n) {
+        keys[myID] = (static_cast<uint64_t>(groupIndex[myID]) << 32) | static_cast<uint64_t>(primitiveIDs[myID]);
+    }
+}
+
+// Extract group indices from packed keys (high 32 bits).
+__global__ void extractGroupIndexFromKey(const uint64_t* keys, contactPairs_t* groupIndex, size_t n) {
+    contactPairs_t myID = blockIdx.x * blockDim.x + threadIdx.x;
+    if (myID < n) {
+        groupIndex[myID] = static_cast<contactPairs_t>(keys[myID] >> 32);
+    }
+}
+
+// Scatter run-length counts into dense per-group counters.
+__global__ void scatterGroupCounts(const contactPairs_t* groupIDs,
+                                   const contactPairs_t* counts,
+                                   contactPairs_t* groupCounts,
+                                   size_t n) {
+    contactPairs_t myID = blockIdx.x * blockDim.x + threadIdx.x;
+    if (myID < n) {
+        groupCounts[groupIDs[myID]] = counts[myID];
+    }
+}
+
+// Determine winner side for each group.
+__global__ void computeGroupWinners(const contact_t* groupTypes,
+                                    const bodyID_t* groupPrimA,
+                                    const bodyID_t* groupPrimB,
+                                    const contactPairs_t* countA,
+                                    const contactPairs_t* countB,
+                                    const bodyID_t* ownerTriMesh,
+                                    const notStupidBool_t* ownerMeshConvex,
+                                    const notStupidBool_t* ownerMeshNeverWinner,
+                                    notStupidBool_t* winnerIsA,
+                                    notStupidBool_t* winnerIsTri,
+                                    notStupidBool_t* forceSingleIsland,
+                                    size_t n) {
+    contactPairs_t myID = blockIdx.x * blockDim.x + threadIdx.x;
+    if (myID < n) {
+        const contact_t ctype = groupTypes[myID];
+        const geoType_t typeA = decodeTypeA<contact_t, geoType_t>(ctype);
+        const geoType_t typeB = decodeTypeB<contact_t, geoType_t>(ctype);
+        const contactPairs_t nA = countA[myID];
+        const contactPairs_t nB = countB[myID];
+        const bool A_is_tri = (typeA == GEO_T_TRIANGLE);
+        const bool B_is_tri = (typeB == GEO_T_TRIANGLE);
+        bool A_convex = false;
+        bool B_convex = false;
+        bool A_never = false;
+        bool B_never = false;
+        if (A_is_tri) {
+            const bodyID_t ownerA = ownerTriMesh[groupPrimA[myID]];
+            if (ownerA != NULL_BODYID) {
+                A_convex = (ownerMeshConvex[ownerA] != 0);
+                A_never = (ownerMeshNeverWinner[ownerA] != 0);
+            }
+        }
+        if (B_is_tri) {
+            const bodyID_t ownerB = ownerTriMesh[groupPrimB[myID]];
+            if (ownerB != NULL_BODYID) {
+                B_convex = (ownerMeshConvex[ownerB] != 0);
+                B_never = (ownerMeshNeverWinner[ownerB] != 0);
+            }
+        }
+        const bool single_island = (A_is_tri && B_is_tri && A_convex && B_convex);
+        forceSingleIsland[myID] = single_island ? 1 : 0;
+
+        notStupidBool_t pickA = 0;
+        if (A_never && !B_never) {
+            pickA = 0;
+        } else if (B_never && !A_never) {
+            pickA = 1;
+        } else if (nA > nB) {
+            pickA = 1;
+        } else if (nA < nB) {
+            pickA = 0;
+        } else {
+            if (A_is_tri && B_is_tri) {
+                if (A_convex != B_convex) {
+                    pickA = A_convex ? 0 : 1;  // prefer concave if tied
+                } else {
+                    pickA = 0;  // deterministic tie-break: prefer B
+                }
+            } else if (A_is_tri && !B_is_tri) {
+                pickA = 1;
+            } else if (B_is_tri && !A_is_tri) {
+                pickA = 0;
+            } else {
+                pickA = 0;  // deterministic tie-break: prefer B
+            }
+        }
+        winnerIsA[myID] = pickA;
+        if (single_island) {
+            winnerIsTri[myID] = 0;
+        } else {
+            const geoType_t winnerType = (pickA ? typeA : typeB);
+            winnerIsTri[myID] = (winnerType == GEO_T_TRIANGLE) ? 1 : 0;
+        }
+    }
+}
+
+// Select winner primitive and flag if it is a triangle.
+__global__ void selectWinnerPrimitive(const contactPairs_t* groupIndex,
+                                      const bodyID_t* idA,
+                                      const bodyID_t* idB,
+                                      const notStupidBool_t* groupWinnerIsA,
+                                      const notStupidBool_t* groupWinnerIsTri,
+                                      const notStupidBool_t* groupForceSingleIsland,
+                                      bodyID_t* winnerID,
+                                      notStupidBool_t* winnerIsTri,
+                                      size_t n) {
+    contactPairs_t myID = blockIdx.x * blockDim.x + threadIdx.x;
+    if (myID < n) {
+        const contactPairs_t grp = groupIndex[myID];
+        if (groupForceSingleIsland[grp] != 0) {
+            winnerID[myID] = 0;
+            winnerIsTri[myID] = 0;
+            return;
+        }
+        const bool pickA = (groupWinnerIsA[grp] != 0);
+        winnerID[myID] = pickA ? idA[myID] : idB[myID];
+        winnerIsTri[myID] = groupWinnerIsTri[grp];
+    }
+}
+
+// Build active triangle keys for compacting (groupIndex, triID).
+__global__ void buildActiveTriKeys(const contactPairs_t* groupIndex,
+                                   const bodyID_t* winnerID,
+                                   const notStupidBool_t* winnerIsTri,
+                                   uint64_t* keys,
+                                   notStupidBool_t* flags,
+                                   size_t n) {
+    contactPairs_t myID = blockIdx.x * blockDim.x + threadIdx.x;
+    if (myID < n) {
+        const notStupidBool_t is_tri = winnerIsTri[myID];
+        flags[myID] = is_tri;
+        if (is_tri) {
+            keys[myID] = (static_cast<uint64_t>(groupIndex[myID]) << 32) | static_cast<uint64_t>(winnerID[myID]);
+        } else {
+            keys[myID] = 0;
+        }
+    }
+}
+
+// Initialize labels from active triangle keys (label = triID).
+__global__ void initActiveTriLabels(const uint64_t* keys, bodyID_t* labels, size_t n) {
+    contactPairs_t myID = blockIdx.x * blockDim.x + threadIdx.x;
+    if (myID < n) {
+        labels[myID] = static_cast<bodyID_t>(keys[myID] & 0xffffffffull);
+    }
+}
+
+// Count active triangles per group (atomic add).
+__global__ void countActiveTriPerGroup(const uint64_t* keys, contactPairs_t* groupCounts, size_t n) {
+    contactPairs_t myID = blockIdx.x * blockDim.x + threadIdx.x;
+    if (myID < n) {
+        const contactPairs_t grp = static_cast<contactPairs_t>(keys[myID] >> 32);
+        atomicAdd(&groupCounts[grp], (contactPairs_t)1);
+    }
+}
+
+// Label propagation for active triangles within each group.
+__global__ void propagateActiveTriLabels(const uint64_t* keys,
+                                         const bodyID_t* labelsIn,
+                                         bodyID_t* labelsOut,
+                                         const contactPairs_t* groupStart,
+                                         const contactPairs_t* groupCount,
+                                         const bodyID_t* triNeighbor1,
+                                         const bodyID_t* triNeighbor2,
+                                         const bodyID_t* triNeighbor3,
+                                         size_t n) {
+    contactPairs_t myID = blockIdx.x * blockDim.x + threadIdx.x;
+    if (myID < n) {
+        const uint64_t key = keys[myID];
+        const contactPairs_t grp = static_cast<contactPairs_t>(key >> 32);
+        const bodyID_t triID = static_cast<bodyID_t>(key & 0xffffffffull);
+        const contactPairs_t start = groupStart[grp];
+        const contactPairs_t count = groupCount[grp];
+        bodyID_t label = labelsIn[myID];
+
+        bodyID_t nbs[3] = {triNeighbor1[triID], triNeighbor2[triID], triNeighbor3[triID]};
+        for (int e = 0; e < 3; ++e) {
+            const bodyID_t nb = nbs[e];
+            if (nb == NULL_BODYID || count == 0) {
+                continue;
+            }
+            const uint64_t target = (static_cast<uint64_t>(grp) << 32) | static_cast<uint64_t>(nb);
+            contactPairs_t left = 0;
+            contactPairs_t right = count;
+            while (left < right) {
+                contactPairs_t mid = left + (right - left) / 2;
+                const uint64_t mid_key = keys[start + mid];
+                if (mid_key < target) {
+                    left = mid + 1;
+                } else {
+                    right = mid;
+                }
+            }
+            if (left < count) {
+                const uint64_t found = keys[start + left];
+                if (found == target) {
+                    const bodyID_t nb_label = labelsIn[start + left];
+                    if (nb_label < label) {
+                        label = nb_label;
+                    }
+                }
+            }
+        }
+        labelsOut[myID] = label;
+    }
+}
+
+// Assign per-contact island labels using winner primitive and active triangle labels.
+__global__ void assignContactIslandLabel(const contactPairs_t* groupIndex,
+                                         const bodyID_t* winnerID,
+                                         const notStupidBool_t* winnerIsTri,
+                                         const uint64_t* activeKeys,
+                                         const bodyID_t* activeLabels,
+                                         const contactPairs_t* groupStart,
+                                         const contactPairs_t* groupCount,
+                                         bodyID_t* outLabels,
+                                         size_t n) {
+    contactPairs_t myID = blockIdx.x * blockDim.x + threadIdx.x;
+    if (myID < n) {
+        const bodyID_t prim = winnerID[myID];
+        if (winnerIsTri[myID] == 0) {
+            outLabels[myID] = prim;
+            return;
+        }
+        const contactPairs_t grp = groupIndex[myID];
+        const contactPairs_t start = groupStart[grp];
+        const contactPairs_t count = groupCount[grp];
+        if (count == 0) {
+            outLabels[myID] = prim;
+            return;
+        }
+        const uint64_t target = (static_cast<uint64_t>(grp) << 32) | static_cast<uint64_t>(prim);
+        contactPairs_t left = 0;
+        contactPairs_t right = count;
+        while (left < right) {
+            contactPairs_t mid = left + (right - left) / 2;
+            const uint64_t mid_key = activeKeys[start + mid];
+            if (mid_key < target) {
+                left = mid + 1;
+            } else {
+                right = mid;
+            }
+        }
+        if (left < count && activeKeys[start + left] == target) {
+            outLabels[myID] = activeLabels[start + left];
+        } else {
+            outLabels[myID] = prim;
+        }
+    }
+}
+
+// Simple copy kernel for bodyID arrays.
+__global__ void copyBodyIDArray(const bodyID_t* in, bodyID_t* out, size_t n) {
+    contactPairs_t myID = blockIdx.x * blockDim.x + threadIdx.x;
+    if (myID < n) {
+        out[myID] = in[myID];
+    }
+}
+
+// Build composite key parts (contactType + patchA, patchB + label) for island grouping.
+__global__ void buildIslandCompositeKeyParts(const patchIDPair_t* patchPairs,
+                                             const contact_t* contactTypes,
+                                             const bodyID_t* labels,
+                                             uint64_t* key_hi,
+                                             uint64_t* key_lo,
+                                             size_t n) {
+    contactPairs_t myID = blockIdx.x * blockDim.x + threadIdx.x;
+    if (myID < n) {
+        const patchIDPair_t pair = patchPairs[myID];
+        const uint64_t hi = static_cast<uint64_t>(pair >> 32);
+        const uint64_t lo = static_cast<uint64_t>(pair & 0xffffffffull);
+        // key_hi: contactType + patchA (primary key)
+        key_hi[myID] = (static_cast<uint64_t>(contactTypes[myID]) << 32) | hi;
+        // key_lo: patchB + island label (secondary key)
+        key_lo[myID] = (lo << 32) | static_cast<uint64_t>(labels[myID]);
+    }
+}
+
+// Mark new composite groups for sorted (key_hi, key_lo) arrays.
+__global__ void markNewCompositeGroups64(const uint64_t* key_hi,
+                                         const uint64_t* key_lo,
+                                         contactPairs_t* isNewGroup,
+                                         size_t n) {
+    contactPairs_t myID = blockIdx.x * blockDim.x + threadIdx.x;
+    if (myID < n) {
+        if (myID == 0) {
+            isNewGroup[myID] = 0;
+        } else {
+            const bool new_hi = key_hi[myID] != key_hi[myID - 1];
+            const bool new_lo = key_lo[myID] != key_lo[myID - 1];
+            isNewGroup[myID] = (new_hi || new_lo) ? 1 : 0;
+        }
+    }
+}
+
 // Build a sortable 64-bit key from (idB, contactType, persistency_preference).
 // - High 32 bits: idB (so contacts with the same idB group together)
 // - Low bits: contactType then persistency (so within a duplicate group, the preferred contact comes first)
@@ -325,8 +631,10 @@ __global__ void setNullMappingForType(contactPairs_t* contactMapping,
 //   prev_count: Number of contacts of this type in previous step
 __global__ void buildPatchContactMappingForType(bodyID_t* curr_idPatchA,
                                                 bodyID_t* curr_idPatchB,
+                                                bodyID_t* curr_patchIsland,
                                                 bodyID_t* prev_idPatchA,
                                                 bodyID_t* prev_idPatchB,
+                                                bodyID_t* prev_patchIsland,
                                                 contactPairs_t* contactMapping,
                                                 contactPairs_t curr_start,
                                                 contactPairs_t curr_count,
@@ -339,6 +647,7 @@ __global__ void buildPatchContactMappingForType(bodyID_t* curr_idPatchA,
 
         bodyID_t curr_A = curr_idPatchA[curr_idx];
         bodyID_t curr_B = curr_idPatchB[curr_idx];
+        bodyID_t curr_L = curr_patchIsland[curr_idx];
 
         // Default: no match found
         contactPairs_t my_partner = NULL_MAPPING_PARTNER;
@@ -353,8 +662,9 @@ __global__ void buildPatchContactMappingForType(bodyID_t* curr_idPatchA,
             bodyID_t prev_A = prev_idPatchA[prev_idx];
             bodyID_t prev_B = prev_idPatchB[prev_idx];
 
-            // Compare (A, B) pairs lexicographically
-            if (prev_A < curr_A || (prev_A == curr_A && prev_B < curr_B)) {
+            // Compare (A, B, label) lexicographically
+            if (prev_A < curr_A || (prev_A == curr_A && (prev_B < curr_B ||
+                                                         (prev_B == curr_B && prev_patchIsland[prev_idx] < curr_L)))) {
                 left = mid + 1;
             } else {
                 right = mid;
@@ -366,7 +676,8 @@ __global__ void buildPatchContactMappingForType(bodyID_t* curr_idPatchA,
             contactPairs_t prev_idx = prev_start + left;
             bodyID_t prev_A = prev_idPatchA[prev_idx];
             bodyID_t prev_B = prev_idPatchB[prev_idx];
-            if (prev_A == curr_A && prev_B == curr_B) {
+            bodyID_t prev_L = prev_patchIsland[prev_idx];
+            if (prev_A == curr_A && prev_B == curr_B && prev_L == curr_L) {
                 my_partner = prev_idx;
             }
         }
@@ -380,9 +691,11 @@ __global__ void buildPatchContactMappingForType(bodyID_t* curr_idPatchA,
 // For each current contact, we use binary search to find the matching contact in the previous array.
 __global__ void buildPatchContactMapping(bodyID_t* curr_idPatchA,
                                          bodyID_t* curr_idPatchB,
+                                         bodyID_t* curr_patchIsland,
                                          contact_t* curr_contactTypePatch,
                                          bodyID_t* prev_idPatchA,
                                          bodyID_t* prev_idPatchB,
+                                         bodyID_t* prev_patchIsland,
                                          contact_t* previous_contactTypePatch,
                                          contactPairs_t* contactMapping,
                                          size_t numCurrContacts,
@@ -391,6 +704,7 @@ __global__ void buildPatchContactMapping(bodyID_t* curr_idPatchA,
     if (myID < numCurrContacts) {
         bodyID_t curr_A = curr_idPatchA[myID];
         bodyID_t curr_B = curr_idPatchB[myID];
+        bodyID_t curr_L = curr_patchIsland[myID];
         contact_t curr_type = curr_contactTypePatch[myID];
 
         // Default: no match found
@@ -426,19 +740,19 @@ __global__ void buildPatchContactMapping(bodyID_t* curr_idPatchA,
         }
         size_t type_end = left;
 
-        // Within this type segment, use binary search to find the matching A/B pair
-        // The segment is sorted by the combined patch ID pair (A in high bits, B in low bits)
-        // The encoding ensures that (smaller_A, larger_B) pattern creates a sortable value
+        // Within this type segment, use binary search to find the matching A/B/label triple
+        // The segment is sorted by patch pair then island label.
         left = type_start;
         right = type_end;
         while (left < right) {
             size_t mid = left + (right - left) / 2;
             bodyID_t prev_A = prev_idPatchA[mid];
             bodyID_t prev_B = prev_idPatchB[mid];
+            bodyID_t prev_L = prev_patchIsland[mid];
 
             // Compare (A, B) pairs lexicographically
             // Since they're sorted by patch ID pair where smaller ID is in high bits
-            if (prev_A < curr_A || (prev_A == curr_A && prev_B < curr_B)) {
+            if (prev_A < curr_A || (prev_A == curr_A && (prev_B < curr_B || (prev_B == curr_B && prev_L < curr_L)))) {
                 left = mid + 1;
             } else {
                 right = mid;
@@ -449,7 +763,8 @@ __global__ void buildPatchContactMapping(bodyID_t* curr_idPatchA,
         if (left < type_end) {
             bodyID_t prev_A = prev_idPatchA[left];
             bodyID_t prev_B = prev_idPatchB[left];
-            if (prev_A == curr_A && prev_B == curr_B) {
+            bodyID_t prev_L = prev_patchIsland[left];
+            if (prev_A == curr_A && prev_B == curr_B && prev_L == curr_L) {
                 my_partner = left;
             }
         }
diff --git a/src/algorithms/DEMStaticDeviceSubroutines.h b/src/algorithms/DEMStaticDeviceSubroutines.h
index 87cf6685..252930b3 100644
--- a/src/algorithms/DEMStaticDeviceSubroutines.h
+++ b/src/algorithms/DEMStaticDeviceSubroutines.h
@@ -128,6 +128,8 @@ void contactDetection(std::shared_ptr<JitHelper::CachedProgram>& bin_sphere_kern
                       DualArray<bodyID_t>& previous_idPatchB,
                       DualArray<contact_t>& contactTypePatch,
                       DualArray<contact_t>& previous_contactTypePatch,
+                      DualArray<bodyID_t>& contactPatchIsland,
+                      DualArray<bodyID_t>& previous_contactPatchIsland,
                       ContactTypeMap<std::pair<contactPairs_t, contactPairs_t>>& typeStartCountPatchMap,
                       DualArray<contactPairs_t>& geomToPatchMap,
                       cudaStream_t& this_stream,
@@ -149,6 +151,7 @@ void overwritePrevContactArrays(DualStruct<DEMDataKT>& kT_data,
                                 DualArray<bodyID_t>& previous_idPatchA,
                                 DualArray<bodyID_t>& previous_idPatchB,
                                 DualArray<contact_t>& previous_contactTypePatch,
+                                DualArray<bodyID_t>& previous_contactPatchIsland,
                                 ContactTypeMap<std::pair<contactPairs_t, contactPairs_t>>& typeStartCountPatchMap,
                                 DualStruct<DEMSimParams>& simParams,
                                 DEMSolverScratchData& scratchPad,
diff --git a/src/demo/DEMdemo_DrumCubes.cpp b/src/demo/DEMdemo_DrumCubes.cpp
index d47d05bf..a9a4faf1 100644
--- a/src/demo/DEMdemo_DrumCubes.cpp
+++ b/src/demo/DEMdemo_DrumCubes.cpp
@@ -43,6 +43,8 @@ int main() {
     // Load cube mesh template (12 triangles) and scale to 10 mm
     auto cube_template = DEMSim.LoadMeshType((GET_DATA_PATH() / "mesh/cube.obj").string(), mat_type_cube, true, false);
     cube_template->Scale(cube_size);
+    cube_template->SetConvex(true);
+    cube_template->SetNeverWinner(true);
 
     // Drum definition
     float3 CylCenter = make_float3(0, 0, 0);
diff --git a/src/demo/DEMdemo_ResponseAngleMesh.cpp b/src/demo/DEMdemo_ResponseAngleMesh.cpp
index f5064ea8..85370fd9 100644
--- a/src/demo/DEMdemo_ResponseAngleMesh.cpp
+++ b/src/demo/DEMdemo_ResponseAngleMesh.cpp
@@ -94,8 +94,8 @@ int main() {
 
     // --------------------- Particle settings block ---------------------
     // Mesh file can be .stl or .obj (path is relative to data/mesh).
-    const path particle_mesh_file = GET_DATA_PATH() / "mesh" / "cube.obj"; // "simpleTriangleShape4mm.stl"
-    const float particle_mesh_scale = mm_to_m * 5.0f; // 1.0f for STLs in mm size
+    const path particle_mesh_file = GET_DATA_PATH() / "mesh" / "cross_fine.stl"; // "simpleTriangleShape4mm.stl"
+    const float particle_mesh_scale = mm_to_m * 0.5f; // 1.0f for STLs in mm size
     const unsigned int target_particles = 5000;
     // -------------------------------------------------------------------
 
@@ -141,11 +141,32 @@ int main() {
     auto cap_tracker = DEMSim.Track(end_caps);
 
     // Sample particles inside the cylindrical volume with a small wall clearance.
-    const float sample_radius = drum_inner_radius - wall_clearance - tri_radius;
-    const float sample_halfheight = drum_height / 2.0f - wall_clearance - tri_radius;
+    const float r_sphere = tri_radius;  // = 0.5 * tri_diag
+    // AABB clearance for a cylinder aligned with z:
+    // radial clearance uses the half-diagonal in XY; z-clearance uses half-height in Z.
+    const float r_xy_aabb = 0.5f * std::sqrt(tri_dims.x * tri_dims.x + tri_dims.y * tri_dims.y);
+    const float r_z_aabb  = 0.5f * tri_dims.z;
+    // Spacing of the HCP lattice (center-to-center). Keep conservative spacing (uses tri_diag).
+    // Clearance model only changes usable container dimensions.
     HCPSampler sampler(tri_diag * 1.01f);
-    auto candidate_pos =
-        sampler.SampleCylinderZ(make_float3(0, 0, drum_height / 2.0f), sample_radius, sample_halfheight);
+    auto sample_with_clearance = [&](float r_xy, float r_z) {
+        const float sample_radius     = drum_inner_radius - wall_clearance - r_xy;
+        const float sample_halfheight = drum_height * 0.5f - wall_clearance - r_z;
+        // Guard against negative dimensions
+        if (sample_radius <= 0.f || sample_halfheight <= 0.f) {
+            return std::vector<float3>{};
+        }
+
+        return sampler.SampleCylinderZ(make_float3(0, 0, drum_height / 2.0f), sample_radius, sample_halfheight);
+    };
+    // Generate both candidate sets
+    auto cand_sphere = sample_with_clearance(r_sphere, r_sphere);
+    auto cand_aabb   = sample_with_clearance(r_xy_aabb, r_z_aabb);
+    // Pick denser (more points). If equal, prefer sphere for robustness.
+    bool use_aabb = cand_aabb.size() > cand_sphere.size();
+    auto& candidate_pos = use_aabb ? cand_aabb : cand_sphere;
+    std::cout << "Sampling clearance mode: " << (use_aabb ? "AABB" : "Sphere")
+              << " (AABB=" << cand_aabb.size() << ", Sphere=" << cand_sphere.size() << ")\n";
     if (candidate_pos.size() < target_particles) {
         DEME_WARNING("Sampler produced fewer points (%zu) than requested (%u). Using all generated points.",
                      candidate_pos.size(), target_particles);

From 351a8316c8695bbe55154e9f7aa4fddff4a2160a Mon Sep 17 00:00:00 2001
From: Florian Reinle <f.reinle@otec.de>
Date: Sat, 31 Jan 2026 14:46:56 +0100
Subject: [PATCH 17/17] Cleanup and expanded SimpleCollsion demo - clean up
 mesh splitting section inculding test - added full fast path for convex shape
 that never win (contact island always more coarse on their side) -
 SimpleCollision demo expanded

---
 src/DEM/API.h                                 |   3 +
 src/DEM/APIPrivate.cpp                        |  81 +-
 src/DEM/APIPublic.cpp                         |   5 +-
 src/DEM/Defines.h                             |   4 +
 src/DEM/MeshUtils.cpp                         | 851 +-----------------
 src/DEM/dT.cpp                                |  45 +-
 src/DEM/dT.h                                  |  20 +-
 src/DEM/kT.cpp                                |  44 +-
 src/DEM/kT.h                                  |   8 +-
 src/DEM/utils/HostSideHelpers.hpp             |  73 ++
 src/algorithms/DEMContactDetection.cu         |   1 +
 src/algorithms/DEMContactDetectionKernels.cuh |  12 +-
 src/demo/DEMdemo_ResponseAngleMesh.cpp        |   6 +-
 src/demo/ModularTests/CMakeLists.txt          |   1 -
 src/demo/ModularTests/DEMTest_MeshPatch.cpp   | 271 ------
 .../ModularTests/DEMTest_SimpleCollisions.cpp | 257 ++++--
 16 files changed, 367 insertions(+), 1315 deletions(-)
 delete mode 100644 src/demo/ModularTests/DEMTest_MeshPatch.cpp

diff --git a/src/DEM/API.h b/src/DEM/API.h
index 3ba4c82e..22a0376d 100644
--- a/src/DEM/API.h
+++ b/src/DEM/API.h
@@ -1750,6 +1750,8 @@ class DEMSolver {
     size_t nSpheresGM = 0;
     // Total number of triangle facets
     size_t nTriGM = 0;
+    // Total number of triangles that need neighbor info (compact neighbor array size)
+    size_t nTriNeighbors = 0;
     // Total number of mesh patches
     size_t nMeshPatches = 0;
     // Number of analytical entites (as components of some external objects)
@@ -2080,6 +2082,7 @@ class DEMSolver {
                                size_t nSpheres,
                                size_t nTriMesh,
                                size_t nFacets,
+                               size_t nTriNeighbors,
                                size_t nMeshPatches,
                                unsigned int nExtObj_old,
                                unsigned int nAnalGM_old);
diff --git a/src/DEM/APIPrivate.cpp b/src/DEM/APIPrivate.cpp
index 531c860f..8e7be66e 100644
--- a/src/DEM/APIPrivate.cpp
+++ b/src/DEM/APIPrivate.cpp
@@ -31,32 +31,12 @@ struct EdgeInfo {
     int edge = 0;
 };
 
-struct QuantKey3 {
-    int64_t x, y, z;
-    bool operator==(const QuantKey3& o) const noexcept { return x == o.x && y == o.y && z == o.z; }
-};
-struct QuantKey3Hash {
-    size_t operator()(const QuantKey3& k) const noexcept {
-        size_t h1 = std::hash<int64_t>{}(k.x);
-        size_t h2 = std::hash<int64_t>{}(k.y);
-        size_t h3 = std::hash<int64_t>{}(k.z);
-        size_t h = h1;
-        h ^= h2 + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2);
-        h ^= h3 + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2);
-        return h;
-    }
-};
-
 inline uint64_t makeEdgeKey(int a, int b) {
     const uint32_t lo = static_cast<uint32_t>(std::min(a, b));
     const uint32_t hi = static_cast<uint32_t>(std::max(a, b));
     return (static_cast<uint64_t>(lo) << 32) | static_cast<uint64_t>(hi);
 }
 
-static inline int64_t quantize(double v, double eps) {
-    return static_cast<int64_t>(std::llround(v / eps));
-}
-
 std::vector<std::array<bodyID_t, 3>> buildTriangleEdgeNeighbors(const std::vector<int3>& face_v_indices,
                                                                  const std::vector<float3>& vertices) {
     const size_t n_faces = face_v_indices.size();
@@ -67,36 +47,8 @@ std::vector<std::array<bodyID_t, 3>> buildTriangleEdgeNeighbors(const std::vecto
 
     std::vector<size_t> canon;
     if (!vertices.empty()) {
-        double minx = vertices[0].x, miny = vertices[0].y, minz = vertices[0].z;
-        double maxx = minx, maxy = miny, maxz = minz;
-        for (const auto& v : vertices) {
-            minx = std::min(minx, (double)v.x);
-            miny = std::min(miny, (double)v.y);
-            minz = std::min(minz, (double)v.z);
-            maxx = std::max(maxx, (double)v.x);
-            maxy = std::max(maxy, (double)v.y);
-            maxz = std::max(maxz, (double)v.z);
-        }
-        const double dx = maxx - minx, dy = maxy - miny, dz = maxz - minz;
-        const double diag = std::sqrt(dx * dx + dy * dy + dz * dz);
-        const double eps = std::max(diag * 1e-9, 1e-12);
-
-        std::unordered_map<QuantKey3, size_t, QuantKey3Hash> rep;
-        rep.reserve(vertices.size());
-        canon.assign(vertices.size(), static_cast<size_t>(-1));
-        size_t next_id = 0;
-        for (size_t i = 0; i < vertices.size(); ++i) {
-            const auto& v = vertices[i];
-            QuantKey3 key{quantize(v.x, eps), quantize(v.y, eps), quantize(v.z, eps)};
-            auto it = rep.find(key);
-            if (it == rep.end()) {
-                rep.emplace(key, next_id);
-                canon[i] = next_id;
-                next_id++;
-            } else {
-                canon[i] = it->second;
-            }
-        }
+        const double eps = computeVertexQuantEps(vertices);
+        canon = buildCanonicalVertexMap(vertices, eps);
     }
 
     std::unordered_map<uint64_t, std::vector<EdgeInfo>> edge_map;
@@ -973,7 +925,12 @@ void DEMSolver::preprocessTriangleObjs() {
         m_mesh_facet_owner.insert(m_mesh_facet_owner.end(), mesh_obj->GetNumTriangles(), thisMeshObj);
 
         const bodyID_t tri_offset = static_cast<bodyID_t>(m_mesh_facets.size());
-        const auto local_neighbors = buildTriangleEdgeNeighbors(mesh_obj->m_face_v_indices, mesh_obj->m_vertices);
+        std::vector<std::array<bodyID_t, 3>> local_neighbors;
+        if (mesh_obj->IsConvex() && mesh_obj->IsNeverWinner()) {
+            local_neighbors.assign(mesh_obj->GetNumTriangles(), {NULL_BODYID, NULL_BODYID, NULL_BODYID});
+        } else {
+            local_neighbors = buildTriangleEdgeNeighbors(mesh_obj->m_face_v_indices, mesh_obj->m_vertices);
+        }
 
         // Force single-patch semantics: one patch per mesh (all facets in patch 0)
         if (mesh_obj->patches_explicitly_set || mesh_obj->GetNumPatches() > 1) {
@@ -1469,17 +1426,30 @@ void DEMSolver::setSimParams() {
 }
 
 void DEMSolver::allocateGPUArrays() {
+    size_t tri_neighbors = 0;
+    for (const auto& mesh_obj : cached_mesh_objs) {
+        if (!mesh_obj) {
+            continue;
+        }
+        if (!(mesh_obj->IsConvex() && mesh_obj->IsNeverWinner())) {
+            tri_neighbors += mesh_obj->GetNumTriangles();
+        }
+    }
+    nTriNeighbors = tri_neighbors;
+
     // Resize arrays based on the statistical data we have
     std::thread dThread = std::move(std::thread([this]() {
         this->dT->allocateGPUArrays(this->nOwnerBodies, this->nOwnerClumps, this->nExtObj, this->nTriMeshes,
-                                    this->nSpheresGM, this->nTriGM, this->nMeshPatches, this->nAnalGM,
+                                    this->nSpheresGM, this->nTriGM, this->nTriNeighbors, this->nMeshPatches,
+                                    this->nAnalGM,
                                     this->nExtraContacts, this->nDistinctMassProperties,
                                     this->nDistinctClumpBodyTopologies, this->nDistinctClumpComponents,
                                     this->nJitifiableClumpComponents, this->nMatTuples);
     }));
     std::thread kThread = std::move(std::thread([this]() {
         this->kT->allocateGPUArrays(this->nOwnerBodies, this->nOwnerClumps, this->nExtObj, this->nTriMeshes,
-                                    this->nSpheresGM, this->nTriGM, this->nAnalGM, this->nExtraContacts,
+                                    this->nSpheresGM, this->nTriGM, this->nTriNeighbors, this->nAnalGM,
+                                    this->nExtraContacts,
                                     this->nDistinctMassProperties, this->nDistinctClumpBodyTopologies,
                                     this->nDistinctClumpComponents, this->nJitifiableClumpComponents, this->nMatTuples);
     }));
@@ -1542,6 +1512,7 @@ void DEMSolver::updateClumpMeshArrays(size_t nOwners,
                                       size_t nSpheres,
                                       size_t nTriMesh,
                                       size_t nFacets,
+                                      size_t nTriNeighbors,
                                       size_t nMeshPatches,
                                       unsigned int nExtObj,
                                       unsigned int nAnalGM) {
@@ -1572,7 +1543,7 @@ void DEMSolver::updateClumpMeshArrays(size_t nOwners,
         // I/O and misc.
         m_no_output_families, m_tracked_objs,
         // Number of entities, old
-        nOwners, nClumps, nSpheres, nTriMesh, nFacets, nMeshPatches, nExtObj, nAnalGM);
+        nOwners, nClumps, nSpheres, nTriMesh, nFacets, nTriNeighbors, nMeshPatches, nExtObj, nAnalGM);
     kT->updateClumpMeshArrays(
         // Clump batchs' initial stats
         cached_input_clump_batches,
@@ -1588,7 +1559,7 @@ void DEMSolver::updateClumpMeshArrays(size_t nOwners,
         // Templates and misc.
         flattened_clump_templates,
         // Number of entities, old
-        nOwners, nClumps, nSpheres, nTriMesh, nFacets, nMeshPatches, nExtObj, nAnalGM);
+        nOwners, nClumps, nSpheres, nTriMesh, nFacets, nTriNeighbors, nMeshPatches, nExtObj, nAnalGM);
 }
 
 void DEMSolver::packDataPointers() {
diff --git a/src/DEM/APIPublic.cpp b/src/DEM/APIPublic.cpp
index e79e8ee0..b572acca 100644
--- a/src/DEM/APIPublic.cpp
+++ b/src/DEM/APIPublic.cpp
@@ -2524,6 +2524,7 @@ void DEMSolver::Update() {
     size_t nSpheres_old = nSpheresGM;
     size_t nTriMesh_old = nTriMeshes;
     size_t nFacets_old = nTriGM;
+    size_t nTriNeighbors_old = nTriNeighbors;
     size_t nPatch_old = nMeshPatches;
     unsigned int nAnalGM_old = nAnalGM;
     unsigned int nExtObj_old = nExtObj;
@@ -2534,8 +2535,8 @@ void DEMSolver::Update() {
     updateTotalEntityNum();
     allocateGPUArrays();
     // `Update' method needs to know the number of existing clumps and spheres (before this addition)
-    updateClumpMeshArrays(nOwners_old, nClumps_old, nSpheres_old, nTriMesh_old, nFacets_old, nPatch_old, nExtObj_old,
-                          nAnalGM_old);
+    updateClumpMeshArrays(nOwners_old, nClumps_old, nSpheres_old, nTriMesh_old, nFacets_old, nTriNeighbors_old,
+                          nPatch_old, nExtObj_old, nAnalGM_old);
     packDataPointers();
 
     // Now that all params prepared, and all data pointers packed on host side, we need to migrate that imformation to
diff --git a/src/DEM/Defines.h b/src/DEM/Defines.h
index a9c7f799..462a05f1 100644
--- a/src/DEM/Defines.h
+++ b/src/DEM/Defines.h
@@ -392,6 +392,8 @@ struct DEMDataDT {
     notStupidBool_t* ownerMeshConvex;
     notStupidBool_t* ownerMeshNeverWinner;
     bodyID_t* triPatchID;
+    // Map global triangle ID -> compact neighbor index (NULL_BODYID if neighbors are not stored)
+    bodyID_t* triNeighborIndex;
     bodyID_t* triNeighbor1;
     bodyID_t* triNeighbor2;
     bodyID_t* triNeighbor3;
@@ -473,6 +475,8 @@ struct DEMDataKT {
     notStupidBool_t* ownerMeshConvex;
     notStupidBool_t* ownerMeshNeverWinner;
     bodyID_t* triPatchID;
+    // Map global triangle ID -> compact neighbor index (NULL_BODYID if neighbors are not stored)
+    bodyID_t* triNeighborIndex;
     bodyID_t* triNeighbor1;
     bodyID_t* triNeighbor2;
     bodyID_t* triNeighbor3;
diff --git a/src/DEM/MeshUtils.cpp b/src/DEM/MeshUtils.cpp
index daee8c4d..12ad108f 100644
--- a/src/DEM/MeshUtils.cpp
+++ b/src/DEM/MeshUtils.cpp
@@ -795,804 +795,6 @@ static std::vector<std::vector<EdgeAdjInfo>> buildAdjacencyWithEdgeInfo(const st
     return adj;
 }
 
-// ------------------------------------------------------------
-// Smart patch splitter
-// ------------------------------------------------------------
-unsigned int DEMMesh::SplitIntoConvexPatches(float hard_angle_deg,
-                                             const PatchSplitOptions& opt_in,
-                                             PatchQualityReport* out_report,
-                                             const PatchQualityOptions& qopt) {
-    if (nTri == 0) {
-        patches_explicitly_set = false;
-        nPatches = 1;
-        if (out_report) {
-            out_report->overall = PatchQualityLevel::SAFE;
-            out_report->constraint_status = PatchConstraintStatus::SATISFIED;
-            out_report->achieved_patches = 1;
-            out_report->requested_min = 1;
-            out_report->requested_max = 1;
-            out_report->per_patch.clear();
-        }
-        return 0;
-    }
-
-    if (hard_angle_deg <= 0.0f) {
-        DEME_ERROR("SplitIntoConvexPatches: hard_angle_deg must be > 0.");
-    }
-    if (opt_in.patch_min == 0) {
-        DEME_ERROR("SplitIntoConvexPatches: patch_min must be >= 1.");
-    }
-    if (opt_in.patch_min > opt_in.patch_max) {
-        DEME_ERROR("SplitIntoConvexPatches: patch_min cannot be > patch_max.");
-    }
-
-    // Copy options (we may adjust defaults in a controlled way)
-    PatchSplitOptions opt = opt_in;
-
-    hard_angle_deg = std::min(180.0f, std::max(0.0f, hard_angle_deg));
-
-    // Resolve hysteresis
-    float soft_angle_deg = (opt.soft_angle_deg >= 0.0f) ? opt.soft_angle_deg : hard_angle_deg;
-    soft_angle_deg = std::min(hard_angle_deg, std::max(0.0f, soft_angle_deg));
-
-    // If user activates hysteresis (soft < hard) but didn't enable patch-normal gating, set a sensible default:
-    // otherwise the mid-band has no extra decision signal.
-    bool patch_gate_enabled = (opt.patch_normal_max_deg >= 0.0f);
-    if (!patch_gate_enabled && soft_angle_deg < hard_angle_deg) {
-        opt.patch_normal_max_deg = soft_angle_deg;
-        patch_gate_enabled = true;
-    }
-
-    float patch_normal_max_deg = opt.patch_normal_max_deg;  // may be <0 => disabled
-    if (patch_gate_enabled) {
-        patch_normal_max_deg = std::min(180.0f, std::max(0.0f, patch_normal_max_deg));
-    }
-
-    const float cos_hard = std::cos(deg2rad(hard_angle_deg));
-    const float cos_soft = std::cos(deg2rad(soft_angle_deg));
-    float cos_patch = -1.0f;
-    if (patch_gate_enabled) {
-        cos_patch = std::cos(deg2rad(patch_normal_max_deg));
-    }
-
-    // Precompute face normals and areas
-    std::vector<float3> face_normals(nTri);
-    std::vector<float> face_areas(nTri, 0.0f);
-    for (size_t i = 0; i < nTri; ++i) {
-        const int3& f = m_face_v_indices[i];
-        const float3& v0 = m_vertices[f.x];
-        const float3& v1 = m_vertices[f.y];
-        const float3& v2 = m_vertices[f.z];
-        face_normals[i] = computeFaceNormal(v0, v1, v2);
-        face_areas[i] = computeTriangleArea(v0, v1, v2);
-        if (face_areas[i] <= DEME_TINY_FLOAT)
-            face_areas[i] = 0.0f;
-    }
-
-    // Adjacency with edge info
-    auto adjacency = buildAdjacencyWithEdgeInfo(m_face_v_indices);
-
-    // Seed order
-    std::vector<size_t> seeds(nTri);
-    for (size_t i = 0; i < nTri; ++i)
-        seeds[i] = i;
-    if (opt.seed_largest_first) {
-        std::stable_sort(seeds.begin(), seeds.end(), [&](size_t a, size_t b) { return face_areas[a] > face_areas[b]; });
-    }
-
-    // Core segmentation routine (no post-merge/split)
-    auto segment_once = [&](const PatchSplitOptions& o,
-                            float soft_deg,
-                            bool patch_gate,
-                            float cosPatch,
-                            std::vector<patchID_t>& out_ids,
-                            unsigned int& out_nP) {
-        out_ids.assign(nTri, (patchID_t)-1);
-
-        int current_patch_id = 0;
-        std::vector<size_t> queue;
-        queue.reserve(256);
-
-        for (size_t si = 0; si < nTri; ++si) {
-            size_t seed = seeds[si];
-            if (out_ids[seed] != (patchID_t)-1)
-                continue;
-
-            if (current_patch_id > std::numeric_limits<patchID_t>::max()) {
-                DEME_ERROR("SplitIntoPatches: too many patches for patchID_t.");
-            }
-
-            float3 sumN = mul3(face_normals[seed], face_areas[seed]);
-            float sumA = face_areas[seed];
-            float3 patchN = normalize3(sumN);
-
-            queue.clear();
-            queue.push_back(seed);
-            out_ids[seed] = (patchID_t)current_patch_id;
-
-            size_t qi = 0;
-            while (qi < queue.size()) {
-                size_t cur = queue[qi++];
-
-                for (const auto& e : adjacency[cur]) {
-                    size_t nb = e.nbr;
-                    if (out_ids[nb] != (patchID_t)-1)
-                        continue;
-
-                    const float3& n_cur = face_normals[cur];
-                    const float3& n_nb = face_normals[nb];
-
-                    // Hard barrier (mandatory)
-                    float d_cn = clamp11(dot3(n_cur, n_nb));
-                    if (d_cn < cos_hard)
-                        continue;
-
-                    // Optional concavity barrier
-                    if (o.block_concave_edges && e.oriented_ok) {
-                        const float3& vA = m_vertices[e.va];
-                        const float3& vB = m_vertices[e.vb];
-                        float dih = signedDihedralDeg(n_cur, n_nb, vA, vB);
-                        if (dih < -o.concave_allow_deg)
-                            continue;
-                    }
-
-                    // Hysteresis band:
-                    // - if below soft: we still require patch gate if enabled (otherwise accept)
-                    // - if between soft and hard: require patch gate if enabled; otherwise accept (legacy-like)
-                    bool in_soft = (d_cn >= cos_soft);
-
-                    if (patch_gate) {
-                        float d_pn = clamp11(dot3(patchN, n_nb));
-                        if (d_pn < cosPatch)
-                            continue;
-                        // pass patch gate => accept
-                    } else {
-                        // no patch gate => legacy-like behavior (soft only matters if patch gate is active)
-                        (void)in_soft;
-                    }
-
-                    out_ids[nb] = (patchID_t)current_patch_id;
-                    queue.push_back(nb);
-
-                    if (face_areas[nb] > 0.0f) {
-                        sumN = add3(sumN, mul3(n_nb, face_areas[nb]));
-                        sumA += face_areas[nb];
-                        patchN = normalize3(sumN);
-                    }
-                }
-            }
-
-            current_patch_id++;
-        }
-
-        out_nP = (unsigned int)current_patch_id;
-    };
-
-    // A small helper to compress patch IDs to [0..nP-1]
-    auto compress_ids = [&](std::vector<patchID_t>& ids, unsigned int& out_nP) {
-        auto res = rank_transform<patchID_t>(ids);
-        ids = std::move(res.first);
-        // recompute nP
-        patchID_t mx = 0;
-        for (auto v : ids)
-            if (v > mx) mx = v;
-        out_nP = (unsigned int)(mx + 1);
-    };
-
-    // Enforce patch_max by merging adjacent patches where allowed (hard/concave respected)
-    auto enforce_patch_max = [&](std::vector<patchID_t>& ids, unsigned int& pcount, PatchConstraintStatus& cstat) {
-        if (pcount <= opt.patch_max)
-            return;
-
-        // Build patch mean normals (area-weighted)
-        std::vector<float3> pSumN(pcount, make_float3(0, 0, 0));
-        std::vector<float> pSumA(pcount, 0.0f);
-
-        for (size_t t = 0; t < nTri; ++t) {
-            int p = (int)ids[t];
-            if (face_areas[t] > 0.0f) {
-                pSumN[p] = add3(pSumN[p], mul3(face_normals[t], face_areas[t]));
-                pSumA[p] += face_areas[t];
-            }
-        }
-
-        struct DSU {
-            std::vector<int> parent, rnk;
-            std::vector<float3>* sumN;
-            std::vector<float>* sumA;
-
-            DSU(int n, std::vector<float3>& sN, std::vector<float>& sA) : parent(n), rnk(n, 0), sumN(&sN), sumA(&sA) {
-                for (int i = 0; i < n; ++i) parent[i] = i;
-            }
-            int find(int x) {
-                while (parent[x] != x) {
-                    parent[x] = parent[parent[x]];
-                    x = parent[x];
-                }
-                return x;
-            }
-            bool unite(int a, int b) {
-                a = find(a); b = find(b);
-                if (a == b) return false;
-                if (rnk[a] < rnk[b]) std::swap(a, b);
-                parent[b] = a;
-                if (rnk[a] == rnk[b]) rnk[a]++;
-                (*sumN)[a] = add3((*sumN)[a], (*sumN)[b]);
-                (*sumA)[a] += (*sumA)[b];
-                return true;
-            }
-            float3 patchN(int x) {
-                x = find(x);
-                return normalize3((*sumN)[x]);
-            }
-        };
-
-        DSU dsu((int)pcount, pSumN, pSumA);
-
-        struct Cand { float cost; int a; int b; };
-        struct Cmp { bool operator()(const Cand& x, const Cand& y) const { return x.cost > y.cost; } };
-
-        auto cost_between = [&](int a, int b) {
-            float3 na = dsu.patchN(a);
-            float3 nb = dsu.patchN(b);
-            float d = clamp11(dot3(na, nb));
-            return 1.0f - d;  // smaller is better (more parallel)
-        };
-
-        // Candidate patch adjacency across mergeable edges (hard + optional concavity)
-        std::map<std::pair<int, int>, float> best_cost;
-
-        for (size_t t = 0; t < nTri; ++t) {
-            int pt = (int)ids[t];
-            for (const auto& e : adjacency[t]) {
-                size_t nb = e.nbr;
-                int pn = (int)ids[nb];
-                if (pt == pn)
-                    continue;
-
-                float d = clamp11(dot3(face_normals[t], face_normals[nb]));
-                if (d < cos_hard)
-                    continue;
-
-                if (opt.block_concave_edges && e.oriented_ok) {
-                    const float3& vA = m_vertices[e.va];
-                    const float3& vB = m_vertices[e.vb];
-                    float dih = signedDihedralDeg(face_normals[t], face_normals[nb], vA, vB);
-                    if (dih < -opt.concave_allow_deg)
-                        continue;
-                }
-
-                int a = std::min(pt, pn);
-                int b = std::max(pt, pn);
-                float c = cost_between(a, b);
-
-                auto key = std::make_pair(a, b);
-                auto it = best_cost.find(key);
-                if (it == best_cost.end() || c < it->second)
-                    best_cost[key] = c;
-            }
-        }
-
-        std::priority_queue<Cand, std::vector<Cand>, Cmp> pq;
-        for (const auto& kv : best_cost)
-            pq.push(Cand{kv.second, kv.first.first, kv.first.second});
-
-        unsigned int cur = pcount;
-        while (cur > opt.patch_max && !pq.empty()) {
-            auto c = pq.top(); pq.pop();
-            int ra = dsu.find(c.a);
-            int rb = dsu.find(c.b);
-            if (ra == rb)
-                continue;
-            if (dsu.unite(ra, rb))
-                cur--;
-        }
-
-        // If we couldn't merge enough, mark as unmergeable
-        if (cur > opt.patch_max)
-            cstat = PatchConstraintStatus::TOO_MANY_UNMERGEABLE;
-
-        // Write back merged ids and compress
-        std::unordered_map<int, patchID_t> rep2new;
-        rep2new.reserve(pcount * 2);
-
-        patchID_t next = 0;
-        for (size_t i = 0; i < nTri; ++i) {
-            int p = (int)ids[i];
-            int r = dsu.find(p);
-            auto it = rep2new.find(r);
-            if (it == rep2new.end()) {
-                rep2new.emplace(r, next);
-                ids[i] = next;
-                next++;
-            } else {
-                ids[i] = it->second;
-            }
-        }
-        pcount = (unsigned int)next;
-    };
-
-    // Enforce patch_min by splitting worst-spread patches (count-only)
-    auto enforce_patch_min = [&](std::vector<patchID_t>& ids, unsigned int& pcount, PatchConstraintStatus& cstat) {
-        if (pcount >= opt.patch_min)
-            return;
-
-        auto rebuild_patch_lists = [&](std::vector<std::vector<size_t>>& pTris) {
-            pTris.assign(pcount, {});
-            for (size_t i = 0; i < nTri; ++i) {
-                int p = (int)ids[i];
-                pTris[p].push_back(i);
-            }
-        };
-
-        std::vector<std::vector<size_t>> pTris;
-        rebuild_patch_lists(pTris);
-
-        auto patch_mean_normal = [&](int p) {
-            float3 sumN = make_float3(0, 0, 0);
-            float sumA = 0.0f;
-            for (size_t t : pTris[p]) {
-                if (face_areas[t] > 0.0f) {
-                    sumN = add3(sumN, mul3(face_normals[t], face_areas[t]));
-                    sumA += face_areas[t];
-                }
-            }
-            (void)sumA;
-            return normalize3(sumN);
-        };
-
-        auto pick_patch_to_split = [&]() -> int {
-            float worst = 1.0f;
-            int worst_p = -1;
-            for (int p = 0; p < (int)pcount; ++p) {
-                if (pTris[p].size() < 2)
-                    continue;
-                float3 pn = patch_mean_normal(p);
-                float minDot = 1.0f;
-                for (size_t t : pTris[p]) {
-                    float d = clamp11(dot3(pn, face_normals[t]));
-                    minDot = std::min(minDot, d);
-                }
-                if (minDot < worst) {
-                    worst = minDot;
-                    worst_p = p;
-                }
-            }
-            return worst_p;
-        };
-
-        struct Node { float cost; size_t tri; int label; };
-        struct NodeCmp { bool operator()(const Node& a, const Node& b) const { return a.cost > b.cost; } };
-
-        std::vector<int8_t> label(nTri, -2);
-        std::vector<size_t> touched; touched.reserve(2048);
-
-        while (pcount < opt.patch_min) {
-            int p = pick_patch_to_split();
-            if (p < 0) {
-                cstat = PatchConstraintStatus::TOO_FEW_UNSPLITTABLE;
-                break;
-            }
-            const auto& tris = pTris[p];
-            if (tris.size() < 2) {
-                cstat = PatchConstraintStatus::TOO_FEW_UNSPLITTABLE;
-                break;
-            }
-
-            // choose 2 seeds with farthest normals (2-sweep)
-            size_t t0 = tris[0];
-            size_t sA = t0;
-            float best = 1.0f;
-            for (size_t t : tris) {
-                float d = clamp11(dot3(face_normals[t0], face_normals[t]));
-                if (d < best) { best = d; sA = t; }
-            }
-            size_t sB = sA;
-            best = 1.0f;
-            for (size_t t : tris) {
-                float d = clamp11(dot3(face_normals[sA], face_normals[t]));
-                if (d < best) { best = d; sB = t; }
-            }
-            if (sA == sB) {
-                cstat = PatchConstraintStatus::TOO_FEW_UNSPLITTABLE;
-                break;
-            }
-
-            touched.clear();
-            for (size_t t : tris) {
-                label[t] = -1;
-                touched.push_back(t);
-            }
-
-            std::priority_queue<Node, std::vector<Node>, NodeCmp> pq;
-            label[sA] = 0; label[sB] = 1;
-            pq.push(Node{0.0f, sA, 0});
-            pq.push(Node{0.0f, sB, 1});
-
-            const float3 seedN[2] = {face_normals[sA], face_normals[sB]};
-
-            while (!pq.empty()) {
-                Node cur = pq.top(); pq.pop();
-                size_t t = cur.tri;
-                int lbl = cur.label;
-                if (label[t] != lbl)
-                    continue;
-
-                for (const auto& e : adjacency[t]) {
-                    size_t nb = e.nbr;
-                    if (label[nb] != -1)
-                        continue;
-
-                    float d = clamp11(dot3(face_normals[t], face_normals[nb]));
-                    if (d < cos_hard)
-                        continue;
-
-                    float dn = clamp11(dot3(face_normals[nb], seedN[lbl]));
-                    float cost = 1.0f - dn;
-
-                    label[nb] = (int8_t)lbl;
-                    pq.push(Node{cost, nb, lbl});
-                }
-            }
-
-            size_t c0 = 0, c1 = 0;
-            for (size_t t : tris) {
-                if (label[t] == 0) c0++;
-                else if (label[t] == 1) c1++;
-            }
-            if (c0 == 0 || c1 == 0) {
-                for (size_t t : touched) label[t] = -2;
-                cstat = PatchConstraintStatus::TOO_FEW_UNSPLITTABLE;
-                break;
-            }
-
-            patchID_t newP = (patchID_t)pcount;
-            pcount++;
-
-            for (size_t t : tris) {
-                ids[t] = (label[t] == 1) ? newP : (patchID_t)p;
-            }
-
-            for (size_t t : touched) label[t] = -2;
-
-            // compress & rebuild
-            compress_ids(ids, pcount);
-            rebuild_patch_lists(pTris);
-        }
-    };
-
-    // Quality report computation
-    auto compute_report = [&](const std::vector<patchID_t>& ids,
-                              unsigned int pcount,
-                              PatchConstraintStatus cstat,
-                              PatchQualityReport& rep) {
-        rep.per_patch.assign(pcount, PatchQualityPatch{});
-        rep.overall = PatchQualityLevel::SAFE;
-        rep.constraint_status = cstat;
-        rep.achieved_patches = pcount;
-        rep.requested_min = opt.patch_min;
-        rep.requested_max = opt.patch_max;
-
-        std::vector<std::vector<size_t>> pTris(pcount);
-        for (size_t i = 0; i < nTri; ++i) {
-            int p = (int)ids[i];
-            pTris[p].push_back(i);
-        }
-
-        std::vector<float3> pSumN(pcount, make_float3(0, 0, 0));
-        std::vector<float>  pSumA(pcount, 0.0f);
-
-        for (int p = 0; p < (int)pcount; ++p) {
-            for (size_t t : pTris[p]) {
-                if (face_areas[t] > 0.0f) {
-                    pSumN[p] = add3(pSumN[p], mul3(face_normals[t], face_areas[t]));
-                    pSumA[p] += face_areas[t];
-                }
-            }
-        }
-
-        // reference angle for classification
-        float ref_angle_deg = patch_gate_enabled ? patch_normal_max_deg : hard_angle_deg;
-
-        for (int p = 0; p < (int)pcount; ++p) {
-            PatchQualityPatch pq;
-            pq.n_tris = (unsigned int)pTris[p].size();
-
-            float3 meanN = normalize3(pSumN[p]);
-            float sumA = pSumA[p];
-            float r = (sumA > DEME_TINY_FLOAT) ? (norm3(pSumN[p]) / sumA) : 0.0f;
-            pq.coherence_r = std::min(1.0f, std::max(0.0f, r));
-
-            float minDot = 1.0f;
-            for (size_t t : pTris[p]) {
-                float d = clamp11(dot3(meanN, face_normals[t]));
-                minDot = std::min(minDot, d);
-            }
-            pq.worst_angle_deg = rad2deg(std::acos(clamp11(minDot)));
-
-            unsigned int hard_cross = 0;
-            unsigned int conc_cross = 0;
-            unsigned int unoriented = 0;
-
-            for (size_t t : pTris[p]) {
-                for (const auto& e : adjacency[t]) {
-                    size_t nb = e.nbr;
-                    if ((int)ids[nb] != p)
-                        continue;
-
-                    float d = clamp11(dot3(face_normals[t], face_normals[nb]));
-                    if (d < cos_hard)
-                        hard_cross++;
-
-                    if (opt.block_concave_edges) {
-                        if (!e.oriented_ok) {
-                            unoriented++;
-                        } else {
-                            const float3& vA = m_vertices[e.va];
-                            const float3& vB = m_vertices[e.vb];
-                            float dih = signedDihedralDeg(face_normals[t], face_normals[nb], vA, vB);
-                            if (dih < -opt.concave_allow_deg)
-                                conc_cross++;
-                        }
-                    }
-                }
-            }
-
-            pq.hard_crossings = hard_cross / 2;
-            pq.concave_crossings = conc_cross / 2;
-            pq.unoriented_edges = unoriented / 2;
-
-            PatchQualityLevel lvl = PatchQualityLevel::SAFE;
-
-            if (qopt.hard_crossings_are_critical && pq.hard_crossings > 0) {
-                lvl = PatchQualityLevel::CRITICAL;
-            }
-
-            if (lvl != PatchQualityLevel::CRITICAL) {
-                bool angle_ok = (pq.worst_angle_deg <= ref_angle_deg);
-                bool angle_warn = (pq.worst_angle_deg <= ref_angle_deg + qopt.warn_worst_angle_margin_deg);
-
-                if (pq.coherence_r < qopt.warn_r || !angle_warn) {
-                    lvl = PatchQualityLevel::CRITICAL;
-                } else if (pq.coherence_r < qopt.safe_r || !angle_ok) {
-                    lvl = PatchQualityLevel::WARN;
-                }
-            }
-
-            if (opt.block_concave_edges && pq.concave_crossings > 0) {
-                if (qopt.concave_crossings_are_critical)
-                    lvl = PatchQualityLevel::CRITICAL;
-                else if (lvl == PatchQualityLevel::SAFE)
-                    lvl = PatchQualityLevel::WARN;
-            }
-
-            if (opt.block_concave_edges && pq.unoriented_edges >= qopt.unoriented_warn_threshold && lvl == PatchQualityLevel::SAFE) {
-                lvl = PatchQualityLevel::WARN;
-            }
-
-            pq.level = lvl;
-            rep.per_patch[p] = pq;
-
-            if ((int)lvl > (int)rep.overall)
-                rep.overall = lvl;
-        }
-    };
-
-    // ------------------------------------------------------------
-    // Optional auto tuning (OFF unless opt.auto_tune.enabled == true)
-    // ------------------------------------------------------------
-    auto run_full = [&](PatchSplitOptions run_opt,
-                        std::vector<patchID_t>& ids_out,
-                        unsigned int& pcount_out,
-                        PatchConstraintStatus& cstat_out,
-                        PatchQualityReport* rep_out) {
-        cstat_out = PatchConstraintStatus::SATISFIED;
-
-        float run_soft = (run_opt.soft_angle_deg >= 0.0f) ? run_opt.soft_angle_deg : hard_angle_deg;
-        run_soft = std::min(hard_angle_deg, std::max(0.0f, run_soft));
-
-        bool run_patch_gate = (run_opt.patch_normal_max_deg >= 0.0f);
-        if (!run_patch_gate && run_soft < hard_angle_deg) {
-            run_opt.patch_normal_max_deg = run_soft;
-            run_patch_gate = true;
-        }
-
-        float run_cos_patch = -1.0f;
-        if (run_patch_gate) {
-            float run_patch_deg = std::min(180.0f, std::max(0.0f, run_opt.patch_normal_max_deg));
-            run_cos_patch = std::cos(deg2rad(run_patch_deg));
-        }
-
-        // segment
-        segment_once(run_opt, run_soft, run_patch_gate, run_cos_patch, ids_out, pcount_out);
-        compress_ids(ids_out, pcount_out);
-
-        // enforce max, then min (count-only)
-        enforce_patch_max(ids_out, pcount_out, cstat_out);
-        enforce_patch_min(ids_out, pcount_out, cstat_out);
-
-        // final compress
-        compress_ids(ids_out, pcount_out);
-
-        if (rep_out) {
-            PatchQualityReport tmp;
-            // Update globals for report reference (patch_gate_enabled etc.) are based on outer opt;
-            // for report classification, we reuse "current" (outer) patch_gate_enabled and patch_normal_max_deg.
-            // For best accuracy you can compute ref_angle from run_opt as well; keep simple here.
-            compute_report(ids_out, pcount_out, cstat_out, tmp);
-            *rep_out = std::move(tmp);
-        }
-    };
-
-    std::vector<patchID_t> best_ids;
-    unsigned int best_pcount = 0;
-    PatchConstraintStatus best_cstat = PatchConstraintStatus::SATISFIED;
-    PatchQualityReport best_rep;
-
-    if (!opt.auto_tune.enabled) {
-        run_full(opt, best_ids, best_pcount, best_cstat, out_report ? &best_rep : nullptr);
-    } else {
-        // Auto-tuning is conservative: it will not run if you hard-fix the count (patch_min == patch_max),
-        // because then your intention is explicit ("keep the cube a cube").
-        if (opt.patch_min == opt.patch_max) {
-            run_full(opt, best_ids, best_pcount, best_cstat, out_report ? &best_rep : nullptr);
-        } else {
-            // Start from user options; search by tightening/loosening patch_normal_max_deg (and soft if present)
-            PatchSplitOptions cur = opt;
-
-            auto severity_score = [&](PatchQualityLevel lvl) { return (int)lvl; };
-
-            bool have_best = false;
-
-            for (unsigned int it = 0; it < opt.auto_tune.max_iters; ++it) {
-                std::vector<patchID_t> ids;
-                unsigned int pc = 0;
-                PatchConstraintStatus cs = PatchConstraintStatus::SATISFIED;
-                PatchQualityReport rep;
-
-                run_full(cur, ids, pc, cs, &rep);
-
-                // candidate score: prioritize meeting constraints, then quality, then fewer patches
-                bool constraints_ok = (cs == PatchConstraintStatus::SATISFIED);
-                int sev = severity_score(rep.overall);
-
-                auto better_than = [&](bool ok, int s, unsigned int p) {
-                    if (!have_best) return true;
-                    bool best_ok = (best_cstat == PatchConstraintStatus::SATISFIED);
-                    int best_sev = severity_score(best_rep.overall);
-                    if (ok != best_ok) return ok;          // prefer satisfied
-                    if (s != best_sev) return s < best_sev; // prefer safer
-                    return p < best_pcount;                // prefer fewer patches
-                };
-
-                if (better_than(constraints_ok, sev, pc)) {
-                    best_ids = std::move(ids);
-                    best_pcount = pc;
-                    best_cstat = cs;
-                    best_rep = std::move(rep);
-                    have_best = true;
-                }
-
-                // stop if good enough
-                if (constraints_ok && (int)best_rep.overall <= (int)opt.auto_tune.target_level)
-                    break;
-
-                // Adjust rules:
-                // - If CRITICAL and we can afford more patches => tighten (smaller patch_normal_max, smaller soft)
-                // - If too many unmergeable patches => loosen (bigger patch_normal_max, bigger soft, disable concavity if needed)
-                // - If too few patches => tighten
-                if (cs == PatchConstraintStatus::TOO_MANY_UNMERGEABLE) {
-                    // loosen
-                    if (cur.patch_normal_max_deg >= 0.0f)
-                        cur.patch_normal_max_deg = std::min(180.0f, cur.patch_normal_max_deg + opt.auto_tune.step_deg);
-                    if (cur.soft_angle_deg >= 0.0f)
-                        cur.soft_angle_deg = std::min(hard_angle_deg, cur.soft_angle_deg + opt.auto_tune.step_deg);
-                    if (cur.block_concave_edges && opt.auto_tune.allow_enable_concavity) {
-                        // concavity block can prevent merging; relax it
-                        cur.block_concave_edges = false;
-                    }
-                } else if (pc < opt.patch_min || rep.overall == PatchQualityLevel::CRITICAL) {
-                    // tighten if possible
-                    if (cur.patch_normal_max_deg < 0.0f)
-                        cur.patch_normal_max_deg = std::min(hard_angle_deg, 45.0f);  // enable with a sane default
-                    else
-                        cur.patch_normal_max_deg = std::max(0.0f, cur.patch_normal_max_deg - opt.auto_tune.step_deg);
-
-                    if (cur.soft_angle_deg >= 0.0f)
-                        cur.soft_angle_deg = std::max(0.0f, cur.soft_angle_deg - opt.auto_tune.step_deg);
-
-                    if (!cur.block_concave_edges && opt.auto_tune.allow_enable_concavity) {
-                        cur.block_concave_edges = true;
-                        cur.concave_allow_deg = std::max(0.0f, cur.concave_allow_deg);
-                    }
-                } else if (pc > opt.patch_max) {
-                    // loosen (but note: enforce_patch_max already tries)
-                    if (cur.patch_normal_max_deg >= 0.0f)
-                        cur.patch_normal_max_deg = std::min(180.0f, cur.patch_normal_max_deg + opt.auto_tune.step_deg);
-                    if (cur.soft_angle_deg >= 0.0f)
-                        cur.soft_angle_deg = std::min(hard_angle_deg, cur.soft_angle_deg + opt.auto_tune.step_deg);
-                } else {
-                    // stable but not good enough; slightly tighten coherence if we have headroom under patch_max
-                    if (pc < opt.patch_max) {
-                        if (cur.patch_normal_max_deg < 0.0f)
-                            cur.patch_normal_max_deg = std::min(hard_angle_deg, 45.0f);
-                        else
-                            cur.patch_normal_max_deg = std::max(0.0f, cur.patch_normal_max_deg - opt.auto_tune.step_deg);
-                    } else {
-                        break;
-                    }
-                }
-            }
-
-            // If never found, fall back
-            if (!have_best) {
-                run_full(opt, best_ids, best_pcount, best_cstat, out_report ? &best_rep : nullptr);
-            }
-        }
-    }
-
-    // Commit to mesh state
-    m_patch_ids = std::move(best_ids);
-    nPatches = best_pcount;
-    patches_explicitly_set = true;
-
-    // Feedback output
-    if (out_report) {
-        *out_report = std::move(best_rep);
-    }
-
-    // Material broadcasting (same as existing behavior)
-    if (isMaterialSet && materials.size() == 1) {
-        materials = std::vector<std::shared_ptr<DEMMaterial>>(nPatches, materials[0]);
-    }
-    if (isMaterialSet && materials.size() != nPatches) {
-        DEME_ERROR(
-            "The number of materials set (%zu) does not match the number of patches (%u). Please set the "
-            "material for each patch or use a single material for all patches.",
-            materials.size(), nPatches);
-    }
-
-    return nPatches;
-}
-
-// Manually set patch IDs for each triangle
-void DEMMesh::SetPatchIDs(const std::vector<patchID_t>& patch_ids) {
-    assertTriLength(patch_ids.size(), "SetPatchIDs");
-
-    // Use rank-transformed patch IDs to ensure they are contiguous and start from 0
-    auto [compressed_ids, changed] = rank_transform<patchID_t>(patch_ids);
-
-    if (changed) {
-        DEME_WARNING(
-            std::string("Patch IDs you supplied for a mesh were not contiguous or did not start from 0.\nThey have "
-                        "been transformed to be contiguous and start from 0."));
-    }
-
-    // Copy the patch IDs
-    m_patch_ids = compressed_ids;
-
-    // Calculate the number of patches (maximum patch ID + 1)
-    if (!compressed_ids.empty()) {
-        int max_patch_id = *std::max_element(compressed_ids.begin(), compressed_ids.end());
-        nPatches = max_patch_id + 1;
-    } else {
-        nPatches = 1;
-    }
-
-    patches_explicitly_set = true;
-
-    // If material is set and we can broadcast it to all patches, we do so
-    if (isMaterialSet && materials.size() == 1) {
-        materials = std::vector<std::shared_ptr<DEMMaterial>>(nPatches, materials[0]);
-    }
-    // If material is set and we cannot broadcast it to all patches, we raise error
-    if (isMaterialSet && materials.size() != nPatches) {
-        DEME_ERROR(
-            "The number of materials set (%zu) does not match the number of patches (%u). Please set the "
-            "material for each patch or use a single material for all patches.",
-            materials.size(), nPatches);
-    }
-}
 
 // Compute patch locations (relative to CoM, which is implicitly at 0,0,0)
 // If not explicitly set, calculates as:
@@ -1736,26 +938,6 @@ void DEMMesh::ComputeMassProperties(double& volume, float3& center, float3& iner
 
 // Section for Watertight test, false if not
 
-struct QuantKey3 {
-    int64_t x, y, z;
-    bool operator==(const QuantKey3& o) const noexcept { return x==o.x && y==o.y && z==o.z; }
-};
-struct QuantKey3Hash {
-    size_t operator()(const QuantKey3& k) const noexcept {
-        size_t h1 = std::hash<int64_t>{}(k.x);
-        size_t h2 = std::hash<int64_t>{}(k.y);
-        size_t h3 = std::hash<int64_t>{}(k.z);
-        size_t h = h1;
-        h ^= h2 + 0x9e3779b97f4a7c15ULL + (h<<6) + (h>>2);
-        h ^= h3 + 0x9e3779b97f4a7c15ULL + (h<<6) + (h>>2);
-        return h;
-    }
-};
-
-static inline int64_t q(double v, double eps) {
-    return (int64_t)std::llround(v / eps);
-}
-
 bool DEMMesh::IsWatertight(size_t* boundary_edges, size_t* nonmanifold_edges) const {
     if (boundary_edges) *boundary_edges = 0;
     if (nonmanifold_edges) *nonmanifold_edges = 0;
@@ -1803,37 +985,8 @@ bool DEMMesh::IsWatertight(size_t* boundary_edges, size_t* nonmanifold_edges) co
         return false;
     }
 
-    double minx = m_vertices[0].x, miny = m_vertices[0].y, minz = m_vertices[0].z;
-    double maxx = minx, maxy = miny, maxz = minz;
-    for (const auto& v : m_vertices) {
-        minx = std::min(minx, (double)v.x); miny = std::min(miny, (double)v.y);
-        minz = std::min(minz, (double)v.z);
-        maxx = std::max(maxx, (double)v.x); maxy = std::max(maxy, (double)v.y);
-        maxz = std::max(maxz, (double)v.z);
-    }
-    const double dx = maxx - minx, dy = maxy - miny, dz = maxz - minz;
-    const double diag = std::sqrt(dx*dx + dy*dy + dz*dz);
-    const double eps = std::max(diag * 1e-9, 1e-12);
-
-    std::unordered_map<QuantKey3, size_t, QuantKey3Hash> rep;
-    rep.reserve(m_vertices.size());
-
-    std::vector<size_t> canon(m_vertices.size(), (size_t)-1);
-    size_t next_id = 0;
-
-    for (size_t i = 0; i < m_vertices.size(); ++i) {
-        const auto& v = m_vertices[i];
-        QuantKey3 key{ q(v.x, eps), q(v.y, eps), q(v.z, eps) };
-
-        auto it = rep.find(key);
-        if (it == rep.end()) {
-            rep.emplace(key, next_id);
-            canon[i] = next_id;
-            next_id++;
-        } else {
-            canon[i] = it->second;
-        }
-    }
+    const double eps = computeVertexQuantEps(m_vertices);
+    const auto canon = buildCanonicalVertexMap(m_vertices, eps);
 
     std::map<std::pair<size_t, size_t>, size_t> edge_counts2;
     for (const auto& face : m_face_v_indices) {
diff --git a/src/DEM/dT.cpp b/src/DEM/dT.cpp
index 9ad0a201..82c053e7 100644
--- a/src/DEM/dT.cpp
+++ b/src/DEM/dT.cpp
@@ -120,6 +120,7 @@ void DEMDynamicThread::packDataPointers() {
     ownerMeshNeverWinner.bindDevicePointer(&(granData->ownerMeshNeverWinner));
     ownerPatchMesh.bindDevicePointer(&(granData->ownerPatchMesh));
     triPatchID.bindDevicePointer(&(granData->triPatchID));
+    triNeighborIndex.bindDevicePointer(&(granData->triNeighborIndex));
     triNeighbor1.bindDevicePointer(&(granData->triNeighbor1));
     triNeighbor2.bindDevicePointer(&(granData->triNeighbor2));
     triNeighbor3.bindDevicePointer(&(granData->triNeighbor3));
@@ -284,6 +285,7 @@ void DEMDynamicThread::migrateDataToDevice() {
     ownerMeshNeverWinner.toDeviceAsync(streamInfo.stream);
     ownerPatchMesh.toDeviceAsync(streamInfo.stream);
     triPatchID.toDeviceAsync(streamInfo.stream);
+    triNeighborIndex.toDeviceAsync(streamInfo.stream);
     triNeighbor1.toDeviceAsync(streamInfo.stream);
     triNeighbor2.toDeviceAsync(streamInfo.stream);
     triNeighbor3.toDeviceAsync(streamInfo.stream);
@@ -562,6 +564,7 @@ void DEMDynamicThread::allocateGPUArrays(size_t nOwnerBodies,
                                          size_t nTriMeshes,
                                          size_t nSpheresGM,
                                          size_t nTriGM,
+                                         size_t nTriNeighbors,
                                          size_t nMeshPatches,
                                          unsigned int nAnalGM,
                                          size_t nExtraContacts,
@@ -645,9 +648,10 @@ void DEMDynamicThread::allocateGPUArrays(size_t nOwnerBodies,
     DEME_DUAL_ARRAY_RESIZE(relPosNode2, nTriGM, make_float3(0));
     DEME_DUAL_ARRAY_RESIZE(relPosNode3, nTriGM, make_float3(0));
     DEME_DUAL_ARRAY_RESIZE(triPatchID, nTriGM, 0);
-    DEME_DUAL_ARRAY_RESIZE(triNeighbor1, nTriGM, NULL_BODYID);
-    DEME_DUAL_ARRAY_RESIZE(triNeighbor2, nTriGM, NULL_BODYID);
-    DEME_DUAL_ARRAY_RESIZE(triNeighbor3, nTriGM, NULL_BODYID);
+    DEME_DUAL_ARRAY_RESIZE(triNeighborIndex, nTriGM, NULL_BODYID);
+    DEME_DUAL_ARRAY_RESIZE(triNeighbor1, nTriNeighbors, NULL_BODYID);
+    DEME_DUAL_ARRAY_RESIZE(triNeighbor2, nTriNeighbors, NULL_BODYID);
+    DEME_DUAL_ARRAY_RESIZE(triNeighbor3, nTriNeighbors, NULL_BODYID);
 
     // Resize to the number of mesh patches
     DEME_DUAL_ARRAY_RESIZE(ownerPatchMesh, nMeshPatches, 0);
@@ -835,7 +839,8 @@ void DEMDynamicThread::populateEntityArrays(const std::vector<std::shared_ptr<DE
                                             size_t nExistOwners,
                                             size_t nExistSpheres,
                                             size_t nExistingFacets,
-                                            size_t nExistingMeshPatches) {
+                                            size_t nExistingMeshPatches,
+                                            size_t nExistingTriNeighbors) {
     // Load in clump components info (but only if instructed to use jitified clump templates). This step will be
     // repeated even if we are just adding some more clumps to system, not a complete re-initialization.
     size_t k = 0;
@@ -1125,6 +1130,7 @@ void DEMDynamicThread::populateEntityArrays(const std::vector<std::shared_ptr<DE
     unsigned int offset_for_mesh_obj_mass_template = offset_for_ext_obj_mass_template + input_ext_obj_xyz.size();
     // k for indexing the triangle facets
     k = 0;
+    size_t neighbor_write = nExistingTriNeighbors;
     // p for indexing patches (flattened across all meshes)
     size_t p = 0;
     for (size_t i = 0; i < input_mesh_objs.size(); i++) {
@@ -1208,20 +1214,29 @@ void DEMDynamicThread::populateEntityArrays(const std::vector<std::shared_ptr<DE
         // Per-facet info
         //// TODO: This flatten-then-init approach is historical and too ugly.
         size_t this_facet_owner = mesh_facet_owner.at(k);
+        const bool mesh_needs_neighbors =
+            !(input_mesh_obj_convex.at(this_facet_owner) != 0 && input_mesh_obj_never_winner.at(this_facet_owner) != 0);
         for (; k < mesh_facet_owner.size(); k++) {
             // mesh_facet_owner run length is the num of facets in this mesh entity
             if (mesh_facet_owner.at(k) != this_facet_owner)
                 break;
-            ownerTriMesh[nExistingFacets + k] = owner_offset_for_mesh_obj + this_facet_owner;
+            const size_t global_tri = nExistingFacets + k;
+            ownerTriMesh[global_tri] = owner_offset_for_mesh_obj + this_facet_owner;
             // Tri's patch belonging needs to take into account those patches that are previously added
-            triPatchID[nExistingFacets + k] = nExistingMeshPatches + mesh_facet_patch.at(k);
-            triNeighbor1[nExistingFacets + k] = mesh_facet_neighbor1.at(k);
-            triNeighbor2[nExistingFacets + k] = mesh_facet_neighbor2.at(k);
-            triNeighbor3[nExistingFacets + k] = mesh_facet_neighbor3.at(k);
+            triPatchID[global_tri] = nExistingMeshPatches + mesh_facet_patch.at(k);
+            if (mesh_needs_neighbors) {
+                triNeighborIndex[global_tri] = neighbor_write;
+                triNeighbor1[neighbor_write] = mesh_facet_neighbor1.at(k);
+                triNeighbor2[neighbor_write] = mesh_facet_neighbor2.at(k);
+                triNeighbor3[neighbor_write] = mesh_facet_neighbor3.at(k);
+                neighbor_write++;
+            } else {
+                triNeighborIndex[global_tri] = NULL_BODYID;
+            }
             DEMTriangle this_tri = mesh_facets.at(k);
-            relPosNode1[nExistingFacets + k] = this_tri.p1;
-            relPosNode2[nExistingFacets + k] = this_tri.p2;
-            relPosNode3[nExistingFacets + k] = this_tri.p3;
+            relPosNode1[global_tri] = this_tri.p1;
+            relPosNode2[global_tri] = this_tri.p2;
+            relPosNode3[global_tri] = this_tri.p3;
         }
 
         const bodyID_t owner_id = owner_offset_for_mesh_obj + i;
@@ -1355,7 +1370,8 @@ void DEMDynamicThread::initGPUArrays(const std::vector<std::shared_ptr<DEMClumpB
                          input_mesh_obj_convex, input_mesh_obj_never_winner, mesh_facet_owner, mesh_facet_patch,
                          mesh_facet_neighbor1, mesh_facet_neighbor2, mesh_facet_neighbor3, mesh_facets,
                          mesh_patch_owner, mesh_patch_materials, clump_templates, ext_obj_mass_types, ext_obj_moi_types,
-                         ext_obj_comp_num, mesh_obj_mass_types, mesh_obj_moi_types, mesh_obj_mass_offsets, 0, 0, 0, 0);
+                         ext_obj_comp_num, mesh_obj_mass_types, mesh_obj_moi_types, mesh_obj_mass_offsets, 0, 0, 0, 0,
+                         0);
 
     buildTrackedObjs(input_clump_batches, ext_obj_comp_num, input_mesh_objs, tracked_objs, 0, 0, 0, 0);
 }
@@ -1396,6 +1412,7 @@ void DEMDynamicThread::updateClumpMeshArrays(const std::vector<std::shared_ptr<D
                                              size_t nExistingSpheres,
                                              size_t nExistingTriMesh,
                                              size_t nExistingFacets,
+                                             size_t nExistingTriNeighbors,
                                              size_t nExistingPatches,
                                              unsigned int nExistingObj,
                                              unsigned int nExistingAnalGM) {
@@ -1410,7 +1427,7 @@ void DEMDynamicThread::updateClumpMeshArrays(const std::vector<std::shared_ptr<D
                          mesh_facet_neighbor1, mesh_facet_neighbor2, mesh_facet_neighbor3, mesh_facets,
                          mesh_patch_owner, mesh_patch_materials, clump_templates, ext_obj_mass_types, ext_obj_moi_types,
                          ext_obj_comp_num, mesh_obj_mass_types, mesh_obj_moi_types, mesh_obj_mass_offsets,
-                         nExistingOwners, nExistingSpheres, nExistingFacets, nExistingPatches);
+                         nExistingOwners, nExistingSpheres, nExistingFacets, nExistingPatches, nExistingTriNeighbors);
 
     // Make changes to tracked objects (potentially add more)
     buildTrackedObjs(input_clump_batches, ext_obj_comp_num, input_mesh_objs, tracked_objs, nExistingOwners,
diff --git a/src/DEM/dT.h b/src/DEM/dT.h
index 4406b56b..f1b214b1 100644
--- a/src/DEM/dT.h
+++ b/src/DEM/dT.h
@@ -467,7 +467,8 @@ class DEMDynamicThread {
     // Mesh patch information: each facet belongs to a patch, and each patch has material properties
     // Patch ID for each triangle facet (maps facet to patch)
     DualArray<bodyID_t> triPatchID = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
-    // Triangle edge neighbors (global triangle indices; NULL_BODYID for boundary)
+    // Triangle edge neighbors (compact; index via triNeighborIndex)
+    DualArray<bodyID_t> triNeighborIndex = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
     DualArray<bodyID_t> triNeighbor1 = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
     DualArray<bodyID_t> triNeighbor2 = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
     DualArray<bodyID_t> triNeighbor3 = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
@@ -735,6 +736,7 @@ class DEMDynamicThread {
                            size_t nTriMeshes,
                            size_t nSpheresGM,
                            size_t nTriGM,
+                           size_t nTriNeighbors,
                            size_t nMeshPatches,
                            unsigned int nAnalGM,
                            size_t nExtraContacts,
@@ -775,13 +777,14 @@ class DEMDynamicThread {
                               const std::vector<float>& ext_obj_mass_types,
                               const std::vector<float3>& ext_obj_moi_types,
                               const std::vector<unsigned int>& ext_obj_comp_num,
-                             const std::vector<float>& mesh_obj_mass_types,
-                             const std::vector<float3>& mesh_obj_moi_types,
-                             const std::vector<inertiaOffset_t>& mesh_obj_mass_offsets,
-                             size_t nExistOwners,
-                             size_t nExistSpheres,
-                             size_t nExistingFacets,
-                             size_t nExistingPatches);
+                              const std::vector<float>& mesh_obj_mass_types,
+                              const std::vector<float3>& mesh_obj_moi_types,
+                              const std::vector<inertiaOffset_t>& mesh_obj_mass_offsets,
+                              size_t nExistOwners,
+                              size_t nExistSpheres,
+                              size_t nExistingFacets,
+                              size_t nExistingPatches,
+                              size_t nExistingTriNeighbors);
     void registerPolicies(const std::unordered_map<unsigned int, std::string>& template_number_name_map,
                           const ClumpTemplateFlatten& clump_templates,
                           const std::vector<float>& ext_obj_mass_types,
@@ -864,6 +867,7 @@ class DEMDynamicThread {
                                size_t nExistingSpheres,
                                size_t nExistingTriMesh,
                                size_t nExistingFacets,
+                               size_t nExistingTriNeighbors,
                                size_t nExistingPatches,
                                unsigned int nExistingObj,
                                unsigned int nExistingAnalGM);
diff --git a/src/DEM/kT.cpp b/src/DEM/kT.cpp
index e7c5eb03..093eb326 100644
--- a/src/DEM/kT.cpp
+++ b/src/DEM/kT.cpp
@@ -731,6 +731,7 @@ void DEMKinematicThread::packDataPointers() {
     ownerMeshConvex.bindDevicePointer(&(granData->ownerMeshConvex));
     ownerMeshNeverWinner.bindDevicePointer(&(granData->ownerMeshNeverWinner));
     triPatchID.bindDevicePointer(&(granData->triPatchID));
+    triNeighborIndex.bindDevicePointer(&(granData->triNeighborIndex));
     triNeighbor1.bindDevicePointer(&(granData->triNeighbor1));
     triNeighbor2.bindDevicePointer(&(granData->triNeighbor2));
     triNeighbor3.bindDevicePointer(&(granData->triNeighbor3));
@@ -785,6 +786,7 @@ void DEMKinematicThread::migrateDataToDevice() {
     ownerMeshConvex.toDeviceAsync(streamInfo.stream);
     ownerMeshNeverWinner.toDeviceAsync(streamInfo.stream);
     triPatchID.toDeviceAsync(streamInfo.stream);
+    triNeighborIndex.toDeviceAsync(streamInfo.stream);
     triNeighbor1.toDeviceAsync(streamInfo.stream);
     triNeighbor2.toDeviceAsync(streamInfo.stream);
     triNeighbor3.toDeviceAsync(streamInfo.stream);
@@ -890,6 +892,7 @@ void DEMKinematicThread::allocateGPUArrays(size_t nOwnerBodies,
                                            size_t nTriMeshes,
                                            size_t nSpheresGM,
                                            size_t nTriGM,
+                                           size_t nTriNeighbors,
                                            unsigned int nAnalGM,
                                            size_t nExtraContacts,
                                            unsigned int nMassProperties,
@@ -975,9 +978,10 @@ void DEMKinematicThread::allocateGPUArrays(size_t nOwnerBodies,
     // Resize to the number of triangle facets
     DEME_DUAL_ARRAY_RESIZE(ownerTriMesh, nTriGM, 0);
     DEME_DUAL_ARRAY_RESIZE(triPatchID, nTriGM, 0);
-    DEME_DUAL_ARRAY_RESIZE(triNeighbor1, nTriGM, NULL_BODYID);
-    DEME_DUAL_ARRAY_RESIZE(triNeighbor2, nTriGM, NULL_BODYID);
-    DEME_DUAL_ARRAY_RESIZE(triNeighbor3, nTriGM, NULL_BODYID);
+    DEME_DUAL_ARRAY_RESIZE(triNeighborIndex, nTriGM, NULL_BODYID);
+    DEME_DUAL_ARRAY_RESIZE(triNeighbor1, nTriNeighbors, NULL_BODYID);
+    DEME_DUAL_ARRAY_RESIZE(triNeighbor2, nTriNeighbors, NULL_BODYID);
+    DEME_DUAL_ARRAY_RESIZE(triNeighbor3, nTriNeighbors, NULL_BODYID);
     DEME_DUAL_ARRAY_RESIZE(relPosNode1, nTriGM, make_float3(0));
     DEME_DUAL_ARRAY_RESIZE(relPosNode2, nTriGM, make_float3(0));
     DEME_DUAL_ARRAY_RESIZE(relPosNode3, nTriGM, make_float3(0));
@@ -1055,7 +1059,8 @@ void DEMKinematicThread::populateEntityArrays(const std::vector<std::shared_ptr<
                                               size_t nExistOwners,
                                               size_t nExistSpheres,
                                               size_t nExistingFacets,
-                                              size_t nExistingMeshPatches) {
+                                              size_t nExistingMeshPatches,
+                                              size_t nExistingTriNeighbors) {
     // All the input vectors should have the same length, nClumpTopo
     size_t k = 0;
     std::vector<unsigned int> prescans_comp;
@@ -1157,22 +1162,32 @@ void DEMKinematicThread::populateEntityArrays(const std::vector<std::shared_ptr<
     size_t owner_offset_for_mesh_obj = owner_offset_for_ext_obj + input_ext_obj_family.size();
     // k for indexing the triangle facets
     k = 0;
+    size_t neighbor_write = nExistingTriNeighbors;
     for (size_t i = 0; i < input_mesh_obj_family.size(); i++) {
         // Per-facet info
         size_t this_facet_owner = input_mesh_facet_owner.at(k);
+        const bool mesh_needs_neighbors =
+            !(input_mesh_obj_convex.at(this_facet_owner) != 0 && input_mesh_obj_never_winner.at(this_facet_owner) != 0);
         for (; k < input_mesh_facet_owner.size(); k++) {
             // input_mesh_facet_owner run length is the num of facets in this mesh entity
             if (input_mesh_facet_owner.at(k) != this_facet_owner)
                 break;
-            ownerTriMesh[nExistingFacets + k] = owner_offset_for_mesh_obj + this_facet_owner;
-            triPatchID[nExistingFacets + k] = nExistingMeshPatches + input_mesh_facet_patch.at(k);
-            triNeighbor1[nExistingFacets + k] = input_mesh_facet_neighbor1.at(k);
-            triNeighbor2[nExistingFacets + k] = input_mesh_facet_neighbor2.at(k);
-            triNeighbor3[nExistingFacets + k] = input_mesh_facet_neighbor3.at(k);
+            const size_t global_tri = nExistingFacets + k;
+            ownerTriMesh[global_tri] = owner_offset_for_mesh_obj + this_facet_owner;
+            triPatchID[global_tri] = nExistingMeshPatches + input_mesh_facet_patch.at(k);
+            if (mesh_needs_neighbors) {
+                triNeighborIndex[global_tri] = neighbor_write;
+                triNeighbor1[neighbor_write] = input_mesh_facet_neighbor1.at(k);
+                triNeighbor2[neighbor_write] = input_mesh_facet_neighbor2.at(k);
+                triNeighbor3[neighbor_write] = input_mesh_facet_neighbor3.at(k);
+                neighbor_write++;
+            } else {
+                triNeighborIndex[global_tri] = NULL_BODYID;
+            }
             DEMTriangle this_tri = input_mesh_facets.at(k);
-            relPosNode1[nExistingFacets + k] = this_tri.p1;
-            relPosNode2[nExistingFacets + k] = this_tri.p2;
-            relPosNode3[nExistingFacets + k] = this_tri.p3;
+            relPosNode1[global_tri] = this_tri.p1;
+            relPosNode2[global_tri] = this_tri.p2;
+            relPosNode3[global_tri] = this_tri.p3;
         }
 
         const bodyID_t owner_id = owner_offset_for_mesh_obj + i;
@@ -1207,7 +1222,7 @@ void DEMKinematicThread::initGPUArrays(const std::vector<std::shared_ptr<DEMClum
     populateEntityArrays(input_clump_batches, input_ext_obj_family, input_mesh_obj_family, input_mesh_obj_convex,
                          input_mesh_obj_never_winner, input_mesh_facet_owner, input_mesh_facet_patch,
                          input_mesh_facet_neighbor1, input_mesh_facet_neighbor2, input_mesh_facet_neighbor3,
-                         input_mesh_facets, clump_templates, ext_obj_comp_num, 0, 0, 0, 0);
+                         input_mesh_facets, clump_templates, ext_obj_comp_num, 0, 0, 0, 0, 0);
 }
 
 void DEMKinematicThread::updateClumpMeshArrays(const std::vector<std::shared_ptr<DEMClumpBatch>>& input_clump_batches,
@@ -1229,6 +1244,7 @@ void DEMKinematicThread::updateClumpMeshArrays(const std::vector<std::shared_ptr
                                                size_t nExistingSpheres,
                                                size_t nExistingTriMesh,
                                                size_t nExistingFacets,
+                                               size_t nExistingTriNeighbors,
                                                size_t nExistingPatches,
                                                unsigned int nExistingObj,
                                                unsigned int nExistingAnalGM) {
@@ -1236,7 +1252,7 @@ void DEMKinematicThread::updateClumpMeshArrays(const std::vector<std::shared_ptr
                          input_mesh_obj_never_winner, input_mesh_facet_owner, input_mesh_facet_patch,
                          input_mesh_facet_neighbor1, input_mesh_facet_neighbor2, input_mesh_facet_neighbor3,
                          input_mesh_facets, clump_templates, ext_obj_comp_num, nExistingOwners, nExistingSpheres,
-                         nExistingFacets, nExistingPatches);
+                         nExistingFacets, nExistingPatches, nExistingTriNeighbors);
 }
 
 void DEMKinematicThread::updatePrevContactArrays(DualStruct<DEMDataDT>& dT_data, size_t nContacts) {
diff --git a/src/DEM/kT.h b/src/DEM/kT.h
index 5620b030..e7fb6492 100644
--- a/src/DEM/kT.h
+++ b/src/DEM/kT.h
@@ -204,7 +204,8 @@ class DEMKinematicThread {
     // Mesh patch information: each facet belongs to a patch
     // Patch ID for each triangle facet (maps facet to patch)
     DualArray<bodyID_t> triPatchID = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
-    // Triangle edge neighbors (global triangle indices; NULL_BODYID for boundary)
+    // Triangle edge neighbors (compact; index via triNeighborIndex)
+    DualArray<bodyID_t> triNeighborIndex = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
     DualArray<bodyID_t> triNeighbor1 = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
     DualArray<bodyID_t> triNeighbor2 = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
     DualArray<bodyID_t> triNeighbor3 = DualArray<bodyID_t>(&m_approxHostBytesUsed, &m_approxDeviceBytesUsed);
@@ -340,6 +341,7 @@ class DEMKinematicThread {
                            size_t nTriMeshes,
                            size_t nSpheresGM,
                            size_t nTriGM,
+                           size_t nTriNeighbors,
                            unsigned int nAnalGM,
                            size_t nExtraContacts,
                            unsigned int nMassProperties,
@@ -366,7 +368,8 @@ class DEMKinematicThread {
                               size_t nExistOwners,
                               size_t nExistSpheres,
                               size_t nExistingFacets,
-                              size_t nExistingMeshPatches);
+                              size_t nExistingMeshPatches,
+                              size_t nExistingTriNeighbors);
 
     /// Initialize arrays
     void initGPUArrays(const std::vector<std::shared_ptr<DEMClumpBatch>>& input_clump_batches,
@@ -405,6 +408,7 @@ class DEMKinematicThread {
                                size_t nExistingSpheres,
                                size_t nExistingTriMesh,
                                size_t nExistingFacets,
+                               size_t nExistingTriNeighbors,
                                size_t nExistingPatches,
                                unsigned int nExistingObj,
                                unsigned int nExistingAnalGM);
diff --git a/src/DEM/utils/HostSideHelpers.hpp b/src/DEM/utils/HostSideHelpers.hpp
index 34f0cd4a..789f9346 100644
--- a/src/DEM/utils/HostSideHelpers.hpp
+++ b/src/DEM/utils/HostSideHelpers.hpp
@@ -23,6 +23,8 @@
 #include <utility>
 #include <tuple>
 #include <type_traits>
+#include <unordered_map>
+#include <cstdint>
 
 #include "../kernel/DEMHelperKernels.cuh"
 #include "../VariableTypes.h"
@@ -30,6 +32,77 @@
 
 namespace deme {
 
+namespace detail {
+
+struct QuantKey3 {
+    int64_t x, y, z;
+    bool operator==(const QuantKey3& o) const noexcept { return x == o.x && y == o.y && z == o.z; }
+};
+
+struct QuantKey3Hash {
+    size_t operator()(const QuantKey3& k) const noexcept {
+        size_t h1 = std::hash<int64_t>{}(k.x);
+        size_t h2 = std::hash<int64_t>{}(k.y);
+        size_t h3 = std::hash<int64_t>{}(k.z);
+        size_t h = h1;
+        h ^= h2 + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2);
+        h ^= h3 + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2);
+        return h;
+    }
+};
+
+inline int64_t quantize_coord(double v, double eps) {
+    return static_cast<int64_t>(std::llround(v / eps));
+}
+
+}  // namespace detail
+
+inline double computeVertexQuantEps(const std::vector<float3>& vertices) {
+    if (vertices.empty()) {
+        return 0.0;
+    }
+    double minx = vertices[0].x, miny = vertices[0].y, minz = vertices[0].z;
+    double maxx = minx, maxy = miny, maxz = minz;
+    for (const auto& v : vertices) {
+        minx = std::min(minx, (double)v.x);
+        miny = std::min(miny, (double)v.y);
+        minz = std::min(minz, (double)v.z);
+        maxx = std::max(maxx, (double)v.x);
+        maxy = std::max(maxy, (double)v.y);
+        maxz = std::max(maxz, (double)v.z);
+    }
+    const double dx = maxx - minx, dy = maxy - miny, dz = maxz - minz;
+    const double diag = std::sqrt(dx * dx + dy * dy + dz * dz);
+    return std::max(diag * 1e-9, 1e-12);
+}
+
+inline std::vector<size_t> buildCanonicalVertexMap(const std::vector<float3>& vertices, double eps) {
+    if (vertices.empty()) {
+        return {};
+    }
+    if (eps <= 0.0) {
+        eps = 1e-12;
+    }
+    std::unordered_map<detail::QuantKey3, size_t, detail::QuantKey3Hash> rep;
+    rep.reserve(vertices.size());
+    std::vector<size_t> canon(vertices.size(), static_cast<size_t>(-1));
+    size_t next_id = 0;
+    for (size_t i = 0; i < vertices.size(); ++i) {
+        const auto& v = vertices[i];
+        detail::QuantKey3 key{detail::quantize_coord(v.x, eps), detail::quantize_coord(v.y, eps),
+                              detail::quantize_coord(v.z, eps)};
+        auto it = rep.find(key);
+        if (it == rep.end()) {
+            rep.emplace(key, next_id);
+            canon[i] = next_id;
+            next_id++;
+        } else {
+            canon[i] = it->second;
+        }
+    }
+    return canon;
+}
+
 // Generic helper to access tuple of pointers
 template <typename Tuple, size_t... Is>
 auto dereference_at(const Tuple& ptrs, size_t idx, std::index_sequence<Is...>) {
diff --git a/src/algorithms/DEMContactDetection.cu b/src/algorithms/DEMContactDetection.cu
index eee8d0a9..447c30e4 100644
--- a/src/algorithms/DEMContactDetection.cu
+++ b/src/algorithms/DEMContactDetection.cu
@@ -1622,6 +1622,7 @@ void contactDetection(std::shared_ptr<JitHelper::CachedProgram>& bin_sphere_kern
                         propagateActiveTriLabels<<<dim3(blocks_needed_active), dim3(DEME_MAX_THREADS_PER_BLOCK), 0,
                                                    this_stream>>>(activeTriKeysUnique, labelsIn, labelsOut,
                                                                   groupActiveStart, groupActiveCount,
+                                                                  granData->triNeighborIndex,
                                                                   granData->triNeighbor1, granData->triNeighbor2,
                                                                   granData->triNeighbor3, numUniqueActiveTri);
                         bodyID_t* tmp = labelsIn;
diff --git a/src/algorithms/DEMContactDetectionKernels.cuh b/src/algorithms/DEMContactDetectionKernels.cuh
index aa52a54e..ed6a6df9 100644
--- a/src/algorithms/DEMContactDetectionKernels.cuh
+++ b/src/algorithms/DEMContactDetectionKernels.cuh
@@ -322,7 +322,9 @@ __global__ void computeGroupWinners(const contact_t* groupTypes,
         forceSingleIsland[myID] = single_island ? 1 : 0;
 
         notStupidBool_t pickA = 0;
-        if (A_never && !B_never) {
+        if (A_never && B_never) {
+            pickA = 0;  // deterministic: prefer B when both are never-winner
+        } else if (A_never && !B_never) {
             pickA = 0;
         } else if (B_never && !A_never) {
             pickA = 1;
@@ -421,6 +423,7 @@ __global__ void propagateActiveTriLabels(const uint64_t* keys,
                                          bodyID_t* labelsOut,
                                          const contactPairs_t* groupStart,
                                          const contactPairs_t* groupCount,
+                                         const bodyID_t* triNeighborIndex,
                                          const bodyID_t* triNeighbor1,
                                          const bodyID_t* triNeighbor2,
                                          const bodyID_t* triNeighbor3,
@@ -434,7 +437,12 @@ __global__ void propagateActiveTriLabels(const uint64_t* keys,
         const contactPairs_t count = groupCount[grp];
         bodyID_t label = labelsIn[myID];
 
-        bodyID_t nbs[3] = {triNeighbor1[triID], triNeighbor2[triID], triNeighbor3[triID]};
+        const bodyID_t nb_idx = triNeighborIndex[triID];
+        if (nb_idx == NULL_BODYID) {
+            labelsOut[myID] = label;
+            return;
+        }
+        bodyID_t nbs[3] = {triNeighbor1[nb_idx], triNeighbor2[nb_idx], triNeighbor3[nb_idx]};
         for (int e = 0; e < 3; ++e) {
             const bodyID_t nb = nbs[e];
             if (nb == NULL_BODYID || count == 0) {
diff --git a/src/demo/DEMdemo_ResponseAngleMesh.cpp b/src/demo/DEMdemo_ResponseAngleMesh.cpp
index 85370fd9..f0489ed1 100644
--- a/src/demo/DEMdemo_ResponseAngleMesh.cpp
+++ b/src/demo/DEMdemo_ResponseAngleMesh.cpp
@@ -78,7 +78,7 @@ int main() {
     DEMSolver DEMSim;
     DEMSim.SetOutputFormat(OUTPUT_FORMAT::CSV);
     DEMSim.SetOutputContent(OUTPUT_CONTENT::FAMILY);
-    DEMSim.SetMeshOutputFormat("STL");
+    DEMSim.SetMeshOutputFormat("VTK");
     DEMSim.SetNoForceRecord();
     DEMSim.SetMeshUniversalContact(true);
     const float mm_to_m = 0.001f;
@@ -109,6 +109,8 @@ int main() {
     float3 tri_center = make_float3(0, 0, 0);
     float3 tri_inertia = make_float3(0, 0, 0);
     tri_template->ComputeMassProperties(tri_volume, tri_center, tri_inertia);
+    // tri_template->SetConvex(true); // for convex particels only
+    // tri_template->SetNeverWinner(true); // if mesh is more coarse the other contacts
     const float particle_mass = static_cast<float>(tri_volume * particle_density);
     const float3 particle_moi = tri_inertia * particle_density;
     std::cout << "Particle volume (m^3): " << tri_volume << ", mass (kg): "<< particle_mass << std::endl;
@@ -218,7 +220,7 @@ int main() {
         std::cout << "Frame: " << currframe << std::endl;
         DEMSim.ShowThreadCollaborationStats();
         char filename[100];
-        sprintf(filename, "DEMdemo_output_%04d.stl", currframe);
+        sprintf(filename, "DEMdemo_output_%04d.vtk", currframe);
         DEMSim.WriteMeshFile(out_dir / filename);
         currframe++;
         max_v = max_v_finder->GetValue();
diff --git a/src/demo/ModularTests/CMakeLists.txt b/src/demo/ModularTests/CMakeLists.txt
index 8d10e1fd..05562886 100644
--- a/src/demo/ModularTests/CMakeLists.txt
+++ b/src/demo/ModularTests/CMakeLists.txt
@@ -13,7 +13,6 @@ SET(LIBRARIES
 SET(MODULAR_TESTS
 		DEMTest_MeshTemplate
 		DEMTest_PatchLocations
-		DEMTest_MeshPatch
 		DEMTest_SimpleCollisions
 )
 
diff --git a/src/demo/ModularTests/DEMTest_MeshPatch.cpp b/src/demo/ModularTests/DEMTest_MeshPatch.cpp
deleted file mode 100644
index 06c9b5e2..00000000
--- a/src/demo/ModularTests/DEMTest_MeshPatch.cpp
+++ /dev/null
@@ -1,271 +0,0 @@
-//  Copyright (c) 2021, SBEL GPU Development Team
-//  Copyright (c) 2021, University of Wisconsin - Madison
-//
-//	SPDX-License-Identifier: BSD-3-Clause
-
-// =============================================================================
-// A demo that tests mesh patch splitting functionality.
-// This demo loads a mesh and splits it into convex patches based on angle
-// thresholds, demonstrating the mesh patch splitting utility.
-// =============================================================================
-
-#include <core/ApiVersion.h>
-#include <core/utils/ThreadManager.h>
-#include <DEM/API.h>
-#include <DEM/utils/Samplers.hpp>
-
-#include <filesystem>
-#include <cstdio>
-#include <iostream>
-#include <iomanip>
-#include <limits>
-
-using namespace deme;
-using namespace std::filesystem;
-
-int main() {
-    std::cout << "========================================" << std::endl;
-    std::cout << "DEM Mesh Patch Splitting Demo" << std::endl;
-    std::cout << "========================================" << std::endl;
-
-    // Test with a simple cube mesh
-    std::cout << "\n--- Test 1: Cube Mesh with Default Patch Info ---" << std::endl;
-    auto cube_mesh = std::make_shared<DEMMesh>();
-    bool loaded = cube_mesh->LoadWavefrontMesh((GET_DATA_PATH() / "mesh/cube.obj").string());
-
-    if (loaded) {
-        std::cout << "Loaded cube mesh successfully" << std::endl;
-        std::cout << "Number of triangles: " << cube_mesh->GetNumTriangles() << std::endl;
-        std::cout << "Number of vertices: " << cube_mesh->GetNumNodes() << std::endl;
-
-        // Test default patch info (should be all in patch 0)
-        std::cout << "\nDefault patch info (assuming convex mesh):" << std::endl;
-        std::cout << "Number of patches: " << cube_mesh->GetNumPatches() << std::endl;
-        std::cout << "Patches explicitly set: " << (cube_mesh->ArePatchesExplicitlySet() ? "yes" : "no") << std::endl;
-        const auto& default_patch_ids = cube_mesh->GetPatchIDs();
-        std::cout << "All triangles in patch 0: "
-                  << (std::all_of(default_patch_ids.begin(), default_patch_ids.end(), [](int id) { return id == 0; })
-                          ? "yes"
-                          : "no")
-                  << std::endl;
-
-        // Test different angle thresholds
-        std::cout << "\n--- Test 2: Automatic Patch Splitting ---" << std::endl;
-        float thresholds[] = {10.0f, 45.0f, 90.0f, 300.0f};
-
-        for (float threshold : thresholds) {
-            size_t num_patches = cube_mesh->SplitIntoConvexPatches(threshold);
-            std::cout << "\nAngle threshold: " << std::fixed << std::setprecision(1) << threshold << " degrees"
-                      << std::endl;
-            std::cout << "Number of patches: " << num_patches << std::endl;
-            std::cout << "Patches explicitly set: " << (cube_mesh->ArePatchesExplicitlySet() ? "yes" : "no")
-                      << std::endl;
-
-            // Show patch distribution
-            const auto& patch_ids = cube_mesh->GetPatchIDs();
-
-            // Count triangles per patch
-            std::map<int, int> patch_counts;
-            for (int patch_id : patch_ids) {
-                patch_counts[patch_id]++;
-            }
-
-            std::cout << "Patch distribution:" << std::endl;
-            for (const auto& entry : patch_counts) {
-                std::cout << "  Patch " << entry.first << ": " << entry.second << " triangles" << std::endl;
-            }
-        }
-
-        // Optimized patch settings for convex-focused splitting (prefer single patch)
-        std::cout << "\n--- Test 2b: Optimized Convex Patch Splitting (Cube) ---" << std::endl;
-        DEMMesh::PatchSplitOptions opt;
-        opt.soft_angle_deg = -1.0f;
-        opt.patch_normal_max_deg = -1.0f;
-        opt.block_concave_edges = true;
-        opt.concave_allow_deg = 0.0f;
-        opt.patch_min = 1;
-        opt.patch_max = std::numeric_limits<unsigned int>::max();
-        opt.seed_largest_first = true;
-        opt.auto_tune.enabled = false;
-
-        DEMMesh::PatchQualityReport rep_cube;
-        size_t num_patches_opt = cube_mesh->SplitIntoConvexPatches(120.0f, opt, &rep_cube);
-        std::cout << "Optimized patches: " << num_patches_opt << " (quality "
-                  << static_cast<int>(rep_cube.overall) << ")" << std::endl;
-
-        // Test manual patch ID setting
-        std::cout << "\n--- Test 3: Manual Patch ID Setting ---" << std::endl;
-        size_t num_tris = cube_mesh->GetNumTriangles();
-        std::vector<patchID_t> manual_patches(num_tris);
-        // Split triangles into 3 patches based on index
-        for (size_t i = 0; i < num_tris; ++i) {
-            manual_patches[i] = i % 3;  // Assign patches 0, 1, 2 cyclically
-        }
-
-        cube_mesh->SetPatchIDs(manual_patches);
-        std::cout << "Manually set patch IDs (cycling 0, 1, 2)" << std::endl;
-        std::cout << "Number of patches: " << cube_mesh->GetNumPatches() << std::endl;
-        std::cout << "Patches explicitly set: " << (cube_mesh->ArePatchesExplicitlySet() ? "yes" : "no") << std::endl;
-
-        // Count triangles per patch
-        const auto& manual_patch_ids = cube_mesh->GetPatchIDs();
-        std::map<int, int> manual_patch_counts;
-        for (int patch_id : manual_patch_ids) {
-            manual_patch_counts[patch_id]++;
-        }
-        std::cout << "Manual patch distribution:" << std::endl;
-        for (const auto& entry : manual_patch_counts) {
-            std::cout << "  Patch " << entry.first << ": " << entry.second << " triangles" << std::endl;
-        }
-    } else {
-        std::cout << "Failed to load cube mesh" << std::endl;
-    }
-
-    // Test with sphere mesh if available
-    std::cout << "\n--- Test 4: Sphere Mesh ---" << std::endl;
-    auto sphere_mesh = std::make_shared<DEMMesh>();
-    loaded = sphere_mesh->LoadWavefrontMesh((GET_DATA_PATH() / "mesh/sphere.obj").string());
-
-    if (loaded) {
-        std::cout << "Loaded sphere mesh successfully" << std::endl;
-        std::cout << "Number of triangles: " << sphere_mesh->GetNumTriangles() << std::endl;
-        std::cout << "Number of vertices: " << sphere_mesh->GetNumNodes() << std::endl;
-
-        // Optimized patch split (prefer single patch)
-        DEMMesh::PatchSplitOptions opt;
-        opt.soft_angle_deg = -1.0f;
-        opt.patch_normal_max_deg = -1.0f;
-        opt.block_concave_edges = true;
-        opt.concave_allow_deg = 0.0f;
-        opt.patch_min = 1;
-        opt.patch_max = std::numeric_limits<unsigned int>::max();
-        opt.seed_largest_first = true;
-        opt.auto_tune.enabled = false;
-
-        DEMMesh::PatchQualityReport rep_sphere;
-        size_t num_patches = sphere_mesh->SplitIntoConvexPatches(120.0f, opt, &rep_sphere);
-        std::cout << "Split into " << num_patches << " patches (optimized, quality "
-                  << static_cast<int>(rep_sphere.overall) << ")" << std::endl;
-
-        if (sphere_mesh->ArePatchesExplicitlySet()) {
-            const auto& patch_ids = sphere_mesh->GetPatchIDs();
-
-            // Count triangles per patch
-            std::map<int, int> patch_counts;
-            for (int patch_id : patch_ids) {
-                patch_counts[patch_id]++;
-            }
-
-            std::cout << "Number of patches with different sizes:" << std::endl;
-            std::map<int, int> size_distribution;
-            for (const auto& entry : patch_counts) {
-                size_distribution[entry.second]++;
-            }
-            for (const auto& entry : size_distribution) {
-                std::cout << "  " << entry.second << " patches with " << entry.first << " triangles each" << std::endl;
-            }
-        }
-    } else {
-        std::cout << "Sphere mesh not available, skipping" << std::endl;
-    }
-
-    // Test edge case: empty mesh
-    std::cout << "\n--- Test 5: Empty Mesh ---" << std::endl;
-    auto empty_mesh = std::make_shared<DEMMesh>();
-    std::cout << "Empty mesh default patches: " << empty_mesh->GetNumPatches() << " (expected: 1)" << std::endl;
-    std::cout << "Patches explicitly set: " << (empty_mesh->ArePatchesExplicitlySet() ? "yes" : "no")
-              << " (expected: no)" << std::endl;
-
-    // Test concave mesh (drum)
-    std::cout << "\n--- Test 6: Concave Drum Mesh (STL) ---" << std::endl;
-    auto drum_mesh = std::make_shared<DEMMesh>();
-    loaded = drum_mesh->LoadSTLMesh((GET_DATA_PATH() / "mesh/drum.stl").string());
-    if (loaded) {
-        std::cout << "Loaded drum mesh successfully" << std::endl;
-        std::cout << "Number of triangles: " << drum_mesh->GetNumTriangles() << std::endl;
-        std::cout << "Number of vertices: " << drum_mesh->GetNumNodes() << std::endl;
-
-        DEMMesh::PatchSplitOptions opt;
-        opt.soft_angle_deg = -1.0f;
-        opt.patch_normal_max_deg = -1.0f;
-        opt.block_concave_edges = true;
-        opt.concave_allow_deg = 0.0f;
-        opt.patch_min = 1;
-        opt.patch_max = std::numeric_limits<unsigned int>::max();
-        opt.seed_largest_first = true;
-        opt.auto_tune.enabled = false;
-
-        DEMMesh::PatchQualityReport rep_drum;
-        size_t num_patches = drum_mesh->SplitIntoConvexPatches(120.0f, opt, &rep_drum);
-        std::cout << "Split into " << num_patches << " patches (concave, quality "
-                  << static_cast<int>(rep_drum.overall) << ")" << std::endl;
-    } else {
-        std::cout << "Drum mesh not available, skipping" << std::endl;
-    }
-
-    // Test PLY export with per-patch colors (debug view)
-    std::cout << "\n--- Test 7: PLY Export with Patch Colors (per mesh) ---" << std::endl;
-    {
-        path out_dir = current_path();
-        out_dir /= "DemoOutput_MeshPatch";
-        create_directory(out_dir);
-
-        auto export_mesh = [&](const std::string& label, const path& mesh_path, bool is_stl) {
-            DEMSolver DEMSim;
-            DEMSim.SetVerbosity("INFO");
-            DEMSim.SetMeshOutputFormat("PLY");
-            DEMSim.EnableMeshPatchColorOutput(true);
-            DEMSim.InstructBoxDomainDimension(10, 10, 10);
-            DEMSim.SetMeshUniversalContact(true);
-
-            auto mat_type = DEMSim.LoadMaterial({{"E", 1e9}, {"nu", 0.3}, {"CoR", 0.6}, {"mu", 0.5}});
-
-            std::shared_ptr<DEMMesh> mesh_template;
-            if (is_stl) {
-                mesh_template = DEMSim.LoadMeshType(mesh_path.string(), mat_type, true, false);
-            } else {
-                mesh_template = DEMSim.LoadMeshType(mesh_path.string(), mat_type, true, false);
-            }
-
-            if (!mesh_template) {
-                std::cout << "Failed to load mesh template for " << label << std::endl;
-                return;
-            }
-
-            DEMMesh::PatchSplitOptions opt;
-            opt.soft_angle_deg = -1.0f;
-            opt.patch_normal_max_deg = -1.0f;
-            opt.block_concave_edges = true;
-            opt.concave_allow_deg = 0.0f;
-            opt.patch_min = 1;
-            opt.patch_max = std::numeric_limits<unsigned int>::max();
-            opt.seed_largest_first = true;
-            opt.auto_tune.enabled = false;
-
-            mesh_template->SplitIntoConvexPatches(120.0f, opt);
-            mesh_template->SetMaterial(mat_type);
-
-            auto mesh_instance = DEMSim.AddMeshFromTemplate(mesh_template, make_float3(0, 0, 0));
-            mesh_instance->SetFamily(0);
-            mesh_instance->SetMass(1000.);
-            mesh_instance->SetMOI(make_float3(200., 200., 200.));
-
-            DEMSim.Initialize();
-
-            path ply_file = out_dir / ("mesh_patch_colors_" + label + ".ply");
-            DEMSim.WriteMeshFile(ply_file);
-            DEMSim.WaitForPendingOutput();
-            std::cout << "Wrote patch-colored PLY to: " << ply_file << std::endl;
-        };
-
-        export_mesh("cube", GET_DATA_PATH() / "mesh/cube.obj", false);
-        export_mesh("sphere", GET_DATA_PATH() / "mesh/sphere.obj", false);
-        export_mesh("drum", GET_DATA_PATH() / "mesh/drum.stl", true);
-    }
-
-    std::cout << "\n========================================" << std::endl;
-    std::cout << "Demo completed successfully!" << std::endl;
-    std::cout << "========================================" << std::endl;
-
-    return 0;
-}
diff --git a/src/demo/ModularTests/DEMTest_SimpleCollisions.cpp b/src/demo/ModularTests/DEMTest_SimpleCollisions.cpp
index 4722c916..17798bb4 100644
--- a/src/demo/ModularTests/DEMTest_SimpleCollisions.cpp
+++ b/src/demo/ModularTests/DEMTest_SimpleCollisions.cpp
@@ -31,23 +31,38 @@ using namespace deme;
 
 namespace {
 
-constexpr bool kUseTriangleParticles = true; // toggle to run the STL-based triangle setup
+constexpr bool kUseTriangleParticles = false; // toggle to run the STL-based triangle setup
 constexpr float kMmToMeters = 0.001f;
 constexpr double kTriangleParticleDensity = 2600.0;
 
 constexpr int kNumRuns = 10;
 constexpr double kGap = 0.005;        // 0.5 mm
-constexpr double kSpeed = 1.0;       // 1 m/s
-constexpr double kTimeStep = 1e-5;   // seconds
-constexpr int kMaxSteps = 100000;    // 1 seconds max
-constexpr double kContactEps = 1e-6; // contact force threshold
+constexpr double kSpeed = 1.0;        // 1 m/s magnitude
+constexpr double kTimeStep = 1e-5;
+constexpr int kMaxSteps = 50000; // oberserve 0.5s fitting with --> kTimeStep
+constexpr double kContactEps = 1e-6;
+
+// NEW: impact angle controls
+constexpr double kImpactThetaDeg = 0.0;   // 0 = vertical down, 90 = pure lateral
+constexpr double kImpactPhiDeg   = 0.0;   // azimuth in XY plane: 0 -> +X, 90 -> +Y
+
+// NEW: multi-impact tracking
+constexpr int kMaxImpactsToRecord = 8;
+
 double vmax = kSpeed;
 
+struct ImpactEvent {
+    bool has_rebound = false;           // rebound captured at end of this contact episode
+    double peak_normal_force = 0.0;     // peak Fn during this episode
+    double rebound_speed = 0.0;         // |v| right after separation (if has_rebound)
+    float3 rebound_dir = make_float3(0,0,0);
+    int start_step = -1;
+    int end_step   = -1;
+};
+
 struct RunResult {
     bool ok = false;
-    double rebound_speed = 0.0;
-    double peak_normal_force = 0.0;
-    float3 rebound_dir = make_float3(0, 0, 0);
+    std::vector<ImpactEvent> impacts;   // NEW: can contain multiple episodes
 };
 
 struct Stats {
@@ -71,9 +86,7 @@ float3 vec_scale(const float3& v, double s) {
 
 Stats calc_stats(const std::vector<double>& values) {
     Stats s;
-    if (values.empty()) {
-        return s;
-    }
+    if (values.empty()) return s;
     s.min = values.front();
     s.max = values.front();
     double sum = 0.0;
@@ -102,47 +115,40 @@ double compute_min_z_rotated(const std::shared_ptr<DEMMesh>& mesh, const float4&
     return min_z;
 }
 
-void assign_patch_ids(const std::shared_ptr<DEMMesh>& mesh_template,
-                      bool per_triangle_patches,
-                      const std::shared_ptr<DEMMaterial>& mat_type) {
-    if (!mesh_template) {
-        return;
-    }
-    const size_t num_tris = mesh_template->GetNumTriangles();
-    std::vector<patchID_t> patch_ids(num_tris, 0);
-    if (per_triangle_patches) {
-        for (size_t i = 0; i < num_tris; ++i) {
-            patch_ids[i] = static_cast<patchID_t>(i);
-        }
-    }
-    mesh_template->SetPatchIDs(patch_ids);
-    mesh_template->SetMaterial(mat_type);
+// NEW: build initial velocity vector from speed + angles (theta from +normal, phi azimuth in plane)
+float3 build_velocity(double speed, double theta_deg, double phi_deg) {
+    const double theta = theta_deg * PI / 180.0;
+    const double phi   = phi_deg   * PI / 180.0;
+
+    // normal component (downwards for approaching)
+    const double v_n = -speed * std::cos(theta);
+    // tangential magnitude
+    const double v_t =  speed * std::sin(theta);
+
+    const double vx = v_t * std::cos(phi);
+    const double vy = v_t * std::sin(phi);
+    const double vz = v_n;
+
+    return make_float3((float)vx, (float)vy, (float)vz);
 }
 
 std::shared_ptr<DEMMesh> load_cube_template(DEMSolver& DEMSim,
-                                            const std::shared_ptr<DEMMaterial>& mat_type,
-                                            bool per_triangle_patches) {
+                                           const std::shared_ptr<DEMMaterial>& mat_type) {
     auto mesh_template = DEMSim.LoadMeshType((GET_DATA_PATH() / "mesh/cube.obj").string(), mat_type,
-                                             true,   // load_normals
-                                             false); // load_uv
-    if (!mesh_template) {
-        return nullptr;
-    }
-
-    assign_patch_ids(mesh_template, per_triangle_patches, mat_type);
+                                             true, false);
+    if (!mesh_template) return nullptr;
+    mesh_template->SetMaterial(mat_type);
     return mesh_template;
 }
 
 std::shared_ptr<DEMMesh> load_triangle_template(DEMSolver& DEMSim,
-                                                const std::shared_ptr<DEMMaterial>& mat_type,
-                                                bool per_triangle_patches,
-                                                float& out_mass,
-                                                float3& out_moi) {
+                                               const std::shared_ptr<DEMMaterial>& mat_type,
+                                               float& out_mass,
+                                               float3& out_moi) {
     std::shared_ptr<DEMMesh> mesh_template =
         DEMSim.LoadMeshType((GET_DATA_PATH() / "mesh/simpleTriangleShape4mm.stl").string(), mat_type, true, false);
-    if (!mesh_template) {
-        return nullptr;
-    }
+    if (!mesh_template) return nullptr;
+
     mesh_template->Scale(kMmToMeters);
 
     double volume = 0.0;
@@ -153,15 +159,15 @@ std::shared_ptr<DEMMesh> load_triangle_template(DEMSolver& DEMSim,
     out_mass = static_cast<float>(volume * kTriangleParticleDensity);
     out_moi = inertia * static_cast<float>(kTriangleParticleDensity);
 
-    assign_patch_ids(mesh_template, per_triangle_patches, mat_type);
+    mesh_template->SetMaterial(mat_type);
     return mesh_template;
 }
 
 RunResult run_single_collision(const float4& init_rot,
-                               bool per_triangle_patches,
                                bool use_triangle_particles,
                                const std::string& label,
-                               int run_id) {
+                               int run_id,
+                               const float3& init_vel) {
     RunResult result;
 
     DEMSolver DEMSim;
@@ -177,76 +183,101 @@ RunResult run_single_collision(const float4& init_rot,
     float3 plane_normal = make_float3(0, 0, 1);
     auto plane = DEMSim.AddBCPlane(make_float3(0, 0, 0), plane_normal, mat_type);
     auto plane_tracker = DEMSim.Track(plane);
-    const char* mesh_desc = use_triangle_particles ? "triangle mesh" : "cube mesh";
+
     auto mesh_template = std::shared_ptr<DEMMesh>{};
     float particle_mass = 1.0f;
     float3 particle_moi = make_float3(1.0f / 6.0f, 1.0f / 6.0f, 1.0f / 6.0f);
 
     if (use_triangle_particles) {
-        mesh_template = load_triangle_template(DEMSim, mat_type, per_triangle_patches, particle_mass, particle_moi);
+        mesh_template = load_triangle_template(DEMSim, mat_type, particle_mass, particle_moi);
     } else {
-        mesh_template = load_cube_template(DEMSim, mat_type, per_triangle_patches);
+        mesh_template = load_cube_template(DEMSim, mat_type);
     }
     if (!mesh_template) {
-        std::cout << "[" << label << "] Run " << run_id << ": failed to load " << mesh_desc << std::endl;
+        std::cout << "[" << label << "] Run " << run_id << ": failed to load mesh template" << std::endl;
         return result;
     }
+
     double min_z = compute_min_z_rotated(mesh_template, init_rot);
     double init_z = kGap - min_z;
 
-    auto cube = DEMSim.AddMeshFromTemplate(mesh_template, make_float3(0, 0, 0));
-    cube->SetFamily(0);
-    cube->SetMass(particle_mass);
-    cube->SetMOI(particle_moi);
-    cube->SetInitQuat(init_rot);
-    cube->SetInitPos(make_float3(0, 0, static_cast<float>(init_z)));
-    auto cube_tracker = DEMSim.Track(cube);
+    auto body = DEMSim.AddMeshFromTemplate(mesh_template, make_float3(0, 0, 0));
+    body->SetFamily(0);
+    body->SetMass(particle_mass);
+    body->SetMOI(particle_moi);
+    body->SetInitQuat(init_rot);
+    body->SetInitPos(make_float3(0, 0, static_cast<float>(init_z)));
+    auto body_tracker = DEMSim.Track(body);
 
     DEMSim.SetInitTimeStep(kTimeStep);
     DEMSim.Initialize();
-    cube_tracker->SetVel(make_float3(0, 0, -static_cast<float>(kSpeed)));
 
-    bool contact_started = false;
-    bool rebound_captured = false;
-    double peak_normal_force = 0.0;
+    // NEW: angled initial velocity
+    body_tracker->SetVel(init_vel);
+
+    bool in_contact = false;
+    ImpactEvent current{};
+    int impacts_recorded = 0;
 
     for (int step = 0; step < kMaxSteps; ++step) {
         DEMSim.DoStepDynamics();
 
+        // NOTE: this is your current way to estimate contact force on the plane
         float3 plane_force = plane_tracker->ContactAcc();
         plane_force = vec_scale(plane_force, plane_tracker->Mass());
         double normal_force = std::abs(vec_dot(plane_force, plane_normal));
-        peak_normal_force = std::max(peak_normal_force, normal_force);
 
-        if (normal_force > kContactEps) {
-            contact_started = true;
+        // start of a new contact episode
+        if (!in_contact && normal_force > kContactEps) {
+            in_contact = true;
+            current = ImpactEvent{};
+            current.start_step = step;
+            current.peak_normal_force = normal_force;
+        }
+
+        // update peak during contact
+        if (in_contact) {
+            current.peak_normal_force = std::max(current.peak_normal_force, normal_force);
         }
 
-        float3 vel = cube_tracker->Vel();
-        double vel_n = vec_dot(vel, plane_normal);
+        // end of contact episode
+        if (in_contact && normal_force <= kContactEps) {
+            in_contact = false;
+            current.end_step = step;
+
+            // capture rebound info if moving away (positive normal velocity)
+            float3 vel = body_tracker->Vel();
+            double vel_n = vec_dot(vel, plane_normal);
+
+            if (vel_n > 0.0) {
+                double speed = vec_length(vel);
+                float3 dir = make_float3(0, 0, 0);
+                if (speed > 0) {
+                    dir = vec_scale(vel, 1.0 / speed);
+                }
+                current.has_rebound = true;
+                current.rebound_speed = speed;
+                current.rebound_dir = dir;
+            }
+
+            result.impacts.push_back(current);
+            impacts_recorded++;
 
-        if (contact_started && normal_force <= kContactEps && vel_n > 0.0) {
-            double speed = vec_length(vel);
-            float3 dir = make_float3(0, 0, 0);
-            if (speed > 0) {
-                dir = vec_scale(vel, 1.0 / speed);
+            if (impacts_recorded >= kMaxImpactsToRecord) {
+                break;
             }
-            result.ok = true;
-            result.rebound_speed = speed;
-            result.peak_normal_force = peak_normal_force;
-            result.rebound_dir = dir;
-            rebound_captured = true;
-            break;
         }
     }
 
-    if (!rebound_captured) {
-        std::cout << "[" << label << "] Run " << run_id << ": rebound not captured within max steps" << std::endl;
+    result.ok = !result.impacts.empty();
+    if (!result.ok) {
+        std::cout << "[" << label << "] Run " << run_id << ": no impacts recorded within max steps" << std::endl;
     }
 
     return result;
 }
 
+// Updated stats: by default we evaluate the FIRST rebound episode that has_rebound==true
 void print_stats_block(const std::string& label,
                        const std::vector<RunResult>& results) {
     std::vector<double> speeds;
@@ -254,16 +285,29 @@ void print_stats_block(const std::string& label,
     std::vector<double> dir_x;
     std::vector<double> dir_y;
     std::vector<double> dir_z;
+    std::vector<double> n_impacts;
 
     for (const auto& r : results) {
-        if (!r.ok) {
+        if (!r.ok) continue;
+
+        n_impacts.push_back((double)r.impacts.size());
+
+        // pick first episode with rebound
+        const ImpactEvent* chosen = nullptr;
+        for (const auto& ev : r.impacts) {
+            if (ev.has_rebound) { chosen = &ev; break; }
+        }
+        if (!chosen) {
+            // still record peak of first impact if rebound wasn't detected
+            forces.push_back(r.impacts.front().peak_normal_force);
             continue;
         }
-        speeds.push_back(r.rebound_speed);
-        forces.push_back(r.peak_normal_force);
-        dir_x.push_back(r.rebound_dir.x);
-        dir_y.push_back(r.rebound_dir.y);
-        dir_z.push_back(r.rebound_dir.z);
+
+        speeds.push_back(chosen->rebound_speed);
+        forces.push_back(chosen->peak_normal_force);
+        dir_x.push_back(chosen->rebound_dir.x);
+        dir_y.push_back(chosen->rebound_dir.y);
+        dir_z.push_back(chosen->rebound_dir.z);
     }
 
     Stats s_speed = calc_stats(speeds);
@@ -271,8 +315,11 @@ void print_stats_block(const std::string& label,
     Stats s_dx = calc_stats(dir_x);
     Stats s_dy = calc_stats(dir_y);
     Stats s_dz = calc_stats(dir_z);
+    Stats s_ni = calc_stats(n_impacts);
 
     std::cout << "\n=== " << label << " stats (population stddev) ===" << std::endl;
+    std::cout << "Impacts per run: mean=" << s_ni.mean << " min=" << s_ni.min << " max=" << s_ni.max
+              << " std=" << s_ni.stddev << std::endl;
     std::cout << "Rebound speed [m/s]: mean=" << s_speed.mean << " min=" << s_speed.min << " max=" << s_speed.max
               << " std=" << s_speed.stddev << std::endl;
     std::cout << "Peak normal force [N]: mean=" << s_force.mean << " min=" << s_force.min << " max=" << s_force.max
@@ -285,6 +332,11 @@ void print_stats_block(const std::string& label,
               << " std=" << s_dz.stddev << std::endl;
 }
 
+// Rotations
+float4 flat_quat() {
+    return make_float4(0, 0, 0, 1); // NEW: identity
+}
+
 float4 edge_quat() {
     float4 q = make_float4(0, 0, 0, 1);
     q = RotateQuat(q, make_float3(1, 0, 0), static_cast<float>(PI / 4.0));
@@ -300,23 +352,35 @@ float4 corner_quat() {
 
 void run_scenario(const std::string& label,
                   const float4& rot,
-                  bool per_triangle_patches,
-                  bool use_triangle_particles) {
+                  bool use_triangle_particles,
+                  const float3& init_vel) {
     std::cout << "\n========================================" << std::endl;
     std::cout << label << std::endl;
     std::cout << "========================================" << std::endl;
     std::cout << "Using mesh: " << (use_triangle_particles ? "simpleTriangleShape4mm.stl" : "cube.obj") << std::endl;
+    std::cout << "Init vel: (" << init_vel.x << ", " << init_vel.y << ", " << init_vel.z << ")"
+              << " |v|=" << vec_length(init_vel) << std::endl;
 
     std::vector<RunResult> results;
     results.reserve(kNumRuns);
 
     for (int i = 0; i < kNumRuns; ++i) {
-        RunResult r = run_single_collision(rot, per_triangle_patches, use_triangle_particles, label, i);
+        RunResult r = run_single_collision(rot, use_triangle_particles, label, i, init_vel);
         results.push_back(r);
+
         if (r.ok) {
-            std::cout << "Run " << i << ": speed=" << r.rebound_speed << " dir=(" << r.rebound_dir.x << ", "
-                      << r.rebound_dir.y << ", " << r.rebound_dir.z << ") force=" << r.peak_normal_force
-                      << std::endl;
+            std::cout << "Run " << i << ": impacts=" << r.impacts.size();
+            // print first rebound episode if exists
+            const ImpactEvent* chosen = nullptr;
+            for (const auto& ev : r.impacts) { if (ev.has_rebound) { chosen = &ev; break; } }
+            if (chosen) {
+                std::cout << " rebound_speed=" << chosen->rebound_speed
+                          << " dir=(" << chosen->rebound_dir.x << ", " << chosen->rebound_dir.y << ", " << chosen->rebound_dir.z << ")"
+                          << " peakFn=" << chosen->peak_normal_force;
+            } else {
+                std::cout << " (no rebound captured) peakFn_first=" << r.impacts.front().peak_normal_force;
+            }
+            std::cout << std::endl;
         }
     }
 
@@ -332,13 +396,16 @@ int main() {
     std::cout << "Particle mesh mode: "
               << (kUseTriangleParticles ? "simpleTriangleShape4mm.stl" : "cube.obj") << std::endl;
 
-    float4 q_edge = edge_quat();
+    // NEW: build velocity once (same for all scenarios)
+    float3 init_vel = build_velocity(kSpeed, kImpactThetaDeg, kImpactPhiDeg);
+
+    float4 q_flat   = flat_quat();
+    float4 q_edge   = edge_quat();
     float4 q_corner = corner_quat();
 
-    run_scenario("Edge impact - single patch", q_edge, false, kUseTriangleParticles);
-    run_scenario("Edge impact - 12 patches", q_edge, true, kUseTriangleParticles);
-    run_scenario("Corner impact - single patch", q_corner, false, kUseTriangleParticles);
-    run_scenario("Corner impact - 12 patches", q_corner, true, kUseTriangleParticles);
+    run_scenario("Flat impact",   q_flat,   kUseTriangleParticles, init_vel);
+    run_scenario("Edge impact",   q_edge,   kUseTriangleParticles, init_vel);
+    run_scenario("Corner impact", q_corner, kUseTriangleParticles, init_vel);
 
     std::cout << "\n========================================" << std::endl;
     std::cout << "Test completed" << std::endl;