pmudry
diff --git a/‎CUDA_OPTIMIZATION_PLAN.md‎
Lines changed: 30 additions & 9 deletions b/‎CUDA_OPTIMIZATION_PLAN.md‎
Lines changed: 30 additions & 9 deletions
diff --git a/‎src/rayon/gpu_renderers/cuda_raytracer.cuh‎
Lines changed: 99 additions & 84 deletions b/‎src/rayon/gpu_renderers/cuda_raytracer.cuh‎
Lines changed: 99 additions & 84 deletions
diff --git a/‎src/rayon/gpu_renderers/cuda_scene.cuh‎
Lines changed: 19 additions & 5 deletions b/‎src/rayon/gpu_renderers/cuda_scene.cuh‎
Lines changed: 19 additions & 5 deletions
@@ -3,15 +3,17 @@
 ## Context
 RayON's CUDA renderer is significantly slower than a comparable Vulkan RT raytracer (RayTracingInVulkan) primarily because it performs BVH traversal and intersection in software on shader cores, while Vulkan uses dedicated RT cores. This plan catalogs actionable optimizations and assesses OptiX migration.
 
+See `explanations/VULKAN_VS_CUDA_PERFORMANCE.md` for the detailed comparison.
+
 ## Optimization Options
 
-### Option 1: Enable `--use_fast_math` (Easy, ~10-30% speedup)
-Uncomment `--use_fast_math` in CMakeLists.txt line ~208. Enables fast `rsqrtf`, fused multiply-add, relaxed denormals. Negligible visual impact for a renderer.
+### Option 1: Enable `--use_fast_math` — DONE
+Enables fast `rsqrtf`, fused multiply-add, relaxed denormals. Negligible visual impact.
 - **File**: `CMakeLists.txt`
 
-### Option 2: Fix accumulation buffer memory layout (Easy, ~5-15% speedup)
-Change from 3 separate float writes per pixel to `float4` coalesced writes.
-- **Files**: `gpu_renderers/shaders/render_acc_kernel.cu`, `renderer_cuda_device.cu`, `renderer_cuda_progressive_host.hpp`
+### Option 2: Fix accumulation buffer memory layout — DONE
+Kernel already uses `float4` coalesced reads/writes.
+- **Files**: `gpu_renderers/shaders/render_acc_kernel.cu`
 
 ### Option 3: Increase occupancy / tune kernel launch (Medium, ~10-20% speedup)
 Profile with `ncu`, test 512 threads/block, evaluate register pressure vs. occupancy tradeoff.
@@ -22,12 +24,11 @@ Bind BVH node and geometry arrays as CUDA texture objects for better cache behav
 - **Files**: `renderer_cuda_device.cu`, `scene_builder_cuda.cu`, `cuda_raytracer.cuh`
 
 ### Option 5: Compact BVH node layout (Medium, ~10-20% speedup)
-Pack BVH nodes to 64-byte cache-line alignment. Consider MBVH (4-wide) to reduce tree depth.
+Pack BVH nodes to 64-byte cache-line alignment. Store child AABBs together so both children can be tested with a single cache line fetch.
 - **Files**: `cuda_scene.cuh`, `cuda_raytracer.cuh`, `scene_builder_cuda.cu`, `scenes/scene_description.hpp`
 
-### Option 6: Russian roulette from bounce 1 (Easy, ~5-10% speedup)
-Start Russian roulette termination earlier (currently bounce 3) with energy compensation.
-- **Files**: `gpu_renderers/cuda_raytracer.cuh`
+### Option 6: Russian roulette from bounce 1 — DONE
+Already starts at bounce 1 with energy compensation in `cuda_raytracer.cuh`.
 
 ### Option 7: Wavefront path tracing (Hard, ~30-50% speedup)
 Split monolithic kernel into separate stages (ray gen → intersect → shade per material → bounce). Eliminates most warp divergence. Major architectural change.
@@ -40,6 +41,26 @@ Replace 1:1 pixel-thread mapping with fixed thread count pulling from global que
 ### Option 9: Migrate to OptiX (Hard, ~5-10x speedup)
 Use NVIDIA OptiX SDK to access hardware RT cores for BVH traversal and intersection. This is the only path to match Vulkan RT performance. See detailed assessment below.
 
+### Option A: Eliminate D2H round-trip in progressive renderer — DONE
+Accumulation buffer stays on GPU. GPU-side `gammaCorrectKernel` produces uint8 display image directly. Only the small uint8 image (3 bytes/pixel) is copied to host instead of the full float4 buffer (16 bytes/pixel). Also uses `cudaMemset` instead of free/realloc on camera change.
+- **Files**: `render_acc_kernel.cu`, `renderer_cuda_device.cu`, `renderer_cuda_progressive_host.hpp`
+
+### Option B: Cache device properties — DONE
+`getOptimalBlockSize()` caches result in static variable instead of calling `cudaGetDeviceProperties()` every frame.
+- **File**: `renderer_cuda_device.cu`
+
+### Option C: CUDA streams for async display copy (Medium, ~10-15% latency hiding)
+Overlap kernel execution with display buffer transfer using CUDA streams. Currently the pipeline is fully synchronous.
+- **Files**: `renderer_cuda_device.cu`
+
+### Option E: BVH child ordering by ray direction sign (Medium, ~5-15% speedup)
+Replace expensive distance-to-center heuristic with ray direction sign along split axis. One comparison instead of two `length_squared()` computations per interior node.
+- **Files**: `cuda_raytracer.cuh`
+
+### Option F: Flatten material dispatch in ray_color (Medium, ~5-15% speedup)
+Remove CRTP lambda dispatch (`dispatch_material_bool`) and replace with explicit switch. Reduces register pressure and gives `nvcc` better optimization control.
+- **Files**: `cuda_raytracer.cuh`
+
 ## OptiX Migration Assessment
 
 ### What OptiX Provides
 
@@ -407,35 +407,35 @@ __device__ inline bool hit_scene(const CudaScene::Scene &scene, const ray_simple
          }
          else
          {
-            // Interior node: push children onto stack
-            // Push farther child first for better traversal order
+            // Interior node: push children, near child last (processed first)
+            // Use split axis + ray direction sign to determine near/far child
+            // This is a single comparison vs. two length_squared() computations
             int left_child = node.data.interior.left_child;
             int right_child = node.data.interior.right_child;
 
-            // Simple heuristic: test which child is closer
-            f3 left_center = (scene.bvh_nodes[left_child].bounds_min + scene.bvh_nodes[left_child].bounds_max) * 0.5f;
-            f3 right_center =
-                (scene.bvh_nodes[right_child].bounds_min + scene.bvh_nodes[right_child].bounds_max) * 0.5f;
-
-            float dist_left = (left_center - r.orig).length_squared();
-            float dist_right = (right_center - r.orig).length_squared();
-
-            if (dist_left < dist_right)
+            // Determine which child is "near" based on ray direction along split axis
+            float dir_component;
+            switch (node.split_axis)
             {
-               // Right is farther, push it first
-               if (stack_ptr < 32)
-                  stack[stack_ptr++] = right_child;
-               if (stack_ptr < 32)
-                  stack[stack_ptr++] = left_child;
-            }
-            else
-            {
-               // Left is farther, push it first
-               if (stack_ptr < 32)
-                  stack[stack_ptr++] = left_child;
-               if (stack_ptr < 32)
-                  stack[stack_ptr++] = right_child;
+            case 0:
+               dir_component = r.dir.x;
+               break;
+            case 1:
+               dir_component = r.dir.y;
+               break;
+            default:
+               dir_component = r.dir.z;
+               break;
             }
+
+            // If ray goes in positive direction along split axis, left child is near
+            int near_child = dir_component >= 0.0f ? left_child : right_child;
+            int far_child = dir_component >= 0.0f ? right_child : left_child;
+
+            if (stack_ptr < 32)
+               stack[stack_ptr++] = far_child;
+            if (stack_ptr < 32)
+               stack[stack_ptr++] = near_child;
          }
       }
    }
@@ -474,10 +474,69 @@ __device__ inline bool hit_scene(const CudaScene::Scene &scene, const ray_simple
 }
 
 /**
- * @brief Ray color computation using new material system
+ * @brief Inline material scatter — flat switch, no CRTP dispatch overhead.
  *
- * This version uses compile-time material dispatch via CRTP templates.
- * The compiler generates optimized code for each material type with zero overhead.
+ * Handles emission accumulation and scatter in a single switch.
+ * Returns true if the ray was scattered, false if absorbed/emissive.
+ */
+__device__ __forceinline__ bool scatter_material(const hit_record_simple &rec, const ray_simple &current_ray,
+                                                 ray_simple &scattered_ray, f3 &attenuation, f3 &emitted,
+                                                 curandState *state)
+{
+   using namespace Materials;
+
+   switch (rec.material)
+   {
+   case LAMBERTIAN:
+   {
+      Lambertian mat(LambertianParams{rec.color});
+      emitted = f3(0.0f, 0.0f, 0.0f);
+      return mat.scatter(current_ray, rec, attenuation, scattered_ray, state);
+   }
+   case MIRROR:
+   {
+      Mirror mat(MirrorParams{rec.color});
+      emitted = f3(0.0f, 0.0f, 0.0f);
+      return mat.scatter(current_ray, rec, attenuation, scattered_ray, state);
+   }
+   case ROUGH_MIRROR:
+   {
+      RoughMirror mat(RoughMirrorParams{rec.color, rec.roughness});
+      emitted = f3(0.0f, 0.0f, 0.0f);
+      return mat.scatter(current_ray, rec, attenuation, scattered_ray, state);
+   }
+   case GLASS:
+   {
+      Glass mat(GlassParams{rec.refractive_index});
+      emitted = f3(0.0f, 0.0f, 0.0f);
+      return mat.scatter(current_ray, rec, attenuation, scattered_ray, state);
+   }
+   case LIGHT:
+   {
+      emitted = rec.emission * g_light_intensity;
+      return false;
+   }
+   case CONSTANT:
+   {
+      emitted = rec.color;
+      return false;
+   }
+   case SHOW_NORMALS:
+   {
+      emitted = 0.5f * (rec.normal + f3(1.0f, 1.0f, 1.0f));
+      return false;
+   }
+   default:
+      emitted = f3(0.0f, 0.0f, 0.0f);
+      return false;
+   }
+}
+
+/**
+ * @brief Ray color computation with flattened material dispatch
+ *
+ * Uses a direct switch for material scatter/emission instead of CRTP template
+ * dispatch, reducing register pressure and giving nvcc better optimization control.
  */
 __device__ inline f3 ray_color(const ray_simple &r, const CudaScene::Scene &scene, curandState *state, int depth
 #ifdef DIAGS
@@ -486,8 +545,6 @@ __device__ inline f3 ray_color(const ray_simple &r, const CudaScene::Scene &scen
 #endif
 )
 {
-   using namespace Materials;
-
    f3 accumulated_color(0.0f, 0.0f, 0.0f);
    f3 accumulated_attenuation(1.0f, 1.0f, 1.0f);
    ray_simple current_ray = r;
@@ -501,69 +558,27 @@ __device__ inline f3 ray_color(const ray_simple &r, const CudaScene::Scene &scen
 
       if (hit_scene(scene, current_ray, 0.001f, FLT_MAX, rec))
       {
-         // Create material descriptor from hit record
-         // TODO: This is a temporary adapter - ideally hit_scene would return MaterialDescriptor directly
-         MaterialDescriptor mat_desc;
+         f3 attenuation;
+         ray_simple scattered_ray;
+         f3 emitted;
 
-         switch (rec.material)
+         bool did_scatter = scatter_material(rec, current_ray, scattered_ray, attenuation, emitted, state);
+
+         if (emitted.length_squared() > 0.0f)
          {
-         case LAMBERTIAN:
-            mat_desc = MaterialDescriptor::makeLambertian(rec.color);
-            break;
-         case MIRROR:
-            mat_desc = MaterialDescriptor::makeMirror(rec.color);
-            break;
-         case ROUGH_MIRROR:
-            mat_desc = MaterialDescriptor::makeRoughMirror(rec.color, rec.roughness);
-            break;
-         case GLASS:
-            mat_desc = MaterialDescriptor::makeGlass(rec.refractive_index);
-            break;
-         case LIGHT:
-            mat_desc = MaterialDescriptor::makeLight(rec.emission);
-            break;
-         case CONSTANT:
-            mat_desc = MaterialDescriptor::makeConstant(rec.color);
-            break;
-         case SHOW_NORMALS:
-            mat_desc = MaterialDescriptor::makeShowNormals(rec.normal);
-            break;
+            accumulated_color = accumulated_color + accumulated_attenuation * emitted;
          }
 
-         // Dispatch to appropriate material using compile-time template magic
-         bool scattered = dispatch_material_bool(
-             mat_desc,
-             [&](auto material) -> bool
-             {
-                // Check if emissive first
-                f3 emitted = material.emission();
-                if (emitted.length_squared() > 0.0f)
-                {
-                   accumulated_color = accumulated_color + f3(accumulated_attenuation.x * emitted.x,
-                                                              accumulated_attenuation.y * emitted.y,
-                                                              accumulated_attenuation.z * emitted.z);
-                   return false; // Light materials don't scatter
-                }
-
-                // Scatter the ray
-                f3 attenuation;
-                ray_simple scattered_ray;
-                if (material.scatter(current_ray, rec, attenuation, scattered_ray, state))
-                {
-                   current_ray = scattered_ray;
-                   accumulated_attenuation =
-                       f3(accumulated_attenuation.x * attenuation.x, accumulated_attenuation.y * attenuation.y,
-                          accumulated_attenuation.z * attenuation.z);
-                   return true;
-                }
-                return false;
-             });
-
-         if (!scattered)
+         if (!did_scatter)
          {
             return accumulated_color;
          }
 
+         current_ray = scattered_ray;
+         accumulated_attenuation =
+             f3(accumulated_attenuation.x * attenuation.x, accumulated_attenuation.y * attenuation.y,
+                accumulated_attenuation.z * attenuation.z);
+
          // Russian Roulette path termination (from bounce 1 for early path culling)
          if (bounce > 0)
          {
 
@@ -166,9 +166,22 @@ struct Geometry
 // BVH STRUCTURES (for Phase 5)
 //==============================================================================
 
-struct BVHNode
+/**
+ * @brief Cache-line-aligned BVH node (64 bytes)
+ *
+ * Packed to exactly one 64-byte cache line so that each node fetch loads
+ * all needed data in a single memory transaction. Layout:
+ *   bytes  0-11: bounds_min (f3)
+ *   bytes 12-23: bounds_max (f3)
+ *   bytes 24-27: left_child / first_geom_idx
+ *   bytes 28-31: right_child / geom_count
+ *   byte  32:    is_leaf
+ *   byte  33:    split_axis
+ *   bytes 34-63: padding (reserved for future use)
+ */
+struct alignas(64) BVHNode
 {
-   f3 bounds_min, bounds_max;
+   f3 bounds_min, bounds_max; // 24 bytes
 
    union NodeData
    {
@@ -185,10 +198,11 @@ struct BVHNode
       } leaf;
 
       __host__ __device__ NodeData() {} // Empty constructor for union
-   } data;
+   } data; // 8 bytes
 
-   bool is_leaf;
-   uint8_t split_axis;
+   bool is_leaf;       // 1 byte
+   uint8_t split_axis; // 1 byte
+   uint8_t _pad[30];   // Pad to 64 bytes
 };
 
 //==============================================================================