pmudry
diff --git a/‎.clangd‎
Lines changed: 19 additions & 16 deletions b/‎.clangd‎
Lines changed: 19 additions & 16 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 11 additions & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎src/rayon/constants.hpp‎
Lines changed: 1 addition & 1 deletion b/‎src/rayon/constants.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/rayon/data_structures/material.hpp‎
Lines changed: 4 additions & 5 deletions b/‎src/rayon/data_structures/material.hpp‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎src/rayon/gpu_renderers/cuda_float3.cuh‎
Lines changed: 10 additions & 34 deletions b/‎src/rayon/gpu_renderers/cuda_float3.cuh‎
Lines changed: 10 additions & 34 deletions
diff --git a/‎src/rayon/gpu_renderers/cuda_raytracer.cuh‎
Lines changed: 1 addition & 1 deletion b/‎src/rayon/gpu_renderers/cuda_raytracer.cuh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/rayon/gpu_renderers/cuda_utils.cu‎
Lines changed: 21 additions & 21 deletions b/‎src/rayon/gpu_renderers/cuda_utils.cu‎
Lines changed: 21 additions & 21 deletions
diff --git a/‎src/rayon/gpu_renderers/cuda_utils.cuh‎
Lines changed: 4 additions & 4 deletions b/‎src/rayon/gpu_renderers/cuda_utils.cuh‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/rayon/gpu_renderers/materials/legacy/show_normals.cuh‎
Lines changed: 3 additions & 2 deletions b/‎src/rayon/gpu_renderers/materials/legacy/show_normals.cuh‎
Lines changed: 3 additions & 2 deletions
@@ -15,25 +15,28 @@ CompileFlags:
     - -rdynamic
     - -lineinfo
     - -maxrregcount*
+    - -arch=*
+    - --gpu-architecture=*
+    - -gencode=*
+    - --use_fast_math
   Add:
     - --no-cuda-version-check
     # The include section is automatically generated my cmake at build, do NOT modify.
-    - -I/home/pmudry/git/302_raytracer/src
-    - -I/home/pmudry/git/302_raytracer/src/external
-    - -I/home/pmudry/git/302_raytracer/src/rayon
-    - -I/home/pmudry/git/302_raytracer/src/rayon/camera
-    - -I/home/pmudry/git/302_raytracer/src/rayon/camera/sdl
-    - -I/home/pmudry/git/302_raytracer/src/rayon/cpu_renderers
-    - -I/home/pmudry/git/302_raytracer/src/rayon/cpu_renderers/cpu_shapes
-    - -I/home/pmudry/git/302_raytracer/src/rayon/cpu_renderers/utils
-    - -I/home/pmudry/git/302_raytracer/src/rayon/data_structures
-    - -I/home/pmudry/git/302_raytracer/src/rayon/gpu_renderers
-    - -I/home/pmudry/git/302_raytracer/src/rayon/gpu_renderers/materials
-    - -I/home/pmudry/git/302_raytracer/src/rayon/gpu_renderers/materials/advanced
-    - -I/home/pmudry/git/302_raytracer/src/rayon/gpu_renderers/materials/legacy
-    - -I/home/pmudry/git/302_raytracer/src/rayon/gpu_renderers/shaders
-    - -I/home/pmudry/git/302_raytracer/src/rayon/render
-    - -I/home/pmudry/git/302_raytracer/src/rayon/scenes
+    - -I/home/pyrrhus/git/302_raytracer/src
+    - -I/home/pyrrhus/git/302_raytracer/src/external
+    - -I/home/pyrrhus/git/302_raytracer/src/rayon
+    - -I/home/pyrrhus/git/302_raytracer/src/rayon/camera
+    - -I/home/pyrrhus/git/302_raytracer/src/rayon/camera/sdl
+    - -I/home/pyrrhus/git/302_raytracer/src/rayon/cpu_renderers
+    - -I/home/pyrrhus/git/302_raytracer/src/rayon/cpu_renderers/cpu_shapes
+    - -I/home/pyrrhus/git/302_raytracer/src/rayon/cpu_renderers/utils
+    - -I/home/pyrrhus/git/302_raytracer/src/rayon/data_structures
+    - -I/home/pyrrhus/git/302_raytracer/src/rayon/gpu_renderers
+    - -I/home/pyrrhus/git/302_raytracer/src/rayon/gpu_renderers/materials
+    - -I/home/pyrrhus/git/302_raytracer/src/rayon/gpu_renderers/materials/legacy
+    - -I/home/pyrrhus/git/302_raytracer/src/rayon/gpu_renderers/shaders
+    - -I/home/pyrrhus/git/302_raytracer/src/rayon/render
+    - -I/home/pyrrhus/git/302_raytracer/src/rayon/scenes
 Diagnostics:
   UnusedIncludes: Strict
   Suppress:
 
@@ -19,6 +19,8 @@ endif()
 
 project(RAYON_RAYTRACER LANGUAGES CXX CUDA)
 
+# Option to enable diagnostic output
+option(ENABLE_DIAGS "Enable diagnostic output in CUDA and CPU renderers" OFF)
 
 # Ensure compile_commands.json is generated in the source directory
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -55,7 +57,7 @@ if(CMAKE_CUDA_COMPILER)
     set(CMAKE_CUDA_STANDARD 17)
     set(CMAKE_CUDA_STANDARD_REQUIRED ON)
     set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
-    set(CMAKE_CUDA_ARCHITECTURES 90) # Set CUDA architectures (adjust based on your GPU - this covers common GPUs)
+    set(CMAKE_CUDA_ARCHITECTURES "native") # Set CUDA architecture to the machine where it's built
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -t 0") # Enable parallel CUDA compilation for faster builds
     set(CUDA_FOUND TRUE)
 else()
@@ -133,6 +135,14 @@ target_compile_definitions(rayon PRIVATE
     RT_BUILD_TYPE_STRING="$<IF:$<BOOL:${CMAKE_BUILD_TYPE}>,${CMAKE_BUILD_TYPE},$<CONFIG>>"
 )
 
+# Add DIAGS definition if enabled
+if(ENABLE_DIAGS)
+    target_compile_definitions(rayon PRIVATE DIAGS)
+    message(STATUS "Diagnostics enabled (ENABLE_DIAGS=ON)")
+else()
+    message(STATUS "Diagnostics disabled (set ENABLE_DIAGS=ON to enable)")
+endif()
+
 # Generate .clangd with -I flags
 # This updates the .clangd file's Add section with the current include directories
 # The rest of .clangd remains untouched
 
@@ -6,7 +6,7 @@ namespace constants
 {
     const std::string ver_major = "1";
     const std::string ver_minor = "2";
-    const std::string ver_patch = "1";
+    const std::string ver_patch = "3";
     const std::string version = ver_major + "." + ver_minor + "." + ver_patch;
 
     // Image specifics settings
 
@@ -22,9 +22,9 @@ class Constant : public Material
    Constant(const Color &a) : color(a) {}
 
    virtual bool scatter(const Ray &r_in, const Hit_record &rec, Color &attenuation, Ray &scattered) const override
-   {            
+   {
       attenuation = color;
-      scattered = Ray(rec.p, Vec3(0,0,0)); // No scattering, the ray is absorbed
+      scattered = Ray(rec.p, Vec3(0, 0, 0)); // No scattering, the ray is absorbed
       return true;
    }
 
@@ -38,7 +38,7 @@ class ShowNormals : public Material
    ShowNormals(const Color &a) : albedo(a) {}
 
    virtual bool scatter(const Ray &r_in, const Hit_record &rec, Color &attenuation, Ray &scattered) const override
-   {            
+   {
       attenuation = 0.5 * (rec.normal + Vec3_ONES);
       scattered = Ray(rec.p, Vec3_ZEROES); // No scattering
       return true;
@@ -62,12 +62,11 @@ class Lambertian : public Material
          scatter_direction = rec.normal;
 
       scattered = Ray(rec.p, scatter_direction);
-      
+
       attenuation = albedo;
       return true;
    }
 
  public:
    Color albedo; // The amount of reflected light, 0 for no reflection, 1 for full reflection
 };
-
@@ -7,8 +7,8 @@
  */
 #pragma once
 
-#include <cuda_runtime.h>
 #include <cmath>
+#include <cuda_runtime.h>
 
 //==============================================================================
 // VECTOR MATH AND UTILITY STRUCTURES
@@ -24,26 +24,17 @@ struct f2
    __host__ __device__ f2() : x(0), y(0) {}
    __host__ __device__ f2(float x_, float y_) : x(x_), y(y_) {}
 
-   __host__ __device__ f2 operator+(const f2 &other) const
-   {
-      return f2(x + other.x, y + other.y);
-   }
+   __host__ __device__ f2 operator+(const f2 &other) const { return f2(x + other.x, y + other.y); }
 
-   __host__ __device__ f2 operator-(const f2 &other) const
-   {
-      return f2(x - other.x, y - other.y);
-   }
+   __host__ __device__ f2 operator-(const f2 &other) const { return f2(x - other.x, y - other.y); }
 
    __host__ __device__ f2 operator*(float t) const { return f2(x * t, y * t); }
 
    __host__ __device__ f2 operator/(float t) const { return f2(x / t, y / t); }
 };
 
 /** @brief Scalar multiplication from left */
-__device__ __forceinline__ f2 operator*(float t, const f2 &v)
-{
-   return f2(t * v.x, t * v.y);
-}
+__device__ __forceinline__ f2 operator*(float t, const f2 &v) { return f2(t * v.x, t * v.y); }
 
 /**
  * @brief Simple 3D vector structure optimized for CUDA
@@ -54,18 +45,12 @@ struct f3
    float x, y, z;
 
    __host__ __device__ f3() : x(0), y(0), z(0) {}
-   
+
    __host__ __device__ f3(float x_, float y_, float z_) : x(x_), y(y_), z(z_) {}
 
-   __host__ __device__ f3 operator+(const f3 &other) const
-   {
-      return f3(x + other.x, y + other.y, z + other.z);
-   }
+   __host__ __device__ f3 operator+(const f3 &other) const { return f3(x + other.x, y + other.y, z + other.z); }
 
-   __host__ __device__ f3 operator-(const f3 &other) const
-   {
-      return f3(x - other.x, y - other.y, z - other.z);
-   }
+   __host__ __device__ f3 operator-(const f3 &other) const { return f3(x - other.x, y - other.y, z - other.z); }
 
    __host__ __device__ f3 operator*(float t) const { return f3(x * t, y * t, z * t); }
 
@@ -84,16 +69,10 @@ const f3 f3_ZEROES(0.0f, 0.0f, 0.0f);
 const f3 f3_ONES(1.0f, 1.0f, 1.0f);
 
 /** @brief Scalar multiplication from left */
-__device__ __forceinline__ f3 operator*(float t, const f3 &v)
-{
-   return v * t;
-}
+__device__ __forceinline__ f3 operator*(float t, const f3 &v) { return v * t; }
 
 /** @brief Compute dot product of two vectors */
-__device__ __forceinline__ float dot(const f3 &a, const f3 &b)
-{
-   return a.x * b.x + a.y * b.y + a.z * b.z;
-}
+__device__ __forceinline__ float dot(const f3 &a, const f3 &b) { return a.x * b.x + a.y * b.y + a.z * b.z; }
 
 /** @brief Compute cross product of two vectors */
 __device__ __forceinline__ f3 cross(const f3 &a, const f3 &b)
@@ -102,10 +81,7 @@ __device__ __forceinline__ f3 cross(const f3 &a, const f3 &b)
 }
 
 /** @brief Normalize a vector to unit length */
-__device__ __forceinline__ f3 normalize(const f3 &v)
-{
-   return v / v.length();
-}
+__device__ __forceinline__ f3 normalize(const f3 &v) { return v / v.length(); }
 
 /** @brief Convert a normal to a debug RGB color */
 __device__ __forceinline__ f3 normal_to_color(const f3 &n)
 
@@ -578,7 +578,7 @@ __device__ inline f3 ray_color(const ray_simple &r, const CudaScene::Scene &scen
          // Sky/background
          f3 unit_direction = normalize(current_ray.dir);
          float t = 0.5f * (unit_direction.y + 1.0f);
-         f3 sky_color = (1.0f - t) * f3(1.0f, 1.0f, 1.0f) + t * f3(0.5f, 0.7f, 1.0f);         
+         f3 sky_color = (1.0f - t) * f3(1.0f, 1.0f, 1.0f) + t * f3(0.5f, 0.7f, 1.0f);
          accumulated_color = accumulated_color + accumulated_attenuation * sky_color * g_background_intensity;
          return accumulated_color;
       }
 
@@ -3,26 +3,26 @@
 // Implement kernel in a single translation unit to avoid nvlink multiple definition errors
 __global__ void init_random_states(curandState *rand_states, int num_states, unsigned long long seed, int width)
 {
-    // Support both 1D and 2D grid launches
-    int idx;
-    if (gridDim.y == 1)
-    {
-        // 1D launch
-        idx = blockIdx.x * blockDim.x + threadIdx.x;
-    }
-    else
-    {
-        // 2D launch - compute proper 1D index
-        int x = blockIdx.x * blockDim.x + threadIdx.x;
-        int y = blockIdx.y * blockDim.y + threadIdx.y;
-        idx = y * width + x;
-    }
+   // Support both 1D and 2D grid launches
+   int idx;
+   if (gridDim.y == 1)
+   {
+      // 1D launch
+      idx = blockIdx.x * blockDim.x + threadIdx.x;
+   }
+   else
+   {
+      // 2D launch - compute proper 1D index
+      int x = blockIdx.x * blockDim.x + threadIdx.x;
+      int y = blockIdx.y * blockDim.y + threadIdx.y;
+      idx = y * width + x;
+   }
 
-    if (idx < num_states)
-    {
-        // Initialize fast RNG state - we repurpose curandState storage
-        // Simple but effective: combine seed with index for per-pixel unique sequences
-        unsigned int *fast_state = (unsigned int*)&rand_states[idx];
-        *fast_state = (unsigned int)(seed + idx * 747796405u);
-    }
+   if (idx < num_states)
+   {
+      // Initialize fast RNG state - we repurpose curandState storage
+      // Simple but effective: combine seed with index for per-pixel unique sequences
+      unsigned int *fast_state = (unsigned int *)&rand_states[idx];
+      *fast_state = (unsigned int)(seed + idx * 747796405u);
+   }
 }
@@ -99,10 +99,10 @@ static __device__ inline f3 randPosInSphere(curandState *state, f3 center, float
 static __device__ inline void build_orthonormal_basis(const f3 &n, f3 &u, f3 &v)
 {
    // from "Building an Orthonormal Basis, Pixar" / Shirley
-   if (fabs(n.x) > fabs(n.z))   
-      u = normalize(f3(-n.y, n.x, 0.0f));   
-   else   
-      u = normalize(f3(0.0f, -n.z, n.y));   
+   if (fabs(n.x) > fabs(n.z))
+      u = normalize(f3(-n.y, n.x, 0.0f));
+   else
+      u = normalize(f3(0.0f, -n.z, n.y));
    v = cross(n, u);
 }
 
 
@@ -52,9 +52,10 @@ struct ShowNormals : public MaterialBase<ShowNormals>
     * @brief Get emitted color (displays the surface normal as color)
     * @return Normal vector mapped to RGB color space [0,1]³
     */
-   __device__ __forceinline__ f3 emission() const { 
+   __device__ __forceinline__ f3 emission() const
+   {
       // Map normal from [-1,1] to [0,1] for color display
-      return 0.5f * (params.normal + f3(1.0f, 1.0f, 1.0f)); 
+      return 0.5f * (params.normal + f3(1.0f, 1.0f, 1.0f));
    }
 };
Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,7 @@ namespace constants`
`6`	`6`	`{`
`7`	`7`	`const std::string ver_major = "1";`
`8`	`8`	`const std::string ver_minor = "2";`
`9`		`- const std::string ver_patch = "1";`
	`9`	`+ const std::string ver_patch = "3";`
`10`	`10`	`const std::string version = ver_major + "." + ver_minor + "." + ver_patch;`
`11`	`11`
`12`	`12`	`// Image specifics settings`
Original file line number	Diff line number	Diff line change
`@@ -578,7 +578,7 @@ __device__ inline f3 ray_color(const ray_simple &r, const CudaScene::Scene &scen`
`578`	`578`	`// Sky/background`
`579`	`579`	`f3 unit_direction = normalize(current_ray.dir);`
`580`	`580`	`float t = 0.5f * (unit_direction.y + 1.0f);`
`581`		`- f3 sky_color = (1.0f - t) * f3(1.0f, 1.0f, 1.0f) + t * f3(0.5f, 0.7f, 1.0f);`
	`581`	`+ f3 sky_color = (1.0f - t) * f3(1.0f, 1.0f, 1.0f) + t * f3(0.5f, 0.7f, 1.0f);`
`582`	`582`	`accumulated_color = accumulated_color + accumulated_attenuation * sky_color * g_background_intensity;`
`583`	`583`	`return accumulated_color;`
`584`	`584`	`}`
Original file line number	Diff line number	Diff line change
`@@ -99,10 +99,10 @@ static __device__ inline f3 randPosInSphere(curandState *state, f3 center, float`
`99`	`99`	`static __device__ inline void build_orthonormal_basis(const f3 &n, f3 &u, f3 &v)`
`100`	`100`	`{`
`101`	`101`	`// from "Building an Orthonormal Basis, Pixar" / Shirley`
`102`		`- if (fabs(n.x) > fabs(n.z))`
`103`		`- u = normalize(f3(-n.y, n.x, 0.0f));`
`104`		`- else`
`105`		`- u = normalize(f3(0.0f, -n.z, n.y));`
	`102`	`+ if (fabs(n.x) > fabs(n.z))`
	`103`	`+ u = normalize(f3(-n.y, n.x, 0.0f));`
	`104`	`+ else`
	`105`	`+ u = normalize(f3(0.0f, -n.z, n.y));`
`106`	`106`	`v = cross(n, u);`
`107`	`107`	`}`
`108`	`108`