[re_renderer] Uniform buffer utility using CpuWriteGpuReadBelt (#1400)

* utility function for uniform buffer from struct * Uniform buffers are now compile time forced to be 256 bytes aligned * CpuWriteGpuReadBelt no longer takes mutable buffer pool * Renderers and frame_global_command_encoder are now behind locks --------- Co-authored-by: Clement Rey <cr.rey.clement@gmail.com>
rerun-io · Mar 2, 2023 · 77b7eea · 77b7eea
1 parent 7f81b4f
commit 77b7eea
Show file tree

Hide file tree

Showing 19 changed files with 322 additions and 253 deletions.
diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
@@ -39,6 +39,7 @@ where
     /// Do *not* make this public as we need to guarantee that the memory is *never* read from!
     #[inline(always)]
     fn as_slice(&mut self) -> &mut [T] {
+        // TODO(andreas): Is this access slow given that it internally goes through a trait interface? Should we keep the pointer around?
         &mut bytemuck::cast_slice_mut(&mut self.write_view)[self.unwritten_element_range.clone()]
     }
 
@@ -47,13 +48,13 @@ where
     /// Panics if the data no longer fits into the buffer.
     #[inline]
     pub fn extend_from_slice(&mut self, elements: &[T]) {
-        self.as_slice().copy_from_slice(elements);
+        self.as_slice()[..elements.len()].copy_from_slice(elements);
         self.unwritten_element_range.start += elements.len();
     }
 
     /// Pushes several elements into the buffer.
     ///
-    /// Panics if the data no longer fits into the buffer.
+    /// Extends until either running out of space or elements.
     #[inline]
     pub fn extend(&mut self, elements: impl Iterator<Item = T>) {
         let mut num_elements = 0;
@@ -68,8 +69,8 @@ where
     ///
     /// Panics if the data no longer fits into the buffer.
     #[inline]
-    pub fn push(&mut self, element: &T) {
-        self.as_slice()[0] = *element;
+    pub fn push(&mut self, element: T) {
+        self.as_slice()[0] = element;
         self.unwritten_element_range.start += 1;
     }
 
@@ -285,7 +286,7 @@ impl CpuWriteGpuReadBelt {
     pub fn allocate<T: bytemuck::Pod>(
         &mut self,
         device: &wgpu::Device,
-        buffer_pool: &mut GpuBufferPool,
+        buffer_pool: &GpuBufferPool,
         num_elements: usize,
     ) -> CpuWriteGpuReadBuffer<T> {
         // Potentially overestimate alignment with Self::MIN_ALIGNMENT, see Self::MIN_ALIGNMENT doc string.

diff --git a/crates/re_renderer/src/allocator/mod.rs b/crates/re_renderer/src/allocator/mod.rs
@@ -4,5 +4,9 @@
 //! follows some more complex strategy for efficient re-use and sub-allocation of wgpu resources.
 
 mod cpu_write_gpu_read_belt;
+mod uniform_buffer_fill;
 
 pub use cpu_write_gpu_read_belt::{CpuWriteGpuReadBelt, CpuWriteGpuReadBuffer};
+pub use uniform_buffer_fill::{
+    create_and_fill_uniform_buffer, create_and_fill_uniform_buffer_batch,
+};
diff --git a/crates/re_renderer/src/allocator/uniform_buffer_fill.rs b/crates/re_renderer/src/allocator/uniform_buffer_fill.rs
@@ -0,0 +1,102 @@
+pub use super::cpu_write_gpu_read_belt::{CpuWriteGpuReadBelt, CpuWriteGpuReadBuffer};
+
+use crate::{wgpu_resources::BindGroupEntry, DebugLabel, RenderContext};
+
+struct UniformBufferAlignmentCheck<T> {
+    pub _marker: std::marker::PhantomData<T>,
+}
+impl<T> UniformBufferAlignmentCheck<T> {
+    /// wgpu requires uniform buffers to be aligned to up to 256 bytes.
+    ///
+    /// This is a property of device limits, see [`WebGPU` specification](https://www.w3.org/TR/webgpu/#limits).
+    /// Implementations are allowed to advertise a lower alignment requirement, however
+    /// 256 bytes is fairly common even in modern hardware and is even hardcoded for DX12.
+    ///
+    /// Technically this is only relevant when sub-allocating a buffer, as the wgpu backend
+    /// is internally forced to make sure that the start of any [`wgpu::Buffer`] with [`wgpu::BufferUsages::UNIFORM`] usage
+    /// has this alignment. Practically, ensuring this alignment everywhere
+    ///
+    /// Alternatively to enforcing this alignment on the type we could:
+    /// * only align on the gpu buffer
+    ///     -> causes more fine grained `copy_buffer_to_buffer` calls on the gpu encoder
+    /// * only align on the [`CpuWriteGpuReadBuffer`] & gpu buffer
+    ///     -> causes more complicated offset computation on [`CpuWriteGpuReadBuffer`] as well as either
+    ///         holes at padding (-> undefined values & slow for write combined!) or complicated nulling of padding
+    ///
+    /// About the [`bytemuck::Pod`] requirement (dragged in by [`CpuWriteGpuReadBuffer`]):
+    /// [`bytemuck::Pod`] forces us to be explicit about padding as it doesn't allow invisible padding bytes!
+    /// We could drop this and thus make it easier to define uniform buffer types.
+    /// But this leads to more unsafe code, harder to avoid holes in write combined memory access
+    /// and potentially undefined values in the padding bytes on GPU.
+    const CHECK: () = assert!(
+        std::mem::align_of::<T>() >= 256,
+        "Uniform buffers need to be aligned to 256 bytes. Use `#[repr(C, align(256))]`"
+    );
+}
+/// Utility for fast & efficient creation of uniform buffers from a series of structs.
+///
+/// For subsequent frames, this will automatically not allocate any resources (thanks to our buffer pooling mechanism).
+///
+/// TODO(#1383): We could do this on a more complex stack allocator.
+pub fn create_and_fill_uniform_buffer_batch<T: bytemuck::Pod>(
+    ctx: &RenderContext,
+    label: DebugLabel,
+    content: impl ExactSizeIterator<Item = T>,
+) -> Vec<BindGroupEntry> {
+    #[allow(clippy::let_unit_value)]
+    let _ = UniformBufferAlignmentCheck::<T>::CHECK;
+
+    let num_buffers = content.len() as u64;
+    let element_size = std::mem::size_of::<T>() as u64;
+
+    assert!(
+        element_size > 0,
+        "Uniform buffer need to have a non-zero size"
+    );
+
+    let buffer = ctx.gpu_resources.buffers.alloc(
+        &ctx.device,
+        &crate::wgpu_resources::BufferDesc {
+            label,
+            size: num_buffers * element_size,
+            usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        },
+    );
+
+    let mut staging_buffer = ctx.cpu_write_gpu_read_belt.lock().allocate::<T>(
+        &ctx.device,
+        &ctx.gpu_resources.buffers,
+        num_buffers as _,
+    );
+    staging_buffer.extend(content);
+    staging_buffer.copy_to_buffer(
+        ctx.active_frame
+            .frame_global_command_encoder
+            .lock()
+            .get_or_create(&ctx.device),
+        &buffer,
+        0,
+    );
+
+    (0..num_buffers)
+        .into_iter()
+        .map(|i| BindGroupEntry::Buffer {
+            handle: buffer.handle,
+            offset: i * element_size,
+            size: Some(std::num::NonZeroU64::new(element_size).unwrap()),
+        })
+        .collect()
+}
+
+/// See [`create_and_fill_uniform_buffer`].
+pub fn create_and_fill_uniform_buffer<T: bytemuck::Pod>(
+    ctx: &mut RenderContext,
+    label: DebugLabel,
+    content: T,
+) -> BindGroupEntry {
+    create_and_fill_uniform_buffer_batch(ctx, label, std::iter::once(content))
+        .into_iter()
+        .next()
+        .unwrap()
+}
diff --git a/crates/re_renderer/src/context.rs b/crates/re_renderer/src/context.rs
@@ -1,6 +1,6 @@
 use std::sync::Arc;
 
-use parking_lot::Mutex;
+use parking_lot::{Mutex, RwLock};
 use type_map::concurrent::{self, TypeMap};
 
 use crate::{
@@ -20,7 +20,7 @@ pub struct RenderContext {
     pub queue: Arc<wgpu::Queue>,
 
     pub(crate) shared_renderer_data: SharedRendererData,
-    pub(crate) renderers: Renderers,
+    pub(crate) renderers: RwLock<Renderers>,
     pub(crate) resolver: RecommendedFileResolver,
     #[cfg(all(not(target_arch = "wasm32"), debug_assertions))] // native debug build
     pub(crate) err_tracker: std::sync::Arc<crate::error_tracker::ErrorTracker>,
@@ -155,14 +155,14 @@ impl RenderContext {
         };
 
         let mut resolver = crate::new_recommended_file_resolver();
-        let mut renderers = Renderers {
+        let mut renderers = RwLock::new(Renderers {
             renderers: TypeMap::new(),
-        };
+        });
 
         let mesh_manager = MeshManager::new(
             device.clone(),
             queue.clone(),
-            renderers.get_or_create(
+            renderers.get_mut().get_or_create(
                 &shared_renderer_data,
                 &mut gpu_resources,
                 &device,
@@ -193,8 +193,11 @@ impl RenderContext {
 
             inflight_queue_submissions: Vec::new(),
 
-            active_frame: ActiveFrameContext { frame_global_command_encoder: None,             per_frame_data_helper: TypeMap::new(),
-                 frame_index: 0 }
+            active_frame: ActiveFrameContext {
+                frame_global_command_encoder: Mutex::new(FrameGlobalCommandEncoder(None)),
+                per_frame_data_helper: TypeMap::new(),
+                frame_index: 0
+            }
         }
     }
 
@@ -229,7 +232,7 @@ impl RenderContext {
         self.cpu_write_gpu_read_belt.lock().after_queue_submit();
 
         self.active_frame = ActiveFrameContext {
-            frame_global_command_encoder: None,
+            frame_global_command_encoder: Mutex::new(FrameGlobalCommandEncoder(None)),
             frame_index: self.active_frame.frame_index + 1,
             per_frame_data_helper: TypeMap::new(),
         };
@@ -303,7 +306,13 @@ impl RenderContext {
         // Unmap all staging buffers.
         self.cpu_write_gpu_read_belt.lock().before_queue_submit();
 
-        if let Some(command_encoder) = self.active_frame.frame_global_command_encoder.take() {
+        if let Some(command_encoder) = self
+            .active_frame
+            .frame_global_command_encoder
+            .lock()
+            .0
+            .take()
+        {
             let command_buffer = command_encoder.finish();
 
             // TODO(andreas): For better performance, we should try to bundle this with the single submit call that is currently happening in eframe.
@@ -318,18 +327,38 @@ impl Drop for RenderContext {
     fn drop(&mut self) {
         // Close global command encoder if there is any pending.
         // Not doing so before shutdown causes errors.
-        if let Some(encoder) = self.active_frame.frame_global_command_encoder.take() {
+        if let Some(encoder) = self
+            .active_frame
+            .frame_global_command_encoder
+            .lock()
+            .0
+            .take()
+        {
             encoder.finish();
         }
     }
 }
 
+pub struct FrameGlobalCommandEncoder(Option<wgpu::CommandEncoder>);
+
+impl FrameGlobalCommandEncoder {
+    /// Gets or creates a command encoder that runs before all view builder encoder.
+    pub fn get_or_create(&mut self, device: &wgpu::Device) -> &mut wgpu::CommandEncoder {
+        self.0.get_or_insert_with(|| {
+            device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
+                label: crate::DebugLabel::from("global \"before viewbuilder\" command encoder")
+                    .get(),
+            })
+        })
+    }
+}
+
 pub struct ActiveFrameContext {
     /// Command encoder for all commands that should go in before view builder are submitted.
     ///
     /// This should be used for any gpu copy operation outside of a renderer or view builder.
     /// (i.e. typically in [`crate::renderer::DrawData`] creation!)
-    frame_global_command_encoder: Option<wgpu::CommandEncoder>,
+    pub frame_global_command_encoder: Mutex<FrameGlobalCommandEncoder>,
 
     /// Utility type map that will be cleared every frame.
     pub per_frame_data_helper: TypeMap,
@@ -338,21 +367,6 @@ pub struct ActiveFrameContext {
     frame_index: u64,
 }
 
-impl ActiveFrameContext {
-    /// Gets or creates a command encoder that runs before all view builder encoder.
-    pub fn frame_global_command_encoder(
-        &mut self,
-        device: &wgpu::Device,
-    ) -> &mut wgpu::CommandEncoder {
-        self.frame_global_command_encoder.get_or_insert_with(|| {
-            device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
-                label: crate::DebugLabel::from("global \"before viewbuilder\" command encoder")
-                    .get(),
-            })
-        })
-    }
-}
-
 /// Gets allocation size for a uniform buffer padded in a way that multiple can be put in a single wgpu buffer.
 ///
 /// TODO(andreas): Once we have higher level buffer allocators this should be handled there.

diff --git a/crates/re_renderer/src/global_bindings.rs b/crates/re_renderer/src/global_bindings.rs
@@ -2,7 +2,7 @@ use crate::{
     wgpu_buffer_types,
     wgpu_resources::{
         BindGroupDesc, BindGroupEntry, BindGroupLayoutDesc, GpuBindGroup, GpuBindGroupLayoutHandle,
-        GpuBuffer, GpuSamplerHandle, SamplerDesc, WgpuResourcePools,
+        GpuSamplerHandle, SamplerDesc, WgpuResourcePools,
     },
 };
 
@@ -13,7 +13,7 @@ use smallvec::smallvec;
 ///
 /// Contains information that is constant for a single frame like camera.
 /// (does not contain information that is special to a particular renderer)
-#[repr(C)]
+#[repr(C, align(256))]
 #[derive(Clone, Copy, Zeroable, Pod)]
 pub(crate) struct FrameUniformBuffer {
     pub view_from_world: wgpu_buffer_types::Mat4x3,
@@ -46,8 +46,7 @@ pub(crate) struct FrameUniformBuffer {
     pub auto_size_lines: f32,
 
     /// Factor used to compute depth offsets, see `depth_offset.wgsl`.
-    pub depth_offset_factor: f32,
-    pub _padding: glam::Vec3,
+    pub depth_offset_factor: wgpu_buffer_types::F32RowPadded,
 }
 
 pub(crate) struct GlobalBindings {
@@ -128,19 +127,15 @@ impl GlobalBindings {
         &self,
         pools: &mut WgpuResourcePools,
         device: &wgpu::Device,
-        frame_uniform_buffer: &GpuBuffer,
+        frame_uniform_buffer_binding: BindGroupEntry,
     ) -> GpuBindGroup {
         pools.bind_groups.alloc(
             device,
             // Needs to be kept in sync with `global_bindings.wgsl` / `self.layout`
             &BindGroupDesc {
                 label: "global bind group".into(),
                 entries: smallvec![
-                    BindGroupEntry::Buffer {
-                        handle: frame_uniform_buffer.handle,
-                        offset: 0,
-                        size: None,
-                    },
+                    frame_uniform_buffer_binding,
                     BindGroupEntry::Sampler(self.nearest_neighbor_sampler),
                     BindGroupEntry::Sampler(self.trilinear_sampler),
                 ],

diff --git a/crates/re_renderer/src/mesh.rs b/crates/re_renderer/src/mesh.rs
@@ -137,8 +137,9 @@ pub(crate) mod gpu_data {
 }
 
 impl GpuMesh {
+    // TODO(andreas): Take read-only context here and make uploads happen on staging belt.
     pub fn new(
-        pools: &mut WgpuResourcePools,
+        pools: &WgpuResourcePools,
         texture_manager: &TextureManager2D,
         mesh_bound_group_layout: GpuBindGroupLayoutHandle,
         device: &wgpu::Device,

diff --git a/crates/re_renderer/src/point_cloud_builder.rs b/crates/re_renderer/src/point_cloud_builder.rs
@@ -28,7 +28,7 @@ where
         // TODO(andreas): Be more resourceful about the size allocated here. Typically we know in advance!
         let color_buffer = ctx.cpu_write_gpu_read_belt.lock().allocate::<Color32>(
             &ctx.device,
-            &mut ctx.gpu_resources.buffers,
+            &ctx.gpu_resources.buffers,
             PointCloudDrawData::MAX_NUM_POINTS,
         );
 
@@ -286,7 +286,7 @@ where
     /// This mustn't call this more than once.
     #[inline]
     pub fn color(self, color: Color32) -> Self {
-        self.color.push(&color);
+        self.color.push(color);
         self
     }
 

diff --git a/crates/re_renderer/src/renderer/compositor.rs b/crates/re_renderer/src/renderer/compositor.rs
@@ -31,7 +31,8 @@ impl DrawData for CompositorDrawData {
 impl CompositorDrawData {
     pub fn new(ctx: &mut RenderContext, target: &GpuTexture) -> Self {
         let pools = &mut ctx.gpu_resources;
-        let compositor = ctx.renderers.get_or_create::<_, Compositor>(
+        let mut renderers = ctx.renderers.write();
+        let compositor = renderers.get_or_create::<_, Compositor>(
             &ctx.shared_renderer_data,
             pools,
             &ctx.device,

diff --git a/crates/re_renderer/src/renderer/generic_skybox.rs b/crates/re_renderer/src/renderer/generic_skybox.rs
@@ -29,7 +29,7 @@ impl DrawData for GenericSkyboxDrawData {
 
 impl GenericSkyboxDrawData {
     pub fn new(ctx: &mut RenderContext) -> Self {
-        ctx.renderers.get_or_create::<_, GenericSkybox>(
+        ctx.renderers.write().get_or_create::<_, GenericSkybox>(
             &ctx.shared_renderer_data,
             &mut ctx.gpu_resources,
             &ctx.device,