From aec0e87f6824398f873ea0c3d6b296692d8e61a0 Mon Sep 17 00:00:00 2001
From: Andreas Reich <andreas@rerun.io>
Date: Mon, 27 Feb 2023 10:05:09 +0100
Subject: [PATCH] [re_renderer] Uniform buffer utility using
 `CpuWriteGpuReadBelt` (#1400)

* utility function for uniform buffer from struct
* Uniform buffers are now compile time forced to be 256 bytes aligned
* CpuWriteGpuReadBelt no longer takes mutable buffer pool
* Renderers and frame_global_command_encoder are now behind locks
---------

Co-authored-by: Clement Rey <cr.rey.clement@gmail.com>
---
 .../src/allocator/cpu_write_gpu_read_belt.rs  |  11 +-
 crates/re_renderer/src/allocator/mod.rs       |   4 +
 .../src/allocator/uniform_buffer_fill.rs      | 102 ++++++++++++++++++
 crates/re_renderer/src/context.rs             |  66 +++++++-----
 crates/re_renderer/src/global_bindings.rs     |  15 +--
 crates/re_renderer/src/mesh.rs                |   3 +-
 crates/re_renderer/src/point_cloud_builder.rs |   4 +-
 crates/re_renderer/src/renderer/compositor.rs |   3 +-
 .../src/renderer/generic_skybox.rs            |   2 +-
 crates/re_renderer/src/renderer/lines.rs      |  64 ++++-------
 .../re_renderer/src/renderer/mesh_renderer.rs |   2 +-
 .../re_renderer/src/renderer/point_cloud.rs   |  74 +++++--------
 crates/re_renderer/src/renderer/rectangles.rs |  92 +++++-----------
 .../re_renderer/src/renderer/test_triangle.rs |   2 +-
 crates/re_renderer/src/view_builder.rs        |  49 +++------
 crates/re_renderer/src/wgpu_buffer_types.rs   |  76 ++++++++++---
 .../src/wgpu_resources/bind_group_pool.rs     |   2 +-
 .../src/wgpu_resources/buffer_pool.rs         |   2 +-
 .../src/wgpu_resources/texture_pool.rs        |   2 +-
 19 files changed, 322 insertions(+), 253 deletions(-)
 create mode 100644 crates/re_renderer/src/allocator/uniform_buffer_fill.rs
diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
index e85b69596b16..0af9ad8d2587 100644
--- a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
+++ b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
@@ -39,6 +39,7 @@ where
     /// Do *not* make this public as we need to guarantee that the memory is *never* read from!
     #[inline(always)]
     fn as_slice(&mut self) -> &mut [T] {
+        // TODO(andreas): Is this access slow given that it internally goes through a trait interface? Should we keep the pointer around?
         &mut bytemuck::cast_slice_mut(&mut self.write_view)[self.unwritten_element_range.clone()]
     }
 
@@ -47,13 +48,13 @@ where
     /// Panics if the data no longer fits into the buffer.
     #[inline]
     pub fn extend_from_slice(&mut self, elements: &[T]) {
-        self.as_slice().copy_from_slice(elements);
+        self.as_slice()[..elements.len()].copy_from_slice(elements);
         self.unwritten_element_range.start += elements.len();
     }
 
     /// Pushes several elements into the buffer.
     ///
-    /// Panics if the data no longer fits into the buffer.
+    /// Extends until either running out of space or elements.
     #[inline]
     pub fn extend(&mut self, elements: impl Iterator<Item = T>) {
         let mut num_elements = 0;
@@ -68,8 +69,8 @@ where
     ///
     /// Panics if the data no longer fits into the buffer.
     #[inline]
-    pub fn push(&mut self, element: &T) {
-        self.as_slice()[0] = *element;
+    pub fn push(&mut self, element: T) {
+        self.as_slice()[0] = element;
         self.unwritten_element_range.start += 1;
     }
 
@@ -285,7 +286,7 @@ impl CpuWriteGpuReadBelt {
     pub fn allocate<T: bytemuck::Pod>(
         &mut self,
         device: &wgpu::Device,
-        buffer_pool: &mut GpuBufferPool,
+        buffer_pool: &GpuBufferPool,
         num_elements: usize,
     ) -> CpuWriteGpuReadBuffer<T> {
         // Potentially overestimate alignment with Self::MIN_ALIGNMENT, see Self::MIN_ALIGNMENT doc string.
diff --git a/crates/re_renderer/src/allocator/mod.rs b/crates/re_renderer/src/allocator/mod.rs
index cb4ab783fd09..2c0523348dee 100644
--- a/crates/re_renderer/src/allocator/mod.rs
+++ b/crates/re_renderer/src/allocator/mod.rs
@@ -4,5 +4,9 @@
 //! follows some more complex strategy for efficient re-use and sub-allocation of wgpu resources.
 
 mod cpu_write_gpu_read_belt;
+mod uniform_buffer_fill;
 
 pub use cpu_write_gpu_read_belt::{CpuWriteGpuReadBelt, CpuWriteGpuReadBuffer};
+pub use uniform_buffer_fill::{
+    create_and_fill_uniform_buffer, create_and_fill_uniform_buffer_batch,
+};
diff --git a/crates/re_renderer/src/allocator/uniform_buffer_fill.rs b/crates/re_renderer/src/allocator/uniform_buffer_fill.rs
new file mode 100644
index 000000000000..4239dee69825
--- /dev/null
+++ b/crates/re_renderer/src/allocator/uniform_buffer_fill.rs
@@ -0,0 +1,102 @@
+pub use super::cpu_write_gpu_read_belt::{CpuWriteGpuReadBelt, CpuWriteGpuReadBuffer};
+
+use crate::{wgpu_resources::BindGroupEntry, DebugLabel, RenderContext};
+
+struct UniformBufferAlignmentCheck<T> {
+    pub _marker: std::marker::PhantomData<T>,
+}
+impl<T> UniformBufferAlignmentCheck<T> {
+    /// wgpu requires uniform buffers to be aligned to up to 256 bytes.
+    ///
+    /// This is a property of device limits, see [`WebGPU` specification](https://www.w3.org/TR/webgpu/#limits).
+    /// Implementations are allowed to advertise a lower alignment requirement, however
+    /// 256 bytes is fairly common even in modern hardware and is even hardcoded for DX12.
+    ///
+    /// Technically this is only relevant when sub-allocating a buffer, as the wgpu backend
+    /// is internally forced to make sure that the start of any [`wgpu::Buffer`] with [`wgpu::BufferUsages::UNIFORM`] usage
+    /// has this alignment. Practically, ensuring this alignment everywhere
+    ///
+    /// Alternatively to enforcing this alignment on the type we could:
+    /// * only align on the gpu buffer
+    ///     -> causes more fine grained `copy_buffer_to_buffer` calls on the gpu encoder
+    /// * only align on the [`CpuWriteGpuReadBuffer`] & gpu buffer
+    ///     -> causes more complicated offset computation on [`CpuWriteGpuReadBuffer`] as well as either
+    ///         holes at padding (-> undefined values & slow for write combined!) or complicated nulling of padding
+    ///
+    /// About the [`bytemuck::Pod`] requirement (dragged in by [`CpuWriteGpuReadBuffer`]):
+    /// [`bytemuck::Pod`] forces us to be explicit about padding as it doesn't allow invisible padding bytes!
+    /// We could drop this and thus make it easier to define uniform buffer types.
+    /// But this leads to more unsafe code, harder to avoid holes in write combined memory access
+    /// and potentially undefined values in the padding bytes on GPU.
+    const CHECK: () = assert!(
+        std::mem::align_of::<T>() >= 256,
+        "Uniform buffers need to be aligned to 256 bytes. Use `#[repr(C, align(256))]`"
+    );
+}
+/// Utility for fast & efficient creation of uniform buffers from a series of structs.
+///
+/// For subsequent frames, this will automatically not allocate any resources (thanks to our buffer pooling mechanism).
+///
+/// TODO(#1383): We could do this on a more complex stack allocator.
+pub fn create_and_fill_uniform_buffer_batch<T: bytemuck::Pod>(
+    ctx: &RenderContext,
+    label: DebugLabel,
+    content: impl ExactSizeIterator<Item = T>,
+) -> Vec<BindGroupEntry> {
+    #[allow(clippy::let_unit_value)]
+    let _ = UniformBufferAlignmentCheck::<T>::CHECK;
+
+    let num_buffers = content.len() as u64;
+    let element_size = std::mem::size_of::<T>() as u64;
+
+    assert!(
+        element_size > 0,
+        "Uniform buffer need to have a non-zero size"
+    );
+
+    let buffer = ctx.gpu_resources.buffers.alloc(
+        &ctx.device,
+        &crate::wgpu_resources::BufferDesc {
+            label,
+            size: num_buffers * element_size,
+            usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        },
+    );
+
+    let mut staging_buffer = ctx.cpu_write_gpu_read_belt.lock().allocate::<T>(
+        &ctx.device,
+        &ctx.gpu_resources.buffers,
+        num_buffers as _,
+    );
+    staging_buffer.extend(content);
+    staging_buffer.copy_to_buffer(
+        ctx.active_frame
+            .frame_global_command_encoder
+            .lock()
+            .get_or_create(&ctx.device),
+        &buffer,
+        0,
+    );
+
+    (0..num_buffers)
+        .into_iter()
+        .map(|i| BindGroupEntry::Buffer {
+            handle: buffer.handle,
+            offset: i * element_size,
+            size: Some(std::num::NonZeroU64::new(element_size).unwrap()),
+        })
+        .collect()
+}
+
+/// See [`create_and_fill_uniform_buffer`].
+pub fn create_and_fill_uniform_buffer<T: bytemuck::Pod>(
+    ctx: &mut RenderContext,
+    label: DebugLabel,
+    content: T,
+) -> BindGroupEntry {
+    create_and_fill_uniform_buffer_batch(ctx, label, std::iter::once(content))
+        .into_iter()
+        .next()
+        .unwrap()
+}
diff --git a/crates/re_renderer/src/context.rs b/crates/re_renderer/src/context.rs
index 150d51320a5d..6a985c714277 100644
--- a/crates/re_renderer/src/context.rs
+++ b/crates/re_renderer/src/context.rs
@@ -1,6 +1,6 @@
 use std::sync::Arc;
 
-use parking_lot::Mutex;
+use parking_lot::{Mutex, RwLock};
 use type_map::concurrent::{self, TypeMap};
 
 use crate::{
@@ -20,7 +20,7 @@ pub struct RenderContext {
     pub queue: Arc<wgpu::Queue>,
 
     pub(crate) shared_renderer_data: SharedRendererData,
-    pub(crate) renderers: Renderers,
+    pub(crate) renderers: RwLock<Renderers>,
     pub(crate) resolver: RecommendedFileResolver,
     #[cfg(all(not(target_arch = "wasm32"), debug_assertions))] // native debug build
     pub(crate) err_tracker: std::sync::Arc<crate::error_tracker::ErrorTracker>,
@@ -155,14 +155,14 @@ impl RenderContext {
         };
 
         let mut resolver = crate::new_recommended_file_resolver();
-        let mut renderers = Renderers {
+        let mut renderers = RwLock::new(Renderers {
             renderers: TypeMap::new(),
-        };
+        });
 
         let mesh_manager = MeshManager::new(
             device.clone(),
             queue.clone(),
-            renderers.get_or_create(
+            renderers.get_mut().get_or_create(
                 &shared_renderer_data,
                 &mut gpu_resources,
                 &device,
@@ -193,8 +193,11 @@ impl RenderContext {
 
             inflight_queue_submissions: Vec::new(),
 
-            active_frame: ActiveFrameContext { frame_global_command_encoder: None,             per_frame_data_helper: TypeMap::new(),
-                 frame_index: 0 }
+            active_frame: ActiveFrameContext {
+                frame_global_command_encoder: Mutex::new(FrameGlobalCommandEncoder(None)),
+                per_frame_data_helper: TypeMap::new(),
+                frame_index: 0
+            }
         }
     }
 
@@ -229,7 +232,7 @@ impl RenderContext {
         self.cpu_write_gpu_read_belt.lock().after_queue_submit();
 
         self.active_frame = ActiveFrameContext {
-            frame_global_command_encoder: None,
+            frame_global_command_encoder: Mutex::new(FrameGlobalCommandEncoder(None)),
             frame_index: self.active_frame.frame_index + 1,
             per_frame_data_helper: TypeMap::new(),
         };
@@ -303,7 +306,13 @@ impl RenderContext {
         // Unmap all staging buffers.
         self.cpu_write_gpu_read_belt.lock().before_queue_submit();
 
-        if let Some(command_encoder) = self.active_frame.frame_global_command_encoder.take() {
+        if let Some(command_encoder) = self
+            .active_frame
+            .frame_global_command_encoder
+            .lock()
+            .0
+            .take()
+        {
             let command_buffer = command_encoder.finish();
 
             // TODO(andreas): For better performance, we should try to bundle this with the single submit call that is currently happening in eframe.
@@ -318,18 +327,38 @@ impl Drop for RenderContext {
     fn drop(&mut self) {
         // Close global command encoder if there is any pending.
         // Not doing so before shutdown causes errors.
-        if let Some(encoder) = self.active_frame.frame_global_command_encoder.take() {
+        if let Some(encoder) = self
+            .active_frame
+            .frame_global_command_encoder
+            .lock()
+            .0
+            .take()
+        {
             encoder.finish();
         }
     }
 }
 
+pub struct FrameGlobalCommandEncoder(Option<wgpu::CommandEncoder>);
+
+impl FrameGlobalCommandEncoder {
+    /// Gets or creates a command encoder that runs before all view builder encoder.
+    pub fn get_or_create(&mut self, device: &wgpu::Device) -> &mut wgpu::CommandEncoder {
+        self.0.get_or_insert_with(|| {
+            device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
+                label: crate::DebugLabel::from("global \"before viewbuilder\" command encoder")
+                    .get(),
+            })
+        })
+    }
+}
+
 pub struct ActiveFrameContext {
     /// Command encoder for all commands that should go in before view builder are submitted.
     ///
     /// This should be used for any gpu copy operation outside of a renderer or view builder.
     /// (i.e. typically in [`crate::renderer::DrawData`] creation!)
-    frame_global_command_encoder: Option<wgpu::CommandEncoder>,
+    pub frame_global_command_encoder: Mutex<FrameGlobalCommandEncoder>,
 
     /// Utility type map that will be cleared every frame.
     pub per_frame_data_helper: TypeMap,
@@ -338,21 +367,6 @@ pub struct ActiveFrameContext {
     frame_index: u64,
 }
 
-impl ActiveFrameContext {
-    /// Gets or creates a command encoder that runs before all view builder encoder.
-    pub fn frame_global_command_encoder(
-        &mut self,
-        device: &wgpu::Device,
-    ) -> &mut wgpu::CommandEncoder {
-        self.frame_global_command_encoder.get_or_insert_with(|| {
-            device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
-                label: crate::DebugLabel::from("global \"before viewbuilder\" command encoder")
-                    .get(),
-            })
-        })
-    }
-}
-
 /// Gets allocation size for a uniform buffer padded in a way that multiple can be put in a single wgpu buffer.
 ///
 /// TODO(andreas): Once we have higher level buffer allocators this should be handled there.
diff --git a/crates/re_renderer/src/global_bindings.rs b/crates/re_renderer/src/global_bindings.rs
index bf7729adcf0a..7ac1b2712a94 100644
--- a/crates/re_renderer/src/global_bindings.rs
+++ b/crates/re_renderer/src/global_bindings.rs
@@ -2,7 +2,7 @@ use crate::{
     wgpu_buffer_types,
     wgpu_resources::{
         BindGroupDesc, BindGroupEntry, BindGroupLayoutDesc, GpuBindGroup, GpuBindGroupLayoutHandle,
-        GpuBuffer, GpuSamplerHandle, SamplerDesc, WgpuResourcePools,
+        GpuSamplerHandle, SamplerDesc, WgpuResourcePools,
     },
 };
 
@@ -13,7 +13,7 @@ use smallvec::smallvec;
 ///
 /// Contains information that is constant for a single frame like camera.
 /// (does not contain information that is special to a particular renderer)
-#[repr(C)]
+#[repr(C, align(256))]
 #[derive(Clone, Copy, Zeroable, Pod)]
 pub(crate) struct FrameUniformBuffer {
     pub view_from_world: wgpu_buffer_types::Mat4x3,
@@ -46,8 +46,7 @@ pub(crate) struct FrameUniformBuffer {
     pub auto_size_lines: f32,
 
     /// Factor used to compute depth offsets, see `depth_offset.wgsl`.
-    pub depth_offset_factor: f32,
-    pub _padding: glam::Vec3,
+    pub depth_offset_factor: wgpu_buffer_types::F32RowPadded,
 }
 
 pub(crate) struct GlobalBindings {
@@ -128,7 +127,7 @@ impl GlobalBindings {
         &self,
         pools: &mut WgpuResourcePools,
         device: &wgpu::Device,
-        frame_uniform_buffer: &GpuBuffer,
+        frame_uniform_buffer_binding: BindGroupEntry,
     ) -> GpuBindGroup {
         pools.bind_groups.alloc(
             device,
@@ -136,11 +135,7 @@ impl GlobalBindings {
             &BindGroupDesc {
                 label: "global bind group".into(),
                 entries: smallvec![
-                    BindGroupEntry::Buffer {
-                        handle: frame_uniform_buffer.handle,
-                        offset: 0,
-                        size: None,
-                    },
+                    frame_uniform_buffer_binding,
                     BindGroupEntry::Sampler(self.nearest_neighbor_sampler),
                     BindGroupEntry::Sampler(self.trilinear_sampler),
                 ],
diff --git a/crates/re_renderer/src/mesh.rs b/crates/re_renderer/src/mesh.rs
index 7f30c8fccb92..14a124bebaa2 100644
--- a/crates/re_renderer/src/mesh.rs
+++ b/crates/re_renderer/src/mesh.rs
@@ -137,8 +137,9 @@ pub(crate) mod gpu_data {
 }
 
 impl GpuMesh {
+    // TODO(andreas): Take read-only context here and make uploads happen on staging belt.
     pub fn new(
-        pools: &mut WgpuResourcePools,
+        pools: &WgpuResourcePools,
         texture_manager: &TextureManager2D,
         mesh_bound_group_layout: GpuBindGroupLayoutHandle,
         device: &wgpu::Device,
diff --git a/crates/re_renderer/src/point_cloud_builder.rs b/crates/re_renderer/src/point_cloud_builder.rs
index d13eb9105220..b3c4028194e3 100644
--- a/crates/re_renderer/src/point_cloud_builder.rs
+++ b/crates/re_renderer/src/point_cloud_builder.rs
@@ -28,7 +28,7 @@ where
         // TODO(andreas): Be more resourceful about the size allocated here. Typically we know in advance!
         let color_buffer = ctx.cpu_write_gpu_read_belt.lock().allocate::<Color32>(
             &ctx.device,
-            &mut ctx.gpu_resources.buffers,
+            &ctx.gpu_resources.buffers,
             PointCloudDrawData::MAX_NUM_POINTS,
         );
 
@@ -286,7 +286,7 @@ where
     /// This mustn't call this more than once.
     #[inline]
     pub fn color(self, color: Color32) -> Self {
-        self.color.push(&color);
+        self.color.push(color);
         self
     }
 
diff --git a/crates/re_renderer/src/renderer/compositor.rs b/crates/re_renderer/src/renderer/compositor.rs
index 6ceae9fbd2e4..187ba68b05ed 100644
--- a/crates/re_renderer/src/renderer/compositor.rs
+++ b/crates/re_renderer/src/renderer/compositor.rs
@@ -31,7 +31,8 @@ impl DrawData for CompositorDrawData {
 impl CompositorDrawData {
     pub fn new(ctx: &mut RenderContext, target: &GpuTexture) -> Self {
         let pools = &mut ctx.gpu_resources;
-        let compositor = ctx.renderers.get_or_create::<_, Compositor>(
+        let mut renderers = ctx.renderers.write();
+        let compositor = renderers.get_or_create::<_, Compositor>(
             &ctx.shared_renderer_data,
             pools,
             &ctx.device,
diff --git a/crates/re_renderer/src/renderer/generic_skybox.rs b/crates/re_renderer/src/renderer/generic_skybox.rs
index c0efe78817aa..1e84152f8fbf 100644
--- a/crates/re_renderer/src/renderer/generic_skybox.rs
+++ b/crates/re_renderer/src/renderer/generic_skybox.rs
@@ -29,7 +29,7 @@ impl DrawData for GenericSkyboxDrawData {
 
 impl GenericSkyboxDrawData {
     pub fn new(ctx: &mut RenderContext) -> Self {
-        ctx.renderers.get_or_create::<_, GenericSkybox>(
+        ctx.renderers.write().get_or_create::<_, GenericSkybox>(
             &ctx.shared_renderer_data,
             &mut ctx.gpu_resources,
             &ctx.device,
diff --git a/crates/re_renderer/src/renderer/lines.rs b/crates/re_renderer/src/renderer/lines.rs
index f8237709ebb6..29dc902da0c9 100644
--- a/crates/re_renderer/src/renderer/lines.rs
+++ b/crates/re_renderer/src/renderer/lines.rs
@@ -114,14 +114,14 @@ use bytemuck::Zeroable;
 use smallvec::smallvec;
 
 use crate::{
-    context::uniform_buffer_allocation_size,
+    allocator::create_and_fill_uniform_buffer_batch,
     include_file,
     size::Size,
     view_builder::ViewBuilder,
     wgpu_resources::{
-        BindGroupDesc, BindGroupEntry, BindGroupLayoutDesc, BufferDesc, GpuBindGroup,
-        GpuBindGroupLayoutHandle, GpuRenderPipelineHandle, PipelineLayoutDesc, PoolError,
-        RenderPipelineDesc, ShaderModuleDesc, TextureDesc,
+        BindGroupDesc, BindGroupEntry, BindGroupLayoutDesc, GpuBindGroup, GpuBindGroupLayoutHandle,
+        GpuRenderPipelineHandle, PipelineLayoutDesc, PoolError, RenderPipelineDesc,
+        ShaderModuleDesc, TextureDesc,
     },
     Color32, DebugLabel,
 };
@@ -160,10 +160,12 @@ pub mod gpu_data {
     static_assertions::assert_eq_size!(LineStripInfo, [u32; 2]);
 
     /// Uniform buffer that changes for every batch of line strips.
-    #[repr(C)]
+    #[repr(C, align(256))]
     #[derive(Clone, Copy, bytemuck::Pod, bytemuck::Zeroable)]
     pub struct BatchUniformBuffer {
         pub world_from_obj: wgpu_buffer_types::Mat4,
+
+        pub end_padding: [wgpu_buffer_types::PaddingRow; 16 - 4],
     }
 }
 
@@ -299,7 +301,8 @@ impl LineDrawData {
         strips: &[LineStripInfo],
         batches: &[LineBatchInfo],
     ) -> Result<Self, LineDrawDataError> {
-        let line_renderer = ctx.renderers.get_or_create::<_, LineRenderer>(
+        let mut renderers = ctx.renderers.write();
+        let line_renderer = renderers.get_or_create::<_, LineRenderer>(
             &ctx.shared_renderer_data,
             &mut ctx.gpu_resources,
             &ctx.device,
@@ -494,52 +497,29 @@ impl LineDrawData {
         // Process batches
         let mut batches_internal = Vec::with_capacity(batches.len());
         {
-            let allocation_size_per_uniform_buffer =
-                uniform_buffer_allocation_size::<gpu_data::BatchUniformBuffer>(&ctx.device);
-            let combined_buffers_size = allocation_size_per_uniform_buffer * batches.len() as u64;
-            let uniform_buffers = ctx.gpu_resources.buffers.alloc(
-                &ctx.device,
-                &BufferDesc {
-                    label: "lines batch uniform buffers".into(),
-                    size: combined_buffers_size,
-                    usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
-                    mapped_at_creation: false,
-                },
+            let uniform_buffer_bindings = create_and_fill_uniform_buffer_batch(
+                ctx,
+                "lines batch uniform buffers".into(),
+                batches
+                    .iter()
+                    .map(|batch_info| gpu_data::BatchUniformBuffer {
+                        world_from_obj: batch_info.world_from_obj.into(),
+                        end_padding: Default::default(),
+                    }),
             );
 
-            let mut staging_buffer = ctx
-                .queue
-                .write_buffer_with(
-                    &uniform_buffers,
-                    0,
-                    NonZeroU64::new(combined_buffers_size).unwrap(),
-                )
-                .unwrap(); // Fails only if mapping is bigger than buffer size.
-
             let mut start_vertex_for_next_batch = 0;
-            for (i, batch_info) in batches.iter().enumerate() {
-                // CAREFUL: Memory from `write_buffer_with` may not be aligned, causing bytemuck to fail at runtime if we use it to cast the memory to a slice!
-                let offset = i * allocation_size_per_uniform_buffer as usize;
+            for (batch_info, uniform_buffer_binding) in
+                batches.iter().zip(uniform_buffer_bindings.into_iter())
+            {
                 let line_vertex_range_end = (start_vertex_for_next_batch
                     + batch_info.line_vertex_count)
                     .min(Self::MAX_NUM_VERTICES as u32);
-                staging_buffer
-                    [offset..(offset + std::mem::size_of::<gpu_data::BatchUniformBuffer>())]
-                    .copy_from_slice(bytemuck::bytes_of(&gpu_data::BatchUniformBuffer {
-                        world_from_obj: batch_info.world_from_obj.into(),
-                    }));
-
                 let bind_group = ctx.gpu_resources.bind_groups.alloc(
                     &ctx.device,
                     &BindGroupDesc {
                         label: batch_info.label.clone(),
-                        entries: smallvec![BindGroupEntry::Buffer {
-                            handle: uniform_buffers.handle,
-                            offset: offset as _,
-                            size: NonZeroU64::new(
-                                std::mem::size_of::<gpu_data::BatchUniformBuffer>() as _
-                            ),
-                        }],
+                        entries: smallvec![uniform_buffer_binding],
                         layout: line_renderer.bind_group_layout_batch,
                     },
                     &ctx.gpu_resources.bind_group_layouts,
diff --git a/crates/re_renderer/src/renderer/mesh_renderer.rs b/crates/re_renderer/src/renderer/mesh_renderer.rs
index 14559b9f69e6..eb84b97976e3 100644
--- a/crates/re_renderer/src/renderer/mesh_renderer.rs
+++ b/crates/re_renderer/src/renderer/mesh_renderer.rs
@@ -120,7 +120,7 @@ impl MeshDrawData {
     pub fn new(ctx: &mut RenderContext, instances: &[MeshInstance]) -> anyhow::Result<Self> {
         crate::profile_function!();
 
-        let _mesh_renderer = ctx.renderers.get_or_create::<_, MeshRenderer>(
+        let _mesh_renderer = ctx.renderers.write().get_or_create::<_, MeshRenderer>(
             &ctx.shared_renderer_data,
             &mut ctx.gpu_resources,
             &ctx.device,
diff --git a/crates/re_renderer/src/renderer/point_cloud.rs b/crates/re_renderer/src/renderer/point_cloud.rs
index 233911761fd2..70bfbbb69045 100644
--- a/crates/re_renderer/src/renderer/point_cloud.rs
+++ b/crates/re_renderer/src/renderer/point_cloud.rs
@@ -18,10 +18,7 @@ use std::{
     ops::Range,
 };
 
-use crate::{
-    context::uniform_buffer_allocation_size, wgpu_resources::BufferDesc, DebugLabel,
-    PointCloudBuilder,
-};
+use crate::{allocator::create_and_fill_uniform_buffer_batch, DebugLabel, PointCloudBuilder};
 use bitflags::bitflags;
 use bytemuck::Zeroable;
 use itertools::Itertools;
@@ -56,7 +53,6 @@ bitflags! {
 }
 
 mod gpu_data {
-    use super::PointCloudBatchFlags;
     use crate::{wgpu_buffer_types, Size};
 
     // Don't use `wgsl_buffer_types` since this data doesn't go into a buffer, so alignment rules don't apply like on buffers..
@@ -69,12 +65,14 @@ mod gpu_data {
     static_assertions::assert_eq_size!(PositionData, glam::Vec4);
 
     /// Uniform buffer that changes for every batch of line strips.
-    #[repr(C)]
+    #[repr(C, align(256))]
     #[derive(Clone, Copy, bytemuck::Pod, bytemuck::Zeroable)]
     pub struct BatchUniformBuffer {
         pub world_from_obj: wgpu_buffer_types::Mat4,
-        pub flags: PointCloudBatchFlags,
-        pub _padding: glam::Vec3,
+
+        pub flags: wgpu_buffer_types::U32RowPadded, // PointCloudBatchFlags
+
+        pub end_padding: [wgpu_buffer_types::PaddingRow; 16 - 5],
     }
 }
 
@@ -155,7 +153,8 @@ impl PointCloudDrawData {
     ) -> Result<Self, PointCloudDrawDataError> {
         crate::profile_function!();
 
-        let point_renderer = ctx.renderers.get_or_create::<_, PointCloudRenderer>(
+        let mut renderers = ctx.renderers.write();
+        let point_renderer = renderers.get_or_create::<_, PointCloudRenderer>(
             &ctx.shared_renderer_data,
             &mut ctx.gpu_resources,
             &ctx.device,
@@ -285,7 +284,10 @@ impl PointCloudDrawData {
         }
 
         builder.color_buffer.copy_to_texture(
-            ctx.active_frame.frame_global_command_encoder(&ctx.device),
+            ctx.active_frame
+                .frame_global_command_encoder
+                .lock()
+                .get_or_create(&ctx.device),
             wgpu::ImageCopyTexture {
                 texture: &color_texture.texture,
                 mip_level: 0,
@@ -316,51 +318,27 @@ impl PointCloudDrawData {
         // Process batches
         let mut batches_internal = Vec::with_capacity(batches.len());
         {
-            let allocation_size_per_uniform_buffer =
-                uniform_buffer_allocation_size::<gpu_data::BatchUniformBuffer>(&ctx.device);
-            let combined_buffers_size = allocation_size_per_uniform_buffer * batches.len() as u64;
-            let uniform_buffers = ctx.gpu_resources.buffers.alloc(
-                &ctx.device,
-                &BufferDesc {
-                    label: "point batch uniform buffers".into(),
-                    size: combined_buffers_size,
-                    usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
-                    mapped_at_creation: false,
-                },
+            let uniform_buffer_bindings = create_and_fill_uniform_buffer_batch(
+                ctx,
+                "point batch uniform buffers".into(),
+                batches
+                    .iter()
+                    .map(|batch_info| gpu_data::BatchUniformBuffer {
+                        world_from_obj: batch_info.world_from_obj.into(),
+                        flags: batch_info.flags.bits.into(),
+                        end_padding: Default::default(),
+                    }),
             );
 
-            let mut staging_buffer = ctx
-                .queue
-                .write_buffer_with(
-                    &uniform_buffers,
-                    0,
-                    NonZeroU64::new(combined_buffers_size).unwrap(),
-                )
-                .unwrap(); // Fails only if mapping is bigger than buffer size.
-
             let mut start_point_for_next_batch = 0;
-            for (i, batch_info) in batches.iter().enumerate() {
-                // CAREFUL: Memory from `write_buffer_with` may not be aligned, causing bytemuck to fail at runtime if we use it to cast the memory to a slice!
-                let offset = i * allocation_size_per_uniform_buffer as usize;
-                staging_buffer
-                    [offset..(offset + std::mem::size_of::<gpu_data::BatchUniformBuffer>())]
-                    .copy_from_slice(bytemuck::bytes_of(&gpu_data::BatchUniformBuffer {
-                        world_from_obj: batch_info.world_from_obj.into(),
-                        flags: batch_info.flags,
-                        _padding: glam::Vec3::ZERO,
-                    }));
-
+            for (batch_info, uniform_buffer_binding) in
+                batches.iter().zip(uniform_buffer_bindings.into_iter())
+            {
                 let bind_group = ctx.gpu_resources.bind_groups.alloc(
                     &ctx.device,
                     &BindGroupDesc {
                         label: batch_info.label.clone(),
-                        entries: smallvec![BindGroupEntry::Buffer {
-                            handle: uniform_buffers.handle,
-                            offset: offset as _,
-                            size: NonZeroU64::new(
-                                std::mem::size_of::<gpu_data::BatchUniformBuffer>() as _
-                            ),
-                        }],
+                        entries: smallvec![uniform_buffer_binding],
                         layout: point_renderer.bind_group_layout_batch,
                     },
                     &ctx.gpu_resources.bind_group_layouts,
diff --git a/crates/re_renderer/src/renderer/rectangles.rs b/crates/re_renderer/src/renderer/rectangles.rs
index 42f98378dceb..2a6d019bb263 100644
--- a/crates/re_renderer/src/renderer/rectangles.rs
+++ b/crates/re_renderer/src/renderer/rectangles.rs
@@ -11,18 +11,17 @@
 //! we are forced to have individual bind groups per rectangle and thus a draw call per rectangle.
 
 use smallvec::smallvec;
-use std::num::NonZeroU64;
 
 use crate::{
-    context::uniform_buffer_allocation_size,
+    allocator::create_and_fill_uniform_buffer_batch,
     depth_offset::DepthOffset,
     include_file,
     resource_managers::{GpuTexture2DHandle, ResourceManagerError},
     view_builder::ViewBuilder,
     wgpu_resources::{
-        BindGroupDesc, BindGroupEntry, BindGroupLayoutDesc, BufferDesc, GpuBindGroup,
-        GpuBindGroupLayoutHandle, GpuRenderPipelineHandle, PipelineLayoutDesc, RenderPipelineDesc,
-        SamplerDesc, ShaderModuleDesc,
+        BindGroupDesc, BindGroupEntry, BindGroupLayoutDesc, GpuBindGroup, GpuBindGroupLayoutHandle,
+        GpuRenderPipelineHandle, PipelineLayoutDesc, RenderPipelineDesc, SamplerDesc,
+        ShaderModuleDesc,
     },
     Rgba,
 };
@@ -36,14 +35,16 @@ mod gpu_data {
     use crate::wgpu_buffer_types;
 
     // Keep in sync with mirror in rectangle.wgsl
-    #[repr(C)]
+    #[repr(C, align(256))]
     #[derive(Clone, Copy, bytemuck::Pod, bytemuck::Zeroable)]
     pub struct UniformBuffer {
-        pub top_left_corner_position: wgpu_buffer_types::Vec3,
-        pub extent_u: wgpu_buffer_types::Vec3,
+        pub top_left_corner_position: wgpu_buffer_types::Vec3RowPadded,
+        pub extent_u: wgpu_buffer_types::Vec3RowPadded,
         pub extent_v: wgpu_buffer_types::Vec3Unpadded,
         pub depth_offset: f32,
         pub multiplicative_tint: crate::Rgba,
+
+        pub end_padding: [wgpu_buffer_types::PaddingRow; 16 - 4],
     }
 }
 
@@ -116,7 +117,8 @@ impl RectangleDrawData {
     ) -> Result<Self, ResourceManagerError> {
         crate::profile_function!();
 
-        let rectangle_renderer = ctx.renderers.get_or_create::<_, RectangleRenderer>(
+        let mut renderers = ctx.renderers.write();
+        let rectangle_renderer = renderers.get_or_create::<_, RectangleRenderer>(
             &ctx.shared_renderer_data,
             &mut ctx.gpu_resources,
             &ctx.device,
@@ -129,59 +131,23 @@ impl RectangleDrawData {
             });
         }
 
-        let allocation_size_per_uniform_buffer =
-            uniform_buffer_allocation_size::<gpu_data::UniformBuffer>(&ctx.device);
-        let combined_buffers_size = allocation_size_per_uniform_buffer * rectangles.len() as u64;
-
-        // Allocate all constant buffers at once.
-        // TODO(andreas): This should come from a per-frame allocator!
-        let uniform_buffer = ctx.gpu_resources.buffers.alloc(
-            &ctx.device,
-            &BufferDesc {
-                label: "rectangle uniform buffers".into(),
-                size: combined_buffers_size,
-                usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::UNIFORM,
-                mapped_at_creation: false,
-            },
+        let uniform_buffer_bindings = create_and_fill_uniform_buffer_batch(
+            ctx,
+            "rectangle uniform buffers".into(),
+            rectangles.iter().map(|rectangle| gpu_data::UniformBuffer {
+                top_left_corner_position: rectangle.top_left_corner_position.into(),
+                extent_u: rectangle.extent_u.into(),
+                extent_v: rectangle.extent_v.into(),
+                depth_offset: rectangle.depth_offset as f32,
+                multiplicative_tint: rectangle.multiplicative_tint,
+                end_padding: Default::default(),
+            }),
         );
 
-        // Fill staging buffer in a separate loop to avoid borrow checker issues
-        {
-            // TODO(andreas): This should come from a staging buffer.
-            let mut staging_buffer = ctx
-                .queue
-                .write_buffer_with(
-                    &uniform_buffer,
-                    0,
-                    NonZeroU64::new(combined_buffers_size).unwrap(),
-                )
-                .unwrap(); // Fails only if mapping is bigger than buffer size.
-
-            for (i, rectangle) in rectangles.iter().enumerate() {
-                let offset = i * allocation_size_per_uniform_buffer as usize;
-
-                // CAREFUL: Memory from `write_buffer_with` may not be aligned, causing bytemuck to fail at runtime if we use it to cast the memory to a slice!
-                // I.e. this will crash randomly:
-                //
-                // let target_buffer = bytemuck::from_bytes_mut::<gpu_data::UniformBuffer>(
-                //     &mut staging_buffer[offset..(offset + uniform_buffer_size)],
-                // );
-                //
-                // TODO(andreas): with our own staging buffers we could fix this very easily
-
-                staging_buffer[offset..(offset + std::mem::size_of::<gpu_data::UniformBuffer>())]
-                    .copy_from_slice(bytemuck::bytes_of(&gpu_data::UniformBuffer {
-                        top_left_corner_position: rectangle.top_left_corner_position.into(),
-                        extent_u: rectangle.extent_u.into(),
-                        extent_v: rectangle.extent_v.into(),
-                        depth_offset: rectangle.depth_offset as f32,
-                        multiplicative_tint: rectangle.multiplicative_tint,
-                    }));
-            }
-        }
-
         let mut bind_groups = Vec::with_capacity(rectangles.len());
-        for (i, rectangle) in rectangles.iter().enumerate() {
+        for (rectangle, uniform_buffer) in
+            rectangles.iter().zip(uniform_buffer_bindings.into_iter())
+        {
             let texture = ctx.texture_manager_2d.get(&rectangle.texture)?;
 
             let sampler = ctx.gpu_resources.samplers.get_or_create(
@@ -211,13 +177,7 @@ impl RectangleDrawData {
                 &BindGroupDesc {
                     label: "rectangle".into(),
                     entries: smallvec![
-                        BindGroupEntry::Buffer {
-                            handle: uniform_buffer.handle,
-                            offset: i as u64 * allocation_size_per_uniform_buffer,
-                            size: NonZeroU64::new(
-                                std::mem::size_of::<gpu_data::UniformBuffer>() as u64
-                            ),
-                        },
+                        uniform_buffer,
                         BindGroupEntry::DefaultTextureView(texture.handle),
                         BindGroupEntry::Sampler(sampler)
                     ],
diff --git a/crates/re_renderer/src/renderer/test_triangle.rs b/crates/re_renderer/src/renderer/test_triangle.rs
index 2ac382562c09..2b5a627080e3 100644
--- a/crates/re_renderer/src/renderer/test_triangle.rs
+++ b/crates/re_renderer/src/renderer/test_triangle.rs
@@ -24,7 +24,7 @@ impl DrawData for TestTriangleDrawData {
 
 impl TestTriangleDrawData {
     pub fn new(ctx: &mut RenderContext) -> Self {
-        ctx.renderers.get_or_create::<_, TestTriangle>(
+        ctx.renderers.write().get_or_create::<_, TestTriangle>(
             &ctx.shared_renderer_data,
             &mut ctx.gpu_resources,
             &ctx.device,
diff --git a/crates/re_renderer/src/view_builder.rs b/crates/re_renderer/src/view_builder.rs
index 1f5981b278c6..defdeb80f770 100644
--- a/crates/re_renderer/src/view_builder.rs
+++ b/crates/re_renderer/src/view_builder.rs
@@ -3,13 +3,14 @@ use parking_lot::RwLock;
 use std::sync::Arc;
 
 use crate::{
+    allocator::create_and_fill_uniform_buffer,
     context::RenderContext,
     global_bindings::FrameUniformBuffer,
     renderer::{
         compositor::{Compositor, CompositorDrawData},
         DrawData, Renderer,
     },
-    wgpu_resources::{BufferDesc, GpuBindGroup, GpuTexture, TextureDesc},
+    wgpu_resources::{GpuBindGroup, GpuTexture, TextureDesc},
     DebugLabel, Rgba, Size,
 };
 
@@ -268,17 +269,6 @@ impl ViewBuilder {
 
         let tonemapping_draw_data = CompositorDrawData::new(ctx, &main_target_resolved);
 
-        // Setup frame uniform buffer
-        let frame_uniform_buffer = ctx.gpu_resources.buffers.alloc(
-            &ctx.device,
-            &BufferDesc {
-                label: format!("{:?} - frame uniform buffer", config.name).into(),
-                size: std::mem::size_of::<FrameUniformBuffer>() as _,
-                usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
-                mapped_at_creation: false,
-            },
-        );
-
         let aspect_ratio =
             config.resolution_in_pixel[0] as f32 / config.resolution_in_pixel[1] as f32;
 
@@ -393,12 +383,11 @@ impl ViewBuilder {
         // Factor applied to depth offsets.
         let depth_offset_factor = 1.0e-08; // Value determined by experimentation. Quite close to the f32 machine epsilon but a bit lower.
 
-        {
-            let mut frame_uniform_buffer_cpu = ctx
-                .cpu_write_gpu_read_belt
-                .lock()
-                .allocate::<FrameUniformBuffer>(&ctx.device, &mut ctx.gpu_resources.buffers, 1);
-            frame_uniform_buffer_cpu.push(&FrameUniformBuffer {
+        // Setup frame uniform buffer
+        let frame_uniform_buffer = create_and_fill_uniform_buffer(
+            ctx,
+            format!("{:?} - frame uniform buffer", config.name).into(),
+            FrameUniformBuffer {
                 view_from_world: glam::Affine3A::from_mat4(view_from_world).into(),
                 projection_from_view: projection_from_view.into(),
                 projection_from_world: projection_from_world.into(),
@@ -411,20 +400,14 @@ impl ViewBuilder {
                 auto_size_points: auto_size_points.0,
                 auto_size_lines: auto_size_lines.0,
 
-                depth_offset_factor,
-                _padding: glam::Vec3::ZERO,
-            });
-            frame_uniform_buffer_cpu.copy_to_buffer(
-                ctx.active_frame.frame_global_command_encoder(&ctx.device),
-                &frame_uniform_buffer,
-                0,
-            );
-        }
+                depth_offset_factor: depth_offset_factor.into(),
+            },
+        );
 
         let bind_group_0 = ctx.shared_renderer_data.global_bindings.create_bind_group(
             &mut ctx.gpu_resources,
             &ctx.device,
-            &frame_uniform_buffer,
+            frame_uniform_buffer,
         );
 
         self.setup = Some(ViewTargetSetup {
@@ -448,8 +431,8 @@ impl ViewBuilder {
 
         self.queued_draws.push(QueuedDraw {
             draw_func: Box::new(move |ctx, pass, draw_data| {
-                let renderer = ctx
-                    .renderers
+                let renderers = ctx.renderers.read();
+                let renderer = renderers
                     .get::<D::Renderer>()
                     .context("failed to retrieve renderer")?;
                 let draw_data = draw_data
@@ -556,10 +539,8 @@ impl ViewBuilder {
 
         pass.set_bind_group(0, &setup.bind_group_0, &[]);
 
-        let tonemapper = ctx
-            .renderers
-            .get::<Compositor>()
-            .context("get compositor")?;
+        let renderers = ctx.renderers.read();
+        let tonemapper = renderers.get::<Compositor>().context("get compositor")?;
         tonemapper
             .draw(&ctx.gpu_resources, pass, &setup.compositor_draw_data)
             .context("composite into main view")
diff --git a/crates/re_renderer/src/wgpu_buffer_types.rs b/crates/re_renderer/src/wgpu_buffer_types.rs
index 4156bb6cc607..15686ce0491d 100644
--- a/crates/re_renderer/src/wgpu_buffer_types.rs
+++ b/crates/re_renderer/src/wgpu_buffer_types.rs
@@ -5,6 +5,48 @@
 
 use bytemuck::{Pod, Zeroable};
 
+#[repr(C, align(16))]
+#[derive(Clone, Copy, Zeroable, Pod)]
+pub struct F32RowPadded {
+    pub v: f32,
+    pub padding0: f32,
+    pub padding1: f32,
+    pub padding2: f32,
+}
+
+impl From<f32> for F32RowPadded {
+    #[inline]
+    fn from(v: f32) -> Self {
+        F32RowPadded {
+            v,
+            padding0: 0.0,
+            padding1: 0.0,
+            padding2: 0.0,
+        }
+    }
+}
+
+#[repr(C, align(16))]
+#[derive(Clone, Copy, Zeroable, Pod)]
+pub struct U32RowPadded {
+    pub v: u32,
+    pub padding0: u32,
+    pub padding1: u32,
+    pub padding2: u32,
+}
+
+impl From<u32> for U32RowPadded {
+    #[inline]
+    fn from(v: u32) -> Self {
+        U32RowPadded {
+            v,
+            padding0: 0,
+            padding1: 0,
+            padding2: 0,
+        }
+    }
+}
+
 #[repr(C, align(8))]
 #[derive(Clone, Copy, Zeroable, Pod)]
 pub struct Vec2 {
@@ -21,17 +63,17 @@ impl From<glam::Vec2> for Vec2 {
 
 #[repr(C, align(16))]
 #[derive(Clone, Copy, Zeroable, Pod)]
-pub struct Vec2Padded {
+pub struct Vec2RowPadded {
     pub x: f32,
     pub y: f32,
     pub padding0: f32,
     pub padding1: f32,
 }
 
-impl From<glam::Vec2> for Vec2Padded {
+impl From<glam::Vec2> for Vec2RowPadded {
     #[inline]
     fn from(v: glam::Vec2) -> Self {
-        Vec2Padded {
+        Vec2RowPadded {
             x: v.x,
             y: v.y,
             padding0: 0.0,
@@ -42,17 +84,17 @@ impl From<glam::Vec2> for Vec2Padded {
 
 #[repr(C, align(16))]
 #[derive(Clone, Copy, Zeroable, Pod)]
-pub struct Vec3 {
+pub struct Vec3RowPadded {
     pub x: f32,
     pub y: f32,
     pub z: f32,
     pub padding: f32,
 }
 
-impl From<glam::Vec3> for Vec3 {
+impl From<glam::Vec3> for Vec3RowPadded {
     #[inline]
     fn from(v: glam::Vec3) -> Self {
-        Vec3 {
+        Vec3RowPadded {
             x: v.x,
             y: v.y,
             z: v.z,
@@ -61,10 +103,10 @@ impl From<glam::Vec3> for Vec3 {
     }
 }
 
-impl From<glam::Vec3A> for Vec3 {
+impl From<glam::Vec3A> for Vec3RowPadded {
     #[inline]
     fn from(v: glam::Vec3A) -> Self {
-        Vec3 {
+        Vec3RowPadded {
             x: v.x,
             y: v.y,
             z: v.z,
@@ -160,10 +202,10 @@ impl From<glam::Mat4> for Mat4 {
 #[repr(C, align(16))]
 #[derive(Clone, Copy, Zeroable, Pod)]
 pub struct Mat4x3 {
-    c0: Vec3,
-    c1: Vec3,
-    c2: Vec3,
-    c3: Vec3,
+    c0: Vec3RowPadded,
+    c1: Vec3RowPadded,
+    c2: Vec3RowPadded,
+    c3: Vec3RowPadded,
 }
 
 impl From<glam::Affine3A> for Mat4x3 {
@@ -177,3 +219,13 @@ impl From<glam::Affine3A> for Mat4x3 {
         }
     }
 }
+
+/// A Vec4 of pure padding (i.e. 16 bytes of padding)
+///
+/// Useful utility to pad uniform buffers out to a multiple of 16 rows,
+/// (256 bytes is the alignment requirement for Uniform buffers)
+#[repr(C, align(16))]
+#[derive(Clone, Copy, Zeroable, Pod, Default)]
+pub struct PaddingRow {
+    p: [f32; 4],
+}
diff --git a/crates/re_renderer/src/wgpu_resources/bind_group_pool.rs b/crates/re_renderer/src/wgpu_resources/bind_group_pool.rs
index 8422ae97734a..7de882eaa7c1 100644
--- a/crates/re_renderer/src/wgpu_resources/bind_group_pool.rs
+++ b/crates/re_renderer/src/wgpu_resources/bind_group_pool.rs
@@ -102,7 +102,7 @@ impl GpuBindGroupPool {
     /// Once ownership to the handle is given up, the bind group may be reclaimed in future frames.
     /// The handle also keeps alive any dependent resources.
     pub fn alloc(
-        &mut self,
+        &self,
         device: &wgpu::Device,
         desc: &BindGroupDesc,
         bind_group_layouts: &GpuBindGroupLayoutPool,
diff --git a/crates/re_renderer/src/wgpu_resources/buffer_pool.rs b/crates/re_renderer/src/wgpu_resources/buffer_pool.rs
index c6890749dc5c..7b96406aca95 100644
--- a/crates/re_renderer/src/wgpu_resources/buffer_pool.rs
+++ b/crates/re_renderer/src/wgpu_resources/buffer_pool.rs
@@ -59,7 +59,7 @@ impl GpuBufferPool {
     ///
     /// For more efficient allocation (faster, less fragmentation) you should sub-allocate buffers whenever possible
     /// either manually or using a higher level allocator.
-    pub fn alloc(&mut self, device: &wgpu::Device, desc: &BufferDesc) -> GpuBuffer {
+    pub fn alloc(&self, device: &wgpu::Device, desc: &BufferDesc) -> GpuBuffer {
         self.pool.alloc(desc, |desc| {
             device.create_buffer(&wgpu::BufferDescriptor {
                 label: desc.label.get(),
diff --git a/crates/re_renderer/src/wgpu_resources/texture_pool.rs b/crates/re_renderer/src/wgpu_resources/texture_pool.rs
index ad8c7f5c0d06..3cfdac02cf93 100644
--- a/crates/re_renderer/src/wgpu_resources/texture_pool.rs
+++ b/crates/re_renderer/src/wgpu_resources/texture_pool.rs
@@ -81,7 +81,7 @@ pub struct GpuTexturePool {
 impl GpuTexturePool {
     /// Returns a reference-counted handle to a currently unused texture.
     /// Once ownership to the handle is given up, the texture may be reclaimed in future frames.
-    pub fn alloc(&mut self, device: &wgpu::Device, desc: &TextureDesc) -> GpuTexture {
+    pub fn alloc(&self, device: &wgpu::Device, desc: &TextureDesc) -> GpuTexture {
         crate::profile_function!();
         self.pool.alloc(desc, |desc| {
             let texture = device.create_texture(&wgpu::TextureDescriptor {