diff --git a/crates/re_renderer/examples/2d.rs b/crates/re_renderer/examples/2d.rs
index 741a27f24539..047c37dfbe51 100644
--- a/crates/re_renderer/examples/2d.rs
+++ b/crates/re_renderer/examples/2d.rs
@@ -149,7 +149,7 @@ impl framework::Example for Render2D {
         // Moving the windows to a high dpi screen makes the second one bigger.
         // Also, it looks different under perspective projection.
         // The third point is automatic thickness which is determined by the point renderer implementation.
-        let mut point_cloud_builder = PointCloudBuilder::<()>::default();
+        let mut point_cloud_builder = PointCloudBuilder::<()>::new(re_ctx);
         point_cloud_builder
             .batch("points")
             .add_points_2d(
diff --git a/crates/re_renderer/examples/framework.rs b/crates/re_renderer/examples/framework.rs
index 3022f19726d8..e0b09e1d830a 100644
--- a/crates/re_renderer/examples/framework.rs
+++ b/crates/re_renderer/examples/framework.rs
@@ -82,10 +82,11 @@ struct Application<E> {
     window: Window,
     surface: wgpu::Surface,
     surface_config: wgpu::SurfaceConfiguration,
-    re_ctx: RenderContext,
     time: Time,
 
     example: E,
+
+    re_ctx: RenderContext,
 }
 
 impl<E: Example + 'static> Application<E> {
@@ -277,6 +278,7 @@ impl<E: Example + 'static> Application<E> {
                         }
                     };
 
+                    self.re_ctx.before_submit();
                     self.re_ctx.queue.submit(
                         draw_results
                             .into_iter()
diff --git a/crates/re_renderer/examples/multiview.rs b/crates/re_renderer/examples/multiview.rs
index a5f89d1ae242..2a663f30ca39 100644
--- a/crates/re_renderer/examples/multiview.rs
+++ b/crates/re_renderer/examples/multiview.rs
@@ -10,12 +10,11 @@ use rand::Rng;
 use re_renderer::{
     renderer::{
         GenericSkyboxDrawData, LineDrawData, LineStripFlags, MeshDrawData, MeshInstance,
-        PointCloudBatchFlags, PointCloudBatchInfo, PointCloudDrawData, PointCloudVertex,
         TestTriangleDrawData,
     },
     resource_managers::ResourceLifeTime,
     view_builder::{OrthographicCameraMode, Projection, TargetConfiguration, ViewBuilder},
-    Color32, LineStripSeriesBuilder, RenderContext, Rgba, Size,
+    Color32, LineStripSeriesBuilder, PointCloudBuilder, RenderContext, Rgba, Size,
 };
 use winit::event::{ElementState, VirtualKeyCode};
 
@@ -163,7 +162,8 @@ struct Multiview {
     mesh_instance_positions_and_colors: Vec<(glam::Vec3, Color32)>,
 
     // Want to have a large cloud of random points, but doing rng for all of them every frame is too slow
-    random_points: Vec<PointCloudVertex>,
+    random_points_positions: Vec<glam::Vec3>,
+    random_points_radii: Vec<Size>,
     random_points_colors: Vec<Color32>,
 }
 
@@ -188,17 +188,23 @@ impl Example for Multiview {
 
         let mut rnd = <rand::rngs::StdRng as rand::SeedableRng>::seed_from_u64(42);
         let random_point_range = -5.0_f32..5.0_f32;
-        let random_points = (0..500000)
-            .map(|_| PointCloudVertex {
-                position: glam::vec3(
+
+        let point_count = 500000;
+        let random_points_positions = (0..point_count)
+            .map(|_| {
+                glam::vec3(
                     rnd.gen_range(random_point_range.clone()),
                     rnd.gen_range(random_point_range.clone()),
                     rnd.gen_range(random_point_range.clone()),
-                ),
-                radius: Size::new_scene(rnd.gen_range(0.005..0.05)),
+                )
             })
             .collect_vec();
-        let random_points_colors = (0..500000).map(|_| random_color(&mut rnd)).collect_vec();
+        let random_points_radii = (0..point_count)
+            .map(|_| Size::new_scene(rnd.gen_range(0.005..0.05)))
+            .collect_vec();
+        let random_points_colors = (0..point_count)
+            .map(|_| random_color(&mut rnd))
+            .collect_vec();
 
         let model_mesh_instances = {
             let reader = std::io::Cursor::new(include_bytes!("rerun.obj.zip"));
@@ -232,7 +238,8 @@ impl Example for Multiview {
 
             model_mesh_instances,
             mesh_instance_positions_and_colors,
-            random_points,
+            random_points_positions,
+            random_points_radii,
             random_points_colors,
         }
     }
@@ -260,18 +267,19 @@ impl Example for Multiview {
         let triangle = TestTriangleDrawData::new(re_ctx);
         let skybox = GenericSkyboxDrawData::new(re_ctx);
         let lines = build_lines(re_ctx, seconds_since_startup);
-        let point_cloud = PointCloudDrawData::new(
-            re_ctx,
-            &self.random_points,
-            &self.random_points_colors,
-            &[PointCloudBatchInfo {
-                label: "Random points".into(),
-                world_from_obj: glam::Mat4::from_rotation_x(seconds_since_startup),
-                point_count: self.random_points.len() as _,
-                flags: PointCloudBatchFlags::ENABLE_SHADING,
-            }],
-        )
-        .unwrap();
+
+        let mut builder = PointCloudBuilder::<()>::new(re_ctx);
+        builder
+            .batch("Random Points")
+            .world_from_obj(glam::Mat4::from_rotation_x(seconds_since_startup))
+            .add_points(
+                self.random_points_positions.len(),
+                self.random_points_positions.iter().cloned(),
+            )
+            .radii(self.random_points_radii.iter().cloned())
+            .colors(self.random_points_colors.iter().cloned());
+
+        let point_cloud = builder.to_draw_data(re_ctx).unwrap();
         let meshes = build_mesh_instances(
             re_ctx,
             &self.model_mesh_instances,
diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
new file mode 100644
index 000000000000..e85b69596b16
--- /dev/null
+++ b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
@@ -0,0 +1,417 @@
+use std::{num::NonZeroU32, ops::DerefMut, sync::mpsc};
+
+use crate::wgpu_resources::{BufferDesc, GpuBuffer, GpuBufferPool};
+
+/// A sub-allocated staging buffer that can be written to.
+///
+/// Behaves a bit like a fixed sized `Vec` in that far it keeps track of how many elements were written to it.
+///
+/// We do *not* allow reading from this buffer as it is typically write-combined memory.
+/// Reading would work, but it can be *very* slow.
+/// For details on why, see
+/// [Write combining is not your friend, by Fabian Giesen](https://fgiesen.wordpress.com/2013/01/29/write-combining-is-not-your-friend/)
+/// Note that the "vec like behavior" further encourages
+/// * not leaving holes
+/// * keeping writes sequential
+pub struct CpuWriteGpuReadBuffer<T: bytemuck::Pod + 'static> {
+    /// Write view into the relevant buffer portion.
+    ///
+    /// UNSAFE: The lifetime is transmuted to be `'static`.
+    /// In actuality it is tied to the lifetime of [`chunk_buffer`](#structfield.chunk_buffer)!
+    write_view: wgpu::BufferViewMut<'static>,
+
+    /// Range in T elements in write_view that haven't been written yet.
+    unwritten_element_range: std::ops::Range<usize>,
+
+    chunk_buffer: GpuBuffer,
+    byte_offset_in_chunk_buffer: wgpu::BufferAddress,
+
+    /// Marker for the type whose alignment and size requirements are honored by `write_view`.
+    _type: std::marker::PhantomData<T>,
+}
+
+impl<T> CpuWriteGpuReadBuffer<T>
+where
+    T: bytemuck::Pod + 'static,
+{
+    /// Memory as slice of T.
+    ///
+    /// Do *not* make this public as we need to guarantee that the memory is *never* read from!
+    #[inline(always)]
+    fn as_slice(&mut self) -> &mut [T] {
+        &mut bytemuck::cast_slice_mut(&mut self.write_view)[self.unwritten_element_range.clone()]
+    }
+
+    /// Pushes a slice of elements into the buffer.
+    ///
+    /// Panics if the data no longer fits into the buffer.
+    #[inline]
+    pub fn extend_from_slice(&mut self, elements: &[T]) {
+        self.as_slice().copy_from_slice(elements);
+        self.unwritten_element_range.start += elements.len();
+    }
+
+    /// Pushes several elements into the buffer.
+    ///
+    /// Panics if the data no longer fits into the buffer.
+    #[inline]
+    pub fn extend(&mut self, elements: impl Iterator<Item = T>) {
+        let mut num_elements = 0;
+        for (target, source) in self.as_slice().iter_mut().zip(elements) {
+            *target = source;
+            num_elements += 1;
+        }
+        self.unwritten_element_range.start += num_elements;
+    }
+
+    /// Pushes a single element into the buffer and advances the write pointer.
+    ///
+    /// Panics if the data no longer fits into the buffer.
+    #[inline]
+    pub fn push(&mut self, element: &T) {
+        self.as_slice()[0] = *element;
+        self.unwritten_element_range.start += 1;
+    }
+
+    /// The number of elements pushed into the buffer so far.
+    #[inline]
+    pub fn num_written(&self) -> usize {
+        self.unwritten_element_range.start
+    }
+
+    /// Copies the entire buffer to a texture and drops it.
+    pub fn copy_to_texture(
+        self,
+        encoder: &mut wgpu::CommandEncoder,
+        destination: wgpu::ImageCopyTexture<'_>,
+        bytes_per_row: Option<NonZeroU32>,
+        rows_per_image: Option<NonZeroU32>,
+        copy_size: wgpu::Extent3d,
+    ) {
+        encoder.copy_buffer_to_texture(
+            wgpu::ImageCopyBuffer {
+                buffer: &self.chunk_buffer,
+                layout: wgpu::ImageDataLayout {
+                    offset: self.byte_offset_in_chunk_buffer,
+                    bytes_per_row,
+                    rows_per_image,
+                },
+            },
+            destination,
+            copy_size,
+        );
+    }
+
+    /// Copies the entire buffer to another buffer and drops it.
+    pub fn copy_to_buffer(
+        mut self,
+        encoder: &mut wgpu::CommandEncoder,
+        destination: &GpuBuffer,
+        destination_offset: wgpu::BufferAddress,
+    ) {
+        encoder.copy_buffer_to_buffer(
+            &self.chunk_buffer,
+            self.byte_offset_in_chunk_buffer,
+            destination,
+            destination_offset,
+            self.write_view.deref_mut().len() as u64,
+        );
+    }
+}
+
+/// Internal chunk of the staging belt.
+struct Chunk {
+    buffer: GpuBuffer,
+
+    /// Starting at this offset the buffer is unused.
+    unused_offset: wgpu::BufferAddress,
+}
+
+impl Chunk {
+    fn remaining_capacity(&self) -> u64 {
+        self.buffer.size() - self.unused_offset
+    }
+
+    fn required_padding(write_view: &mut wgpu::BufferViewMut<'_>, alignment: u64) -> u64 {
+        // Use deref_mut explicitly because wgpu warns otherwise that read access is slow.
+        let ptr = write_view.deref_mut().as_ptr() as u64;
+        wgpu::util::align_to(ptr, alignment) - ptr
+    }
+
+    /// Caller needs to make sure that there is enough space plus potential padding.
+    fn allocate_aligned<T: bytemuck::Pod>(
+        &mut self,
+        num_elements: usize,
+        size_in_bytes: u64,
+        alignment: u64,
+    ) -> CpuWriteGpuReadBuffer<T> {
+        debug_assert!(num_elements * std::mem::size_of::<T>() <= size_in_bytes as usize);
+
+        // Optimistic first mapping attempt.
+        let mut start_offset = self.unused_offset;
+        let mut end_offset = start_offset + size_in_bytes;
+        let mut buffer_slice = self.buffer.slice(start_offset..end_offset);
+        let mut write_view = buffer_slice.get_mapped_range_mut();
+
+        // Check if it fulfills the requested alignment.
+        let required_padding = Self::required_padding(&mut write_view, alignment);
+        if required_padding != 0 {
+            // Undo mapping and try again with padding!
+            re_log::trace!(
+                "CpuWriteGpuReadBuffer::allocate alignment requirement not fulfilled. Need to add {required_padding} for alignment of {alignment}"
+            );
+
+            drop(write_view);
+
+            start_offset = self.unused_offset + required_padding;
+            end_offset = start_offset + size_in_bytes;
+            buffer_slice = self.buffer.slice(start_offset..end_offset);
+            write_view = buffer_slice.get_mapped_range_mut();
+
+            let required_padding = Self::required_padding(&mut write_view, alignment);
+            debug_assert_eq!(required_padding, 0);
+        }
+
+        debug_assert!(end_offset <= self.buffer.size());
+        self.unused_offset = end_offset;
+
+        #[allow(unsafe_code)]
+        // SAFETY:
+        // write_view has a lifetime dependency on the chunk's buffer - internally it holds a pointer to it!
+        //
+        // To ensure that the buffer is still around, we put the ref counted buffer handle into the struct with it.
+        // Additionally, the buffer pool needs to ensure:
+        // * it can't drop buffers if there's still users
+        //      -> We assert on on that
+        // * buffers are never moved in memory
+        //      -> buffers are always owned by the pool and are always Arc.
+        //          This means it not allowed to move the buffer out.
+        //          (We could make them Pin<Arc<>> but this complicates things inside the BufferPool)
+        let write_view = unsafe {
+            std::mem::transmute::<wgpu::BufferViewMut<'_>, wgpu::BufferViewMut<'static>>(write_view)
+        };
+
+        CpuWriteGpuReadBuffer {
+            chunk_buffer: self.buffer.clone(),
+            byte_offset_in_chunk_buffer: start_offset,
+            write_view,
+            unwritten_element_range: 0..num_elements,
+            _type: std::marker::PhantomData,
+        }
+    }
+}
+
+/// Efficiently performs many buffer writes by sharing and reusing temporary buffers.
+///
+/// Internally it uses a ring-buffer of staging buffers that are sub-allocated.
+///
+/// Based on to [`wgpu::util::StagingBelt`](https://github.com/gfx-rs/wgpu/blob/a420e453c3d9c93dfb1a8526bf11c000d895c916/wgpu/src/util/belt.rs)
+/// However, there are some important differences:
+/// * can create buffers without yet knowing the target copy location
+/// * lifetime of returned buffers is independent of the [`CpuWriteGpuReadBelt`] (allows working with several in parallel!)
+/// * use of `re_renderer`'s resource pool
+/// * handles alignment in a defined manner
+///  (see this as of writing open wgpu issue on [Alignment guarantees for mapped buffers](https://github.com/gfx-rs/wgpu/issues/3508))
+pub struct CpuWriteGpuReadBelt {
+    /// Minimum size for new buffers.
+    chunk_size: u64,
+
+    /// Chunks which are CPU write at the moment.
+    active_chunks: Vec<Chunk>,
+
+    /// Chunks which are GPU read at the moment.
+    ///
+    /// I.e. they have scheduled transfers already; they are unmapped and one or more
+    /// command encoder has one or more `copy_buffer_to_buffer` commands with them
+    /// as source.
+    closed_chunks: Vec<Chunk>,
+
+    /// Chunks that are back from the GPU and ready to be mapped for write and put
+    /// into `active_chunks`.
+    free_chunks: Vec<Chunk>,
+
+    /// When closed chunks are mapped again, the map callback sends them here.
+    ///
+    /// Note that we shouldn't use SyncSender since this can block the Sender if a buffer is full,
+    /// which means that in a single threaded situation (Web!) we might deadlock.
+    sender: mpsc::Sender<Chunk>,
+
+    /// Free chunks are received here to be put on `self.free_chunks`.
+    receiver: mpsc::Receiver<Chunk>,
+}
+
+impl CpuWriteGpuReadBelt {
+    /// All allocations of this allocator will be aligned to at least this size.
+    ///
+    /// Requiring a minimum alignment means we need to pad less often.
+    /// Also, it has the potential of making memcpy operations faster.
+    ///
+    /// Align to 4xf32. Should be enough for most usecases!
+    /// Needs to be larger or equal than [`wgpu::MAP_ALIGNMENT`].
+    /// For alignment requirements in `WebGPU` in general, refer to
+    /// [the specification on alignment-class limitations](https://www.w3.org/TR/webgpu/#limit-class-alignment)
+    pub const MIN_ALIGNMENT: u64 = 16;
+
+    /// Create a cpu-write & gpu-read staging belt.
+    ///
+    /// The `chunk_size` is the unit of internal buffer allocation; writes will be
+    /// sub-allocated within each chunk. Therefore, for optimal use of memory, the
+    /// chunk size should be:
+    ///
+    /// * larger than the largest single [`CpuWriteGpuReadBelt::allocate`] operation;
+    /// * 1-4 times less than the total amount of data uploaded per submission
+    ///   (per [`CpuWriteGpuReadBelt::before_queue_submit()`]); and
+    /// * bigger is better, within these bounds.
+    ///
+    /// TODO(andreas): Adaptive chunk sizes
+    /// TODO(andreas): Shrinking after usage spikes?
+    pub fn new(chunk_size: wgpu::BufferSize) -> Self {
+        static_assertions::const_assert!(wgpu::MAP_ALIGNMENT <= CpuWriteGpuReadBelt::MIN_ALIGNMENT);
+
+        let (sender, receiver) = mpsc::channel();
+        CpuWriteGpuReadBelt {
+            chunk_size: wgpu::util::align_to(chunk_size.get(), Self::MIN_ALIGNMENT),
+            active_chunks: Vec::new(),
+            closed_chunks: Vec::new(),
+            free_chunks: Vec::new(),
+            sender,
+            receiver,
+        }
+    }
+
+    /// Allocates a cpu writable buffer for `num_elements` instances of type `T`.
+    ///
+    /// Handles alignment requirements automatically, allowing arbitrarily aligned types without issues.
+    pub fn allocate<T: bytemuck::Pod>(
+        &mut self,
+        device: &wgpu::Device,
+        buffer_pool: &mut GpuBufferPool,
+        num_elements: usize,
+    ) -> CpuWriteGpuReadBuffer<T> {
+        // Potentially overestimate alignment with Self::MIN_ALIGNMENT, see Self::MIN_ALIGNMENT doc string.
+        let alignment = (std::mem::align_of::<T>() as wgpu::BufferAddress).max(Self::MIN_ALIGNMENT);
+        // Pad out the size of the used buffer to a multiple of Self::MIN_ALIGNMENT.
+        // This increases our chance of having back to back allocations within a chunk.
+        let size = wgpu::util::align_to(
+            (std::mem::size_of::<T>() * num_elements) as wgpu::BufferAddress,
+            Self::MIN_ALIGNMENT,
+        );
+
+        // We need to be super careful with alignment since today wgpu
+        // has no guarantees on how pointers to mapped memory are aligned!
+        // For all we know, pointers might be 1 aligned, causing even a u32 write to crash the process!
+        //
+        // For details and (as of writing) ongoing discussion see https://github.com/gfx-rs/wgpu/issues/3508
+        //
+        // To work around this, we ask for a bigger size, so we can safely pad out
+        // if the returned pointer is not correctly aligned.
+        // (i.e. we will use _up to_ `required_size` bytes, but at least `size`)]
+        let maximum_padding = alignment - 1;
+        let max_required_size = size + maximum_padding;
+
+        // Try to find space in any of the active chunks first.
+        let mut chunk = if let Some(index) = self
+            .active_chunks
+            .iter_mut()
+            .position(|chunk| chunk.remaining_capacity() >= max_required_size)
+        {
+            self.active_chunks.swap_remove(index)
+        } else {
+            self.receive_chunks(); // ensure self.free_chunks is up to date
+
+            // Use a free chunk if possible, fall back to creating a new one if necessary.
+            if let Some(index) = self
+                .free_chunks
+                .iter()
+                .position(|chunk| chunk.remaining_capacity() >= max_required_size)
+            {
+                self.free_chunks.swap_remove(index)
+            } else {
+                // Allocation might be bigger than a chunk!
+                let buffer_size = wgpu::util::align_to(
+                    self.chunk_size.max(max_required_size),
+                    Self::MIN_ALIGNMENT,
+                );
+                // Happens relatively rarely, this is a noteworthy event!
+                re_log::debug!("Allocating new CpuWriteGpuReadBelt chunk of size {buffer_size}");
+                let buffer = buffer_pool.alloc(
+                    device,
+                    &BufferDesc {
+                        label: "CpuWriteGpuReadBelt buffer".into(),
+                        size: buffer_size,
+                        usage: wgpu::BufferUsages::MAP_WRITE | wgpu::BufferUsages::COPY_SRC,
+                        mapped_at_creation: true,
+                    },
+                );
+
+                Chunk {
+                    buffer,
+                    unused_offset: 0,
+                }
+            }
+        };
+
+        let cpu_buffer_view = chunk.allocate_aligned(num_elements, size, alignment);
+        self.active_chunks.push(chunk);
+        cpu_buffer_view
+    }
+
+    /// Prepare currently mapped buffers for use in a submission.
+    ///
+    /// This must be called before the command encoder(s) used in [`CpuWriteGpuReadBuffer`] copy operations are submitted.
+    ///
+    /// At this point, all the partially used staging buffers are closed (cannot be used for
+    /// further writes) until after [`CpuWriteGpuReadBelt::after_queue_submit`] is called *and* the GPU is done
+    /// copying the data from them.
+    pub fn before_queue_submit(&mut self) {
+        // This would be a great usecase for persistent memory mapping, i.e. mapping without the need to unmap
+        // https://github.com/gfx-rs/wgpu/issues/1468
+        // However, WebGPU does not support this!
+
+        for chunk in self.active_chunks.drain(..) {
+            chunk.buffer.unmap();
+            self.closed_chunks.push(chunk);
+        }
+    }
+
+    /// Recall all of the closed buffers back to be reused.
+    ///
+    /// This must only be called after the command encoder(s) used in [`CpuWriteGpuReadBuffer`]
+    /// copy operations are submitted. Additional calls are harmless.
+    /// Not calling this as soon as possible may result in increased buffer memory usage.
+    pub fn after_queue_submit(&mut self) {
+        self.receive_chunks();
+
+        let sender = &self.sender;
+        for chunk in self.closed_chunks.drain(..) {
+            let sender = sender.clone();
+            chunk
+                .buffer
+                .clone()
+                .slice(..)
+                .map_async(wgpu::MapMode::Write, move |_| {
+                    let _ = sender.send(chunk);
+                });
+        }
+    }
+
+    /// Move all chunks that the GPU is done with (and are now mapped again)
+    /// from `self.receiver` to `self.free_chunks`.
+    fn receive_chunks(&mut self) {
+        while let Ok(mut chunk) = self.receiver.try_recv() {
+            chunk.unused_offset = 0;
+            self.free_chunks.push(chunk);
+        }
+    }
+}
+
+impl std::fmt::Debug for CpuWriteGpuReadBelt {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("StagingBelt")
+            .field("chunk_size", &self.chunk_size)
+            .field("active_chunks", &self.active_chunks.len())
+            .field("closed_chunks", &self.closed_chunks.len())
+            .field("free_chunks", &self.free_chunks.len())
+            .finish_non_exhaustive()
+    }
+}
diff --git a/crates/re_renderer/src/allocator/mod.rs b/crates/re_renderer/src/allocator/mod.rs
new file mode 100644
index 000000000000..cb4ab783fd09
--- /dev/null
+++ b/crates/re_renderer/src/allocator/mod.rs
@@ -0,0 +1,8 @@
+//! High level GPU memory allocators.
+//!
+//! In contrast to the buffer pools in [`crate::wgpu_resources`], every allocator in here
+//! follows some more complex strategy for efficient re-use and sub-allocation of wgpu resources.
+
+mod cpu_write_gpu_read_belt;
+
+pub use cpu_write_gpu_read_belt::{CpuWriteGpuReadBelt, CpuWriteGpuReadBuffer};
diff --git a/crates/re_renderer/src/context.rs b/crates/re_renderer/src/context.rs
index 81044cabb961..150d51320a5d 100644
--- a/crates/re_renderer/src/context.rs
+++ b/crates/re_renderer/src/context.rs
@@ -1,8 +1,10 @@
 use std::sync::Arc;
 
+use parking_lot::Mutex;
 use type_map::concurrent::{self, TypeMap};
 
 use crate::{
+    allocator::CpuWriteGpuReadBelt,
     config::RenderContextConfig,
     global_bindings::GlobalBindings,
     renderer::Renderer,
@@ -23,15 +25,19 @@ pub struct RenderContext {
     #[cfg(all(not(target_arch = "wasm32"), debug_assertions))] // native debug build
     pub(crate) err_tracker: std::sync::Arc<crate::error_tracker::ErrorTracker>,
 
-    /// Utility type map that will be cleared every frame.
-    pub per_frame_data_helper: TypeMap,
-
-    pub gpu_resources: WgpuResourcePools,
     pub mesh_manager: MeshManager,
     pub texture_manager_2d: TextureManager2D,
+    pub cpu_write_gpu_read_belt: Mutex<CpuWriteGpuReadBelt>,
 
-    // TODO(andreas): Add frame/lifetime statistics, shared resources (e.g. "global" uniform buffer), ??
-    frame_index: u64,
+    /// List of unfinished queue submission via this context.
+    ///
+    /// This is currently only about submissions we do via the global encoder in [`ActiveFrameContext`]
+    /// TODO(andreas): We rely on egui to to the "primary" submissions in re_viewer. It would be nice to take full control over all submissions.
+    inflight_queue_submissions: Vec<wgpu::SubmissionIndex>,
+
+    pub active_frame: ActiveFrameContext,
+
+    pub gpu_resources: WgpuResourcePools, // Last due to drop order.
 }
 
 /// Immutable data that is shared between all [`Renderer`]
@@ -69,6 +75,34 @@ impl Renderers {
 }
 
 impl RenderContext {
+    /// Chunk size for our cpu->gpu buffer manager.
+    ///
+    /// For native: 32MiB chunk size (as big as a for instance a 2048x1024 float4 texture)
+    /// For web (memory constraint!): 8MiB
+    #[cfg(not(target_arch = "wasm32"))]
+    const CPU_WRITE_GPU_READ_BELT_DEFAULT_CHUNK_SIZE: Option<wgpu::BufferSize> =
+        wgpu::BufferSize::new(1024 * 1024 * 32);
+    #[cfg(target_arch = "wasm32")]
+    const CPU_WRITE_GPU_READ_BELT_DEFAULT_CHUNK_SIZE: Option<wgpu::BufferSize> =
+        wgpu::BufferSize::new(1024 * 1024 * 8);
+
+    /// Limit maximum number of in flight submissions to this number.
+    ///
+    /// By limiting the number of submissions we have on the queue we ensure that GPU stalls do not
+    /// cause us to request more and more memory to prepare more and more submissions.
+    ///
+    /// Note that this is only indirectly related to number of buffered frames,
+    /// since buffered frames/blit strategy are all about the display<->gpu interface,
+    /// whereas this is about a *particular aspect* of the cpu<->gpu interface.
+    ///
+    /// Should be somewhere between 1-4, too high and we use up more memory and introduce latency,
+    /// too low and we may starve the GPU.
+    /// On the web we want to go lower because a lot more memory constraint.
+    #[cfg(not(target_arch = "wasm32"))]
+    const MAX_NUM_INFLIGHT_QUEUE_SUBMISSIONS: usize = 4;
+    #[cfg(target_arch = "wasm32")]
+    const MAX_NUM_INFLIGHT_QUEUE_SUBMISSIONS: usize = 2;
+
     pub fn new(
         device: Arc<wgpu::Device>,
         queue: Arc<wgpu::Queue>,
@@ -146,19 +180,41 @@ impl RenderContext {
 
             renderers,
 
-            per_frame_data_helper: TypeMap::new(),
-
             gpu_resources,
 
             mesh_manager,
             texture_manager_2d,
+            cpu_write_gpu_read_belt: Mutex::new(CpuWriteGpuReadBelt::new(Self::CPU_WRITE_GPU_READ_BELT_DEFAULT_CHUNK_SIZE.unwrap())),
 
             resolver,
 
             #[cfg(all(not(target_arch = "wasm32"), debug_assertions))] // native debug build
             err_tracker,
 
-            frame_index: 0,
+            inflight_queue_submissions: Vec::new(),
+
+            active_frame: ActiveFrameContext { frame_global_command_encoder: None,             per_frame_data_helper: TypeMap::new(),
+                 frame_index: 0 }
+        }
+    }
+
+    fn poll_device(&mut self) {
+        crate::profile_function!();
+
+        // Ensure not too many queue submissions are in flight.
+        let num_submissions_to_wait_for = self
+            .inflight_queue_submissions
+            .len()
+            .saturating_sub(Self::MAX_NUM_INFLIGHT_QUEUE_SUBMISSIONS);
+
+        if let Some(newest_submission_to_wait_for) = self
+            .inflight_queue_submissions
+            .drain(0..num_submissions_to_wait_for)
+            .last()
+        {
+            self.device.poll(wgpu::Maintain::WaitForSubmissionIndex(
+                newest_submission_to_wait_for,
+            ));
         }
     }
 
@@ -166,8 +222,18 @@ impl RenderContext {
     ///
     /// Updates internal book-keeping, frame allocators and executes delayed events like shader reloading.
     pub fn begin_frame(&mut self) {
-        self.frame_index += 1;
-        self.per_frame_data_helper.clear();
+        crate::profile_function!();
+
+        // Request used staging buffer back.
+        // TODO(andreas): If we'd control all submissions, we could move this directly after the submission which would be a bit better.
+        self.cpu_write_gpu_read_belt.lock().after_queue_submit();
+
+        self.active_frame = ActiveFrameContext {
+            frame_global_command_encoder: None,
+            frame_index: self.active_frame.frame_index + 1,
+            per_frame_data_helper: TypeMap::new(),
+        };
+        let frame_index = self.active_frame.frame_index;
 
         // Tick the error tracker so that it knows when to reset!
         // Note that we're ticking on begin_frame rather than raw frames, which
@@ -183,8 +249,8 @@ impl RenderContext {
             re_log::debug!(?modified_paths, "got some filesystem events");
         }
 
-        self.mesh_manager.begin_frame(self.frame_index);
-        self.texture_manager_2d.begin_frame(self.frame_index);
+        self.mesh_manager.begin_frame(frame_index);
+        self.texture_manager_2d.begin_frame(frame_index);
 
         {
             let WgpuResourcePools {
@@ -203,27 +269,87 @@ impl RenderContext {
             shader_modules.begin_frame(
                 &self.device,
                 &mut self.resolver,
-                self.frame_index,
+                frame_index,
                 &modified_paths,
             );
             render_pipelines.begin_frame(
                 &self.device,
-                self.frame_index,
+                frame_index,
                 shader_modules,
                 pipeline_layouts,
             );
 
             // Bind group maintenance must come before texture/buffer maintenance since it
             // registers texture/buffer use
-            bind_groups.begin_frame(self.frame_index, textures, buffers, samplers);
+            bind_groups.begin_frame(frame_index, textures, buffers, samplers);
 
-            textures.begin_frame(self.frame_index);
-            buffers.begin_frame(self.frame_index);
+            textures.begin_frame(frame_index);
+            buffers.begin_frame(frame_index);
 
-            pipeline_layouts.begin_frame(self.frame_index);
-            bind_group_layouts.begin_frame(self.frame_index);
-            samplers.begin_frame(self.frame_index);
+            pipeline_layouts.begin_frame(frame_index);
+            bind_group_layouts.begin_frame(frame_index);
+            samplers.begin_frame(frame_index);
         }
+
+        // Poll device *after* resource pool `begin_frame` since resource pools may each decide drop resources.
+        // Wgpu internally may then internally decide to let go of these buffers.
+        self.poll_device();
+    }
+
+    /// Call this at the end of a frame but before submitting command buffers (e.g. from [`crate::view_builder::ViewBuilder`])
+    pub fn before_submit(&mut self) {
+        crate::profile_function!();
+
+        // Unmap all staging buffers.
+        self.cpu_write_gpu_read_belt.lock().before_queue_submit();
+
+        if let Some(command_encoder) = self.active_frame.frame_global_command_encoder.take() {
+            let command_buffer = command_encoder.finish();
+
+            // TODO(andreas): For better performance, we should try to bundle this with the single submit call that is currently happening in eframe.
+            //                  How do we hook in there and make sure this buffer is submitted first?
+            self.inflight_queue_submissions
+                .push(self.queue.submit([command_buffer]));
+        }
+    }
+}
+
+impl Drop for RenderContext {
+    fn drop(&mut self) {
+        // Close global command encoder if there is any pending.
+        // Not doing so before shutdown causes errors.
+        if let Some(encoder) = self.active_frame.frame_global_command_encoder.take() {
+            encoder.finish();
+        }
+    }
+}
+
+pub struct ActiveFrameContext {
+    /// Command encoder for all commands that should go in before view builder are submitted.
+    ///
+    /// This should be used for any gpu copy operation outside of a renderer or view builder.
+    /// (i.e. typically in [`crate::renderer::DrawData`] creation!)
+    frame_global_command_encoder: Option<wgpu::CommandEncoder>,
+
+    /// Utility type map that will be cleared every frame.
+    pub per_frame_data_helper: TypeMap,
+
+    /// Index of this frame. Is incremented for every render frame.
+    frame_index: u64,
+}
+
+impl ActiveFrameContext {
+    /// Gets or creates a command encoder that runs before all view builder encoder.
+    pub fn frame_global_command_encoder(
+        &mut self,
+        device: &wgpu::Device,
+    ) -> &mut wgpu::CommandEncoder {
+        self.frame_global_command_encoder.get_or_insert_with(|| {
+            device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
+                label: crate::DebugLabel::from("global \"before viewbuilder\" command encoder")
+                    .get(),
+            })
+        })
     }
 }
 
diff --git a/crates/re_renderer/src/lib.rs b/crates/re_renderer/src/lib.rs
index b89004ed22cf..dcbae79b06dc 100644
--- a/crates/re_renderer/src/lib.rs
+++ b/crates/re_renderer/src/lib.rs
@@ -14,6 +14,7 @@ pub mod renderer;
 pub mod resource_managers;
 pub mod view_builder;
 
+mod allocator;
 mod context;
 mod debug_label;
 mod depth_offset;
diff --git a/crates/re_renderer/src/mesh.rs b/crates/re_renderer/src/mesh.rs
index da0edf449293..7f30c8fccb92 100644
--- a/crates/re_renderer/src/mesh.rs
+++ b/crates/re_renderer/src/mesh.rs
@@ -167,6 +167,7 @@ impl GpuMesh {
                     label: data.label.clone().push_str(" - vertices"),
                     size: vertex_buffer_combined_size,
                     usage: wgpu::BufferUsages::VERTEX | wgpu::BufferUsages::COPY_DST,
+                    mapped_at_creation: false,
                 },
             );
             let mut staging_buffer = queue
@@ -193,6 +194,7 @@ impl GpuMesh {
                     label: data.label.clone().push_str(" - indices"),
                     size: index_buffer_size,
                     usage: wgpu::BufferUsages::INDEX | wgpu::BufferUsages::COPY_DST,
+                    mapped_at_creation: false,
                 },
             );
             let mut staging_buffer = queue
@@ -216,6 +218,7 @@ impl GpuMesh {
                     label: data.label.clone().push_str(" - material uniforms"),
                     size: combined_buffers_size,
                     usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::UNIFORM,
+                    mapped_at_creation: false,
                 },
             );
 
diff --git a/crates/re_renderer/src/point_cloud_builder.rs b/crates/re_renderer/src/point_cloud_builder.rs
index f265ae12e064..d13eb9105220 100644
--- a/crates/re_renderer/src/point_cloud_builder.rs
+++ b/crates/re_renderer/src/point_cloud_builder.rs
@@ -1,42 +1,45 @@
 use crate::{
+    allocator::CpuWriteGpuReadBuffer,
     renderer::{
         PointCloudBatchFlags, PointCloudBatchInfo, PointCloudDrawData, PointCloudDrawDataError,
         PointCloudVertex,
     },
-    Color32, DebugLabel, Size,
+    Color32, DebugLabel, RenderContext, Size,
 };
 
 /// Builder for point clouds, making it easy to create [`crate::renderer::PointCloudDrawData`].
-///
-/// TODO(andreas): We could make significant optimizations here by making this builder capable
-/// of writing to a GPU readable memory location.
-/// This will require some ahead of time size limit, but should be feasible.
-/// But before that we first need to sort out cpu->gpu transfers better by providing staging buffers.
 pub struct PointCloudBuilder<PerPointUserData> {
     // Size of `point`/color`/`per_point_user_data` must be equal.
     pub vertices: Vec<PointCloudVertex>,
-    pub colors: Vec<Color32>,
+
+    pub(crate) color_buffer: CpuWriteGpuReadBuffer<Color32>,
     pub user_data: Vec<PerPointUserData>,
 
-    batches: Vec<PointCloudBatchInfo>,
+    pub(crate) batches: Vec<PointCloudBatchInfo>,
 }
 
-impl<PerPointUserData> Default for PointCloudBuilder<PerPointUserData> {
-    fn default() -> Self {
+impl<PerPointUserData> PointCloudBuilder<PerPointUserData>
+where
+    PerPointUserData: Default + Copy,
+{
+    pub fn new(ctx: &mut RenderContext) -> Self {
         const RESERVE_SIZE: usize = 512;
+
+        // TODO(andreas): Be more resourceful about the size allocated here. Typically we know in advance!
+        let color_buffer = ctx.cpu_write_gpu_read_belt.lock().allocate::<Color32>(
+            &ctx.device,
+            &mut ctx.gpu_resources.buffers,
+            PointCloudDrawData::MAX_NUM_POINTS,
+        );
+
         Self {
             vertices: Vec::with_capacity(RESERVE_SIZE),
-            colors: Vec::with_capacity(RESERVE_SIZE),
+            color_buffer,
             user_data: Vec::with_capacity(RESERVE_SIZE),
             batches: Vec::with_capacity(16),
         }
     }
-}
 
-impl<PerPointUserData> PointCloudBuilder<PerPointUserData>
-where
-    PerPointUserData: Default + Copy,
-{
     /// Start of a new batch.
     #[inline]
     pub fn batch(
@@ -102,10 +105,10 @@ where
 
     /// Finalizes the builder and returns a point cloud draw data with all the points added so far.
     pub fn to_draw_data(
-        &self,
+        self,
         ctx: &mut crate::context::RenderContext,
     ) -> Result<PointCloudDrawData, PointCloudDrawDataError> {
-        PointCloudDrawData::new(ctx, &self.vertices, &self.colors, &self.batches)
+        PointCloudDrawData::new(ctx, self)
     }
 }
 
@@ -150,13 +153,14 @@ where
     /// Each time we `add_points`, or upon builder drop, make sure that we
     /// fill in any additional colors and user-data to have matched vectors.
     fn extend_defaults(&mut self) {
-        if self.0.colors.len() < self.0.vertices.len() {
-            self.0.colors.extend(
-                std::iter::repeat(Color32::WHITE).take(self.0.vertices.len() - self.0.colors.len()),
+        if self.0.color_buffer.num_written() < self.0.vertices.len() {
+            self.0.color_buffer.extend(
+                std::iter::repeat(Color32::WHITE)
+                    .take(self.0.vertices.len() - self.0.color_buffer.num_written()),
             );
         }
 
-        if self.0.user_data.len() < self.0.user_data.len() {
+        if self.0.user_data.len() < self.0.vertices.len() {
             self.0.user_data.extend(
                 std::iter::repeat(PerPointUserData::default())
                     .take(self.0.vertices.len() - self.0.user_data.len()),
@@ -186,7 +190,7 @@ where
 
         self.extend_defaults();
 
-        debug_assert_eq!(self.0.vertices.len(), self.0.colors.len());
+        debug_assert_eq!(self.0.vertices.len(), self.0.color_buffer.num_written());
         debug_assert_eq!(self.0.vertices.len(), self.0.user_data.len());
 
         let old_size = self.0.vertices.len();
@@ -200,7 +204,6 @@ where
         let num_points = self.0.vertices.len() - old_size;
         self.batch_mut().point_count += num_points as u32;
 
-        self.0.colors.reserve(num_points);
         self.0.user_data.reserve(num_points);
 
         let new_range = old_size..self.0.vertices.len();
@@ -210,27 +213,28 @@ where
         PointsBuilder {
             vertices: &mut self.0.vertices[new_range],
             max_points,
-            colors: &mut self.0.colors,
+            colors: &mut self.0.color_buffer,
             user_data: &mut self.0.user_data,
         }
     }
 
     #[inline]
     pub fn add_point(&mut self, position: glam::Vec3) -> PointBuilder<'_, PerPointUserData> {
-        debug_assert_eq!(self.0.vertices.len(), self.0.colors.len());
+        self.extend_defaults();
+
+        debug_assert_eq!(self.0.vertices.len(), self.0.color_buffer.num_written());
         debug_assert_eq!(self.0.vertices.len(), self.0.user_data.len());
 
         self.0.vertices.push(PointCloudVertex {
             position,
             radius: Size::AUTO,
         });
-        self.0.colors.push(Color32::WHITE);
-        self.0.user_data.push(PerPointUserData::default());
+        self.0.user_data.push(Default::default());
         self.batch_mut().point_count += 1;
 
         PointBuilder {
             vertex: self.0.vertices.last_mut().unwrap(),
-            color: self.0.colors.last_mut().unwrap(),
+            color: &mut self.0.color_buffer,
             user_data: self.0.user_data.last_mut().unwrap(),
         }
     }
@@ -265,7 +269,7 @@ where
 
 pub struct PointBuilder<'a, PerPointUserData> {
     vertex: &'a mut PointCloudVertex,
-    color: &'a mut Color32,
+    color: &'a mut CpuWriteGpuReadBuffer<Color32>,
     user_data: &'a mut PerPointUserData,
 }
 
@@ -279,9 +283,10 @@ where
         self
     }
 
+    /// This mustn't call this more than once.
     #[inline]
     pub fn color(self, color: Color32) -> Self {
-        *self.color = color;
+        self.color.push(&color);
         self
     }
 
@@ -298,7 +303,7 @@ pub struct PointsBuilder<'a, PerPointUserData> {
     // Colors and user-data are the Vec we append
     // the data to if provided.
     max_points: usize,
-    colors: &'a mut Vec<Color32>,
+    colors: &'a mut CpuWriteGpuReadBuffer<Color32>,
     user_data: &'a mut Vec<PerPointUserData>,
 }
 
@@ -308,6 +313,8 @@ where
 {
     /// Assigns radii to all points.
     ///
+    /// This mustn't call this more than once.
+    ///
     /// If the iterator doesn't cover all points, some will not be assigned.
     /// If the iterator provides more values than there are points, the extra values will be ignored.
     #[inline]
@@ -323,18 +330,22 @@ where
 
     /// Assigns colors to all points.
     ///
+    /// This mustn't call this more than once.
+    ///
     /// If the iterator doesn't cover all points, some will not be assigned.
     /// If the iterator provides more values than there are points, the extra values will be ignored.
     #[inline]
     pub fn colors(self, colors: impl Iterator<Item = Color32>) -> Self {
         crate::profile_function!();
         self.colors
-            .extend(colors.take(self.max_points - self.colors.len()));
+            .extend(colors.take(self.max_points - self.colors.num_written()));
         self
     }
 
     /// Assigns user data for all points in this builder.
     ///
+    /// This mustn't call this more than once.
+    ///
     /// User data is currently not available on the GPU.
     #[inline]
     pub fn user_data(self, data: impl Iterator<Item = PerPointUserData>) -> Self {
diff --git a/crates/re_renderer/src/renderer/lines.rs b/crates/re_renderer/src/renderer/lines.rs
index 996fa897e69d..f8237709ebb6 100644
--- a/crates/re_renderer/src/renderer/lines.rs
+++ b/crates/re_renderer/src/renderer/lines.rs
@@ -503,6 +503,7 @@ impl LineDrawData {
                     label: "lines batch uniform buffers".into(),
                     size: combined_buffers_size,
                     usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
+                    mapped_at_creation: false,
                 },
             );
 
diff --git a/crates/re_renderer/src/renderer/mesh_renderer.rs b/crates/re_renderer/src/renderer/mesh_renderer.rs
index d9c7b734947b..14559b9f69e6 100644
--- a/crates/re_renderer/src/renderer/mesh_renderer.rs
+++ b/crates/re_renderer/src/renderer/mesh_renderer.rs
@@ -145,6 +145,7 @@ impl MeshDrawData {
                 label: "MeshDrawData instance buffer".into(),
                 size: instance_buffer_size,
                 usage: wgpu::BufferUsages::VERTEX | wgpu::BufferUsages::COPY_DST,
+                mapped_at_creation: false,
             },
         );
 
diff --git a/crates/re_renderer/src/renderer/point_cloud.rs b/crates/re_renderer/src/renderer/point_cloud.rs
index 415a82550830..233911761fd2 100644
--- a/crates/re_renderer/src/renderer/point_cloud.rs
+++ b/crates/re_renderer/src/renderer/point_cloud.rs
@@ -19,7 +19,8 @@ use std::{
 };
 
 use crate::{
-    context::uniform_buffer_allocation_size, wgpu_resources::BufferDesc, Color32, DebugLabel,
+    context::uniform_buffer_allocation_size, wgpu_resources::BufferDesc, DebugLabel,
+    PointCloudBuilder,
 };
 use bitflags::bitflags;
 use bytemuck::Zeroable;
@@ -148,11 +149,9 @@ impl PointCloudDrawData {
     /// Number of vertices and colors has to be equal.
     ///
     /// If no batches are passed, all points are assumed to be in a single batch with identity transform.
-    pub fn new(
+    pub fn new<T>(
         ctx: &mut RenderContext,
-        vertices: &[PointCloudVertex],
-        colors: &[Color32],
-        batches: &[PointCloudBatchInfo],
+        builder: PointCloudBuilder<T>,
     ) -> Result<Self, PointCloudDrawDataError> {
         crate::profile_function!();
 
@@ -163,6 +162,9 @@ impl PointCloudDrawData {
             &mut ctx.resolver,
         );
 
+        let vertices = builder.vertices.as_slice();
+        let batches = builder.batches.as_slice();
+
         if vertices.is_empty() {
             return Ok(PointCloudDrawData {
                 bind_group_all_points: None,
@@ -194,23 +196,16 @@ impl PointCloudDrawData {
             0
         );
 
-        if vertices.len() != colors.len() {
-            return Err(PointCloudDrawDataError::NumberOfColorsNotEqualNumberOfVertices);
-        }
-
-        let (vertices, colors) = if vertices.len() >= Self::MAX_NUM_POINTS {
+        let vertices = if vertices.len() >= Self::MAX_NUM_POINTS {
             re_log::error_once!(
                 "Reached maximum number of supported points. Clamping down to {}, passed were {}.
  See also https://github.com/rerun-io/rerun/issues/957",
                 Self::MAX_NUM_POINTS,
                 vertices.len()
             );
-            (
-                &vertices[..Self::MAX_NUM_POINTS],
-                &colors[..Self::MAX_NUM_POINTS],
-            )
+            &vertices[..Self::MAX_NUM_POINTS]
         } else {
-            (vertices, colors)
+            vertices
         };
 
         // TODO(andreas): We want a "stack allocation" here that lives for one frame.
@@ -261,15 +256,6 @@ impl PointCloudDrawData {
                 .collect_vec()
         };
 
-        let color_staging = {
-            crate::profile_scope!("collect_colors");
-            colors
-                .iter()
-                .cloned()
-                .chain(std::iter::repeat(Color32::TRANSPARENT).take(num_points_zeroed))
-                .collect_vec()
-        };
-
         // Upload data from staging buffers to gpu.
         let size = wgpu::Extent3d {
             width: DATA_TEXTURE_SIZE,
@@ -298,26 +284,18 @@ impl PointCloudDrawData {
             );
         }
 
-        {
-            crate::profile_scope!("write_color_texture");
-            ctx.queue.write_texture(
-                wgpu::ImageCopyTexture {
-                    texture: &color_texture.texture,
-                    mip_level: 0,
-                    origin: wgpu::Origin3d::ZERO,
-                    aspect: wgpu::TextureAspect::All,
-                },
-                bytemuck::cast_slice(&color_staging),
-                wgpu::ImageDataLayout {
-                    offset: 0,
-                    bytes_per_row: NonZeroU32::new(
-                        DATA_TEXTURE_SIZE * std::mem::size_of::<[u8; 4]>() as u32,
-                    ),
-                    rows_per_image: None,
-                },
-                size,
-            );
-        }
+        builder.color_buffer.copy_to_texture(
+            ctx.active_frame.frame_global_command_encoder(&ctx.device),
+            wgpu::ImageCopyTexture {
+                texture: &color_texture.texture,
+                mip_level: 0,
+                origin: wgpu::Origin3d::ZERO,
+                aspect: wgpu::TextureAspect::All,
+            },
+            NonZeroU32::new(DATA_TEXTURE_SIZE * std::mem::size_of::<[u8; 4]>() as u32),
+            None,
+            size,
+        );
 
         let bind_group_all_points = ctx.gpu_resources.bind_groups.alloc(
             &ctx.device,
@@ -347,6 +325,7 @@ impl PointCloudDrawData {
                     label: "point batch uniform buffers".into(),
                     size: combined_buffers_size,
                     usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
+                    mapped_at_creation: false,
                 },
             );
 
diff --git a/crates/re_renderer/src/renderer/rectangles.rs b/crates/re_renderer/src/renderer/rectangles.rs
index 5b325e13d24c..42f98378dceb 100644
--- a/crates/re_renderer/src/renderer/rectangles.rs
+++ b/crates/re_renderer/src/renderer/rectangles.rs
@@ -141,6 +141,7 @@ impl RectangleDrawData {
                 label: "rectangle uniform buffers".into(),
                 size: combined_buffers_size,
                 usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::UNIFORM,
+                mapped_at_creation: false,
             },
         );
 
diff --git a/crates/re_renderer/src/view_builder.rs b/crates/re_renderer/src/view_builder.rs
index 0fcd491935da..1f5981b278c6 100644
--- a/crates/re_renderer/src/view_builder.rs
+++ b/crates/re_renderer/src/view_builder.rs
@@ -275,6 +275,7 @@ impl ViewBuilder {
                 label: format!("{:?} - frame uniform buffer", config.name).into(),
                 size: std::mem::size_of::<FrameUniformBuffer>() as _,
                 usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
+                mapped_at_creation: false,
             },
         );
 
@@ -392,10 +393,12 @@ impl ViewBuilder {
         // Factor applied to depth offsets.
         let depth_offset_factor = 1.0e-08; // Value determined by experimentation. Quite close to the f32 machine epsilon but a bit lower.
 
-        ctx.queue.write_buffer(
-            &frame_uniform_buffer,
-            0,
-            bytemuck::bytes_of(&FrameUniformBuffer {
+        {
+            let mut frame_uniform_buffer_cpu = ctx
+                .cpu_write_gpu_read_belt
+                .lock()
+                .allocate::<FrameUniformBuffer>(&ctx.device, &mut ctx.gpu_resources.buffers, 1);
+            frame_uniform_buffer_cpu.push(&FrameUniformBuffer {
                 view_from_world: glam::Affine3A::from_mat4(view_from_world).into(),
                 projection_from_view: projection_from_view.into(),
                 projection_from_world: projection_from_world.into(),
@@ -410,8 +413,13 @@ impl ViewBuilder {
 
                 depth_offset_factor,
                 _padding: glam::Vec3::ZERO,
-            }),
-        );
+            });
+            frame_uniform_buffer_cpu.copy_to_buffer(
+                ctx.active_frame.frame_global_command_encoder(&ctx.device),
+                &frame_uniform_buffer,
+                0,
+            );
+        }
 
         let bind_group_0 = ctx.shared_renderer_data.global_bindings.create_bind_group(
             &mut ctx.gpu_resources,
diff --git a/crates/re_renderer/src/wgpu_resources/bind_group_pool.rs b/crates/re_renderer/src/wgpu_resources/bind_group_pool.rs
index 668eee548b11..8422ae97734a 100644
--- a/crates/re_renderer/src/wgpu_resources/bind_group_pool.rs
+++ b/crates/re_renderer/src/wgpu_resources/bind_group_pool.rs
@@ -7,7 +7,7 @@ use crate::debug_label::DebugLabel;
 use super::{
     bind_group_layout_pool::{GpuBindGroupLayoutHandle, GpuBindGroupLayoutPool},
     buffer_pool::{GpuBuffer, GpuBufferHandle, GpuBufferPool},
-    dynamic_resource_pool::{DynamicResource, DynamicResourcePool, SizedResourceDesc},
+    dynamic_resource_pool::{DynamicResource, DynamicResourcePool, DynamicResourcesDesc},
     sampler_pool::{GpuSamplerHandle, GpuSamplerPool},
     texture_pool::{GpuTexture, GpuTextureHandle, GpuTexturePool},
 };
@@ -60,12 +60,16 @@ pub struct BindGroupDesc {
     pub layout: GpuBindGroupLayoutHandle,
 }
 
-impl SizedResourceDesc for BindGroupDesc {
+impl DynamicResourcesDesc for BindGroupDesc {
     fn resource_size_in_bytes(&self) -> u64 {
         // Size depends on gpu/driver (like with all resources).
         // We could guess something like a pointer per descriptor, but let's not pretend we know!
         0
     }
+
+    fn allow_reuse(&self) -> bool {
+        true
+    }
 }
 
 /// Resource pool for bind groups.
diff --git a/crates/re_renderer/src/wgpu_resources/buffer_pool.rs b/crates/re_renderer/src/wgpu_resources/buffer_pool.rs
index 21b837e53a2b..c6890749dc5c 100644
--- a/crates/re_renderer/src/wgpu_resources/buffer_pool.rs
+++ b/crates/re_renderer/src/wgpu_resources/buffer_pool.rs
@@ -3,7 +3,7 @@ use std::hash::Hash;
 use crate::debug_label::DebugLabel;
 
 use super::{
-    dynamic_resource_pool::{DynamicResource, DynamicResourcePool, SizedResourceDesc},
+    dynamic_resource_pool::{DynamicResource, DynamicResourcePool, DynamicResourcesDesc},
     resource::PoolError,
 };
 
@@ -13,6 +13,7 @@ slotmap::new_key_type! { pub struct GpuBufferHandle; }
 /// Once all instances are dropped, the buffer will be marked for reclamation in the following frame.
 pub type GpuBuffer = std::sync::Arc<DynamicResource<GpuBufferHandle, BufferDesc, wgpu::Buffer>>;
 
+/// Buffer creation descriptor.
 #[derive(Clone, Hash, PartialEq, Eq, Debug)]
 pub struct BufferDesc {
     /// Debug label of a buffer. This will show up in graphics debuggers for easy identification.
@@ -24,12 +25,26 @@ pub struct BufferDesc {
     /// Usages of a buffer. If the buffer is used in any way that isn't specified here, the operation
     /// will panic.
     pub usage: wgpu::BufferUsages,
+
+    /// Allows a buffer to be mapped immediately after they are made. It does not have to be [`wgpu::BufferUsages::MAP_READ`] or
+    /// [`wgpu::BufferUsages::MAP_WRITE`], all buffers are allowed to be mapped at creation.
+    ///
+    /// *WARNING*: If this is `true`, the pool won't be able to reclaim the buffer later!
+    /// Furthermore, [`size`](#structfield.size) must be a multiple of [`wgpu::COPY_BUFFER_ALIGNMENT`].
+    pub mapped_at_creation: bool,
 }
 
-impl SizedResourceDesc for BufferDesc {
+impl DynamicResourcesDesc for BufferDesc {
     fn resource_size_in_bytes(&self) -> u64 {
         self.size
     }
+
+    fn allow_reuse(&self) -> bool {
+        // We can't re-use buffers that were mapped at creation since we don't know if the user
+        // unmapped the buffer.
+        // We could try to figure it out, but mapped-at-creation buffers should only be used by one of the dedicated allocators anyways!
+        !self.mapped_at_creation
+    }
 }
 
 #[derive(Default)]
@@ -39,7 +54,8 @@ pub struct GpuBufferPool {
 
 impl GpuBufferPool {
     /// Returns a reference-counted gpu buffer that is currently unused.
-    /// Once ownership is given up, the buffer may be reclaimed in future frames.
+    /// Once ownership is given up, the buffer may be reclaimed in future frames
+    /// unless `BufferDesc::mapped_at_creation` was true.
     ///
     /// For more efficient allocation (faster, less fragmentation) you should sub-allocate buffers whenever possible
     /// either manually or using a higher level allocator.
@@ -49,7 +65,7 @@ impl GpuBufferPool {
                 label: desc.label.get(),
                 size: desc.size,
                 usage: desc.usage,
-                mapped_at_creation: false,
+                mapped_at_creation: desc.mapped_at_creation,
             })
         })
     }
diff --git a/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs b/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs
index 4d10fcc9f839..6da0f18b575a 100644
--- a/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs
+++ b/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs
@@ -12,17 +12,24 @@ use smallvec::SmallVec;
 
 use super::resource::PoolError;
 
-pub trait SizedResourceDesc {
+pub trait DynamicResourcesDesc {
     fn resource_size_in_bytes(&self) -> u64;
+
+    /// If true, a unused resources will be kept around for while and then re-used in following frames.
+    /// If false, it will be destroyed on [`DynamicResourcePool::begin_frame`].
+    fn allow_reuse(&self) -> bool;
 }
 
-pub struct DynamicResource<Handle, Desc, Res> {
+pub struct DynamicResource<Handle, Desc: Debug, Res> {
     pub inner: Res,
     pub creation_desc: Desc,
     pub handle: Handle,
 }
 
-impl<Handle, Desc, Res> std::ops::Deref for DynamicResource<Handle, Desc, Res> {
+impl<Handle, Desc, Res> std::ops::Deref for DynamicResource<Handle, Desc, Res>
+where
+    Desc: Debug,
+{
     type Target = Res;
 
     #[inline]
@@ -35,7 +42,7 @@ impl<Handle, Desc, Res> std::ops::Deref for DynamicResource<Handle, Desc, Res> {
 type AliveResourceMap<Handle, Desc, Res> =
     SlotMap<Handle, Option<Arc<DynamicResource<Handle, Desc, Res>>>>;
 
-struct DynamicResourcePoolProtectedState<Handle: Key, Desc, Res> {
+struct DynamicResourcePoolProtectedState<Handle: Key, Desc: Debug, Res> {
     /// All currently alive resources.
     /// We store any ref counted handle we give out in [`DynamicResourcePool::alloc`] here in order to keep it alive.
     /// Every [`DynamicResourcePool::begin_frame`] we check if the pool is now the only owner of the handle
@@ -51,7 +58,7 @@ struct DynamicResourcePoolProtectedState<Handle: Key, Desc, Res> {
 ///
 /// Unlike in [`super::static_resource_pool::StaticResourcePool`], a resource can not be uniquely
 /// identified by its description, as the same description can apply to several different resources.
-pub(super) struct DynamicResourcePool<Handle: Key, Desc, Res> {
+pub(super) struct DynamicResourcePool<Handle: Key, Desc: Debug, Res> {
     state: RwLock<DynamicResourcePoolProtectedState<Handle, Desc, Res>>,
 
     current_frame_index: u64,
@@ -59,7 +66,10 @@ pub(super) struct DynamicResourcePool<Handle: Key, Desc, Res> {
 }
 
 /// We cannot #derive(Default) as that would require Handle/Desc/Res to implement Default too.
-impl<Handle: Key, Desc, Res> Default for DynamicResourcePool<Handle, Desc, Res> {
+impl<Handle: Key, Desc, Res> Default for DynamicResourcePool<Handle, Desc, Res>
+where
+    Desc: Debug,
+{
     fn default() -> Self {
         Self {
             state: RwLock::new(DynamicResourcePoolProtectedState {
@@ -75,7 +85,7 @@ impl<Handle: Key, Desc, Res> Default for DynamicResourcePool<Handle, Desc, Res>
 impl<Handle, Desc, Res> DynamicResourcePool<Handle, Desc, Res>
 where
     Handle: Key,
-    Desc: Clone + Eq + Hash + Debug + SizedResourceDesc,
+    Desc: Clone + Eq + Hash + Debug + DynamicResourcesDesc,
 {
     pub fn alloc<F: FnOnce(&Desc) -> Res>(
         &self,
@@ -85,24 +95,27 @@ where
         let mut state = self.state.write();
 
         // First check if we can reclaim a resource we have around from a previous frame.
-        let inner_resource =
-            if let Entry::Occupied(mut entry) = state.last_frame_deallocated.entry(desc.clone()) {
-                re_log::trace!(?desc, "Reclaimed previously discarded resource",);
-
-                let handle = entry.get_mut().pop().unwrap();
-                if entry.get().is_empty() {
-                    entry.remove();
+        let inner_resource = (|| {
+            if desc.allow_reuse() {
+                if let Entry::Occupied(mut entry) = state.last_frame_deallocated.entry(desc.clone())
+                {
+                    re_log::trace!(?desc, "Reclaimed previously discarded resource");
+                    let inner_resource = entry.get_mut().pop().unwrap();
+                    if entry.get().is_empty() {
+                        entry.remove();
+                    }
+                    return inner_resource;
                 }
-                handle
+            }
             // Otherwise create a new resource
-            } else {
-                let resource = creation_func(desc);
-                self.total_resource_size_in_bytes.fetch_add(
-                    desc.resource_size_in_bytes(),
-                    std::sync::atomic::Ordering::Relaxed,
-                );
-                resource
-            };
+            re_log::debug!(?desc, "Allocated new resource");
+            let inner_resource = creation_func(desc);
+            self.total_resource_size_in_bytes.fetch_add(
+                desc.resource_size_in_bytes(),
+                std::sync::atomic::Ordering::Relaxed,
+            );
+            inner_resource
+        })();
 
         let handle = state.alive_resources.insert_with_key(|handle| {
             Some(Arc::new(DynamicResource {
@@ -195,6 +208,30 @@ where
             .load(std::sync::atomic::Ordering::Relaxed)
     }
 }
+impl<Handle, Desc, Res> Drop for DynamicResourcePool<Handle, Desc, Res>
+where
+    Handle: Key,
+    Desc: Debug,
+{
+    fn drop(&mut self) {
+        // TODO(andreas): We're failing this check currently on re_viewer's shutdown.
+        // This is primarily the case due to the way we store the render ctx itself and other things on egui-wgpu's paint callback resources
+        // We shouldn't do this as it makes a whole lot of other things cumbersome. Instead, we should store it directly on the `App`
+        // where we control the drop order.
+
+        // for (_, alive_resource) in self.state.read().alive_resources.iter() {
+        //     let alive_resource = alive_resource
+        //         .as_ref()
+        //         .expect("Alive resources should never be None");
+        //     let ref_count = Arc::strong_count(alive_resource);
+
+        //     assert!(ref_count == 1,
+        //             "Resource has still {} owner(s) at the time of pool destruction. Description desc was {:?}",
+        //             ref_count - 1,
+        //             &alive_resource.creation_desc);
+        // }
+    }
+}
 
 #[cfg(test)]
 mod tests {
@@ -206,17 +243,21 @@ mod tests {
         },
     };
 
-    use super::{DynamicResourcePool, SizedResourceDesc};
+    use super::{DynamicResourcePool, DynamicResourcesDesc};
 
     slotmap::new_key_type! { pub struct ConcreteHandle; }
 
     #[derive(Clone, PartialEq, Eq, Hash, Debug)]
     pub struct ConcreteResourceDesc(u32);
 
-    impl SizedResourceDesc for ConcreteResourceDesc {
+    impl DynamicResourcesDesc for ConcreteResourceDesc {
         fn resource_size_in_bytes(&self) -> u64 {
             1
         }
+
+        fn allow_reuse(&self) -> bool {
+            true
+        }
     }
 
     #[derive(Debug)]
diff --git a/crates/re_renderer/src/wgpu_resources/texture_pool.rs b/crates/re_renderer/src/wgpu_resources/texture_pool.rs
index d4907b73b178..ad8c7f5c0d06 100644
--- a/crates/re_renderer/src/wgpu_resources/texture_pool.rs
+++ b/crates/re_renderer/src/wgpu_resources/texture_pool.rs
@@ -3,7 +3,7 @@ use std::hash::Hash;
 use crate::debug_label::DebugLabel;
 
 use super::{
-    dynamic_resource_pool::{DynamicResource, DynamicResourcePool, SizedResourceDesc},
+    dynamic_resource_pool::{DynamicResource, DynamicResourcePool, DynamicResourcesDesc},
     resource::PoolError,
 };
 
@@ -46,7 +46,7 @@ pub struct TextureDesc {
     pub usage: wgpu::TextureUsages,
 }
 
-impl SizedResourceDesc for TextureDesc {
+impl DynamicResourcesDesc for TextureDesc {
     /// Number of bytes this texture is expected to take.
     ///
     /// The actual number might be both bigger (padding) and lower (gpu sided compression).
@@ -68,6 +68,10 @@ impl SizedResourceDesc for TextureDesc {
 
         size_in_bytes
     }
+
+    fn allow_reuse(&self) -> bool {
+        true
+    }
 }
 #[derive(Default)]
 pub struct GpuTexturePool {
diff --git a/crates/re_viewer/src/app.rs b/crates/re_viewer/src/app.rs
index 6d01a41916e5..d45afa2376de 100644
--- a/crates/re_viewer/src/app.rs
+++ b/crates/re_viewer/src/app.rs
@@ -491,6 +491,8 @@ impl eframe::App for App {
                             self.rx.source(),
                         );
                     }
+
+                    render_ctx.before_submit();
                 }
             });
 
diff --git a/crates/re_viewer/src/ui/space_view.rs b/crates/re_viewer/src/ui/space_view.rs
index f06bbae25b37..3e9c7d693ff1 100644
--- a/crates/re_viewer/src/ui/space_view.rs
+++ b/crates/re_viewer/src/ui/space_view.rs
@@ -176,7 +176,7 @@ impl SpaceView {
                     &self.space_path,
                     self.data_blueprint.data_blueprints_projected(),
                 );
-                let mut scene = view_spatial::SceneSpatial::default();
+                let mut scene = view_spatial::SceneSpatial::new(ctx.render_ctx);
                 scene.load(ctx, &query, &transforms, highlights);
                 self.view_state
                     .ui_spatial(ctx, ui, &self.space_path, scene, self.id, highlights);
diff --git a/crates/re_viewer/src/ui/view_spatial/scene/mod.rs b/crates/re_viewer/src/ui/view_spatial/scene/mod.rs
index 18166e05d587..84376f4994c2 100644
--- a/crates/re_viewer/src/ui/view_spatial/scene/mod.rs
+++ b/crates/re_viewer/src/ui/view_spatial/scene/mod.rs
@@ -115,7 +115,6 @@ pub struct SceneSpatialUiData {
     pub images: Vec<Image>,
 }
 
-#[derive(Default)]
 pub struct SceneSpatial {
     pub annotation_map: AnnotationMap,
     pub primitives: SceneSpatialPrimitives,
@@ -146,6 +145,17 @@ fn instance_path_hash_if_interactive(
 pub type Keypoints = HashMap<(ClassId, i64), HashMap<KeypointId, glam::Vec3>>;
 
 impl SceneSpatial {
+    pub fn new(re_ctx: &mut re_renderer::RenderContext) -> Self {
+        Self {
+            annotation_map: Default::default(),
+            primitives: SceneSpatialPrimitives::new(re_ctx),
+            ui: Default::default(),
+            num_logged_2d_objects: Default::default(),
+            num_logged_3d_objects: Default::default(),
+            space_cameras: Default::default(),
+        }
+    }
+
     /// Loads all 3D objects into the scene according to the given query.
     pub(crate) fn load(
         &mut self,
diff --git a/crates/re_viewer/src/ui/view_spatial/scene/primitives.rs b/crates/re_viewer/src/ui/view_spatial/scene/primitives.rs
index 56b8e323e58c..22a1fd28dd31 100644
--- a/crates/re_viewer/src/ui/view_spatial/scene/primitives.rs
+++ b/crates/re_viewer/src/ui/view_spatial/scene/primitives.rs
@@ -10,7 +10,6 @@ use super::MeshSource;
 /// TODO(andreas): Right now we're using `re_renderer` data structures for reading (bounding box & picking).
 ///                 In the future, this will be more limited as we're going to gpu staging data as soon as possible
 ///                 which is very slow to read. See [#594](https://github.com/rerun-io/rerun/pull/594)
-#[derive(Default)]
 pub struct SceneSpatialPrimitives {
     /// Estimated bounding box of all data in scene coordinates. Accumulated.
     pub(super) bounding_box: macaw::BoundingBox,
@@ -31,6 +30,17 @@ const AXIS_COLOR_Y: Color32 = Color32::from_rgb(0, 240, 0);
 const AXIS_COLOR_Z: Color32 = Color32::from_rgb(80, 80, 255);
 
 impl SceneSpatialPrimitives {
+    pub fn new(re_ctx: &mut re_renderer::RenderContext) -> Self {
+        Self {
+            bounding_box: macaw::BoundingBox::nothing(),
+            textured_rectangles_ids: Default::default(),
+            textured_rectangles: Default::default(),
+            line_strips: Default::default(),
+            points: PointCloudBuilder::new(re_ctx),
+            meshes: Default::default(),
+        }
+    }
+
     /// bounding box covering the rendered scene
     pub fn bounding_box(&self) -> macaw::BoundingBox {
         self.bounding_box
diff --git a/crates/re_viewer/src/ui/view_spatial/ui_2d.rs b/crates/re_viewer/src/ui/view_spatial/ui_2d.rs
index 172341ac30b6..23be71806264 100644
--- a/crates/re_viewer/src/ui/view_spatial/ui_2d.rs
+++ b/crates/re_viewer/src/ui/view_spatial/ui_2d.rs
@@ -8,7 +8,11 @@ use re_data_store::EntityPath;
 use re_log_types::component_types::TensorTrait;
 use re_renderer::view_builder::TargetConfiguration;
 
-use super::{eye::Eye, scene::AdditionalPickingInfo, ViewSpatialState};
+use super::{
+    eye::Eye,
+    scene::{AdditionalPickingInfo, SceneSpatialUiData},
+    ViewSpatialState,
+};
 use crate::{
     misc::{HoveredSpace, Item, SelectionHighlight, SpaceViewHighlights},
     ui::{
@@ -301,43 +305,13 @@ fn view_2d_scrollable(
 
     // ------------------------------------------------------------------------
 
-    // Draw a re_renderer driven view.
-    // Camera & projection are configured to ingest space coordinates directly.
-    {
-        crate::profile_scope!("build command buffer for 2D view {}", space.to_string());
-
-        let Ok(target_config) = setup_target_config(
-            &painter,
-            space_from_ui,
-            space_from_pixel,
-            &space.to_string(),
-            state.auto_size_config(response.rect.size()),
-        ) else {
-            return response;
-        };
-
-        let Ok(callback) = create_scene_paint_callback(
-            ctx.render_ctx,
-            target_config, painter.clip_rect(),
-            &scene.primitives,
-            &ScreenBackground::ClearColor(parent_ui.visuals().extreme_bg_color.into()),
-        ) else {
-            return response;
-        };
-
-        painter.add(callback);
-    }
-
-    // ------------------------------------------------------------------------
-
-    // Add egui driven labels on top of re_renderer content.
-    painter.extend(create_labels(
-        &mut scene,
+    let label_shapes = create_labels(
+        &mut scene.ui,
         ui_from_space,
         space_from_ui,
         parent_ui,
         highlights,
-    ));
+    );
 
     // ------------------------------------------------------------------------
 
@@ -452,6 +426,33 @@ fn view_2d_scrollable(
 
     // ------------------------------------------------------------------------
 
+    // Draw a re_renderer driven view.
+    // Camera & projection are configured to ingest space coordinates directly.
+    {
+        crate::profile_scope!("build command buffer for 2D view {}", space.to_string());
+
+        let Ok(target_config) = setup_target_config(
+            &painter,
+            space_from_ui,
+            space_from_pixel,
+            &space.to_string(),
+            state.auto_size_config(response.rect.size()),
+        ) else {
+            return response;
+        };
+
+        let Ok(callback) = create_scene_paint_callback(
+            ctx.render_ctx,
+            target_config, painter.clip_rect(),
+            scene.primitives,
+            &ScreenBackground::ClearColor(parent_ui.visuals().extreme_bg_color.into()),
+        ) else {
+            return response;
+        };
+
+        painter.add(callback);
+    }
+
     project_onto_other_spaces(ctx, space, &response, &space_from_ui, depth_at_pointer);
     painter.extend(show_projections_from_3d_space(
         ctx,
@@ -460,11 +461,14 @@ fn view_2d_scrollable(
         &ui_from_space,
     ));
 
+    // Add egui driven labels on top of re_renderer content.
+    painter.extend(label_shapes);
+
     response
 }
 
 fn create_labels(
-    scene: &mut SceneSpatial,
+    scene_ui: &mut SceneSpatialUiData,
     ui_from_space: RectTransform,
     space_from_ui: RectTransform,
     parent_ui: &mut egui::Ui,
@@ -472,9 +476,9 @@ fn create_labels(
 ) -> Vec<Shape> {
     crate::profile_function!();
 
-    let mut label_shapes = Vec::with_capacity(scene.ui.labels_2d.len() * 2);
+    let mut label_shapes = Vec::with_capacity(scene_ui.labels_2d.len() * 2);
 
-    for label in &scene.ui.labels_2d {
+    for label in &scene_ui.labels_2d {
         let (wrap_width, text_anchor_pos) = match label.target {
             Label2DTarget::Rect(rect) => {
                 let rect_in_ui = ui_from_space.transform_rect(rect);
@@ -534,8 +538,7 @@ fn create_labels(
         label_shapes.push(Shape::rect_filled(bg_rect, 3.0, fill_color));
         label_shapes.push(Shape::galley(text_rect.center_top(), galley));
 
-        scene
-            .ui
+        scene_ui
             .pickable_ui_rects
             .push((space_from_ui.transform_rect(bg_rect), label.labled_instance));
     }
diff --git a/crates/re_viewer/src/ui/view_spatial/ui_3d.rs b/crates/re_viewer/src/ui/view_spatial/ui_3d.rs
index b4d1ed2a2359..88590aba0cb7 100644
--- a/crates/re_viewer/src/ui/view_spatial/ui_3d.rs
+++ b/crates/re_viewer/src/ui/view_spatial/ui_3d.rs
@@ -473,7 +473,7 @@ pub fn view_3d(
         ui,
         eye,
         rect,
-        &scene,
+        scene,
         ctx.render_ctx,
         &space.to_string(),
         state.auto_size_config(rect.size()),
@@ -484,7 +484,7 @@ fn paint_view(
     ui: &mut egui::Ui,
     eye: Eye,
     rect: egui::Rect,
-    scene: &SceneSpatial,
+    scene: SceneSpatial,
     render_ctx: &mut RenderContext,
     name: &str,
     auto_size_config: re_renderer::AutoSizeConfig,
@@ -516,7 +516,7 @@ fn paint_view(
         render_ctx,
         target_config,
         rect,
-        &scene.primitives, &ScreenBackground::GenericSkybox)
+        scene.primitives, &ScreenBackground::GenericSkybox)
     else {
         return;
     };
diff --git a/crates/re_viewer/src/ui/view_spatial/ui_renderer_bridge.rs b/crates/re_viewer/src/ui/view_spatial/ui_renderer_bridge.rs
index 2c9201982add..30a2758bb653 100644
--- a/crates/re_viewer/src/ui/view_spatial/ui_renderer_bridge.rs
+++ b/crates/re_viewer/src/ui/view_spatial/ui_renderer_bridge.rs
@@ -18,7 +18,7 @@ pub fn create_scene_paint_callback(
     render_ctx: &mut RenderContext,
     target_config: TargetConfiguration,
     clip_rect: egui::Rect,
-    primitives: &SceneSpatialPrimitives,
+    primitives: SceneSpatialPrimitives,
     background: &ScreenBackground,
 ) -> anyhow::Result<egui::PaintCallback> {
     let pixels_from_point = target_config.pixels_from_point;
@@ -41,7 +41,7 @@ pub enum ScreenBackground {
 fn create_and_fill_view_builder(
     render_ctx: &mut RenderContext,
     target_config: TargetConfiguration,
-    primitives: &SceneSpatialPrimitives,
+    primitives: SceneSpatialPrimitives,
     background: &ScreenBackground,
 ) -> anyhow::Result<(wgpu::CommandBuffer, ViewBuilder)> {
     let mut view_builder = ViewBuilder::default();
@@ -85,6 +85,7 @@ fn renderer_paint_callback(
     let command_buffer = std::sync::Arc::new(Mutex::new(Some(command_buffer)));
 
     let composition_view_builder_map = render_ctx
+        .active_frame
         .per_frame_data_helper
         .entry::<ViewBuilderMap>()
         .or_insert_with(Default::default);
@@ -111,7 +112,10 @@ fn renderer_paint_callback(
                     //let clip_rect = info.clip_rect_in_pixels();
 
                     let ctx = paint_callback_resources.get::<RenderContext>().unwrap();
-                    ctx.per_frame_data_helper.get::<ViewBuilderMap>().unwrap()[view_builder_handle]
+                    ctx.active_frame
+                        .per_frame_data_helper
+                        .get::<ViewBuilderMap>()
+                        .unwrap()[view_builder_handle]
                         .composite(ctx, render_pass, screen_position)
                         .expect("Failed compositing view builder with main target.");
                 }),