From 137deba52c87c20eaa07891c15340de3d239dea5 Mon Sep 17 00:00:00 2001
From: Andreas Reich <andreas@rerun.io>
Date: Mon, 20 Feb 2023 11:47:01 +0100
Subject: [PATCH 01/21] Allow mapped_on_creation for wgpu buffer

---
 crates/re_renderer/src/mesh.rs                |  3 +
 crates/re_renderer/src/renderer/lines.rs      |  1 +
 .../re_renderer/src/renderer/mesh_renderer.rs |  1 +
 .../re_renderer/src/renderer/point_cloud.rs   |  1 +
 crates/re_renderer/src/renderer/rectangles.rs |  1 +
 crates/re_renderer/src/view_builder.rs        |  1 +
 .../src/wgpu_resources/bind_group_pool.rs     |  8 ++-
 .../src/wgpu_resources/buffer_pool.rs         | 27 ++++++--
 .../wgpu_resources/dynamic_resource_pool.rs   | 67 +++++++++++++------
 .../src/wgpu_resources/texture_pool.rs        |  8 ++-
 10 files changed, 88 insertions(+), 30 deletions(-)

diff --git a/crates/re_renderer/src/mesh.rs b/crates/re_renderer/src/mesh.rs
index bc08f6cc6548..aeb977c27476 100644
--- a/crates/re_renderer/src/mesh.rs
+++ b/crates/re_renderer/src/mesh.rs
@@ -167,6 +167,7 @@ impl GpuMesh {
                     label: data.label.clone().push_str(" - vertices"),
                     size: vertex_buffer_combined_size,
                     usage: wgpu::BufferUsages::VERTEX | wgpu::BufferUsages::COPY_DST,
+                    mapped_at_creation: false,
                 },
             );
             let mut staging_buffer = queue
@@ -195,6 +196,7 @@ impl GpuMesh {
                     label: data.label.clone().push_str(" - indices"),
                     size: index_buffer_size,
                     usage: wgpu::BufferUsages::INDEX | wgpu::BufferUsages::COPY_DST,
+                    mapped_at_creation: false,
                 },
             );
             let mut staging_buffer = queue
@@ -224,6 +226,7 @@ impl GpuMesh {
                     label: data.label.clone().push_str(" - material uniforms"),
                     size: combined_buffers_size,
                     usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::UNIFORM,
+                    mapped_at_creation: false,
                 },
             );
 
diff --git a/crates/re_renderer/src/renderer/lines.rs b/crates/re_renderer/src/renderer/lines.rs
index bb82c7f475ea..7405148cfc44 100644
--- a/crates/re_renderer/src/renderer/lines.rs
+++ b/crates/re_renderer/src/renderer/lines.rs
@@ -511,6 +511,7 @@ impl LineDrawData {
                     label: "lines batch uniform buffers".into(),
                     size: combined_buffers_size,
                     usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
+                    mapped_at_creation: false,
                 },
             );
 
diff --git a/crates/re_renderer/src/renderer/mesh_renderer.rs b/crates/re_renderer/src/renderer/mesh_renderer.rs
index 1d312a5387ab..8d71b4f8b16c 100644
--- a/crates/re_renderer/src/renderer/mesh_renderer.rs
+++ b/crates/re_renderer/src/renderer/mesh_renderer.rs
@@ -145,6 +145,7 @@ impl MeshDrawData {
                 label: "MeshDrawData instance buffer".into(),
                 size: instance_buffer_size,
                 usage: wgpu::BufferUsages::VERTEX | wgpu::BufferUsages::COPY_DST,
+                mapped_at_creation: false,
             },
         );
 
diff --git a/crates/re_renderer/src/renderer/point_cloud.rs b/crates/re_renderer/src/renderer/point_cloud.rs
index 164a8473b541..166b0fd781b6 100644
--- a/crates/re_renderer/src/renderer/point_cloud.rs
+++ b/crates/re_renderer/src/renderer/point_cloud.rs
@@ -357,6 +357,7 @@ impl PointCloudDrawData {
                     label: "point batch uniform buffers".into(),
                     size: combined_buffers_size,
                     usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
+                    mapped_at_creation: false,
                 },
             );
 
diff --git a/crates/re_renderer/src/renderer/rectangles.rs b/crates/re_renderer/src/renderer/rectangles.rs
index fd79df19d6a5..71a84d4565d5 100644
--- a/crates/re_renderer/src/renderer/rectangles.rs
+++ b/crates/re_renderer/src/renderer/rectangles.rs
@@ -141,6 +141,7 @@ impl RectangleDrawData {
                 label: "rectangle uniform buffers".into(),
                 size: combined_buffers_size,
                 usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::UNIFORM,
+                mapped_at_creation: false,
             },
         );
 
diff --git a/crates/re_renderer/src/view_builder.rs b/crates/re_renderer/src/view_builder.rs
index fdcec7cf146c..098422ae03cf 100644
--- a/crates/re_renderer/src/view_builder.rs
+++ b/crates/re_renderer/src/view_builder.rs
@@ -270,6 +270,7 @@ impl ViewBuilder {
                 label: format!("{:?} - frame uniform buffer", config.name).into(),
                 size: std::mem::size_of::<FrameUniformBuffer>() as _,
                 usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
+                mapped_at_creation: false,
             },
         );
 
diff --git a/crates/re_renderer/src/wgpu_resources/bind_group_pool.rs b/crates/re_renderer/src/wgpu_resources/bind_group_pool.rs
index 1a746342090b..aee1112c9b08 100644
--- a/crates/re_renderer/src/wgpu_resources/bind_group_pool.rs
+++ b/crates/re_renderer/src/wgpu_resources/bind_group_pool.rs
@@ -7,7 +7,7 @@ use crate::debug_label::DebugLabel;
 use super::{
     bind_group_layout_pool::{GpuBindGroupLayoutHandle, GpuBindGroupLayoutPool},
     buffer_pool::{GpuBufferHandle, GpuBufferHandleStrong, GpuBufferPool},
-    dynamic_resource_pool::{DynamicResourcePool, SizedResourceDesc},
+    dynamic_resource_pool::{DynamicResourcePool, DynamicResourcesDesc},
     resource::PoolError,
     sampler_pool::{GpuSamplerHandle, GpuSamplerPool},
     texture_pool::{GpuTextureHandle, GpuTextureHandleStrong, GpuTexturePool},
@@ -66,12 +66,16 @@ pub struct BindGroupDesc {
     pub layout: GpuBindGroupLayoutHandle,
 }
 
-impl SizedResourceDesc for BindGroupDesc {
+impl DynamicResourcesDesc for BindGroupDesc {
     fn resource_size_in_bytes(&self) -> u64 {
         // Size depends on gpu/driver (like with all resources).
         // We could guess something like a pointer per descriptor, but let's not pretend we know!
         0
     }
+
+    fn allow_reuse(&self) -> bool {
+        true
+    }
 }
 
 /// Resource pool for bind groups.
diff --git a/crates/re_renderer/src/wgpu_resources/buffer_pool.rs b/crates/re_renderer/src/wgpu_resources/buffer_pool.rs
index 16268afb6f7f..f62059d59c09 100644
--- a/crates/re_renderer/src/wgpu_resources/buffer_pool.rs
+++ b/crates/re_renderer/src/wgpu_resources/buffer_pool.rs
@@ -3,7 +3,7 @@ use std::hash::Hash;
 use crate::debug_label::DebugLabel;
 
 use super::{
-    dynamic_resource_pool::{DynamicResourcePool, SizedResourceDesc},
+    dynamic_resource_pool::{DynamicResourcePool, DynamicResourcesDesc},
     resource::PoolError,
 };
 
@@ -13,6 +13,7 @@ slotmap::new_key_type! { pub struct GpuBufferHandle; }
 /// Once all strong handles are dropped, the bind group will be marked for reclamation in the following frame.
 pub type GpuBufferHandleStrong = std::sync::Arc<GpuBufferHandle>;
 
+/// Buffer creation descriptor.
 #[derive(Clone, Hash, PartialEq, Eq, Debug)]
 pub struct BufferDesc {
     /// Debug label of a buffer. This will show up in graphics debuggers for easy identification.
@@ -24,12 +25,27 @@ pub struct BufferDesc {
     /// Usages of a buffer. If the buffer is used in any way that isn't specified here, the operation
     /// will panic.
     pub usage: wgpu::BufferUsages,
+
+    /// Allows a buffer to be mapped immediately after they are made. It does not have to be [`BufferUsages::MAP_READ`] or
+    /// [`BufferUsages::MAP_WRITE`], all buffers are allowed to be mapped at creation.
+    ///
+    /// *WARNING*: If this is `true`, the pool won't be able to reclaim the buffer later!
+    /// Furthermore, [`size`](#structfield.size) must be a multiple of
+    /// [`COPY_BUFFER_ALIGNMENT`].
+    pub mapped_at_creation: bool,
 }
 
-impl SizedResourceDesc for BufferDesc {
+impl DynamicResourcesDesc for BufferDesc {
     fn resource_size_in_bytes(&self) -> u64 {
         self.size
     }
+
+    fn allow_reuse(&self) -> bool {
+        // We can't re-use buffers that were mapped at creation since we don't know if the user
+        // unmapped the buffer.
+        // We could try to figure it out, but mapped-at-creation buffers should only be used by one of the dedicated allocators anyways!
+        !self.mapped_at_creation
+    }
 }
 
 #[derive(Default)]
@@ -39,7 +55,8 @@ pub struct GpuBufferPool {
 
 impl GpuBufferPool {
     /// Returns a ref counted handle to a currently unused buffer.
-    /// Once ownership to the handle is given up, the buffer may be reclaimed in future frames.
+    /// Once ownership to the handle is given up, the buffer may be reclaimed in future frames,
+    /// unless `BufferDesc::mapped_at_creation` was true.
     ///
     /// For more efficient allocation (faster, less fragmentation) you should sub-allocate buffers whenever possible
     /// either manually or using a higher level allocator.
@@ -49,7 +66,7 @@ impl GpuBufferPool {
                 label: desc.label.get(),
                 size: desc.size,
                 usage: desc.usage,
-                mapped_at_creation: false,
+                mapped_at_creation: desc.mapped_at_creation,
             })
         })
     }
@@ -73,7 +90,7 @@ impl GpuBufferPool {
     }
 
     /// Internal method to retrieve a strong handle from a weak handle (used by [`super::GpuBindGroupPool`])
-    /// without inrementing the ref-count (note the returned reference!).
+    /// without incrementing the ref-count (note the returned reference!).
     pub(super) fn get_strong_handle(&self, handle: GpuBufferHandle) -> &GpuBufferHandleStrong {
         self.pool.get_strong_handle(handle)
     }
diff --git a/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs b/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs
index 9edbaa52e1dd..2e7cb5d0de65 100644
--- a/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs
+++ b/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs
@@ -11,8 +11,12 @@ use smallvec::{smallvec, SmallVec};
 
 use super::resource::PoolError;
 
-pub trait SizedResourceDesc {
+pub trait DynamicResourcesDesc {
     fn resource_size_in_bytes(&self) -> u64;
+
+    /// If true, a unused resources will be kept around for while and then re-used in following frames.
+    /// If false, it will be destroyed on [`DynamicResourcePool::begin_frame`].
+    fn allow_reuse(&self) -> bool;
 }
 
 /// Generic resource pool for all resources that have varying contents beyond their description.
@@ -54,11 +58,15 @@ impl<Handle: Key, Desc, Res> Default for DynamicResourcePool<Handle, Desc, Res>
 impl<Handle, Desc, Res> DynamicResourcePool<Handle, Desc, Res>
 where
     Handle: Key,
-    Desc: Clone + Eq + Hash + Debug + SizedResourceDesc,
+    Desc: Clone + Eq + Hash + Debug + DynamicResourcesDesc,
 {
-    pub fn alloc<F: FnOnce(&Desc) -> Res>(&mut self, desc: &Desc, creation_func: F) -> Arc<Handle> {
+    fn alloc_internal<F: FnOnce(&Desc) -> Res>(
+        &mut self,
+        desc: &Desc,
+        creation_func: F,
+    ) -> Arc<Handle> {
         // First check if we can reclaim a resource we have around from a previous frame.
-        let handle =
+        if desc.allow_reuse() {
             if let Entry::Occupied(mut entry) = self.last_frame_deallocated.entry(desc.clone()) {
                 re_log::trace!(?desc, "Reclaimed previously discarded resource",);
 
@@ -66,14 +74,18 @@ where
                 if entry.get().is_empty() {
                     entry.remove();
                 }
-                handle
-            // Otherwise create a new resource
-            } else {
-                let resource = creation_func(desc);
-                self.total_resource_size_in_bytes += desc.resource_size_in_bytes();
-                Arc::new(self.resources.insert((desc.clone(), resource)))
-            };
+                return handle;
+            }
+        }
+
+        // Otherwise create a new resource
+        let resource = creation_func(desc);
+        self.total_resource_size_in_bytes += desc.resource_size_in_bytes();
+        Arc::new(self.resources.insert((desc.clone(), resource)))
+    }
 
+    pub fn alloc<F: FnOnce(&Desc) -> Res>(&mut self, desc: &Desc, creation_func: F) -> Arc<Handle> {
+        let handle = self.alloc_internal(desc, creation_func);
         self.alive_handles.insert(*handle, Arc::clone(&handle));
         handle
     }
@@ -102,10 +114,10 @@ where
                 "Drained dangling resources from last frame",
             );
             for handle in handles {
-                if let Some((_, res)) = self.resources.remove(*handle) {
+                if let Some((desc, res)) = self.resources.remove(*handle) {
                     on_destroy_resource(&res);
+                    self.total_resource_size_in_bytes -= desc.resource_size_in_bytes();
                 }
-                self.total_resource_size_in_bytes -= desc.resource_size_in_bytes();
             }
         }
 
@@ -117,13 +129,20 @@ where
         self.alive_handles.retain(|handle, strong_handle| {
             if Arc::strong_count(strong_handle) == 1 {
                 let desc = &self.resources[handle].0;
-                match self.last_frame_deallocated.entry(desc.clone()) {
-                    Entry::Occupied(mut e) => {
-                        e.get_mut().push(Arc::clone(strong_handle));
-                    }
-                    Entry::Vacant(e) => {
-                        e.insert(smallvec![Arc::clone(strong_handle)]);
+
+                // If allowed, put it on the "last frame deallocated" list instead of destroying the resource immediately.
+                if desc.allow_reuse() {
+                    match self.last_frame_deallocated.entry(desc.clone()) {
+                        Entry::Occupied(mut e) => {
+                            e.get_mut().push(Arc::clone(strong_handle));
+                        }
+                        Entry::Vacant(e) => {
+                            e.insert(smallvec![Arc::clone(strong_handle)]);
+                        }
                     }
+                } else if let Some((desc, res)) = self.resources.remove(handle) {
+                    on_destroy_resource(&res);
+                    self.total_resource_size_in_bytes -= desc.resource_size_in_bytes();
                 }
                 false
             } else {
@@ -159,7 +178,7 @@ mod tests {
 
     use slotmap::Key;
 
-    use super::{DynamicResourcePool, SizedResourceDesc};
+    use super::{DynamicResourcePool, DynamicResourcesDesc};
     use crate::wgpu_resources::resource::PoolError;
 
     slotmap::new_key_type! { pub struct ConcreteHandle; }
@@ -167,10 +186,14 @@ mod tests {
     #[derive(Clone, PartialEq, Eq, Hash, Debug)]
     pub struct ConcreteResourceDesc(u32);
 
-    impl SizedResourceDesc for ConcreteResourceDesc {
+    impl DynamicResourcesDesc for ConcreteResourceDesc {
         fn resource_size_in_bytes(&self) -> u64 {
             1
         }
+
+        fn allow_reuse(&self) -> bool {
+            true
+        }
     }
 
     #[derive(Debug)]
@@ -256,6 +279,8 @@ mod tests {
         }
     }
 
+    // TODO: Add test for resources without re-use
+
     fn allocate_resources(
         descs: &[u32],
         pool: &mut DynamicResourcePool<ConcreteHandle, ConcreteResourceDesc, ConcreteResource>,
diff --git a/crates/re_renderer/src/wgpu_resources/texture_pool.rs b/crates/re_renderer/src/wgpu_resources/texture_pool.rs
index 85ea6b008b35..f8b93d27af37 100644
--- a/crates/re_renderer/src/wgpu_resources/texture_pool.rs
+++ b/crates/re_renderer/src/wgpu_resources/texture_pool.rs
@@ -3,7 +3,7 @@ use std::hash::Hash;
 use crate::debug_label::DebugLabel;
 
 use super::{
-    dynamic_resource_pool::{DynamicResourcePool, SizedResourceDesc},
+    dynamic_resource_pool::{DynamicResourcePool, DynamicResourcesDesc},
     resource::PoolError,
 };
 
@@ -45,7 +45,7 @@ pub struct TextureDesc {
     pub usage: wgpu::TextureUsages,
 }
 
-impl SizedResourceDesc for TextureDesc {
+impl DynamicResourcesDesc for TextureDesc {
     /// Number of bytes this texture is expected to take.
     ///
     /// The actual number might be both bigger (padding) and lower (gpu sided compression).
@@ -67,6 +67,10 @@ impl SizedResourceDesc for TextureDesc {
 
         size_in_bytes
     }
+
+    fn allow_reuse(&self) -> bool {
+        true
+    }
 }
 #[derive(Default)]
 pub struct GpuTexturePool {

From ef93d724c33347f84fddf1c0e345071abd1ce374 Mon Sep 17 00:00:00 2001
From: Andreas Reich <andreas@rerun.io>
Date: Mon, 20 Feb 2023 13:48:26 +0100
Subject: [PATCH 02/21] Add CpuWriteGpuReadBelt and use it for frameuniform
 buffer as poc

---
 .../src/allocator/cpu_write_gpu_read_belt.rs  | 392 ++++++++++++++++++
 crates/re_renderer/src/allocator/mod.rs       |   8 +
 crates/re_renderer/src/context.rs             |  50 +++
 crates/re_renderer/src/lib.rs                 |   1 +
 crates/re_renderer/src/view_builder.rs        |  32 ++
 .../src/wgpu_resources/buffer_pool.rs         |   3 +-
 6 files changed, 484 insertions(+), 2 deletions(-)
 create mode 100644 crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
 create mode 100644 crates/re_renderer/src/allocator/mod.rs

diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
new file mode 100644
index 000000000000..7a65a3bb6096
--- /dev/null
+++ b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
@@ -0,0 +1,392 @@
+use std::{
+    ops::DerefMut,
+    sync::{mpsc, Arc},
+};
+
+use crate::wgpu_resources::{BufferDesc, GpuBufferHandleStrong, GpuBufferPool, PoolError};
+
+/// Internal chunk of the staging belt.
+struct Chunk {
+    buffer: GpuBufferHandleStrong,
+    size: usize,
+}
+
+/// Chunk that the CPU is writing to currently.
+struct ActiveChunk {
+    chunk: Chunk,
+
+    /// Write view into the entire buffer.
+    ///
+    /// UNSAFE: The lifetime is transmuted to be `'static`.
+    /// In actuality it is tied to the lifetime of [`buffer`](#structfield.chunk.buffer)!
+    write_view: wgpu::BufferViewMut<'static>,
+
+    /// Safety mechanism to track the number of uses of this buffer.
+    ///
+    /// As long it is bigger 1, we are NOT allowed to deactivate the chunk!
+    safety_usage_counter: Arc<()>,
+
+    /// At what offset is [`write_view`](#structfield.write_view) unused.
+    unused_offset: usize,
+}
+
+impl Chunk {
+    #[allow(unsafe_code)]
+    fn into_active(self, buffer_pool: &GpuBufferPool) -> ActiveChunk {
+        // Get the entire mapped slice up front because mainly because there is no way to know the memory alignment beforehand!
+        // After discussing this on the wgpu dev channel, consensus was reached that they should be 4xf32 aligned.
+        // TODO: Github issue link.
+        //
+        // It also safes us a bit of time since we don't have to run through all the checks on every `allocate` call,
+        // but this is likely negligible (citation needed).
+        //
+        // Wgpu isn't fond of that though since it ties the (compile time) lifetime of the returned `wgpu::BufferSlice`
+        // to the lifetime of the buffer (after all the mapping should never outlive the buffer!).
+        // Therefore, we need to workaround this a bit.
+        //
+        // Note that in the JavaScript/wasm api of WebGPU this is trivially possible, see
+        // https://www.w3.org/TR/webgpu/#dom-gpubuffer-getmappedrange
+        // (but it also ends up doing an extra memcpy to that's an unfair comparison)
+
+        let wgpu_buffer = buffer_pool
+            .get_resource(&self.buffer)
+            .expect("Invalid buffer handle");
+        let buffer_slice = wgpu_buffer.slice(..);
+        let write_view = buffer_slice.get_mapped_range_mut();
+
+        // SAFETY:
+        // Things we need to guarantee manually now:
+        // * Buffer needs to stay alive for the time CpuWriteGpuReadBuffer is held.
+        //      -> We keep a reference count to `chunk.buffer`
+        // * Nobody holds a view into our memory by the time
+        //      -> We use `safety_usage_counter` and panic if there is more than one owner when we close the buffer view
+        // * Returned ranges into `write_view` NEVER overlap.
+        //      -> We track the unused_offset and never return any range ahead of it!
+        let write_view = unsafe {
+            std::mem::transmute::<wgpu::BufferViewMut<'_>, wgpu::BufferViewMut<'static>>(write_view)
+        };
+
+        ActiveChunk {
+            chunk: self,
+            write_view,
+            unused_offset: 0,
+            safety_usage_counter: Arc::new(()),
+        }
+    }
+}
+
+// TODO(andreas): Make wgpu::BufferMappedRange Send upstream
+#[allow(unsafe_code)]
+/// SAFETY:
+/// TODO: Link to pr here doing so
+unsafe impl std::marker::Send for ActiveChunk {}
+
+/// A suballocated staging buffer that can be written to.
+///
+/// We do *not* allow reading from this buffer as it is typically write-combined memory.
+/// Reading would work, but it can be *very* slow.
+pub struct CpuWriteGpuReadBuffer<T: bytemuck::Pod> {
+    write_only_memory: &'static mut [T],
+
+    #[allow(dead_code)]
+    safety_usage_counter: Arc<()>,
+
+    chunk_buffer: GpuBufferHandleStrong,
+    offset_in_chunk_buffer: usize,
+}
+
+impl<T> CpuWriteGpuReadBuffer<T>
+where
+    T: bytemuck::Pod + 'static,
+{
+    /// Writes several objects to the buffer at a given location.
+    /// User is responsible for ensuring the element offset is valid with the element types's alignment requirement.
+    /// (panics otherwise)
+    ///
+    /// We do *not* allow reading from this buffer as it is typically write-combined memory.
+    /// Reading would work, but it can be *very* slow.
+    #[inline]
+    pub fn write(&mut self, elements: impl Iterator<Item = T>, num_elements_offset: usize) {
+        for (target, source) in self
+            .write_only_memory
+            .iter_mut()
+            .skip(num_elements_offset)
+            .zip(elements)
+        {
+            *target = source;
+        }
+    }
+
+    /// Writes a single objects to the buffer at a given location.
+    /// User is responsible for ensuring the element offset is valid with the element types's alignment requirement.
+    /// (panics otherwise)
+    #[inline(always)]
+    pub fn write_single(&mut self, element: &T, num_elements_offset: usize) {
+        self.write_only_memory[num_elements_offset] = *element;
+    }
+
+    // pub fn copy_to_texture(
+    //     self,
+    //     encoder: &mut wgpu::CommandEncoder,
+    //     buffer_pool: &GpuBufferPool,
+    //     destination: wgpu::ImageCopyTexture<'_>,
+    //     bytes_per_row: Option<NonZeroU32>,
+    //     rows_per_image: Option<NonZeroU32>,
+    //     copy_size: wgpu::Extent3d,
+    // ) -> Result<(), PoolError> {
+    //     let buffer = buffer_pool.get_resource(&self.chunk_buffer)?;
+
+    //     encoder.copy_buffer_to_texture(
+    //         wgpu::ImageCopyBuffer {
+    //             buffer,
+    //             layout: wgpu::ImageDataLayout {
+    //                 offset: self.offset_in_chunk,
+    //                 bytes_per_row,
+    //                 rows_per_image,
+    //             },
+    //         },
+    //         destination,
+    //         copy_size,
+    //     );
+
+    //     Ok(())
+    // }
+
+    /// Consume this data view and copy it to another gpu buffer.
+    pub fn copy_to_buffer(
+        self,
+        encoder: &mut wgpu::CommandEncoder,
+        buffer_pool: &GpuBufferPool,
+        destination: &GpuBufferHandleStrong,
+        destination_offset: wgpu::BufferAddress,
+    ) -> Result<(), PoolError> {
+        encoder.copy_buffer_to_buffer(
+            buffer_pool.get_resource(&self.chunk_buffer)?,
+            self.offset_in_chunk_buffer as _,
+            buffer_pool.get_resource(destination)?,
+            destination_offset,
+            (self.write_only_memory.len() * std::mem::size_of::<T>()) as u64,
+        );
+        Ok(())
+    }
+}
+
+/// Efficiently performs many buffer writes by sharing and reusing temporary buffers.
+///
+/// Internally it uses a ring-buffer of staging buffers that are sub-allocated.
+///
+/// Based on to [`wgpu::util::StagingBelt`](https://github.com/gfx-rs/wgpu/blob/a420e453c3d9c93dfb1a8526bf11c000d895c916/wgpu/src/util/belt.rs)
+/// However, there are some important differences:
+/// * can create buffers without yet knowing the target copy location
+/// * lifetime of returned buffers is independent of the [`StagingWriteBelt`] (allows working with several in parallel!)
+/// * use of `re_renderer`'s resource pool
+/// * handles alignment
+pub struct CpuWriteGpuReadBelt {
+    /// Minimum size for new buffers.
+    chunk_size: usize,
+
+    /// Chunks which are CPU write at the moment.
+    active_chunks: Vec<ActiveChunk>,
+
+    /// Chunks which are GPU read at the moment.
+    ///
+    /// I.e. they have scheduled transfers already; they are unmapped and one or more
+    /// command encoder has one or more `copy_buffer_to_buffer` commands with them
+    /// as source.
+    closed_chunks: Vec<Chunk>,
+
+    /// Chunks that are back from the GPU and ready to be mapped for write and put
+    /// into `active_chunks`.
+    free_chunks: Vec<Chunk>,
+
+    /// When closed chunks are mapped again, the map callback sends them here.
+    ///
+    /// Behind a mutex, so that our StagingBelt becomes Sync.
+    /// Note that we shouldn't use SyncSender since this can block the Sender if a buffer is full,
+    /// which means that in a single threaded situation (Web!) we might deadlock.
+    sender: mpsc::Sender<Chunk>,
+
+    /// Free chunks are received here to be put on `self.free_chunks`.
+    receiver: mpsc::Receiver<Chunk>,
+}
+
+impl CpuWriteGpuReadBelt {
+    /// Create a new staging belt.
+    ///
+    /// The `chunk_size` is the unit of internal buffer allocation; writes will be
+    /// sub-allocated within each chunk. Therefore, for optimal use of memory, the
+    /// chunk size should be:
+    ///
+    /// * larger than the largest single [`StagingBelt::write_buffer()`] operation;
+    /// * 1-4 times less than the total amount of data uploaded per submission
+    ///   (per [`StagingBelt::finish()`]); and
+    /// * bigger is better, within these bounds.
+    ///
+    /// TODO(andreas): Adaptive chunk sizes
+    /// TODO(andreas): Shrinking after usage spikes?
+    pub fn new(chunk_size: usize) -> Self {
+        let (sender, receiver) = mpsc::channel();
+        CpuWriteGpuReadBelt {
+            chunk_size,
+            active_chunks: Vec::new(),
+            closed_chunks: Vec::new(),
+            free_chunks: Vec::new(),
+            sender,
+            receiver,
+        }
+    }
+
+    /// Allocates a cpu writable buffer for `num_elements` instances of T.
+    ///
+    /// Handles alignment requirements automatically which allows faster copy operations.
+    #[allow(unsafe_code)]
+    pub fn allocate<T: bytemuck::Pod + 'static>(
+        &mut self,
+        device: &wgpu::Device,
+        buffer_pool: &mut GpuBufferPool,
+        num_elements: usize,
+    ) -> CpuWriteGpuReadBuffer<T> {
+        let alignment = std::mem::align_of::<T>().min(wgpu::MAP_ALIGNMENT as usize);
+        let size = std::mem::size_of::<T>() * num_elements;
+
+        // We need to be super careful with alignment since wgpu
+        // has no guarantees on how pointers to mapped memory are aligned!
+
+        // We explicitly use `deref_mut` on write_view everywhere, since wgpu warns if we accidentally use `deref`.
+
+        // Try to find space in any of the active chunks first.
+        let mut active_chunk = if let Some(index) =
+            self.active_chunks.iter_mut().position(|active_chunk| {
+                size + (active_chunk.write_view.deref_mut().as_ptr() as usize
+                    + active_chunk.unused_offset)
+                    % alignment
+                    <= active_chunk.chunk.size - active_chunk.unused_offset
+            }) {
+            self.active_chunks.swap_remove(index)
+        } else {
+            self.receive_chunks(); // ensure self.free_chunks is up to date
+
+            // We don't know yet how aligned the mapped pointer is. So we need to ask for more memory!
+            let required_size = size + alignment - 1;
+
+            // Use a free chunk if possible, fall back to creating a new one if necessary.
+            if let Some(index) = self
+                .free_chunks
+                .iter()
+                .position(|chunk| chunk.size >= required_size)
+            {
+                self.free_chunks.swap_remove(index).into_active(buffer_pool)
+            } else {
+                // Allocation might be bigger than a chunk.
+                let size = self.chunk_size.max(required_size);
+
+                let buffer = buffer_pool.alloc(
+                    device,
+                    &BufferDesc {
+                        label: "CpuWriteGpuReadBelt buffer".into(),
+                        size: size as _,
+                        usage: wgpu::BufferUsages::MAP_WRITE | wgpu::BufferUsages::COPY_SRC,
+                        mapped_at_creation: true,
+                    },
+                );
+
+                Chunk { buffer, size }.into_active(buffer_pool)
+            }
+        };
+
+        let alignment_padding = (active_chunk.write_view.deref_mut().as_ptr() as usize
+            + active_chunk.unused_offset)
+            % alignment;
+        let start_offset = active_chunk.unused_offset + alignment_padding;
+        let end_offset = start_offset + size;
+
+        let write_only_memory = bytemuck::cast_slice_mut(
+            &mut active_chunk.write_view.deref_mut()[start_offset..end_offset],
+        );
+
+        // SAFETY:
+        // The slice we take out of the write view has a lifetime dependency on the write_view.
+        // This means it is self-referential. We handle this lifetime at runtime instead by
+        //
+        // See also `ActiveChunk::into_active`.
+        let write_only_memory =
+            unsafe { std::mem::transmute::<&mut [T], &'static mut [T]>(write_only_memory) };
+        let cpu_buffer_view = CpuWriteGpuReadBuffer {
+            write_only_memory,
+            safety_usage_counter: active_chunk.safety_usage_counter.clone(),
+            chunk_buffer: active_chunk.chunk.buffer.clone(),
+            offset_in_chunk_buffer: start_offset,
+        };
+
+        self.active_chunks.push(active_chunk);
+
+        cpu_buffer_view
+    }
+
+    /// Prepare currently mapped buffers for use in a submission.
+    ///
+    /// This must be called before the command encoder(s) used in [`StagingBuffer`] copy operations are submitted.
+    ///
+    /// At this point, all the partially used staging buffers are closed (cannot be used for
+    /// further writes) until after [`StagingBelt::recall()`] is called *and* the GPU is done
+    /// copying the data from them.
+    pub fn before_queue_submit(&mut self, buffer_pool: &GpuBufferPool) {
+        // This would be a great usecase for persistent memory mapping, i.e. mapping without the need to unmap
+        // https://github.com/gfx-rs/wgpu/issues/1468
+        // However, WebGPU does not support this!
+
+        for active_chunk in self.active_chunks.drain(..) {
+            assert!(Arc::strong_count(&active_chunk.safety_usage_counter) == 1,
+                "Chunk still in use. All instances of `CpuWriteGpuReadBelt` need to be freed before submitting.");
+
+            // Ensure write view is dropped before we unmap!
+            drop(active_chunk.write_view);
+
+            buffer_pool
+                .get_resource(&active_chunk.chunk.buffer)
+                .expect("invalid buffer handle")
+                .unmap();
+            self.closed_chunks.push(active_chunk.chunk);
+        }
+    }
+
+    /// Recall all of the closed buffers back to be reused.
+    ///
+    /// This must only be called after the command encoder(s) used in [`StagingBuffer`]
+    /// copy operations are submitted. Additional calls are harmless.
+    /// Not calling this as soon as possible may result in increased buffer memory usage.
+    pub fn after_queue_submit(&mut self, buffer_pool: &GpuBufferPool) {
+        self.receive_chunks();
+
+        let sender = &self.sender;
+        for chunk in self.closed_chunks.drain(..) {
+            let sender = sender.clone();
+            buffer_pool
+                .get_resource(&chunk.buffer)
+                .expect("invalid buffer handle")
+                .slice(..)
+                .map_async(wgpu::MapMode::Write, move |_| {
+                    let _ = sender.send(chunk);
+                });
+        }
+    }
+
+    /// Move all chunks that the GPU is done with (and are now mapped again)
+    /// from `self.receiver` to `self.free_chunks`.
+    fn receive_chunks(&mut self) {
+        while let Ok(chunk) = self.receiver.try_recv() {
+            self.free_chunks.push(chunk);
+        }
+    }
+}
+
+// impl fmt::Debug for StagingBelt {
+//     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+//         f.debug_struct("StagingBelt")
+//             .field("chunk_size", &self.chunk_size)
+//             .field("active_chunks", &self.active_chunks.len())
+//             .field("closed_chunks", &self.closed_chunks.len())
+//             .field("free_chunks", &self.free_chunks.len())
+//             .finish_non_exhaustive()
+//     }
+// }
diff --git a/crates/re_renderer/src/allocator/mod.rs b/crates/re_renderer/src/allocator/mod.rs
new file mode 100644
index 000000000000..7ae3d967a7e0
--- /dev/null
+++ b/crates/re_renderer/src/allocator/mod.rs
@@ -0,0 +1,8 @@
+//! High level GPU memory allocators.
+//!
+//! In contrast to the buffer pools in [`crate::wgpu_resources`], every allocator in here
+//! follows some more complex strategy for efficient re-use and sub-allocation of wgpu resources.
+
+mod cpu_write_gpu_read_belt;
+
+pub use cpu_write_gpu_read_belt::CpuWriteGpuReadBelt;
diff --git a/crates/re_renderer/src/context.rs b/crates/re_renderer/src/context.rs
index a0bfc4474ad0..5080acd7495b 100644
--- a/crates/re_renderer/src/context.rs
+++ b/crates/re_renderer/src/context.rs
@@ -1,8 +1,10 @@
 use std::sync::Arc;
 
+use parking_lot::Mutex;
 use type_map::concurrent::{self, TypeMap};
 
 use crate::{
+    allocator::CpuWriteGpuReadBelt,
     config::RenderContextConfig,
     global_bindings::GlobalBindings,
     renderer::Renderer,
@@ -28,6 +30,13 @@ pub struct RenderContext {
     pub gpu_resources: WgpuResourcePools,
     pub mesh_manager: MeshManager,
     pub texture_manager_2d: TextureManager2D,
+    pub(crate) cpu_write_gpu_read_belt: Mutex<CpuWriteGpuReadBelt>,
+
+    /// Command encoder for all commands that should go in before view builder are submitted.
+    ///
+    /// This should be used for any gpu copy operation outside of a renderer or view builder.
+    /// (i.e. typically in [`crate::renderer::DrawData`] creation!)
+    pub(crate) frame_global_command_encoder: wgpu::CommandEncoder,
 
     // TODO(andreas): Add frame/lifetime statistics, shared resources (e.g. "global" uniform buffer), ??
     frame_index: u64,
@@ -137,6 +146,8 @@ impl RenderContext {
         let texture_manager_2d =
             TextureManager2D::new(device.clone(), queue.clone(), &mut gpu_resources.textures);
 
+        let frame_global_command_encoder = Self::create_frame_global_command_encoder(&device);
+
         RenderContext {
             device,
             queue,
@@ -148,6 +159,10 @@ impl RenderContext {
 
             mesh_manager,
             texture_manager_2d,
+            // 32MiB chunk size (as big as a for instance a 2048x1024 float4 texture)
+            cpu_write_gpu_read_belt: Mutex::new(CpuWriteGpuReadBelt::new(1024 * 1024 * 32)),
+
+            frame_global_command_encoder,
 
             resolver,
 
@@ -158,6 +173,12 @@ impl RenderContext {
         }
     }
 
+    fn create_frame_global_command_encoder(device: &wgpu::Device) -> wgpu::CommandEncoder {
+        device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
+            label: crate::DebugLabel::from("global \"before viewbuilder\" command encoder").get(),
+        })
+    }
+
     /// Call this at the beginning of a new frame.
     ///
     /// Updates internal book-keeping, frame allocators and executes delayed events like shader reloading.
@@ -219,6 +240,35 @@ impl RenderContext {
             bind_group_layouts.begin_frame(self.frame_index);
             samplers.begin_frame(self.frame_index);
         }
+
+        // Retrieve unused staging buffer.
+        self.cpu_write_gpu_read_belt
+            .lock()
+            .after_queue_submit(&self.gpu_resources.buffers);
+    }
+
+    /// Call this at the end of a frame but before submitting command buffers from [`crate::view_builder::ViewBuilder`]
+    pub fn before_submit(&mut self) {
+        // Unmap all staging buffers.
+        self.cpu_write_gpu_read_belt
+            .lock()
+            .before_queue_submit(&self.gpu_resources.buffers);
+
+        let mut command_encoder = Self::create_frame_global_command_encoder(&self.device);
+        std::mem::swap(&mut self.frame_global_command_encoder, &mut command_encoder);
+        let command_buffer = command_encoder.finish();
+
+        // TODO(andreas): For better performance, we should try to bundle this with the single submit call that is currently happening in eframe.
+        //                  How do we hook in there and make sure this buffer is submitted first?
+        self.queue.submit([command_buffer]);
+    }
+}
+
+impl Drop for RenderContext {
+    fn drop(&mut self) {
+        // Delete the CpuWriteGpuReadBelt beforehand since its active chunks may keep unsafe references into the buffer pool.
+        // (if we don't do this and buffers get destroyed before the belt, wgpu will panic)
+        *self.cpu_write_gpu_read_belt.lock() = CpuWriteGpuReadBelt::new(0);
     }
 }
 
diff --git a/crates/re_renderer/src/lib.rs b/crates/re_renderer/src/lib.rs
index b89004ed22cf..dcbae79b06dc 100644
--- a/crates/re_renderer/src/lib.rs
+++ b/crates/re_renderer/src/lib.rs
@@ -14,6 +14,7 @@ pub mod renderer;
 pub mod resource_managers;
 pub mod view_builder;
 
+mod allocator;
 mod context;
 mod debug_label;
 mod depth_offset;
diff --git a/crates/re_renderer/src/view_builder.rs b/crates/re_renderer/src/view_builder.rs
index 098422ae03cf..c53ddce3b9bc 100644
--- a/crates/re_renderer/src/view_builder.rs
+++ b/crates/re_renderer/src/view_builder.rs
@@ -388,6 +388,38 @@ impl ViewBuilder {
         // Factor applied to depth offsets.
         let depth_offset_factor = 1.0e-08; // Value determined by experimentation. Quite close to the f32 machine epsilon but a bit lower.
 
+        let mut frame_uniform_buffer_cpu = ctx
+            .cpu_write_gpu_read_belt
+            .lock()
+            .allocate::<FrameUniformBuffer>(&ctx.device, &mut ctx.gpu_resources.buffers, 1);
+        frame_uniform_buffer_cpu.write_single(
+            &FrameUniformBuffer {
+                view_from_world: glam::Affine3A::from_mat4(view_from_world).into(),
+                projection_from_view: projection_from_view.into(),
+                projection_from_world: projection_from_world.into(),
+                camera_position,
+                camera_forward,
+                tan_half_fov: tan_half_fov.into(),
+                pixel_world_size_from_camera_distance,
+                pixels_from_point: config.pixels_from_point,
+
+                auto_size_points: auto_size_points.0,
+                auto_size_lines: auto_size_lines.0,
+
+                depth_offset_factor,
+                _padding: glam::Vec3::ZERO,
+            },
+            0,
+        );
+        frame_uniform_buffer_cpu
+            .copy_to_buffer(
+                &mut ctx.frame_global_command_encoder,
+                &ctx.gpu_resources.buffers,
+                &frame_uniform_buffer,
+                0,
+            )
+            .unwrap();
+
         ctx.queue.write_buffer(
             ctx.gpu_resources
                 .buffers
diff --git a/crates/re_renderer/src/wgpu_resources/buffer_pool.rs b/crates/re_renderer/src/wgpu_resources/buffer_pool.rs
index f62059d59c09..7e3c42563aa1 100644
--- a/crates/re_renderer/src/wgpu_resources/buffer_pool.rs
+++ b/crates/re_renderer/src/wgpu_resources/buffer_pool.rs
@@ -30,8 +30,7 @@ pub struct BufferDesc {
     /// [`BufferUsages::MAP_WRITE`], all buffers are allowed to be mapped at creation.
     ///
     /// *WARNING*: If this is `true`, the pool won't be able to reclaim the buffer later!
-    /// Furthermore, [`size`](#structfield.size) must be a multiple of
-    /// [`COPY_BUFFER_ALIGNMENT`].
+    /// Furthermore, [`size`](#structfield.size) must be a multiple of [`COPY_BUFFER_ALIGNMENT`].
     pub mapped_at_creation: bool,
 }
 

From a1c439e0111a5357e6e552e2c2bf0e4b9c66025b Mon Sep 17 00:00:00 2001
From: Andreas Reich <andreas@rerun.io>
Date: Mon, 20 Feb 2023 19:54:00 +0100
Subject: [PATCH 03/21] Reduce amount of unsafe code in CpuWriteGpuReadBuffer.
 Better handling of alignment. Fix issues on render context shutdown

---
 crates/re_renderer/examples/framework.rs      |   1 +
 .../src/allocator/cpu_write_gpu_read_belt.rs  | 275 +++++++++---------
 crates/re_renderer/src/context.rs             |  95 +++---
 crates/re_renderer/src/view_builder.rs        |   2 +-
 4 files changed, 195 insertions(+), 178 deletions(-)

diff --git a/crates/re_renderer/examples/framework.rs b/crates/re_renderer/examples/framework.rs
index 37b76b505802..5b6bc13c38b1 100644
--- a/crates/re_renderer/examples/framework.rs
+++ b/crates/re_renderer/examples/framework.rs
@@ -278,6 +278,7 @@ impl<E: Example + 'static> Application<E> {
                         // drop the pass so we can finish() the main encoder!
                     };
 
+                    self.re_ctx.before_submit();
                     self.re_ctx.queue.submit(
                         view_cmd_buffers
                             .into_iter()
diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
index 7a65a3bb6096..30528384b8b5 100644
--- a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
+++ b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
@@ -1,114 +1,67 @@
-use std::{
-    ops::DerefMut,
-    sync::{mpsc, Arc},
-};
+use std::{ops::DerefMut, sync::mpsc};
 
 use crate::wgpu_resources::{BufferDesc, GpuBufferHandleStrong, GpuBufferPool, PoolError};
 
 /// Internal chunk of the staging belt.
 struct Chunk {
     buffer: GpuBufferHandleStrong,
-    size: usize,
-}
-
-/// Chunk that the CPU is writing to currently.
-struct ActiveChunk {
-    chunk: Chunk,
-
-    /// Write view into the entire buffer.
-    ///
-    /// UNSAFE: The lifetime is transmuted to be `'static`.
-    /// In actuality it is tied to the lifetime of [`buffer`](#structfield.chunk.buffer)!
-    write_view: wgpu::BufferViewMut<'static>,
-
-    /// Safety mechanism to track the number of uses of this buffer.
-    ///
-    /// As long it is bigger 1, we are NOT allowed to deactivate the chunk!
-    safety_usage_counter: Arc<()>,
+    size: wgpu::BufferAddress,
 
     /// At what offset is [`write_view`](#structfield.write_view) unused.
-    unused_offset: usize,
-}
-
-impl Chunk {
-    #[allow(unsafe_code)]
-    fn into_active(self, buffer_pool: &GpuBufferPool) -> ActiveChunk {
-        // Get the entire mapped slice up front because mainly because there is no way to know the memory alignment beforehand!
-        // After discussing this on the wgpu dev channel, consensus was reached that they should be 4xf32 aligned.
-        // TODO: Github issue link.
-        //
-        // It also safes us a bit of time since we don't have to run through all the checks on every `allocate` call,
-        // but this is likely negligible (citation needed).
-        //
-        // Wgpu isn't fond of that though since it ties the (compile time) lifetime of the returned `wgpu::BufferSlice`
-        // to the lifetime of the buffer (after all the mapping should never outlive the buffer!).
-        // Therefore, we need to workaround this a bit.
-        //
-        // Note that in the JavaScript/wasm api of WebGPU this is trivially possible, see
-        // https://www.w3.org/TR/webgpu/#dom-gpubuffer-getmappedrange
-        // (but it also ends up doing an extra memcpy to that's an unfair comparison)
-
-        let wgpu_buffer = buffer_pool
-            .get_resource(&self.buffer)
-            .expect("Invalid buffer handle");
-        let buffer_slice = wgpu_buffer.slice(..);
-        let write_view = buffer_slice.get_mapped_range_mut();
-
-        // SAFETY:
-        // Things we need to guarantee manually now:
-        // * Buffer needs to stay alive for the time CpuWriteGpuReadBuffer is held.
-        //      -> We keep a reference count to `chunk.buffer`
-        // * Nobody holds a view into our memory by the time
-        //      -> We use `safety_usage_counter` and panic if there is more than one owner when we close the buffer view
-        // * Returned ranges into `write_view` NEVER overlap.
-        //      -> We track the unused_offset and never return any range ahead of it!
-        let write_view = unsafe {
-            std::mem::transmute::<wgpu::BufferViewMut<'_>, wgpu::BufferViewMut<'static>>(write_view)
-        };
-
-        ActiveChunk {
-            chunk: self,
-            write_view,
-            unused_offset: 0,
-            safety_usage_counter: Arc::new(()),
-        }
-    }
+    unused_offset: wgpu::BufferAddress,
 }
 
-// TODO(andreas): Make wgpu::BufferMappedRange Send upstream
-#[allow(unsafe_code)]
-/// SAFETY:
-/// TODO: Link to pr here doing so
-unsafe impl std::marker::Send for ActiveChunk {}
-
 /// A suballocated staging buffer that can be written to.
 ///
 /// We do *not* allow reading from this buffer as it is typically write-combined memory.
 /// Reading would work, but it can be *very* slow.
-pub struct CpuWriteGpuReadBuffer<T: bytemuck::Pod> {
-    write_only_memory: &'static mut [T],
-
-    #[allow(dead_code)]
-    safety_usage_counter: Arc<()>,
+pub struct CpuWriteGpuReadBuffer<T: bytemuck::Pod + 'static> {
+    /// Write view into the relevant buffer portion.
+    ///
+    /// UNSAFE: The lifetime is transmuted to be `'static`.
+    /// In actuality it is tied to the lifetime of [`chunk_buffer`](#structfield.chunk.chunk_buffer)!
+    write_view: wgpu::BufferViewMut<'static>,
 
     chunk_buffer: GpuBufferHandleStrong,
-    offset_in_chunk_buffer: usize,
+    offset_in_chunk_buffer: wgpu::BufferAddress,
+
+    /// Marker for the type whose alignment and size requirements are honored by `write_view`.
+    _type: std::marker::PhantomData<T>,
 }
 
 impl<T> CpuWriteGpuReadBuffer<T>
 where
     T: bytemuck::Pod + 'static,
 {
-    /// Writes several objects to the buffer at a given location.
+    /// Memory as slice of T.
+    ///
+    /// Do *not* make this public as we need to guarantee that the memory is *never* read from!
+    #[inline(always)]
+    fn as_slice(&mut self) -> &mut [T] {
+        bytemuck::cast_slice_mut(&mut self.write_view)
+    }
+
+    /// Writes several objects using to the buffer at a given location using a slice.
+    ///
     /// User is responsible for ensuring the element offset is valid with the element types's alignment requirement.
     /// (panics otherwise)
+    #[inline]
+    pub fn write_slice(&mut self, elements: &[T], num_elements_offset: usize) {
+        self.as_slice()[num_elements_offset..].copy_from_slice(elements);
+    }
+
+    /// Writes several objects using to the buffer at a given location using an iterator.
     ///
-    /// We do *not* allow reading from this buffer as it is typically write-combined memory.
-    /// Reading would work, but it can be *very* slow.
+    /// User is responsible for ensuring the element offset is valid with the element types's alignment requirement.
+    /// (panics otherwise)
     #[inline]
-    pub fn write(&mut self, elements: impl Iterator<Item = T>, num_elements_offset: usize) {
+    pub fn write_iterator(
+        &mut self,
+        elements: impl Iterator<Item = T>,
+        num_elements_offset: usize,
+    ) {
         for (target, source) in self
-            .write_only_memory
+            .as_slice()
             .iter_mut()
             .skip(num_elements_offset)
             .zip(elements)
@@ -118,11 +71,12 @@ where
     }
 
     /// Writes a single objects to the buffer at a given location.
+    ///
     /// User is responsible for ensuring the element offset is valid with the element types's alignment requirement.
     /// (panics otherwise)
-    #[inline(always)]
+    #[inline]
     pub fn write_single(&mut self, element: &T, num_elements_offset: usize) {
-        self.write_only_memory[num_elements_offset] = *element;
+        self.as_slice()[num_elements_offset] = *element;
     }
 
     // pub fn copy_to_texture(
@@ -154,7 +108,7 @@ where
 
     /// Consume this data view and copy it to another gpu buffer.
     pub fn copy_to_buffer(
-        self,
+        mut self,
         encoder: &mut wgpu::CommandEncoder,
         buffer_pool: &GpuBufferPool,
         destination: &GpuBufferHandleStrong,
@@ -162,10 +116,10 @@ where
     ) -> Result<(), PoolError> {
         encoder.copy_buffer_to_buffer(
             buffer_pool.get_resource(&self.chunk_buffer)?,
-            self.offset_in_chunk_buffer as _,
+            self.offset_in_chunk_buffer,
             buffer_pool.get_resource(destination)?,
             destination_offset,
-            (self.write_only_memory.len() * std::mem::size_of::<T>()) as u64,
+            self.write_view.deref_mut().len() as u64,
         );
         Ok(())
     }
@@ -183,10 +137,10 @@ where
 /// * handles alignment
 pub struct CpuWriteGpuReadBelt {
     /// Minimum size for new buffers.
-    chunk_size: usize,
+    chunk_size: wgpu::BufferSize,
 
     /// Chunks which are CPU write at the moment.
-    active_chunks: Vec<ActiveChunk>,
+    active_chunks: Vec<Chunk>,
 
     /// Chunks which are GPU read at the moment.
     ///
@@ -224,7 +178,7 @@ impl CpuWriteGpuReadBelt {
     ///
     /// TODO(andreas): Adaptive chunk sizes
     /// TODO(andreas): Shrinking after usage spikes?
-    pub fn new(chunk_size: usize) -> Self {
+    pub fn new(chunk_size: wgpu::BufferSize) -> Self {
         let (sender, receiver) = mpsc::channel();
         CpuWriteGpuReadBelt {
             chunk_size,
@@ -246,79 +200,132 @@ impl CpuWriteGpuReadBelt {
         buffer_pool: &mut GpuBufferPool,
         num_elements: usize,
     ) -> CpuWriteGpuReadBuffer<T> {
-        let alignment = std::mem::align_of::<T>().min(wgpu::MAP_ALIGNMENT as usize);
-        let size = std::mem::size_of::<T>() * num_elements;
+        let alignment = (std::mem::align_of::<T>() as wgpu::BufferAddress).min(wgpu::MAP_ALIGNMENT);
+        let size = (std::mem::size_of::<T>() * num_elements) as wgpu::BufferAddress;
 
-        // We need to be super careful with alignment since wgpu
-        // has no guarantees on how pointers to mapped memory are aligned!
+        // We need to be super careful with alignment since today wgpu
+        // has NO guarantees on how pointers to mapped memory are aligned!
+        // For all we know, pointers might be 1 aligned, causing even a u32 write to crash the process!
+        //
+        // See https://github.com/gfx-rs/wgpu/issues/3508
+        //
+        // To work around this, we require a bigger size to begin with.
+        //
+        // TODO(andreas): Either fix the wgpu issue or come up with a more conservative strategy,
+        //                where we first look for a buffer slice with `size` and then again with required_size if needed.
+        let required_size = size + alignment - 1;
 
         // We explicitly use `deref_mut` on write_view everywhere, since wgpu warns if we accidentally use `deref`.
 
         // Try to find space in any of the active chunks first.
-        let mut active_chunk = if let Some(index) =
-            self.active_chunks.iter_mut().position(|active_chunk| {
-                size + (active_chunk.write_view.deref_mut().as_ptr() as usize
-                    + active_chunk.unused_offset)
-                    % alignment
-                    <= active_chunk.chunk.size - active_chunk.unused_offset
-            }) {
+        let mut chunk = if let Some(index) = self
+            .active_chunks
+            .iter_mut()
+            .position(|chunk| chunk.size - chunk.unused_offset >= required_size)
+        {
             self.active_chunks.swap_remove(index)
         } else {
             self.receive_chunks(); // ensure self.free_chunks is up to date
 
-            // We don't know yet how aligned the mapped pointer is. So we need to ask for more memory!
-            let required_size = size + alignment - 1;
-
             // Use a free chunk if possible, fall back to creating a new one if necessary.
             if let Some(index) = self
                 .free_chunks
                 .iter()
                 .position(|chunk| chunk.size >= required_size)
             {
-                self.free_chunks.swap_remove(index).into_active(buffer_pool)
+                self.free_chunks.swap_remove(index)
             } else {
                 // Allocation might be bigger than a chunk.
-                let size = self.chunk_size.max(required_size);
+                let size = self.chunk_size.get().max(required_size);
 
                 let buffer = buffer_pool.alloc(
                     device,
                     &BufferDesc {
                         label: "CpuWriteGpuReadBelt buffer".into(),
-                        size: size as _,
+                        size,
                         usage: wgpu::BufferUsages::MAP_WRITE | wgpu::BufferUsages::COPY_SRC,
                         mapped_at_creation: true,
                     },
                 );
 
-                Chunk { buffer, size }.into_active(buffer_pool)
+                Chunk {
+                    buffer,
+                    size,
+                    unused_offset: 0,
+                }
             }
         };
 
-        let alignment_padding = (active_chunk.write_view.deref_mut().as_ptr() as usize
-            + active_chunk.unused_offset)
-            % alignment;
-        let start_offset = active_chunk.unused_offset + alignment_padding;
-        let end_offset = start_offset + size;
+        let buffer = buffer_pool
+            .get_resource(&chunk.buffer)
+            .expect("invalid chunk buffer");
 
-        let write_only_memory = bytemuck::cast_slice_mut(
-            &mut active_chunk.write_view.deref_mut()[start_offset..end_offset],
-        );
+        // Allocate mapping from a chunk.
+        fn allocate_mapping<'a>(
+            chunk: &mut Chunk,
+            size: u64,
+            buffer: &'a wgpu::Buffer,
+        ) -> (u64, wgpu::BufferViewMut<'a>) {
+            let start_offset = chunk.unused_offset;
+            let end_offset = start_offset + size;
 
-        // SAFETY:
-        // The slice we take out of the write view has a lifetime dependency on the write_view.
-        // This means it is self-referential. We handle this lifetime at runtime instead by
+            debug_assert!(end_offset <= chunk.size);
+            chunk.unused_offset = end_offset;
+
+            let buffer_slice = buffer.slice(start_offset..end_offset);
+            (start_offset, buffer_slice.get_mapped_range_mut())
+        }
+
+        // Allocate mapping from a chunk with alignment requirements.
         //
-        // See also `ActiveChunk::into_active`.
-        let write_only_memory =
-            unsafe { std::mem::transmute::<&mut [T], &'static mut [T]>(write_only_memory) };
+        // Depending on how https://github.com/gfx-rs/wgpu/issues/3508 will be solved, this will become trivial
+        // as we will have knowledge of a full buffer mapping alignment beforehand.
+        // (we then should probably always align to 32 byte and don't allow types with even higher alignment requirements!)
+        fn allocate_chunk_mapping_with_alignment<'a>(
+            chunk: &mut Chunk,
+            size: u64,
+            buffer: &'a wgpu::Buffer,
+            alignment: u64,
+        ) -> (u64, wgpu::BufferViewMut<'a>) {
+            // First optimistically try without explicit padding.
+            let (start_offset, mut write_view) = allocate_mapping(chunk, size, buffer);
+            let required_padding = write_view.deref_mut().as_ptr() as u64 % alignment; // use deref_mut because wgpu warns otherwise that read access is slow.
+
+            if required_padding == 0 {
+                (start_offset, write_view)
+            } else {
+                // Undo mapping and try again with padding.
+                // We made sure earlier that the chunk has enough space for this case!
+                drop(write_view);
+                chunk.unused_offset = start_offset + required_padding;
+
+                let (start_offset, mut write_view) = allocate_mapping(chunk, size, buffer);
+                let required_padding = write_view.deref_mut().as_ptr() as u64 % alignment; // use deref_mut because wgpu warns otherwise that read access is slow.
+                debug_assert_eq!(required_padding, 0);
+
+                (start_offset, write_view)
+            }
+        }
+
+        let (start_offset, write_view) =
+            allocate_chunk_mapping_with_alignment(&mut chunk, size, buffer, alignment);
+
+        // SAFETY:
+        // write_view has a lifetime dependency on the chunk's buffer.
+        // To ensure that the buffer is still around, we put the ref counted buffer handle into the struct with it.
+        // However, this also implies that the buffer pool is still alive! The renderer context needs to make sure of this.
+        let write_view = unsafe {
+            std::mem::transmute::<wgpu::BufferViewMut<'_>, wgpu::BufferViewMut<'static>>(write_view)
+        };
+
         let cpu_buffer_view = CpuWriteGpuReadBuffer {
-            write_only_memory,
-            safety_usage_counter: active_chunk.safety_usage_counter.clone(),
-            chunk_buffer: active_chunk.chunk.buffer.clone(),
+            chunk_buffer: chunk.buffer.clone(),
             offset_in_chunk_buffer: start_offset,
+            write_view,
+            _type: std::marker::PhantomData,
         };
 
-        self.active_chunks.push(active_chunk);
+        self.active_chunks.push(chunk);
 
         cpu_buffer_view
     }
@@ -335,18 +342,12 @@ impl CpuWriteGpuReadBelt {
         // https://github.com/gfx-rs/wgpu/issues/1468
         // However, WebGPU does not support this!
 
-        for active_chunk in self.active_chunks.drain(..) {
-            assert!(Arc::strong_count(&active_chunk.safety_usage_counter) == 1,
-                "Chunk still in use. All instances of `CpuWriteGpuReadBelt` need to be freed before submitting.");
-
-            // Ensure write view is dropped before we unmap!
-            drop(active_chunk.write_view);
-
+        for chunk in self.active_chunks.drain(..) {
             buffer_pool
-                .get_resource(&active_chunk.chunk.buffer)
+                .get_resource(&chunk.buffer)
                 .expect("invalid buffer handle")
                 .unmap();
-            self.closed_chunks.push(active_chunk.chunk);
+            self.closed_chunks.push(chunk);
         }
     }
 
diff --git a/crates/re_renderer/src/context.rs b/crates/re_renderer/src/context.rs
index 5080acd7495b..a9cc96981d6d 100644
--- a/crates/re_renderer/src/context.rs
+++ b/crates/re_renderer/src/context.rs
@@ -32,14 +32,7 @@ pub struct RenderContext {
     pub texture_manager_2d: TextureManager2D,
     pub(crate) cpu_write_gpu_read_belt: Mutex<CpuWriteGpuReadBelt>,
 
-    /// Command encoder for all commands that should go in before view builder are submitted.
-    ///
-    /// This should be used for any gpu copy operation outside of a renderer or view builder.
-    /// (i.e. typically in [`crate::renderer::DrawData`] creation!)
-    pub(crate) frame_global_command_encoder: wgpu::CommandEncoder,
-
-    // TODO(andreas): Add frame/lifetime statistics, shared resources (e.g. "global" uniform buffer), ??
-    frame_index: u64,
+    pub(crate) active_frame: ActiveFrameContext,
 }
 
 /// Immutable data that is shared between all [`Renderer`]
@@ -146,8 +139,6 @@ impl RenderContext {
         let texture_manager_2d =
             TextureManager2D::new(device.clone(), queue.clone(), &mut gpu_resources.textures);
 
-        let frame_global_command_encoder = Self::create_frame_global_command_encoder(&device);
-
         RenderContext {
             device,
             queue,
@@ -160,30 +151,26 @@ impl RenderContext {
             mesh_manager,
             texture_manager_2d,
             // 32MiB chunk size (as big as a for instance a 2048x1024 float4 texture)
-            cpu_write_gpu_read_belt: Mutex::new(CpuWriteGpuReadBelt::new(1024 * 1024 * 32)),
-
-            frame_global_command_encoder,
+            cpu_write_gpu_read_belt: Mutex::new(CpuWriteGpuReadBelt::new(wgpu::BufferSize::new(1024 * 1024 * 32).unwrap())),
 
             resolver,
 
             #[cfg(all(not(target_arch = "wasm32"), debug_assertions))] // native debug build
             err_tracker,
 
-            frame_index: 0,
+            active_frame: ActiveFrameContext { frame_global_command_encoder: None, frame_index: 0 }
         }
     }
 
-    fn create_frame_global_command_encoder(device: &wgpu::Device) -> wgpu::CommandEncoder {
-        device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
-            label: crate::DebugLabel::from("global \"before viewbuilder\" command encoder").get(),
-        })
-    }
-
     /// Call this at the beginning of a new frame.
     ///
     /// Updates internal book-keeping, frame allocators and executes delayed events like shader reloading.
     pub fn begin_frame(&mut self) {
-        self.frame_index += 1;
+        self.active_frame = ActiveFrameContext {
+            frame_global_command_encoder: None,
+            frame_index: self.active_frame.frame_index + 1,
+        };
+        let frame_index = self.active_frame.frame_index;
 
         // Tick the error tracker so that it knows when to reset!
         // Note that we're ticking on begin_frame rather than raw frames, which
@@ -199,8 +186,8 @@ impl RenderContext {
             re_log::debug!(?modified_paths, "got some filesystem events");
         }
 
-        self.mesh_manager.begin_frame(self.frame_index);
-        self.texture_manager_2d.begin_frame(self.frame_index);
+        self.mesh_manager.begin_frame(frame_index);
+        self.texture_manager_2d.begin_frame(frame_index);
 
         {
             let WgpuResourcePools {
@@ -219,26 +206,26 @@ impl RenderContext {
             shader_modules.begin_frame(
                 &self.device,
                 &mut self.resolver,
-                self.frame_index,
+                frame_index,
                 &modified_paths,
             );
             render_pipelines.begin_frame(
                 &self.device,
-                self.frame_index,
+                frame_index,
                 shader_modules,
                 pipeline_layouts,
             );
 
             // Bind group maintenance must come before texture/buffer maintenance since it
             // registers texture/buffer use
-            bind_groups.begin_frame(self.frame_index, textures, buffers, samplers);
+            bind_groups.begin_frame(frame_index, textures, buffers, samplers);
 
-            textures.begin_frame(self.frame_index);
-            buffers.begin_frame(self.frame_index);
+            textures.begin_frame(frame_index);
+            buffers.begin_frame(frame_index);
 
-            pipeline_layouts.begin_frame(self.frame_index);
-            bind_group_layouts.begin_frame(self.frame_index);
-            samplers.begin_frame(self.frame_index);
+            pipeline_layouts.begin_frame(frame_index);
+            bind_group_layouts.begin_frame(frame_index);
+            samplers.begin_frame(frame_index);
         }
 
         // Retrieve unused staging buffer.
@@ -254,21 +241,49 @@ impl RenderContext {
             .lock()
             .before_queue_submit(&self.gpu_resources.buffers);
 
-        let mut command_encoder = Self::create_frame_global_command_encoder(&self.device);
-        std::mem::swap(&mut self.frame_global_command_encoder, &mut command_encoder);
-        let command_buffer = command_encoder.finish();
+        if let Some(command_encoder) = self.active_frame.frame_global_command_encoder.take() {
+            let command_buffer = command_encoder.finish();
 
-        // TODO(andreas): For better performance, we should try to bundle this with the single submit call that is currently happening in eframe.
-        //                  How do we hook in there and make sure this buffer is submitted first?
-        self.queue.submit([command_buffer]);
+            // TODO(andreas): For better performance, we should try to bundle this with the single submit call that is currently happening in eframe.
+            //                  How do we hook in there and make sure this buffer is submitted first?
+            self.queue.submit([command_buffer]);
+        }
     }
 }
 
 impl Drop for RenderContext {
     fn drop(&mut self) {
-        // Delete the CpuWriteGpuReadBelt beforehand since its active chunks may keep unsafe references into the buffer pool.
-        // (if we don't do this and buffers get destroyed before the belt, wgpu will panic)
-        *self.cpu_write_gpu_read_belt.lock() = CpuWriteGpuReadBelt::new(0);
+        // Close global command encoder if there is any pending.
+        // Not doing so before shutdown causes errors.
+        if let Some(encoder) = self.active_frame.frame_global_command_encoder.take() {
+            encoder.finish();
+        }
+    }
+}
+
+pub struct ActiveFrameContext {
+    /// Command encoder for all commands that should go in before view builder are submitted.
+    ///
+    /// This should be used for any gpu copy operation outside of a renderer or view builder.
+    /// (i.e. typically in [`crate::renderer::DrawData`] creation!)
+    frame_global_command_encoder: Option<wgpu::CommandEncoder>,
+
+    // TODO(andreas): Add frame/lifetime statistics, shared resources (e.g. "global" uniform buffer), ??
+    frame_index: u64,
+}
+
+impl ActiveFrameContext {
+    /// Gets or creates a command encoder that runs before all view builder encoder.
+    pub fn frame_global_command_encoder(
+        &mut self,
+        device: &wgpu::Device,
+    ) -> &mut wgpu::CommandEncoder {
+        self.frame_global_command_encoder.get_or_insert_with(|| {
+            device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
+                label: crate::DebugLabel::from("global \"before viewbuilder\" command encoder")
+                    .get(),
+            })
+        })
     }
 }
 
diff --git a/crates/re_renderer/src/view_builder.rs b/crates/re_renderer/src/view_builder.rs
index c53ddce3b9bc..6a9b75abdcc1 100644
--- a/crates/re_renderer/src/view_builder.rs
+++ b/crates/re_renderer/src/view_builder.rs
@@ -413,7 +413,7 @@ impl ViewBuilder {
         );
         frame_uniform_buffer_cpu
             .copy_to_buffer(
-                &mut ctx.frame_global_command_encoder,
+                ctx.active_frame.frame_global_command_encoder(&ctx.device),
                 &ctx.gpu_resources.buffers,
                 &frame_uniform_buffer,
                 0,

From 847e3afaf6e51f45e1c38b76d0155bc4d3e5f5a3 Mon Sep 17 00:00:00 2001
From: Andreas Reich <andreas@rerun.io>
Date: Mon, 20 Feb 2023 19:58:05 +0100
Subject: [PATCH 04/21] debug fmt for  CpuWriteGpuReadBelt for easier debugging

---
 .../src/allocator/cpu_write_gpu_read_belt.rs  | 20 +++++++++----------
 crates/re_renderer/src/context.rs             |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
index 30528384b8b5..fe7f5b00bb58 100644
--- a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
+++ b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
@@ -381,13 +381,13 @@ impl CpuWriteGpuReadBelt {
     }
 }
 
-// impl fmt::Debug for StagingBelt {
-//     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-//         f.debug_struct("StagingBelt")
-//             .field("chunk_size", &self.chunk_size)
-//             .field("active_chunks", &self.active_chunks.len())
-//             .field("closed_chunks", &self.closed_chunks.len())
-//             .field("free_chunks", &self.free_chunks.len())
-//             .finish_non_exhaustive()
-//     }
-// }
+impl std::fmt::Debug for CpuWriteGpuReadBelt {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("StagingBelt")
+            .field("chunk_size", &self.chunk_size)
+            .field("active_chunks", &self.active_chunks.len())
+            .field("closed_chunks", &self.closed_chunks.len())
+            .field("free_chunks", &self.free_chunks.len())
+            .finish_non_exhaustive()
+    }
+}
diff --git a/crates/re_renderer/src/context.rs b/crates/re_renderer/src/context.rs
index a9cc96981d6d..66b09e0cd622 100644
--- a/crates/re_renderer/src/context.rs
+++ b/crates/re_renderer/src/context.rs
@@ -30,7 +30,7 @@ pub struct RenderContext {
     pub gpu_resources: WgpuResourcePools,
     pub mesh_manager: MeshManager,
     pub texture_manager_2d: TextureManager2D,
-    pub(crate) cpu_write_gpu_read_belt: Mutex<CpuWriteGpuReadBelt>,
+    pub cpu_write_gpu_read_belt: Mutex<CpuWriteGpuReadBelt>,
 
     pub(crate) active_frame: ActiveFrameContext,
 }

From ceb2695c1ba0b3f3d175859c8ef626e8e95a5a3f Mon Sep 17 00:00:00 2001
From: Andreas Reich <andreas@rerun.io>
Date: Mon, 20 Feb 2023 20:36:26 +0100
Subject: [PATCH 05/21] add missing profiler scopes & call to before_submit

---
 crates/re_renderer/src/context.rs             | 4 ++++
 crates/re_renderer/src/point_cloud_builder.rs | 5 -----
 crates/re_viewer/src/app.rs                   | 2 ++
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/crates/re_renderer/src/context.rs b/crates/re_renderer/src/context.rs
index 66b09e0cd622..77deb934f4ce 100644
--- a/crates/re_renderer/src/context.rs
+++ b/crates/re_renderer/src/context.rs
@@ -166,6 +166,8 @@ impl RenderContext {
     ///
     /// Updates internal book-keeping, frame allocators and executes delayed events like shader reloading.
     pub fn begin_frame(&mut self) {
+        crate::profile_function!();
+
         self.active_frame = ActiveFrameContext {
             frame_global_command_encoder: None,
             frame_index: self.active_frame.frame_index + 1,
@@ -236,6 +238,8 @@ impl RenderContext {
 
     /// Call this at the end of a frame but before submitting command buffers from [`crate::view_builder::ViewBuilder`]
     pub fn before_submit(&mut self) {
+        crate::profile_function!();
+
         // Unmap all staging buffers.
         self.cpu_write_gpu_read_belt
             .lock()
diff --git a/crates/re_renderer/src/point_cloud_builder.rs b/crates/re_renderer/src/point_cloud_builder.rs
index f265ae12e064..e3d85c7ace80 100644
--- a/crates/re_renderer/src/point_cloud_builder.rs
+++ b/crates/re_renderer/src/point_cloud_builder.rs
@@ -7,11 +7,6 @@ use crate::{
 };
 
 /// Builder for point clouds, making it easy to create [`crate::renderer::PointCloudDrawData`].
-///
-/// TODO(andreas): We could make significant optimizations here by making this builder capable
-/// of writing to a GPU readable memory location.
-/// This will require some ahead of time size limit, but should be feasible.
-/// But before that we first need to sort out cpu->gpu transfers better by providing staging buffers.
 pub struct PointCloudBuilder<PerPointUserData> {
     // Size of `point`/color`/`per_point_user_data` must be equal.
     pub vertices: Vec<PointCloudVertex>,
diff --git a/crates/re_viewer/src/app.rs b/crates/re_viewer/src/app.rs
index 52eeedcf20c7..a2ba4ec295a2 100644
--- a/crates/re_viewer/src/app.rs
+++ b/crates/re_viewer/src/app.rs
@@ -518,6 +518,8 @@ impl eframe::App for App {
                             self.rx.source(),
                         );
                     }
+
+                    render_ctx.before_submit();
                 }
             });
 

From 10c4a9f33aca485b031c3d7cded948b57964fe28 Mon Sep 17 00:00:00 2001
From: Andreas Reich <andreas@rerun.io>
Date: Mon, 20 Feb 2023 20:46:35 +0100
Subject: [PATCH 06/21] require point cloud builder for creating point cloud
 data

---
 crates/re_renderer/examples/multiview.rs      | 52 +++++++++++--------
 crates/re_renderer/src/point_cloud_builder.rs |  4 +-
 .../re_renderer/src/renderer/point_cloud.rs   | 11 ++--
 3 files changed, 39 insertions(+), 28 deletions(-)

diff --git a/crates/re_renderer/examples/multiview.rs b/crates/re_renderer/examples/multiview.rs
index a5f89d1ae242..3275bf68e099 100644
--- a/crates/re_renderer/examples/multiview.rs
+++ b/crates/re_renderer/examples/multiview.rs
@@ -10,12 +10,11 @@ use rand::Rng;
 use re_renderer::{
     renderer::{
         GenericSkyboxDrawData, LineDrawData, LineStripFlags, MeshDrawData, MeshInstance,
-        PointCloudBatchFlags, PointCloudBatchInfo, PointCloudDrawData, PointCloudVertex,
         TestTriangleDrawData,
     },
     resource_managers::ResourceLifeTime,
     view_builder::{OrthographicCameraMode, Projection, TargetConfiguration, ViewBuilder},
-    Color32, LineStripSeriesBuilder, RenderContext, Rgba, Size,
+    Color32, LineStripSeriesBuilder, PointCloudBuilder, RenderContext, Rgba, Size,
 };
 use winit::event::{ElementState, VirtualKeyCode};
 
@@ -163,7 +162,8 @@ struct Multiview {
     mesh_instance_positions_and_colors: Vec<(glam::Vec3, Color32)>,
 
     // Want to have a large cloud of random points, but doing rng for all of them every frame is too slow
-    random_points: Vec<PointCloudVertex>,
+    random_points_positions: Vec<glam::Vec3>,
+    random_points_radii: Vec<Size>,
     random_points_colors: Vec<Color32>,
 }
 
@@ -188,17 +188,23 @@ impl Example for Multiview {
 
         let mut rnd = <rand::rngs::StdRng as rand::SeedableRng>::seed_from_u64(42);
         let random_point_range = -5.0_f32..5.0_f32;
-        let random_points = (0..500000)
-            .map(|_| PointCloudVertex {
-                position: glam::vec3(
+
+        let point_count = 500000;
+        let random_points_positions = (0..point_count)
+            .map(|_| {
+                glam::vec3(
                     rnd.gen_range(random_point_range.clone()),
                     rnd.gen_range(random_point_range.clone()),
                     rnd.gen_range(random_point_range.clone()),
-                ),
-                radius: Size::new_scene(rnd.gen_range(0.005..0.05)),
+                )
             })
             .collect_vec();
-        let random_points_colors = (0..500000).map(|_| random_color(&mut rnd)).collect_vec();
+        let random_points_radii = (0..point_count)
+            .map(|_| Size::new_scene(rnd.gen_range(0.005..0.05)))
+            .collect_vec();
+        let random_points_colors = (0..point_count)
+            .map(|_| random_color(&mut rnd))
+            .collect_vec();
 
         let model_mesh_instances = {
             let reader = std::io::Cursor::new(include_bytes!("rerun.obj.zip"));
@@ -232,7 +238,8 @@ impl Example for Multiview {
 
             model_mesh_instances,
             mesh_instance_positions_and_colors,
-            random_points,
+            random_points_positions,
+            random_points_radii,
             random_points_colors,
         }
     }
@@ -260,18 +267,19 @@ impl Example for Multiview {
         let triangle = TestTriangleDrawData::new(re_ctx);
         let skybox = GenericSkyboxDrawData::new(re_ctx);
         let lines = build_lines(re_ctx, seconds_since_startup);
-        let point_cloud = PointCloudDrawData::new(
-            re_ctx,
-            &self.random_points,
-            &self.random_points_colors,
-            &[PointCloudBatchInfo {
-                label: "Random points".into(),
-                world_from_obj: glam::Mat4::from_rotation_x(seconds_since_startup),
-                point_count: self.random_points.len() as _,
-                flags: PointCloudBatchFlags::ENABLE_SHADING,
-            }],
-        )
-        .unwrap();
+
+        let mut builder = PointCloudBuilder::<()>::default();
+        builder
+            .batch("Random Points")
+            .world_from_obj(glam::Mat4::from_rotation_x(seconds_since_startup))
+            .add_points(
+                self.random_points_positions.len(),
+                self.random_points_positions.iter().cloned(),
+            )
+            .radii(self.random_points_radii.iter().cloned())
+            .colors(self.random_points_colors.iter().cloned());
+
+        let point_cloud = builder.to_draw_data(re_ctx).unwrap();
         let meshes = build_mesh_instances(
             re_ctx,
             &self.model_mesh_instances,
diff --git a/crates/re_renderer/src/point_cloud_builder.rs b/crates/re_renderer/src/point_cloud_builder.rs
index e3d85c7ace80..1b64af4b3762 100644
--- a/crates/re_renderer/src/point_cloud_builder.rs
+++ b/crates/re_renderer/src/point_cloud_builder.rs
@@ -13,7 +13,7 @@ pub struct PointCloudBuilder<PerPointUserData> {
     pub colors: Vec<Color32>,
     pub user_data: Vec<PerPointUserData>,
 
-    batches: Vec<PointCloudBatchInfo>,
+    pub(crate) batches: Vec<PointCloudBatchInfo>,
 }
 
 impl<PerPointUserData> Default for PointCloudBuilder<PerPointUserData> {
@@ -100,7 +100,7 @@ where
         &self,
         ctx: &mut crate::context::RenderContext,
     ) -> Result<PointCloudDrawData, PointCloudDrawDataError> {
-        PointCloudDrawData::new(ctx, &self.vertices, &self.colors, &self.batches)
+        PointCloudDrawData::new(ctx, self)
     }
 }
 
diff --git a/crates/re_renderer/src/renderer/point_cloud.rs b/crates/re_renderer/src/renderer/point_cloud.rs
index 166b0fd781b6..5a37803d9562 100644
--- a/crates/re_renderer/src/renderer/point_cloud.rs
+++ b/crates/re_renderer/src/renderer/point_cloud.rs
@@ -20,6 +20,7 @@ use std::{
 
 use crate::{
     context::uniform_buffer_allocation_size, wgpu_resources::BufferDesc, Color32, DebugLabel,
+    PointCloudBuilder,
 };
 use bitflags::bitflags;
 use bytemuck::Zeroable;
@@ -148,11 +149,9 @@ impl PointCloudDrawData {
     /// Number of vertices and colors has to be equal.
     ///
     /// If no batches are passed, all points are assumed to be in a single batch with identity transform.
-    pub fn new(
+    pub fn new<T>(
         ctx: &mut RenderContext,
-        vertices: &[PointCloudVertex],
-        colors: &[Color32],
-        batches: &[PointCloudBatchInfo],
+        builder: &PointCloudBuilder<T>,
     ) -> Result<Self, PointCloudDrawDataError> {
         crate::profile_function!();
 
@@ -163,6 +162,10 @@ impl PointCloudDrawData {
             &mut ctx.resolver,
         );
 
+        let vertices = builder.vertices.as_slice();
+        let colors = builder.colors.as_slice();
+        let batches = builder.batches.as_slice();
+
         if vertices.is_empty() {
             return Ok(PointCloudDrawData {
                 bind_group_all_points: None,

From 0226fb2f4b7c04f51d0157bbf8a664dd5cabfac3 Mon Sep 17 00:00:00 2001
From: Andreas Reich <andreas@rerun.io>
Date: Mon, 20 Feb 2023 21:52:07 +0100
Subject: [PATCH 07/21] Make `CpuWriteGpuReadBuffer` more like a Vec. POC usage
 in PointCloudBuilder

---
 crates/re_renderer/examples/2d.rs             |   2 +-
 crates/re_renderer/examples/multiview.rs      |   2 +-
 .../src/allocator/cpu_write_gpu_read_belt.rs  | 119 ++++++++++--------
 crates/re_renderer/src/allocator/mod.rs       |   2 +-
 crates/re_renderer/src/point_cloud_builder.rs |  63 ++++++----
 .../re_renderer/src/renderer/point_cloud.rs   |  70 ++++-------
 crates/re_renderer/src/view_builder.rs        |  35 +++---
 crates/re_viewer/src/ui/space_view.rs         |   2 +-
 .../src/ui/view_spatial/scene/mod.rs          |  12 +-
 .../src/ui/view_spatial/scene/primitives.rs   |  12 +-
 crates/re_viewer/src/ui/view_spatial/ui_2d.rs |  81 ++++++------
 crates/re_viewer/src/ui/view_spatial/ui_3d.rs |   6 +-
 .../src/ui/view_spatial/ui_renderer_bridge.rs |   4 +-
 13 files changed, 217 insertions(+), 193 deletions(-)

diff --git a/crates/re_renderer/examples/2d.rs b/crates/re_renderer/examples/2d.rs
index 741a27f24539..047c37dfbe51 100644
--- a/crates/re_renderer/examples/2d.rs
+++ b/crates/re_renderer/examples/2d.rs
@@ -149,7 +149,7 @@ impl framework::Example for Render2D {
         // Moving the windows to a high dpi screen makes the second one bigger.
         // Also, it looks different under perspective projection.
         // The third point is automatic thickness which is determined by the point renderer implementation.
-        let mut point_cloud_builder = PointCloudBuilder::<()>::default();
+        let mut point_cloud_builder = PointCloudBuilder::<()>::new(re_ctx);
         point_cloud_builder
             .batch("points")
             .add_points_2d(
diff --git a/crates/re_renderer/examples/multiview.rs b/crates/re_renderer/examples/multiview.rs
index 3275bf68e099..2a663f30ca39 100644
--- a/crates/re_renderer/examples/multiview.rs
+++ b/crates/re_renderer/examples/multiview.rs
@@ -268,7 +268,7 @@ impl Example for Multiview {
         let skybox = GenericSkyboxDrawData::new(re_ctx);
         let lines = build_lines(re_ctx, seconds_since_startup);
 
-        let mut builder = PointCloudBuilder::<()>::default();
+        let mut builder = PointCloudBuilder::<()>::new(re_ctx);
         builder
             .batch("Random Points")
             .world_from_obj(glam::Mat4::from_rotation_x(seconds_since_startup))
diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
index fe7f5b00bb58..dd339412d3b1 100644
--- a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
+++ b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
@@ -1,4 +1,4 @@
-use std::{ops::DerefMut, sync::mpsc};
+use std::{num::NonZeroU32, ops::DerefMut, sync::mpsc};
 
 use crate::wgpu_resources::{BufferDesc, GpuBufferHandleStrong, GpuBufferPool, PoolError};
 
@@ -11,7 +11,9 @@ struct Chunk {
     unused_offset: wgpu::BufferAddress,
 }
 
-/// A suballocated staging buffer that can be written to.
+/// A sub-allocated staging buffer that can be written to.
+///
+/// Behaves a bit like a fixed sized `Vec` in that far it keeps track of how many elements were written to it.
 ///
 /// We do *not* allow reading from this buffer as it is typically write-combined memory.
 /// Reading would work, but it can be *very* slow.
@@ -22,6 +24,9 @@ pub struct CpuWriteGpuReadBuffer<T: bytemuck::Pod + 'static> {
     /// In actuality it is tied to the lifetime of [`chunk_buffer`](#structfield.chunk.chunk_buffer)!
     write_view: wgpu::BufferViewMut<'static>,
 
+    /// How many elements of T were already pushed into this buffer.
+    write_counter: usize,
+
     chunk_buffer: GpuBufferHandleStrong,
     offset_in_chunk_buffer: wgpu::BufferAddress,
 
@@ -38,73 +43,72 @@ where
     /// Do *not* make this public as we need to guarantee that the memory is *never* read from!
     #[inline(always)]
     fn as_slice(&mut self) -> &mut [T] {
-        bytemuck::cast_slice_mut(&mut self.write_view)
+        &mut bytemuck::cast_slice_mut(&mut self.write_view)[self.write_counter..]
     }
 
-    /// Writes several objects using to the buffer at a given location using a slice.
+    /// Pushes a slice of elements into the buffer.
     ///
-    /// User is responsible for ensuring the element offset is valid with the element types's alignment requirement.
-    /// (panics otherwise)
+    /// Panics if the data no longer fits into the buffer.
     #[inline]
-    pub fn write_slice(&mut self, elements: &[T], num_elements_offset: usize) {
-        self.as_slice()[num_elements_offset..].copy_from_slice(elements);
+    pub fn extend_from_slice(&mut self, elements: &[T]) {
+        self.as_slice().copy_from_slice(elements);
+        self.write_counter += elements.len();
     }
 
-    /// Writes several objects using to the buffer at a given location using an iterator.
+    /// Pushes several elements into the buffer.
     ///
-    /// User is responsible for ensuring the element offset is valid with the element types's alignment requirement.
-    /// (panics otherwise)
+    /// Panics if the data no longer fits into the buffer.
     #[inline]
-    pub fn write_iterator(
-        &mut self,
-        elements: impl Iterator<Item = T>,
-        num_elements_offset: usize,
-    ) {
-        for (target, source) in self
-            .as_slice()
-            .iter_mut()
-            .skip(num_elements_offset)
-            .zip(elements)
-        {
+    pub fn extend(&mut self, elements: impl Iterator<Item = T>) {
+        let mut num_elements = 0;
+        for (target, source) in self.as_slice().iter_mut().zip(elements) {
             *target = source;
+            num_elements += 1;
         }
+        self.write_counter += num_elements;
     }
 
-    /// Writes a single objects to the buffer at a given location.
+    /// Pushes a single element into the buffer and advances the write pointer.
     ///
-    /// User is responsible for ensuring the element offset is valid with the element types's alignment requirement.
-    /// (panics otherwise)
+    /// Panics if the data no longer fits into the buffer.
     #[inline]
-    pub fn write_single(&mut self, element: &T, num_elements_offset: usize) {
-        self.as_slice()[num_elements_offset] = *element;
+    pub fn push(&mut self, element: &T) {
+        *self.as_slice().first_mut().unwrap() = *element;
+        self.write_counter += 1;
     }
 
-    // pub fn copy_to_texture(
-    //     self,
-    //     encoder: &mut wgpu::CommandEncoder,
-    //     buffer_pool: &GpuBufferPool,
-    //     destination: wgpu::ImageCopyTexture<'_>,
-    //     bytes_per_row: Option<NonZeroU32>,
-    //     rows_per_image: Option<NonZeroU32>,
-    //     copy_size: wgpu::Extent3d,
-    // ) -> Result<(), PoolError> {
-    //     let buffer = buffer_pool.get_resource(&self.chunk_buffer)?;
-
-    //     encoder.copy_buffer_to_texture(
-    //         wgpu::ImageCopyBuffer {
-    //             buffer,
-    //             layout: wgpu::ImageDataLayout {
-    //                 offset: self.offset_in_chunk,
-    //                 bytes_per_row,
-    //                 rows_per_image,
-    //             },
-    //         },
-    //         destination,
-    //         copy_size,
-    //     );
-
-    //     Ok(())
-    // }
+    /// The number of elements pushed into the buffer so far.
+    #[inline]
+    pub fn num_written(&self) -> usize {
+        self.write_counter
+    }
+
+    pub fn copy_to_texture(
+        self,
+        encoder: &mut wgpu::CommandEncoder,
+        buffer_pool: &GpuBufferPool,
+        destination: wgpu::ImageCopyTexture<'_>,
+        bytes_per_row: Option<NonZeroU32>,
+        rows_per_image: Option<NonZeroU32>,
+        copy_size: wgpu::Extent3d,
+    ) -> Result<(), PoolError> {
+        let source_buffer = buffer_pool.get_resource(&self.chunk_buffer)?;
+
+        encoder.copy_buffer_to_texture(
+            wgpu::ImageCopyBuffer {
+                buffer: source_buffer,
+                layout: wgpu::ImageDataLayout {
+                    offset: self.offset_in_chunk_buffer,
+                    bytes_per_row,
+                    rows_per_image,
+                },
+            },
+            destination,
+            copy_size,
+        );
+
+        Ok(())
+    }
 
     /// Consume this data view and copy it to another gpu buffer.
     pub fn copy_to_buffer(
@@ -114,13 +118,16 @@ where
         destination: &GpuBufferHandleStrong,
         destination_offset: wgpu::BufferAddress,
     ) -> Result<(), PoolError> {
+        let source_buffer = buffer_pool.get_resource(&self.chunk_buffer)?;
+
         encoder.copy_buffer_to_buffer(
-            buffer_pool.get_resource(&self.chunk_buffer)?,
+            source_buffer,
             self.offset_in_chunk_buffer,
             buffer_pool.get_resource(destination)?,
             destination_offset,
             self.write_view.deref_mut().len() as u64,
         );
+
         Ok(())
     }
 }
@@ -322,6 +329,7 @@ impl CpuWriteGpuReadBelt {
             chunk_buffer: chunk.buffer.clone(),
             offset_in_chunk_buffer: start_offset,
             write_view,
+            write_counter: 0,
             _type: std::marker::PhantomData,
         };
 
@@ -375,7 +383,8 @@ impl CpuWriteGpuReadBelt {
     /// Move all chunks that the GPU is done with (and are now mapped again)
     /// from `self.receiver` to `self.free_chunks`.
     fn receive_chunks(&mut self) {
-        while let Ok(chunk) = self.receiver.try_recv() {
+        while let Ok(mut chunk) = self.receiver.try_recv() {
+            chunk.unused_offset = 0;
             self.free_chunks.push(chunk);
         }
     }
diff --git a/crates/re_renderer/src/allocator/mod.rs b/crates/re_renderer/src/allocator/mod.rs
index 7ae3d967a7e0..cb4ab783fd09 100644
--- a/crates/re_renderer/src/allocator/mod.rs
+++ b/crates/re_renderer/src/allocator/mod.rs
@@ -5,4 +5,4 @@
 
 mod cpu_write_gpu_read_belt;
 
-pub use cpu_write_gpu_read_belt::CpuWriteGpuReadBelt;
+pub use cpu_write_gpu_read_belt::{CpuWriteGpuReadBelt, CpuWriteGpuReadBuffer};
diff --git a/crates/re_renderer/src/point_cloud_builder.rs b/crates/re_renderer/src/point_cloud_builder.rs
index 1b64af4b3762..cdfd60666ea7 100644
--- a/crates/re_renderer/src/point_cloud_builder.rs
+++ b/crates/re_renderer/src/point_cloud_builder.rs
@@ -1,37 +1,45 @@
 use crate::{
+    allocator::CpuWriteGpuReadBuffer,
     renderer::{
         PointCloudBatchFlags, PointCloudBatchInfo, PointCloudDrawData, PointCloudDrawDataError,
         PointCloudVertex,
     },
-    Color32, DebugLabel, Size,
+    Color32, DebugLabel, RenderContext, Size,
 };
 
 /// Builder for point clouds, making it easy to create [`crate::renderer::PointCloudDrawData`].
 pub struct PointCloudBuilder<PerPointUserData> {
     // Size of `point`/color`/`per_point_user_data` must be equal.
     pub vertices: Vec<PointCloudVertex>,
-    pub colors: Vec<Color32>,
+
+    pub(crate) color_buffer: CpuWriteGpuReadBuffer<Color32>,
     pub user_data: Vec<PerPointUserData>,
 
     pub(crate) batches: Vec<PointCloudBatchInfo>,
 }
 
-impl<PerPointUserData> Default for PointCloudBuilder<PerPointUserData> {
-    fn default() -> Self {
+impl<PerPointUserData> PointCloudBuilder<PerPointUserData>
+where
+    PerPointUserData: Default + Copy,
+{
+    pub fn new(ctx: &mut RenderContext) -> Self {
         const RESERVE_SIZE: usize = 512;
+
+        // TODO(andreas): Be more resourceful about the size allocated here. Typically we know in advance!
+        let color_buffer = ctx.cpu_write_gpu_read_belt.lock().allocate::<Color32>(
+            &ctx.device,
+            &mut ctx.gpu_resources.buffers,
+            PointCloudDrawData::MAX_NUM_POINTS,
+        );
+
         Self {
             vertices: Vec::with_capacity(RESERVE_SIZE),
-            colors: Vec::with_capacity(RESERVE_SIZE),
+            color_buffer,
             user_data: Vec::with_capacity(RESERVE_SIZE),
             batches: Vec::with_capacity(16),
         }
     }
-}
 
-impl<PerPointUserData> PointCloudBuilder<PerPointUserData>
-where
-    PerPointUserData: Default + Copy,
-{
     /// Start of a new batch.
     #[inline]
     pub fn batch(
@@ -97,7 +105,7 @@ where
 
     /// Finalizes the builder and returns a point cloud draw data with all the points added so far.
     pub fn to_draw_data(
-        &self,
+        self,
         ctx: &mut crate::context::RenderContext,
     ) -> Result<PointCloudDrawData, PointCloudDrawDataError> {
         PointCloudDrawData::new(ctx, self)
@@ -145,9 +153,10 @@ where
     /// Each time we `add_points`, or upon builder drop, make sure that we
     /// fill in any additional colors and user-data to have matched vectors.
     fn extend_defaults(&mut self) {
-        if self.0.colors.len() < self.0.vertices.len() {
-            self.0.colors.extend(
-                std::iter::repeat(Color32::WHITE).take(self.0.vertices.len() - self.0.colors.len()),
+        if self.0.color_buffer.num_written() < self.0.vertices.len() {
+            self.0.color_buffer.extend(
+                std::iter::repeat(Color32::WHITE)
+                    .take(self.0.vertices.len() - self.0.color_buffer.num_written()),
             );
         }
 
@@ -181,7 +190,7 @@ where
 
         self.extend_defaults();
 
-        debug_assert_eq!(self.0.vertices.len(), self.0.colors.len());
+        debug_assert_eq!(self.0.vertices.len(), self.0.color_buffer.num_written());
         debug_assert_eq!(self.0.vertices.len(), self.0.user_data.len());
 
         let old_size = self.0.vertices.len();
@@ -195,7 +204,6 @@ where
         let num_points = self.0.vertices.len() - old_size;
         self.batch_mut().point_count += num_points as u32;
 
-        self.0.colors.reserve(num_points);
         self.0.user_data.reserve(num_points);
 
         let new_range = old_size..self.0.vertices.len();
@@ -205,27 +213,25 @@ where
         PointsBuilder {
             vertices: &mut self.0.vertices[new_range],
             max_points,
-            colors: &mut self.0.colors,
+            colors: &mut self.0.color_buffer,
             user_data: &mut self.0.user_data,
         }
     }
 
     #[inline]
     pub fn add_point(&mut self, position: glam::Vec3) -> PointBuilder<'_, PerPointUserData> {
-        debug_assert_eq!(self.0.vertices.len(), self.0.colors.len());
+        debug_assert_eq!(self.0.vertices.len(), self.0.color_buffer.num_written());
         debug_assert_eq!(self.0.vertices.len(), self.0.user_data.len());
 
         self.0.vertices.push(PointCloudVertex {
             position,
             radius: Size::AUTO,
         });
-        self.0.colors.push(Color32::WHITE);
-        self.0.user_data.push(PerPointUserData::default());
         self.batch_mut().point_count += 1;
 
         PointBuilder {
             vertex: self.0.vertices.last_mut().unwrap(),
-            color: self.0.colors.last_mut().unwrap(),
+            color: &mut self.0.color_buffer,
             user_data: self.0.user_data.last_mut().unwrap(),
         }
     }
@@ -260,7 +266,7 @@ where
 
 pub struct PointBuilder<'a, PerPointUserData> {
     vertex: &'a mut PointCloudVertex,
-    color: &'a mut Color32,
+    color: &'a mut CpuWriteGpuReadBuffer<Color32>,
     user_data: &'a mut PerPointUserData,
 }
 
@@ -274,9 +280,10 @@ where
         self
     }
 
+    /// This mustn't call this more than once.
     #[inline]
     pub fn color(self, color: Color32) -> Self {
-        *self.color = color;
+        self.color.push(&color);
         self
     }
 
@@ -293,7 +300,7 @@ pub struct PointsBuilder<'a, PerPointUserData> {
     // Colors and user-data are the Vec we append
     // the data to if provided.
     max_points: usize,
-    colors: &'a mut Vec<Color32>,
+    colors: &'a mut CpuWriteGpuReadBuffer<Color32>,
     user_data: &'a mut Vec<PerPointUserData>,
 }
 
@@ -303,6 +310,8 @@ where
 {
     /// Assigns radii to all points.
     ///
+    /// This mustn't call this more than once.
+    ///
     /// If the iterator doesn't cover all points, some will not be assigned.
     /// If the iterator provides more values than there are points, the extra values will be ignored.
     #[inline]
@@ -318,18 +327,22 @@ where
 
     /// Assigns colors to all points.
     ///
+    /// This mustn't call this more than once.
+    ///
     /// If the iterator doesn't cover all points, some will not be assigned.
     /// If the iterator provides more values than there are points, the extra values will be ignored.
     #[inline]
     pub fn colors(self, colors: impl Iterator<Item = Color32>) -> Self {
         crate::profile_function!();
         self.colors
-            .extend(colors.take(self.max_points - self.colors.len()));
+            .extend(colors.take(self.max_points - self.colors.num_written()));
         self
     }
 
     /// Assigns user data for all points in this builder.
     ///
+    /// This mustn't call this more than once.
+    ///
     /// User data is currently not available on the GPU.
     #[inline]
     pub fn user_data(self, data: impl Iterator<Item = PerPointUserData>) -> Self {
diff --git a/crates/re_renderer/src/renderer/point_cloud.rs b/crates/re_renderer/src/renderer/point_cloud.rs
index 5a37803d9562..a1aea58bc5aa 100644
--- a/crates/re_renderer/src/renderer/point_cloud.rs
+++ b/crates/re_renderer/src/renderer/point_cloud.rs
@@ -19,7 +19,7 @@ use std::{
 };
 
 use crate::{
-    context::uniform_buffer_allocation_size, wgpu_resources::BufferDesc, Color32, DebugLabel,
+    context::uniform_buffer_allocation_size, wgpu_resources::BufferDesc, DebugLabel,
     PointCloudBuilder,
 };
 use bitflags::bitflags;
@@ -151,7 +151,7 @@ impl PointCloudDrawData {
     /// If no batches are passed, all points are assumed to be in a single batch with identity transform.
     pub fn new<T>(
         ctx: &mut RenderContext,
-        builder: &PointCloudBuilder<T>,
+        builder: PointCloudBuilder<T>,
     ) -> Result<Self, PointCloudDrawDataError> {
         crate::profile_function!();
 
@@ -163,7 +163,6 @@ impl PointCloudDrawData {
         );
 
         let vertices = builder.vertices.as_slice();
-        let colors = builder.colors.as_slice();
         let batches = builder.batches.as_slice();
 
         if vertices.is_empty() {
@@ -197,23 +196,16 @@ impl PointCloudDrawData {
             0
         );
 
-        if vertices.len() != colors.len() {
-            return Err(PointCloudDrawDataError::NumberOfColorsNotEqualNumberOfVertices);
-        }
-
-        let (vertices, colors) = if vertices.len() >= Self::MAX_NUM_POINTS {
+        let vertices = if vertices.len() >= Self::MAX_NUM_POINTS {
             re_log::error_once!(
                 "Reached maximum number of supported points. Clamping down to {}, passed were {}.
  See also https://github.com/rerun-io/rerun/issues/957",
                 Self::MAX_NUM_POINTS,
                 vertices.len()
             );
-            (
-                &vertices[..Self::MAX_NUM_POINTS],
-                &colors[..Self::MAX_NUM_POINTS],
-            )
+            &vertices[..Self::MAX_NUM_POINTS]
         } else {
-            (vertices, colors)
+            vertices
         };
 
         // TODO(andreas): We want a "stack allocation" here that lives for one frame.
@@ -264,15 +256,6 @@ impl PointCloudDrawData {
                 .collect_vec()
         };
 
-        let color_staging = {
-            crate::profile_scope!("collect_colors");
-            colors
-                .iter()
-                .cloned()
-                .chain(std::iter::repeat(Color32::TRANSPARENT).take(num_points_zeroed))
-                .collect_vec()
-        };
-
         // Upload data from staging buffers to gpu.
         let size = wgpu::Extent3d {
             width: DATA_TEXTURE_SIZE,
@@ -308,28 +291,27 @@ impl PointCloudDrawData {
 
         {
             crate::profile_scope!("write_color_texture");
-            ctx.queue.write_texture(
-                wgpu::ImageCopyTexture {
-                    texture: &ctx
-                        .gpu_resources
-                        .textures
-                        .get_resource(&color_texture)
-                        .unwrap()
-                        .texture,
-                    mip_level: 0,
-                    origin: wgpu::Origin3d::ZERO,
-                    aspect: wgpu::TextureAspect::All,
-                },
-                bytemuck::cast_slice(&color_staging),
-                wgpu::ImageDataLayout {
-                    offset: 0,
-                    bytes_per_row: NonZeroU32::new(
-                        DATA_TEXTURE_SIZE * std::mem::size_of::<[u8; 4]>() as u32,
-                    ),
-                    rows_per_image: None,
-                },
-                size,
-            );
+            builder
+                .color_buffer
+                .copy_to_texture(
+                    ctx.active_frame.frame_global_command_encoder(&ctx.device),
+                    &ctx.gpu_resources.buffers,
+                    wgpu::ImageCopyTexture {
+                        texture: &ctx
+                            .gpu_resources
+                            .textures
+                            .get_resource(&color_texture)
+                            .unwrap()
+                            .texture,
+                        mip_level: 0,
+                        origin: wgpu::Origin3d::ZERO,
+                        aspect: wgpu::TextureAspect::All,
+                    },
+                    NonZeroU32::new(DATA_TEXTURE_SIZE * std::mem::size_of::<[u8; 4]>() as u32),
+                    None,
+                    size,
+                )
+                .expect("invalid color cpu->gpu buffer");
         }
 
         let bind_group_all_points = ctx.gpu_resources.bind_groups.alloc(
diff --git a/crates/re_renderer/src/view_builder.rs b/crates/re_renderer/src/view_builder.rs
index 6a9b75abdcc1..e0f00e3fcacf 100644
--- a/crates/re_renderer/src/view_builder.rs
+++ b/crates/re_renderer/src/view_builder.rs
@@ -392,25 +392,22 @@ impl ViewBuilder {
             .cpu_write_gpu_read_belt
             .lock()
             .allocate::<FrameUniformBuffer>(&ctx.device, &mut ctx.gpu_resources.buffers, 1);
-        frame_uniform_buffer_cpu.write_single(
-            &FrameUniformBuffer {
-                view_from_world: glam::Affine3A::from_mat4(view_from_world).into(),
-                projection_from_view: projection_from_view.into(),
-                projection_from_world: projection_from_world.into(),
-                camera_position,
-                camera_forward,
-                tan_half_fov: tan_half_fov.into(),
-                pixel_world_size_from_camera_distance,
-                pixels_from_point: config.pixels_from_point,
-
-                auto_size_points: auto_size_points.0,
-                auto_size_lines: auto_size_lines.0,
-
-                depth_offset_factor,
-                _padding: glam::Vec3::ZERO,
-            },
-            0,
-        );
+        frame_uniform_buffer_cpu.push(&FrameUniformBuffer {
+            view_from_world: glam::Affine3A::from_mat4(view_from_world).into(),
+            projection_from_view: projection_from_view.into(),
+            projection_from_world: projection_from_world.into(),
+            camera_position,
+            camera_forward,
+            tan_half_fov: tan_half_fov.into(),
+            pixel_world_size_from_camera_distance,
+            pixels_from_point: config.pixels_from_point,
+
+            auto_size_points: auto_size_points.0,
+            auto_size_lines: auto_size_lines.0,
+
+            depth_offset_factor,
+            _padding: glam::Vec3::ZERO,
+        });
         frame_uniform_buffer_cpu
             .copy_to_buffer(
                 ctx.active_frame.frame_global_command_encoder(&ctx.device),
diff --git a/crates/re_viewer/src/ui/space_view.rs b/crates/re_viewer/src/ui/space_view.rs
index f06bbae25b37..3e9c7d693ff1 100644
--- a/crates/re_viewer/src/ui/space_view.rs
+++ b/crates/re_viewer/src/ui/space_view.rs
@@ -176,7 +176,7 @@ impl SpaceView {
                     &self.space_path,
                     self.data_blueprint.data_blueprints_projected(),
                 );
-                let mut scene = view_spatial::SceneSpatial::default();
+                let mut scene = view_spatial::SceneSpatial::new(ctx.render_ctx);
                 scene.load(ctx, &query, &transforms, highlights);
                 self.view_state
                     .ui_spatial(ctx, ui, &self.space_path, scene, self.id, highlights);
diff --git a/crates/re_viewer/src/ui/view_spatial/scene/mod.rs b/crates/re_viewer/src/ui/view_spatial/scene/mod.rs
index 18166e05d587..84376f4994c2 100644
--- a/crates/re_viewer/src/ui/view_spatial/scene/mod.rs
+++ b/crates/re_viewer/src/ui/view_spatial/scene/mod.rs
@@ -115,7 +115,6 @@ pub struct SceneSpatialUiData {
     pub images: Vec<Image>,
 }
 
-#[derive(Default)]
 pub struct SceneSpatial {
     pub annotation_map: AnnotationMap,
     pub primitives: SceneSpatialPrimitives,
@@ -146,6 +145,17 @@ fn instance_path_hash_if_interactive(
 pub type Keypoints = HashMap<(ClassId, i64), HashMap<KeypointId, glam::Vec3>>;
 
 impl SceneSpatial {
+    pub fn new(re_ctx: &mut re_renderer::RenderContext) -> Self {
+        Self {
+            annotation_map: Default::default(),
+            primitives: SceneSpatialPrimitives::new(re_ctx),
+            ui: Default::default(),
+            num_logged_2d_objects: Default::default(),
+            num_logged_3d_objects: Default::default(),
+            space_cameras: Default::default(),
+        }
+    }
+
     /// Loads all 3D objects into the scene according to the given query.
     pub(crate) fn load(
         &mut self,
diff --git a/crates/re_viewer/src/ui/view_spatial/scene/primitives.rs b/crates/re_viewer/src/ui/view_spatial/scene/primitives.rs
index 56b8e323e58c..22a1fd28dd31 100644
--- a/crates/re_viewer/src/ui/view_spatial/scene/primitives.rs
+++ b/crates/re_viewer/src/ui/view_spatial/scene/primitives.rs
@@ -10,7 +10,6 @@ use super::MeshSource;
 /// TODO(andreas): Right now we're using `re_renderer` data structures for reading (bounding box & picking).
 ///                 In the future, this will be more limited as we're going to gpu staging data as soon as possible
 ///                 which is very slow to read. See [#594](https://github.com/rerun-io/rerun/pull/594)
-#[derive(Default)]
 pub struct SceneSpatialPrimitives {
     /// Estimated bounding box of all data in scene coordinates. Accumulated.
     pub(super) bounding_box: macaw::BoundingBox,
@@ -31,6 +30,17 @@ const AXIS_COLOR_Y: Color32 = Color32::from_rgb(0, 240, 0);
 const AXIS_COLOR_Z: Color32 = Color32::from_rgb(80, 80, 255);
 
 impl SceneSpatialPrimitives {
+    pub fn new(re_ctx: &mut re_renderer::RenderContext) -> Self {
+        Self {
+            bounding_box: macaw::BoundingBox::nothing(),
+            textured_rectangles_ids: Default::default(),
+            textured_rectangles: Default::default(),
+            line_strips: Default::default(),
+            points: PointCloudBuilder::new(re_ctx),
+            meshes: Default::default(),
+        }
+    }
+
     /// bounding box covering the rendered scene
     pub fn bounding_box(&self) -> macaw::BoundingBox {
         self.bounding_box
diff --git a/crates/re_viewer/src/ui/view_spatial/ui_2d.rs b/crates/re_viewer/src/ui/view_spatial/ui_2d.rs
index 172341ac30b6..23be71806264 100644
--- a/crates/re_viewer/src/ui/view_spatial/ui_2d.rs
+++ b/crates/re_viewer/src/ui/view_spatial/ui_2d.rs
@@ -8,7 +8,11 @@ use re_data_store::EntityPath;
 use re_log_types::component_types::TensorTrait;
 use re_renderer::view_builder::TargetConfiguration;
 
-use super::{eye::Eye, scene::AdditionalPickingInfo, ViewSpatialState};
+use super::{
+    eye::Eye,
+    scene::{AdditionalPickingInfo, SceneSpatialUiData},
+    ViewSpatialState,
+};
 use crate::{
     misc::{HoveredSpace, Item, SelectionHighlight, SpaceViewHighlights},
     ui::{
@@ -301,43 +305,13 @@ fn view_2d_scrollable(
 
     // ------------------------------------------------------------------------
 
-    // Draw a re_renderer driven view.
-    // Camera & projection are configured to ingest space coordinates directly.
-    {
-        crate::profile_scope!("build command buffer for 2D view {}", space.to_string());
-
-        let Ok(target_config) = setup_target_config(
-            &painter,
-            space_from_ui,
-            space_from_pixel,
-            &space.to_string(),
-            state.auto_size_config(response.rect.size()),
-        ) else {
-            return response;
-        };
-
-        let Ok(callback) = create_scene_paint_callback(
-            ctx.render_ctx,
-            target_config, painter.clip_rect(),
-            &scene.primitives,
-            &ScreenBackground::ClearColor(parent_ui.visuals().extreme_bg_color.into()),
-        ) else {
-            return response;
-        };
-
-        painter.add(callback);
-    }
-
-    // ------------------------------------------------------------------------
-
-    // Add egui driven labels on top of re_renderer content.
-    painter.extend(create_labels(
-        &mut scene,
+    let label_shapes = create_labels(
+        &mut scene.ui,
         ui_from_space,
         space_from_ui,
         parent_ui,
         highlights,
-    ));
+    );
 
     // ------------------------------------------------------------------------
 
@@ -452,6 +426,33 @@ fn view_2d_scrollable(
 
     // ------------------------------------------------------------------------
 
+    // Draw a re_renderer driven view.
+    // Camera & projection are configured to ingest space coordinates directly.
+    {
+        crate::profile_scope!("build command buffer for 2D view {}", space.to_string());
+
+        let Ok(target_config) = setup_target_config(
+            &painter,
+            space_from_ui,
+            space_from_pixel,
+            &space.to_string(),
+            state.auto_size_config(response.rect.size()),
+        ) else {
+            return response;
+        };
+
+        let Ok(callback) = create_scene_paint_callback(
+            ctx.render_ctx,
+            target_config, painter.clip_rect(),
+            scene.primitives,
+            &ScreenBackground::ClearColor(parent_ui.visuals().extreme_bg_color.into()),
+        ) else {
+            return response;
+        };
+
+        painter.add(callback);
+    }
+
     project_onto_other_spaces(ctx, space, &response, &space_from_ui, depth_at_pointer);
     painter.extend(show_projections_from_3d_space(
         ctx,
@@ -460,11 +461,14 @@ fn view_2d_scrollable(
         &ui_from_space,
     ));
 
+    // Add egui driven labels on top of re_renderer content.
+    painter.extend(label_shapes);
+
     response
 }
 
 fn create_labels(
-    scene: &mut SceneSpatial,
+    scene_ui: &mut SceneSpatialUiData,
     ui_from_space: RectTransform,
     space_from_ui: RectTransform,
     parent_ui: &mut egui::Ui,
@@ -472,9 +476,9 @@ fn create_labels(
 ) -> Vec<Shape> {
     crate::profile_function!();
 
-    let mut label_shapes = Vec::with_capacity(scene.ui.labels_2d.len() * 2);
+    let mut label_shapes = Vec::with_capacity(scene_ui.labels_2d.len() * 2);
 
-    for label in &scene.ui.labels_2d {
+    for label in &scene_ui.labels_2d {
         let (wrap_width, text_anchor_pos) = match label.target {
             Label2DTarget::Rect(rect) => {
                 let rect_in_ui = ui_from_space.transform_rect(rect);
@@ -534,8 +538,7 @@ fn create_labels(
         label_shapes.push(Shape::rect_filled(bg_rect, 3.0, fill_color));
         label_shapes.push(Shape::galley(text_rect.center_top(), galley));
 
-        scene
-            .ui
+        scene_ui
             .pickable_ui_rects
             .push((space_from_ui.transform_rect(bg_rect), label.labled_instance));
     }
diff --git a/crates/re_viewer/src/ui/view_spatial/ui_3d.rs b/crates/re_viewer/src/ui/view_spatial/ui_3d.rs
index b4d1ed2a2359..88590aba0cb7 100644
--- a/crates/re_viewer/src/ui/view_spatial/ui_3d.rs
+++ b/crates/re_viewer/src/ui/view_spatial/ui_3d.rs
@@ -473,7 +473,7 @@ pub fn view_3d(
         ui,
         eye,
         rect,
-        &scene,
+        scene,
         ctx.render_ctx,
         &space.to_string(),
         state.auto_size_config(rect.size()),
@@ -484,7 +484,7 @@ fn paint_view(
     ui: &mut egui::Ui,
     eye: Eye,
     rect: egui::Rect,
-    scene: &SceneSpatial,
+    scene: SceneSpatial,
     render_ctx: &mut RenderContext,
     name: &str,
     auto_size_config: re_renderer::AutoSizeConfig,
@@ -516,7 +516,7 @@ fn paint_view(
         render_ctx,
         target_config,
         rect,
-        &scene.primitives, &ScreenBackground::GenericSkybox)
+        scene.primitives, &ScreenBackground::GenericSkybox)
     else {
         return;
     };
diff --git a/crates/re_viewer/src/ui/view_spatial/ui_renderer_bridge.rs b/crates/re_viewer/src/ui/view_spatial/ui_renderer_bridge.rs
index 3579b11cfc74..3d2f24cc8765 100644
--- a/crates/re_viewer/src/ui/view_spatial/ui_renderer_bridge.rs
+++ b/crates/re_viewer/src/ui/view_spatial/ui_renderer_bridge.rs
@@ -17,7 +17,7 @@ pub fn create_scene_paint_callback(
     render_ctx: &mut RenderContext,
     target_config: TargetConfiguration,
     clip_rect: egui::Rect,
-    primitives: &SceneSpatialPrimitives,
+    primitives: SceneSpatialPrimitives,
     background: &ScreenBackground,
 ) -> anyhow::Result<egui::PaintCallback> {
     let pixels_from_point = target_config.pixels_from_point;
@@ -39,7 +39,7 @@ pub enum ScreenBackground {
 fn create_and_fill_view_builder(
     render_ctx: &mut RenderContext,
     target_config: TargetConfiguration,
-    primitives: &SceneSpatialPrimitives,
+    primitives: SceneSpatialPrimitives,
     background: &ScreenBackground,
 ) -> anyhow::Result<(wgpu::CommandBuffer, ViewBuilder)> {
     let mut view_builder = ViewBuilder::default();

From 1bf15f7726b47090457bd5bb46d2003bb42a1f96 Mon Sep 17 00:00:00 2001
From: Andreas Reich <andreas@rerun.io>
Date: Mon, 20 Feb 2023 23:20:41 +0100
Subject: [PATCH 08/21] add note on pitfalls of write combined memory

---
 .../src/allocator/cpu_write_gpu_read_belt.rs  | 25 +++++++++++--------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
index dd339412d3b1..63e10ad65373 100644
--- a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
+++ b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
@@ -2,21 +2,17 @@ use std::{num::NonZeroU32, ops::DerefMut, sync::mpsc};
 
 use crate::wgpu_resources::{BufferDesc, GpuBufferHandleStrong, GpuBufferPool, PoolError};
 
-/// Internal chunk of the staging belt.
-struct Chunk {
-    buffer: GpuBufferHandleStrong,
-    size: wgpu::BufferAddress,
-
-    /// At what offset is [`write_view`](#structfield.write_view) unused.
-    unused_offset: wgpu::BufferAddress,
-}
-
 /// A sub-allocated staging buffer that can be written to.
 ///
 /// Behaves a bit like a fixed sized `Vec` in that far it keeps track of how many elements were written to it.
 ///
 /// We do *not* allow reading from this buffer as it is typically write-combined memory.
 /// Reading would work, but it can be *very* slow.
+/// For details on why, see
+/// [Write combining is not your friend, by Fabian Giesen](https://fgiesen.wordpress.com/2013/01/29/write-combining-is-not-your-friend/)
+/// Note that the "vec like behavior" further encourages
+/// * not leaving holes
+/// * keeping writes sequential
 pub struct CpuWriteGpuReadBuffer<T: bytemuck::Pod + 'static> {
     /// Write view into the relevant buffer portion.
     ///
@@ -132,6 +128,15 @@ where
     }
 }
 
+/// Internal chunk of the staging belt.
+struct Chunk {
+    buffer: GpuBufferHandleStrong,
+    size: wgpu::BufferAddress,
+
+    /// At what offset is [`write_view`](#structfield.write_view) unused.
+    unused_offset: wgpu::BufferAddress,
+}
+
 /// Efficiently performs many buffer writes by sharing and reusing temporary buffers.
 ///
 /// Internally it uses a ring-buffer of staging buffers that are sub-allocated.
@@ -200,7 +205,6 @@ impl CpuWriteGpuReadBelt {
     /// Allocates a cpu writable buffer for `num_elements` instances of T.
     ///
     /// Handles alignment requirements automatically which allows faster copy operations.
-    #[allow(unsafe_code)]
     pub fn allocate<T: bytemuck::Pod + 'static>(
         &mut self,
         device: &wgpu::Device,
@@ -317,6 +321,7 @@ impl CpuWriteGpuReadBelt {
         let (start_offset, write_view) =
             allocate_chunk_mapping_with_alignment(&mut chunk, size, buffer, alignment);
 
+        #[allow(unsafe_code)]
         // SAFETY:
         // write_view has a lifetime dependency on the chunk's buffer.
         // To ensure that the buffer is still around, we put the ref counted buffer handle into the struct with it.

From 09a1f20d2c7cf323dfe8eb414604757a6f79796a Mon Sep 17 00:00:00 2001
From: Andreas Reich <andreas@rerun.io>
Date: Tue, 21 Feb 2023 10:39:14 +0100
Subject: [PATCH 09/21] Introduce minimum alignment to CpuWriteGpuReadBelt, fix
 padding mistake, add log

---
 .../src/allocator/cpu_write_gpu_read_belt.rs  | 50 +++++++++++++++----
 1 file changed, 39 insertions(+), 11 deletions(-)

diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
index 63e10ad65373..b1219c2ca504 100644
--- a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
+++ b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
@@ -146,7 +146,8 @@ struct Chunk {
 /// * can create buffers without yet knowing the target copy location
 /// * lifetime of returned buffers is independent of the [`StagingWriteBelt`] (allows working with several in parallel!)
 /// * use of `re_renderer`'s resource pool
-/// * handles alignment
+/// * handles alignment in a defined manner
+///  (see this as of writing open wgpu issue on [Alignment guarantees for mapped buffers](https://github.com/gfx-rs/wgpu/issues/3508))
 pub struct CpuWriteGpuReadBelt {
     /// Minimum size for new buffers.
     chunk_size: wgpu::BufferSize,
@@ -177,6 +178,13 @@ pub struct CpuWriteGpuReadBelt {
 }
 
 impl CpuWriteGpuReadBelt {
+    /// All allocations of this allocator will be aligned to at least this size.
+    ///
+    /// Requiring a minimum alignment means we need to pad less often.
+    /// Also, it has the potential of making memcpy operations faster.
+    /// Align to 4xf32. Should be enough for most usecases and is
+    pub const MIN_ALIGNMENT: u64 = 128;
+
     /// Create a new staging belt.
     ///
     /// The `chunk_size` is the unit of internal buffer allocation; writes will be
@@ -191,6 +199,8 @@ impl CpuWriteGpuReadBelt {
     /// TODO(andreas): Adaptive chunk sizes
     /// TODO(andreas): Shrinking after usage spikes?
     pub fn new(chunk_size: wgpu::BufferSize) -> Self {
+        static_assertions::const_assert!(wgpu::MAP_ALIGNMENT <= CpuWriteGpuReadBelt::MIN_ALIGNMENT);
+
         let (sender, receiver) = mpsc::channel();
         CpuWriteGpuReadBelt {
             chunk_size,
@@ -204,15 +214,20 @@ impl CpuWriteGpuReadBelt {
 
     /// Allocates a cpu writable buffer for `num_elements` instances of T.
     ///
-    /// Handles alignment requirements automatically which allows faster copy operations.
+    /// Handles alignment requirements automatically, allowing arbitrarily aligned types and fast memcpy.
     pub fn allocate<T: bytemuck::Pod + 'static>(
         &mut self,
         device: &wgpu::Device,
         buffer_pool: &mut GpuBufferPool,
         num_elements: usize,
     ) -> CpuWriteGpuReadBuffer<T> {
-        let alignment = (std::mem::align_of::<T>() as wgpu::BufferAddress).min(wgpu::MAP_ALIGNMENT);
-        let size = (std::mem::size_of::<T>() * num_elements) as wgpu::BufferAddress;
+        // Potentially overestimate alignment with Self::MIN_ALIGNMENT, see Self::MIN_ALIGNMENT doc string.
+        let alignment = (std::mem::align_of::<T>() as wgpu::BufferAddress).max(Self::MIN_ALIGNMENT);
+        // Padd out the size of the used buffer, so that within a started chunk we'll always hit Self::MIN_ALIGNMENT
+        let size = wgpu::util::align_to(
+            (std::mem::size_of::<T>() * num_elements) as wgpu::BufferAddress,
+            Self::MIN_ALIGNMENT,
+        );
 
         // We need to be super careful with alignment since today wgpu
         // has NO guarantees on how pointers to mapped memory are aligned!
@@ -220,10 +235,9 @@ impl CpuWriteGpuReadBelt {
         //
         // See https://github.com/gfx-rs/wgpu/issues/3508
         //
-        // To work around this, we require a bigger size to begin with.
-        //
-        // TODO(andreas): Either fix the wgpu issue or come up with a more conservative strategy,
-        //                where we first look for a buffer slice with `size` and then again with required_size if needed.
+        // To work around this, we ask for a bigger size, so we can safely pad out
+        // *if* the returned pointer is not correctly aligned.
+        // (i.e. we will use _up to_ `required_size` bytes, but at least `size`)
         let required_size = size + alignment - 1;
 
         // We explicitly use `deref_mut` on write_view everywhere, since wgpu warns if we accidentally use `deref`.
@@ -246,9 +260,16 @@ impl CpuWriteGpuReadBelt {
             {
                 self.free_chunks.swap_remove(index)
             } else {
-                // Allocation might be bigger than a chunk.
-                let size = self.chunk_size.get().max(required_size);
+                // (happens relatively rarely, this is a noteworthy event!)
+                re_log::debug!(
+                    "Allocating new CpuWriteGpuReadBuffer chunk of size {required_size}"
+                );
 
+                // Allocation might be bigger than a chunk.
+                let size = wgpu::util::align_to(
+                    self.chunk_size.get().max(required_size),
+                    wgpu::MAP_ALIGNMENT,
+                );
                 let buffer = buffer_pool.alloc(
                     device,
                     &BufferDesc {
@@ -300,11 +321,18 @@ impl CpuWriteGpuReadBelt {
         ) -> (u64, wgpu::BufferViewMut<'a>) {
             // First optimistically try without explicit padding.
             let (start_offset, mut write_view) = allocate_mapping(chunk, size, buffer);
-            let required_padding = write_view.deref_mut().as_ptr() as u64 % alignment; // use deref_mut because wgpu warns otherwise that read access is slow.
+
+            // use deref_mut explicitly because wgpu warns otherwise that read access is slow.
+            let ptr = write_view.deref_mut().as_ptr() as u64;
+            let required_padding = wgpu::util::align_to(ptr, alignment) - ptr;
 
             if required_padding == 0 {
                 (start_offset, write_view)
             } else {
+                re_log::trace!(
+                    "CpuWriteGpuReadBuffer::allocate alignment requirement not fulfilled. Need to add {required_padding} for alignment of {alignment}"
+                );
+
                 // Undo mapping and try again with padding.
                 // We made sure earlier that the chunk has enough space for this case!
                 drop(write_view);

From 5728005bdb3b1b1cb990cc7fc8c40ffc8ba54251 Mon Sep 17 00:00:00 2001
From: Andreas Reich <andreas@rerun.io>
Date: Wed, 22 Feb 2023 17:43:19 +0100
Subject: [PATCH 10/21] cleanup, fixup alignment padding a bit

---
 .../src/allocator/cpu_write_gpu_read_belt.rs  | 201 +++++++++---------
 .../re_renderer/src/renderer/point_cloud.rs   |  30 ++-
 crates/re_renderer/src/view_builder.rs        |  48 ++---
 3 files changed, 131 insertions(+), 148 deletions(-)

diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
index 73cc8423997f..cb6e02705f15 100644
--- a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
+++ b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
@@ -1,6 +1,6 @@
 use std::{num::NonZeroU32, ops::DerefMut, sync::mpsc};
 
-use crate::wgpu_resources::{BufferDesc, GpuBuffer, GpuBufferPool, PoolError};
+use crate::wgpu_resources::{BufferDesc, GpuBuffer, GpuBufferPool};
 
 /// A sub-allocated staging buffer that can be written to.
 ///
@@ -86,7 +86,7 @@ where
         bytes_per_row: Option<NonZeroU32>,
         rows_per_image: Option<NonZeroU32>,
         copy_size: wgpu::Extent3d,
-    ) -> Result<(), PoolError> {
+    ) {
         encoder.copy_buffer_to_texture(
             wgpu::ImageCopyBuffer {
                 buffer: &self.chunk_buffer,
@@ -99,8 +99,6 @@ where
             destination,
             copy_size,
         );
-
-        Ok(())
     }
 
     /// Consume this data view and copy it to another gpu buffer.
@@ -109,7 +107,7 @@ where
         encoder: &mut wgpu::CommandEncoder,
         destination: &GpuBuffer,
         destination_offset: wgpu::BufferAddress,
-    ) -> Result<(), PoolError> {
+    ) {
         encoder.copy_buffer_to_buffer(
             &self.chunk_buffer,
             self.offset_in_chunk_buffer,
@@ -117,20 +115,82 @@ where
             destination_offset,
             self.write_view.deref_mut().len() as u64,
         );
-
-        Ok(())
     }
 }
 
 /// Internal chunk of the staging belt.
 struct Chunk {
     buffer: GpuBuffer,
-    size: wgpu::BufferAddress,
 
-    /// At what offset is [`write_view`](#structfield.write_view) unused.
+    /// Starting at this offset the buffer is unused.
     unused_offset: wgpu::BufferAddress,
 }
 
+impl Chunk {
+    fn remaining_capacity(&self) -> u64 {
+        self.buffer.size() - self.unused_offset
+    }
+
+    fn required_padding(write_view: &mut wgpu::BufferViewMut<'_>, alignment: u64) -> u64 {
+        // Use deref_mut explicitly because wgpu warns otherwise that read access is slow.
+        let ptr = write_view.deref_mut().as_ptr() as u64;
+        wgpu::util::align_to(ptr, alignment) - ptr
+    }
+
+    /// Caller needs to make sure that there is enough space plus potential padding.
+    fn allocate_aligned<T: bytemuck::Pod>(
+        &mut self,
+        size_in_bytes: u64,
+        alignment: u64,
+    ) -> CpuWriteGpuReadBuffer<T> {
+        // Optimistic first mapping attempt.
+        let mut start_offset = self.unused_offset;
+        let mut end_offset = start_offset + size_in_bytes;
+        let mut buffer_slice = self.buffer.slice(start_offset..end_offset);
+        let mut write_view = buffer_slice.get_mapped_range_mut();
+
+        // Check if it fullfills the requested alignment.
+        let required_padding = Self::required_padding(&mut write_view, alignment);
+        if required_padding != 0 {
+            // Undo mapping and try again with padding!
+            re_log::trace!(
+                "CpuWriteGpuReadBuffer::allocate alignment requirement not fulfilled. Need to add {required_padding} for alignment of {alignment}"
+            );
+
+            drop(write_view);
+
+            start_offset = self.unused_offset + required_padding;
+            end_offset = start_offset + size_in_bytes;
+            buffer_slice = self.buffer.slice(start_offset..end_offset);
+            write_view = buffer_slice.get_mapped_range_mut();
+
+            let required_padding = Self::required_padding(&mut write_view, alignment);
+            debug_assert_eq!(required_padding, 0);
+        }
+
+        debug_assert!(end_offset <= self.buffer.size());
+        self.unused_offset = end_offset;
+
+        #[allow(unsafe_code)]
+        // SAFETY:
+        // write_view has a lifetime dependency on the chunk's buffer.
+        // To ensure that the buffer is still around, we put the ref counted buffer handle into the struct with it.
+        // However, this also implies that the buffer pool is still alive! The renderer context needs to make sure of this.
+        // TODO: some more doc why this is here
+        let write_view = unsafe {
+            std::mem::transmute::<wgpu::BufferViewMut<'_>, wgpu::BufferViewMut<'static>>(write_view)
+        };
+
+        CpuWriteGpuReadBuffer {
+            chunk_buffer: self.buffer.clone(),
+            offset_in_chunk_buffer: start_offset,
+            write_view,
+            write_counter: 0,
+            _type: std::marker::PhantomData,
+        }
+    }
+}
+
 /// Efficiently performs many buffer writes by sharing and reusing temporary buffers.
 ///
 /// Internally it uses a ring-buffer of staging buffers that are sub-allocated.
@@ -144,7 +204,7 @@ struct Chunk {
 ///  (see this as of writing open wgpu issue on [Alignment guarantees for mapped buffers](https://github.com/gfx-rs/wgpu/issues/3508))
 pub struct CpuWriteGpuReadBelt {
     /// Minimum size for new buffers.
-    chunk_size: wgpu::BufferSize,
+    chunk_size: u64,
 
     /// Chunks which are CPU write at the moment.
     active_chunks: Vec<Chunk>,
@@ -176,10 +236,11 @@ impl CpuWriteGpuReadBelt {
     ///
     /// Requiring a minimum alignment means we need to pad less often.
     /// Also, it has the potential of making memcpy operations faster.
-    /// Align to 4xf32. Should be enough for most usecases and is
+    ///
+    /// Align to 4xf32. Should be enough for most usecases!
     pub const MIN_ALIGNMENT: u64 = 16;
 
-    /// Create a new staging belt.
+    /// Create a cpu-write & gpu-read staging belt.
     ///
     /// The `chunk_size` is the unit of internal buffer allocation; writes will be
     /// sub-allocated within each chunk. Therefore, for optimal use of memory, the
@@ -197,7 +258,7 @@ impl CpuWriteGpuReadBelt {
 
         let (sender, receiver) = mpsc::channel();
         CpuWriteGpuReadBelt {
-            chunk_size,
+            chunk_size: wgpu::util::align_to(chunk_size.get(), Self::MIN_ALIGNMENT),
             active_chunks: Vec::new(),
             closed_chunks: Vec::new(),
             free_chunks: Vec::new(),
@@ -206,10 +267,10 @@ impl CpuWriteGpuReadBelt {
         }
     }
 
-    /// Allocates a cpu writable buffer for `num_elements` instances of T.
+    /// Allocates a cpu writable buffer for `num_elements` instances of type `T`.
     ///
-    /// Handles alignment requirements automatically, allowing arbitrarily aligned types and fast memcpy.
-    pub fn allocate<T: bytemuck::Pod + 'static>(
+    /// Handles alignment requirements automatically, allowing arbitrarily aligned types without issues.
+    pub fn allocate<T: bytemuck::Pod>(
         &mut self,
         device: &wgpu::Device,
         buffer_pool: &mut GpuBufferPool,
@@ -217,30 +278,30 @@ impl CpuWriteGpuReadBelt {
     ) -> CpuWriteGpuReadBuffer<T> {
         // Potentially overestimate alignment with Self::MIN_ALIGNMENT, see Self::MIN_ALIGNMENT doc string.
         let alignment = (std::mem::align_of::<T>() as wgpu::BufferAddress).max(Self::MIN_ALIGNMENT);
-        // Padd out the size of the used buffer, so that within active chunk we'll always hit at least Self::MIN_ALIGNMENT
+        // Pad out the size of the used buffer to a multiple of Self::MIN_ALIGNMENT.
+        // This increases our chance of having back to back allocations within a chunk.
         let size = wgpu::util::align_to(
             (std::mem::size_of::<T>() * num_elements) as wgpu::BufferAddress,
             Self::MIN_ALIGNMENT,
         );
 
         // We need to be super careful with alignment since today wgpu
-        // has NO guarantees on how pointers to mapped memory are aligned!
+        // has no guarantees on how pointers to mapped memory are aligned!
         // For all we know, pointers might be 1 aligned, causing even a u32 write to crash the process!
         //
-        // See https://github.com/gfx-rs/wgpu/issues/3508
+        // For details and (as of writing) ongoing discussion see https://github.com/gfx-rs/wgpu/issues/3508
         //
         // To work around this, we ask for a bigger size, so we can safely pad out
-        // *if* the returned pointer is not correctly aligned.
-        // (i.e. we will use _up to_ `required_size` bytes, but at least `size`)
-        let required_size = size + alignment - 1;
-
-        // We explicitly use `deref_mut` on write_view everywhere, since wgpu warns if we accidentally use `deref`.
+        // if the returned pointer is not correctly aligned.
+        // (i.e. we will use _up to_ `required_size` bytes, but at least `size`)]
+        let maximum_padding = alignment - 1;
+        let max_required_size = size + maximum_padding;
 
         // Try to find space in any of the active chunks first.
         let mut chunk = if let Some(index) = self
             .active_chunks
             .iter_mut()
-            .position(|chunk| chunk.size - chunk.unused_offset >= required_size)
+            .position(|chunk| chunk.remaining_capacity() >= max_required_size)
         {
             self.active_chunks.swap_remove(index)
         } else {
@@ -250,25 +311,22 @@ impl CpuWriteGpuReadBelt {
             if let Some(index) = self
                 .free_chunks
                 .iter()
-                .position(|chunk| chunk.size >= required_size)
+                .position(|chunk| chunk.remaining_capacity() >= max_required_size)
             {
                 self.free_chunks.swap_remove(index)
             } else {
-                // (happens relatively rarely, this is a noteworthy event!)
-                re_log::debug!(
-                    "Allocating new CpuWriteGpuReadBuffer chunk of size {required_size}"
-                );
-
-                // Allocation might be bigger than a chunk.
-                let size = wgpu::util::align_to(
-                    self.chunk_size.get().max(required_size),
-                    wgpu::MAP_ALIGNMENT,
+                // Allocation might be bigger than a chunk!
+                let buffer_size = wgpu::util::align_to(
+                    self.chunk_size.max(max_required_size),
+                    Self::MIN_ALIGNMENT,
                 );
+                // Happens relatively rarely, this is a noteworthy event!
+                re_log::debug!("Allocating new CpuWriteGpuReadBelt chunk of size {buffer_size}");
                 let buffer = buffer_pool.alloc(
                     device,
                     &BufferDesc {
                         label: "CpuWriteGpuReadBelt buffer".into(),
-                        size,
+                        size: buffer_size,
                         usage: wgpu::BufferUsages::MAP_WRITE | wgpu::BufferUsages::COPY_SRC,
                         mapped_at_creation: true,
                     },
@@ -276,82 +334,13 @@ impl CpuWriteGpuReadBelt {
 
                 Chunk {
                     buffer,
-                    size,
                     unused_offset: 0,
                 }
             }
         };
 
-        // Allocate mapping from a chunk.
-        fn allocate_mapping<'a>(chunk: &'a mut Chunk, size: u64) -> (u64, wgpu::BufferViewMut<'a>) {
-            let start_offset = chunk.unused_offset;
-            let end_offset = start_offset + size;
-
-            debug_assert!(end_offset <= chunk.size);
-            chunk.unused_offset = end_offset;
-
-            let buffer_slice = chunk.buffer.slice(start_offset..end_offset);
-            (start_offset, buffer_slice.get_mapped_range_mut())
-        }
-
-        // Allocate mapping from a chunk with alignment requirements.
-        //
-        // Depending on how https://github.com/gfx-rs/wgpu/issues/3508 will be solved, we can early out for certain alignments.
-        // TODO(andreas): Or just hardcode alignment for everything to `MIN_ALIGNMENT`?
-        fn allocate_chunk_mapping_with_alignment<'a>(
-            chunk: &'a mut Chunk,
-            size: u64,
-            alignment: u64,
-        ) -> (u64, wgpu::BufferViewMut<'a>) {
-            // First optimistically try without explicit padding.
-            let (start_offset, mut write_view) = allocate_mapping(chunk, size);
-
-            // use deref_mut explicitly because wgpu warns otherwise that read access is slow.
-            let ptr = write_view.deref_mut().as_ptr() as u64;
-            let required_padding = wgpu::util::align_to(ptr, alignment) - ptr;
-
-            //            if required_padding == 0 {
-            (start_offset, write_view)
-            // } else {
-            //     re_log::trace!(
-            //         "CpuWriteGpuReadBuffer::allocate alignment requirement not fulfilled. Need to add {required_padding} for alignment of {alignment}"
-            //     );
-
-            //     // Undo mapping and try again with padding.
-            //     // We made sure earlier that the chunk has enough space for this case!
-            //     drop(write_view);
-            //     chunk.unused_offset = start_offset + required_padding;
-
-            //     let (start_offset, mut write_view) = allocate_mapping(chunk, size);
-            //     let required_padding = write_view.deref_mut().as_ptr() as u64 % alignment; // use deref_mut because wgpu warns otherwise that read access is slow.
-            //     debug_assert_eq!(required_padding, 0);
-
-            //     (start_offset, write_view)
-            // }
-        }
-
-        let (start_offset, write_view) =
-            allocate_chunk_mapping_with_alignment(&mut chunk, size, alignment);
-
-        #[allow(unsafe_code)]
-        // SAFETY:
-        // write_view has a lifetime dependency on the chunk's buffer.
-        // To ensure that the buffer is still around, we put the ref counted buffer handle into the struct with it.
-        // However, this also implies that the buffer pool is still alive! The renderer context needs to make sure of this.
-        let write_view = unsafe {
-            std::mem::transmute::<wgpu::BufferViewMut<'_>, wgpu::BufferViewMut<'static>>(write_view)
-        };
-
-        let cpu_buffer_view = CpuWriteGpuReadBuffer {
-            chunk_buffer: chunk.buffer.clone(),
-            offset_in_chunk_buffer: start_offset,
-            write_view,
-            write_counter: 0,
-            _type: std::marker::PhantomData,
-        };
-
+        let cpu_buffer_view = chunk.allocate_aligned(size, alignment);
         self.active_chunks.push(chunk);
-
         cpu_buffer_view
     }
 
diff --git a/crates/re_renderer/src/renderer/point_cloud.rs b/crates/re_renderer/src/renderer/point_cloud.rs
index 7d9b8494cbce..233911761fd2 100644
--- a/crates/re_renderer/src/renderer/point_cloud.rs
+++ b/crates/re_renderer/src/renderer/point_cloud.rs
@@ -284,24 +284,18 @@ impl PointCloudDrawData {
             );
         }
 
-        {
-            crate::profile_scope!("write_color_texture");
-            builder
-                .color_buffer
-                .copy_to_texture(
-                    ctx.active_frame.frame_global_command_encoder(&ctx.device),
-                    wgpu::ImageCopyTexture {
-                        texture: &color_texture.texture,
-                        mip_level: 0,
-                        origin: wgpu::Origin3d::ZERO,
-                        aspect: wgpu::TextureAspect::All,
-                    },
-                    NonZeroU32::new(DATA_TEXTURE_SIZE * std::mem::size_of::<[u8; 4]>() as u32),
-                    None,
-                    size,
-                )
-                .expect("invalid color cpu->gpu buffer");
-        }
+        builder.color_buffer.copy_to_texture(
+            ctx.active_frame.frame_global_command_encoder(&ctx.device),
+            wgpu::ImageCopyTexture {
+                texture: &color_texture.texture,
+                mip_level: 0,
+                origin: wgpu::Origin3d::ZERO,
+                aspect: wgpu::TextureAspect::All,
+            },
+            NonZeroU32::new(DATA_TEXTURE_SIZE * std::mem::size_of::<[u8; 4]>() as u32),
+            None,
+            size,
+        );
 
         let bind_group_all_points = ctx.gpu_resources.bind_groups.alloc(
             &ctx.device,
diff --git a/crates/re_renderer/src/view_builder.rs b/crates/re_renderer/src/view_builder.rs
index 849b99106629..6bafab2dbc74 100644
--- a/crates/re_renderer/src/view_builder.rs
+++ b/crates/re_renderer/src/view_builder.rs
@@ -393,33 +393,33 @@ impl ViewBuilder {
         // Factor applied to depth offsets.
         let depth_offset_factor = 1.0e-08; // Value determined by experimentation. Quite close to the f32 machine epsilon but a bit lower.
 
-        let mut frame_uniform_buffer_cpu = ctx
-            .cpu_write_gpu_read_belt
-            .lock()
-            .allocate::<FrameUniformBuffer>(&ctx.device, &mut ctx.gpu_resources.buffers, 1);
-        frame_uniform_buffer_cpu.push(&FrameUniformBuffer {
-            view_from_world: glam::Affine3A::from_mat4(view_from_world).into(),
-            projection_from_view: projection_from_view.into(),
-            projection_from_world: projection_from_world.into(),
-            camera_position,
-            camera_forward,
-            tan_half_fov: tan_half_fov.into(),
-            pixel_world_size_from_camera_distance,
-            pixels_from_point: config.pixels_from_point,
-
-            auto_size_points: auto_size_points.0,
-            auto_size_lines: auto_size_lines.0,
-
-            depth_offset_factor,
-            _padding: glam::Vec3::ZERO,
-        });
-        frame_uniform_buffer_cpu
-            .copy_to_buffer(
+        {
+            let mut frame_uniform_buffer_cpu = ctx
+                .cpu_write_gpu_read_belt
+                .lock()
+                .allocate::<FrameUniformBuffer>(&ctx.device, &mut ctx.gpu_resources.buffers, 1);
+            frame_uniform_buffer_cpu.push(&FrameUniformBuffer {
+                view_from_world: glam::Affine3A::from_mat4(view_from_world).into(),
+                projection_from_view: projection_from_view.into(),
+                projection_from_world: projection_from_world.into(),
+                camera_position,
+                camera_forward,
+                tan_half_fov: tan_half_fov.into(),
+                pixel_world_size_from_camera_distance,
+                pixels_from_point: config.pixels_from_point,
+
+                auto_size_points: auto_size_points.0,
+                auto_size_lines: auto_size_lines.0,
+
+                depth_offset_factor,
+                _padding: glam::Vec3::ZERO,
+            });
+            frame_uniform_buffer_cpu.copy_to_buffer(
                 ctx.active_frame.frame_global_command_encoder(&ctx.device),
                 &frame_uniform_buffer,
                 0,
-            )
-            .unwrap();
+            );
+        }
 
         ctx.queue.write_buffer(
             &frame_uniform_buffer,

From 7d544a79477df790dc1df50d092a1ecc5955db83 Mon Sep 17 00:00:00 2001
From: Andreas Reich <andreas@rerun.io>
Date: Wed, 22 Feb 2023 18:46:18 +0100
Subject: [PATCH 11/21] Limit number of in-flight queue submissions

---
 crates/re_renderer/src/context.rs | 43 ++++++++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/crates/re_renderer/src/context.rs b/crates/re_renderer/src/context.rs
index 166094007af6..ea4ad3fbc827 100644
--- a/crates/re_renderer/src/context.rs
+++ b/crates/re_renderer/src/context.rs
@@ -33,6 +33,12 @@ pub struct RenderContext {
     pub texture_manager_2d: TextureManager2D,
     pub cpu_write_gpu_read_belt: Mutex<CpuWriteGpuReadBelt>,
 
+    /// List of unfinished queue submission via this context.
+    ///
+    /// This is currently only about submissions we do via the global encoder in [`ActiveFrameContext`]
+    /// TODO(andreas): We rely on egui to to the "primary" submissions in re_viewer. It would be nice to take full control over all submissions.
+    pub inflight_queue_submissions: Vec<wgpu::SubmissionIndex>,
+
     pub(crate) active_frame: ActiveFrameContext,
 }
 
@@ -71,6 +77,23 @@ impl Renderers {
 }
 
 impl RenderContext {
+    /// Limit maximum number of in flight submissions to this number.
+    ///
+    /// By limiting the number of submissions we have on the queue we ensure that GPU stalls do not
+    /// cause us to request more and more memory to prepare more and more submissions.
+    ///
+    /// Note that this is only indirectly related to number of buffered frames,
+    /// since buffered frames/blit strategy are all about the display<->gpu interface,
+    /// whereas this is about a *particular aspect* of the cpu<->gpu interface.
+    ///
+    /// Should be somewhere between 1-4, too high and we use up more memory and introduce latency,
+    /// too low and we may starve the GPU.
+    /// On the web we want to go lower because a lot more memory constraint.
+    #[cfg(not(target_arch = "wasm32"))]
+    const MAX_NUM_INFLIGHT_QUEUE_SUBMISSIONS: usize = 4;
+    #[cfg(target_arch = "wasm32")]
+    const MAX_NUM_INFLIGHT_QUEUE_SUBMISSIONS: usize = 2;
+
     pub fn new(
         device: Arc<wgpu::Device>,
         queue: Arc<wgpu::Queue>,
@@ -162,10 +185,23 @@ impl RenderContext {
             #[cfg(all(not(target_arch = "wasm32"), debug_assertions))] // native debug build
             err_tracker,
 
+            inflight_queue_submissions: Vec::new(),
+
             active_frame: ActiveFrameContext { frame_global_command_encoder: None, frame_index: 0 }
         }
     }
 
+    fn poll_device(&mut self) {
+        crate::profile_function!();
+
+        // Ensure not too many queue submissions are in flight.
+        while self.inflight_queue_submissions.len() > Self::MAX_NUM_INFLIGHT_QUEUE_SUBMISSIONS {
+            let submission_index = self.inflight_queue_submissions.remove(0); // Remove oldest
+            self.device
+                .poll(wgpu::Maintain::WaitForSubmissionIndex(submission_index));
+        }
+    }
+
     /// Call this at the beginning of a new frame.
     ///
     /// Updates internal book-keeping, frame allocators and executes delayed events like shader reloading.
@@ -235,6 +271,10 @@ impl RenderContext {
             samplers.begin_frame(frame_index);
         }
 
+        // Poll device *after* resource pool `begin_frame` since resource pools may each decide drop resources.
+        // Wgpu internally may then internally decide to let go of these buffers.
+        self.poll_device();
+
         // Retrieve unused staging buffer.
         self.cpu_write_gpu_read_belt.lock().after_queue_submit();
     }
@@ -251,7 +291,8 @@ impl RenderContext {
 
             // TODO(andreas): For better performance, we should try to bundle this with the single submit call that is currently happening in eframe.
             //                  How do we hook in there and make sure this buffer is submitted first?
-            self.queue.submit([command_buffer]);
+            self.inflight_queue_submissions
+                .push(self.queue.submit([command_buffer]));
         }
     }
 }

From cf57460f7a7e9d842c199683903376e3c4a6db20 Mon Sep 17 00:00:00 2001
From: Andreas Reich <andreas@rerun.io>
Date: Wed, 22 Feb 2023 18:46:49 +0100
Subject: [PATCH 12/21] different memory budget for web and native on staging
 belt

---
 crates/re_renderer/src/context.rs | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/crates/re_renderer/src/context.rs b/crates/re_renderer/src/context.rs
index ea4ad3fbc827..f71b28ad04e5 100644
--- a/crates/re_renderer/src/context.rs
+++ b/crates/re_renderer/src/context.rs
@@ -77,6 +77,17 @@ impl Renderers {
 }
 
 impl RenderContext {
+    /// Chunk size for our cpu->gpu buffer manager.
+    ///
+    /// For native: 32MiB chunk size (as big as a for instance a 2048x1024 float4 texture)
+    /// For web (memory constraint!): 8MiB
+    #[cfg(not(target_arch = "wasm32"))]
+    const CPU_WRITE_GPU_READ_BELT_DEFAULT_CHUNK_SIZE: Option<wgpu::BufferSize> =
+        wgpu::BufferSize::new(1024 * 1024 * 32);
+    #[cfg(target_arch = "wasm32")]
+    const CPU_WRITE_GPU_READ_BELT_DEFAULT_CHUNK_SIZE: Option<wgpu::BufferSize> =
+        wgpu::BufferSize::new(1024 * 1024 * 8);
+
     /// Limit maximum number of in flight submissions to this number.
     ///
     /// By limiting the number of submissions we have on the queue we ensure that GPU stalls do not
@@ -177,8 +188,7 @@ impl RenderContext {
 
             mesh_manager,
             texture_manager_2d,
-            // 32MiB chunk size (as big as a for instance a 2048x1024 float4 texture)
-            cpu_write_gpu_read_belt: Mutex::new(CpuWriteGpuReadBelt::new(wgpu::BufferSize::new(1024 * 1024 * 32).unwrap())),
+            cpu_write_gpu_read_belt: Mutex::new(CpuWriteGpuReadBelt::new(Self::CPU_WRITE_GPU_READ_BELT_DEFAULT_CHUNK_SIZE.unwrap())),
 
             resolver,
 

From 9db6f70a186eaa82cbeb71c25fb884b1c96d4a5d Mon Sep 17 00:00:00 2001
From: Andreas Reich <andreas@rerun.io>
Date: Wed, 22 Feb 2023 18:57:45 +0100
Subject: [PATCH 13/21] nicer poll_device implementation

---
 crates/re_renderer/src/context.rs | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/crates/re_renderer/src/context.rs b/crates/re_renderer/src/context.rs
index f71b28ad04e5..9ea81cc20497 100644
--- a/crates/re_renderer/src/context.rs
+++ b/crates/re_renderer/src/context.rs
@@ -205,10 +205,19 @@ impl RenderContext {
         crate::profile_function!();
 
         // Ensure not too many queue submissions are in flight.
-        while self.inflight_queue_submissions.len() > Self::MAX_NUM_INFLIGHT_QUEUE_SUBMISSIONS {
-            let submission_index = self.inflight_queue_submissions.remove(0); // Remove oldest
-            self.device
-                .poll(wgpu::Maintain::WaitForSubmissionIndex(submission_index));
+        let num_submissions_to_wait_for = self
+            .inflight_queue_submissions
+            .len()
+            .saturating_sub(Self::MAX_NUM_INFLIGHT_QUEUE_SUBMISSIONS);
+
+        if let Some(newest_submission_to_wait_for) = self
+            .inflight_queue_submissions
+            .drain(0..num_submissions_to_wait_for)
+            .last()
+        {
+            self.device.poll(wgpu::Maintain::WaitForSubmissionIndex(
+                newest_submission_to_wait_for,
+            ));
         }
     }
 

From 1d30e9af3d38704365d648b5cea20ed8a5898c2a Mon Sep 17 00:00:00 2001
From: Andreas Reich <andreas@rerun.io>
Date: Wed, 22 Feb 2023 19:07:17 +0100
Subject: [PATCH 14/21] move per_frame_data_helper into ActiveFrame

---
 .../src/allocator/cpu_write_gpu_read_belt.rs  |  2 +-
 crates/re_renderer/src/context.rs             | 28 +++++++++----------
 .../src/ui/view_spatial/ui_renderer_bridge.rs |  6 +++-
 3 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
index cb6e02705f15..ed79eef4671d 100644
--- a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
+++ b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
@@ -149,7 +149,7 @@ impl Chunk {
         let mut buffer_slice = self.buffer.slice(start_offset..end_offset);
         let mut write_view = buffer_slice.get_mapped_range_mut();
 
-        // Check if it fullfills the requested alignment.
+        // Check if it fulfills the requested alignment.
         let required_padding = Self::required_padding(&mut write_view, alignment);
         if required_padding != 0 {
             // Undo mapping and try again with padding!
diff --git a/crates/re_renderer/src/context.rs b/crates/re_renderer/src/context.rs
index 9ea81cc20497..1fc79c49d4eb 100644
--- a/crates/re_renderer/src/context.rs
+++ b/crates/re_renderer/src/context.rs
@@ -25,9 +25,6 @@ pub struct RenderContext {
     #[cfg(all(not(target_arch = "wasm32"), debug_assertions))] // native debug build
     pub(crate) err_tracker: std::sync::Arc<crate::error_tracker::ErrorTracker>,
 
-    /// Utility type map that will be cleared every frame.
-    pub per_frame_data_helper: TypeMap,
-
     pub gpu_resources: WgpuResourcePools,
     pub mesh_manager: MeshManager,
     pub texture_manager_2d: TextureManager2D,
@@ -37,9 +34,9 @@ pub struct RenderContext {
     ///
     /// This is currently only about submissions we do via the global encoder in [`ActiveFrameContext`]
     /// TODO(andreas): We rely on egui to to the "primary" submissions in re_viewer. It would be nice to take full control over all submissions.
-    pub inflight_queue_submissions: Vec<wgpu::SubmissionIndex>,
+    inflight_queue_submissions: Vec<wgpu::SubmissionIndex>,
 
-    pub(crate) active_frame: ActiveFrameContext,
+    pub active_frame: ActiveFrameContext,
 }
 
 /// Immutable data that is shared between all [`Renderer`]
@@ -182,8 +179,6 @@ impl RenderContext {
 
             renderers,
 
-            per_frame_data_helper: TypeMap::new(),
-
             gpu_resources,
 
             mesh_manager,
@@ -197,7 +192,8 @@ impl RenderContext {
 
             inflight_queue_submissions: Vec::new(),
 
-            active_frame: ActiveFrameContext { frame_global_command_encoder: None, frame_index: 0 }
+            active_frame: ActiveFrameContext { frame_global_command_encoder: None,             per_frame_data_helper: TypeMap::new(),
+                 frame_index: 0 }
         }
     }
 
@@ -226,11 +222,15 @@ impl RenderContext {
     /// Updates internal book-keeping, frame allocators and executes delayed events like shader reloading.
     pub fn begin_frame(&mut self) {
         crate::profile_function!();
-        self.per_frame_data_helper.clear();
+
+        // Request used staging buffer back.
+        // TODO(andreas): If we'd control all submissions, we could move this directly after the submission which would be a bit better.
+        self.cpu_write_gpu_read_belt.lock().after_queue_submit();
 
         self.active_frame = ActiveFrameContext {
             frame_global_command_encoder: None,
             frame_index: self.active_frame.frame_index + 1,
+            per_frame_data_helper: TypeMap::new(),
         };
         let frame_index = self.active_frame.frame_index;
 
@@ -293,12 +293,9 @@ impl RenderContext {
         // Poll device *after* resource pool `begin_frame` since resource pools may each decide drop resources.
         // Wgpu internally may then internally decide to let go of these buffers.
         self.poll_device();
-
-        // Retrieve unused staging buffer.
-        self.cpu_write_gpu_read_belt.lock().after_queue_submit();
     }
 
-    /// Call this at the end of a frame but before submitting command buffers from [`crate::view_builder::ViewBuilder`]
+    /// Call this at the end of a frame but before submitting command buffers (e.g. from [`crate::view_builder::ViewBuilder`])
     pub fn before_submit(&mut self) {
         crate::profile_function!();
 
@@ -333,7 +330,10 @@ pub struct ActiveFrameContext {
     /// (i.e. typically in [`crate::renderer::DrawData`] creation!)
     frame_global_command_encoder: Option<wgpu::CommandEncoder>,
 
-    // TODO(andreas): Add frame/lifetime statistics, shared resources (e.g. "global" uniform buffer), ??
+    /// Utility type map that will be cleared every frame.
+    pub per_frame_data_helper: TypeMap,
+
+    /// Index of this frame. Is incremented for every render frame.
     frame_index: u64,
 }
 
diff --git a/crates/re_viewer/src/ui/view_spatial/ui_renderer_bridge.rs b/crates/re_viewer/src/ui/view_spatial/ui_renderer_bridge.rs
index ed3c6d5a4a76..30a2758bb653 100644
--- a/crates/re_viewer/src/ui/view_spatial/ui_renderer_bridge.rs
+++ b/crates/re_viewer/src/ui/view_spatial/ui_renderer_bridge.rs
@@ -85,6 +85,7 @@ fn renderer_paint_callback(
     let command_buffer = std::sync::Arc::new(Mutex::new(Some(command_buffer)));
 
     let composition_view_builder_map = render_ctx
+        .active_frame
         .per_frame_data_helper
         .entry::<ViewBuilderMap>()
         .or_insert_with(Default::default);
@@ -111,7 +112,10 @@ fn renderer_paint_callback(
                     //let clip_rect = info.clip_rect_in_pixels();
 
                     let ctx = paint_callback_resources.get::<RenderContext>().unwrap();
-                    ctx.per_frame_data_helper.get::<ViewBuilderMap>().unwrap()[view_builder_handle]
+                    ctx.active_frame
+                        .per_frame_data_helper
+                        .get::<ViewBuilderMap>()
+                        .unwrap()[view_builder_handle]
                         .composite(ctx, render_pass, screen_position)
                         .expect("Failed compositing view builder with main target.");
                 }),

From b7078c3da7b73724fed08b19dc48e035233f90dc Mon Sep 17 00:00:00 2001
From: Andreas Reich <andreas@rerun.io>
Date: Wed, 22 Feb 2023 19:57:04 +0100
Subject: [PATCH 15/21] iterating on some doc and api aspects. Add resource
 ownership check at end of life of resource pool

---
 crates/re_renderer/examples/framework.rs      |  3 +-
 .../src/allocator/cpu_write_gpu_read_belt.rs  | 48 +++++++++++--------
 crates/re_renderer/src/context.rs             |  3 +-
 .../wgpu_resources/dynamic_resource_pool.rs   | 35 ++++++++++++--
 4 files changed, 63 insertions(+), 26 deletions(-)

diff --git a/crates/re_renderer/examples/framework.rs b/crates/re_renderer/examples/framework.rs
index f63f5e828eb0..e0b09e1d830a 100644
--- a/crates/re_renderer/examples/framework.rs
+++ b/crates/re_renderer/examples/framework.rs
@@ -82,10 +82,11 @@ struct Application<E> {
     window: Window,
     surface: wgpu::Surface,
     surface_config: wgpu::SurfaceConfiguration,
-    re_ctx: RenderContext,
     time: Time,
 
     example: E,
+
+    re_ctx: RenderContext,
 }
 
 impl<E: Example + 'static> Application<E> {
diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
index ed79eef4671d..b614ad85ed46 100644
--- a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
+++ b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
@@ -17,14 +17,14 @@ pub struct CpuWriteGpuReadBuffer<T: bytemuck::Pod + 'static> {
     /// Write view into the relevant buffer portion.
     ///
     /// UNSAFE: The lifetime is transmuted to be `'static`.
-    /// In actuality it is tied to the lifetime of [`chunk_buffer`](#structfield.chunk.chunk_buffer)!
+    /// In actuality it is tied to the lifetime of [`chunk_buffer`](#structfield.chunk_buffer)!
     write_view: wgpu::BufferViewMut<'static>,
 
-    /// How many elements of T were already pushed into this buffer.
-    write_counter: usize,
+    /// Range in T elements in write_view that haven't been written yet.
+    unwritten_element_range: std::ops::Range<usize>,
 
     chunk_buffer: GpuBuffer,
-    offset_in_chunk_buffer: wgpu::BufferAddress,
+    byte_offset_in_chunk_buffer: wgpu::BufferAddress,
 
     /// Marker for the type whose alignment and size requirements are honored by `write_view`.
     _type: std::marker::PhantomData<T>,
@@ -39,7 +39,7 @@ where
     /// Do *not* make this public as we need to guarantee that the memory is *never* read from!
     #[inline(always)]
     fn as_slice(&mut self) -> &mut [T] {
-        &mut bytemuck::cast_slice_mut(&mut self.write_view)[self.write_counter..]
+        &mut bytemuck::cast_slice_mut(&mut self.write_view)[self.unwritten_element_range.clone()]
     }
 
     /// Pushes a slice of elements into the buffer.
@@ -48,7 +48,7 @@ where
     #[inline]
     pub fn extend_from_slice(&mut self, elements: &[T]) {
         self.as_slice().copy_from_slice(elements);
-        self.write_counter += elements.len();
+        self.unwritten_element_range.start += elements.len();
     }
 
     /// Pushes several elements into the buffer.
@@ -61,7 +61,7 @@ where
             *target = source;
             num_elements += 1;
         }
-        self.write_counter += num_elements;
+        self.unwritten_element_range.start += num_elements;
     }
 
     /// Pushes a single element into the buffer and advances the write pointer.
@@ -69,16 +69,17 @@ where
     /// Panics if the data no longer fits into the buffer.
     #[inline]
     pub fn push(&mut self, element: &T) {
-        *self.as_slice().first_mut().unwrap() = *element;
-        self.write_counter += 1;
+        self.as_slice()[0] = *element;
+        self.unwritten_element_range.start += 1;
     }
 
     /// The number of elements pushed into the buffer so far.
     #[inline]
     pub fn num_written(&self) -> usize {
-        self.write_counter
+        self.unwritten_element_range.start
     }
 
+    /// Copies the entire buffer to a texture and drops it.
     pub fn copy_to_texture(
         self,
         encoder: &mut wgpu::CommandEncoder,
@@ -91,7 +92,7 @@ where
             wgpu::ImageCopyBuffer {
                 buffer: &self.chunk_buffer,
                 layout: wgpu::ImageDataLayout {
-                    offset: self.offset_in_chunk_buffer,
+                    offset: self.byte_offset_in_chunk_buffer,
                     bytes_per_row,
                     rows_per_image,
                 },
@@ -101,7 +102,7 @@ where
         );
     }
 
-    /// Consume this data view and copy it to another gpu buffer.
+    /// Copies the entire buffer to another buffer and drops it.
     pub fn copy_to_buffer(
         mut self,
         encoder: &mut wgpu::CommandEncoder,
@@ -110,7 +111,7 @@ where
     ) {
         encoder.copy_buffer_to_buffer(
             &self.chunk_buffer,
-            self.offset_in_chunk_buffer,
+            self.byte_offset_in_chunk_buffer,
             destination,
             destination_offset,
             self.write_view.deref_mut().len() as u64,
@@ -140,9 +141,12 @@ impl Chunk {
     /// Caller needs to make sure that there is enough space plus potential padding.
     fn allocate_aligned<T: bytemuck::Pod>(
         &mut self,
+        num_elements: usize,
         size_in_bytes: u64,
         alignment: u64,
     ) -> CpuWriteGpuReadBuffer<T> {
+        debug_assert!(num_elements * std::mem::size_of::<T>() <= size_in_bytes as usize);
+
         // Optimistic first mapping attempt.
         let mut start_offset = self.unused_offset;
         let mut end_offset = start_offset + size_in_bytes;
@@ -173,19 +177,25 @@ impl Chunk {
 
         #[allow(unsafe_code)]
         // SAFETY:
-        // write_view has a lifetime dependency on the chunk's buffer.
+        // write_view has a lifetime dependency on the chunk's buffer - internally it holds a pointer to it!
+        //
         // To ensure that the buffer is still around, we put the ref counted buffer handle into the struct with it.
-        // However, this also implies that the buffer pool is still alive! The renderer context needs to make sure of this.
-        // TODO: some more doc why this is here
+        // Additionally, the buffer pool needs to ensure:
+        // * it can't drop buffers if there's still users
+        //      -> We assert on on that
+        // * buffers are never moved in memory
+        //      -> buffers are always owned by the pool and are always Arc.
+        //          This means it not allowed to move the buffer out.
+        //          (We could make them Pin<Arc<>> but this complicates things inside the BufferPool)
         let write_view = unsafe {
             std::mem::transmute::<wgpu::BufferViewMut<'_>, wgpu::BufferViewMut<'static>>(write_view)
         };
 
         CpuWriteGpuReadBuffer {
             chunk_buffer: self.buffer.clone(),
-            offset_in_chunk_buffer: start_offset,
+            byte_offset_in_chunk_buffer: start_offset,
             write_view,
-            write_counter: 0,
+            unwritten_element_range: 0..num_elements,
             _type: std::marker::PhantomData,
         }
     }
@@ -339,7 +349,7 @@ impl CpuWriteGpuReadBelt {
             }
         };
 
-        let cpu_buffer_view = chunk.allocate_aligned(size, alignment);
+        let cpu_buffer_view = chunk.allocate_aligned(num_elements, size, alignment);
         self.active_chunks.push(chunk);
         cpu_buffer_view
     }
diff --git a/crates/re_renderer/src/context.rs b/crates/re_renderer/src/context.rs
index 1fc79c49d4eb..150d51320a5d 100644
--- a/crates/re_renderer/src/context.rs
+++ b/crates/re_renderer/src/context.rs
@@ -25,7 +25,6 @@ pub struct RenderContext {
     #[cfg(all(not(target_arch = "wasm32"), debug_assertions))] // native debug build
     pub(crate) err_tracker: std::sync::Arc<crate::error_tracker::ErrorTracker>,
 
-    pub gpu_resources: WgpuResourcePools,
     pub mesh_manager: MeshManager,
     pub texture_manager_2d: TextureManager2D,
     pub cpu_write_gpu_read_belt: Mutex<CpuWriteGpuReadBelt>,
@@ -37,6 +36,8 @@ pub struct RenderContext {
     inflight_queue_submissions: Vec<wgpu::SubmissionIndex>,
 
     pub active_frame: ActiveFrameContext,
+
+    pub gpu_resources: WgpuResourcePools, // Last due to drop order.
 }
 
 /// Immutable data that is shared between all [`Renderer`]
diff --git a/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs b/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs
index 5dd5d20e7969..b966fb1c2d1d 100644
--- a/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs
+++ b/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs
@@ -20,13 +20,16 @@ pub trait DynamicResourcesDesc {
     fn allow_reuse(&self) -> bool;
 }
 
-pub struct DynamicResource<Handle, Desc, Res> {
+pub struct DynamicResource<Handle, Desc: Debug, Res> {
     pub inner: Res,
     pub creation_desc: Desc,
     pub handle: Handle,
 }
 
-impl<Handle, Desc, Res> std::ops::Deref for DynamicResource<Handle, Desc, Res> {
+impl<Handle, Desc, Res> std::ops::Deref for DynamicResource<Handle, Desc, Res>
+where
+    Desc: Debug,
+{
     type Target = Res;
 
     #[inline]
@@ -39,7 +42,7 @@ impl<Handle, Desc, Res> std::ops::Deref for DynamicResource<Handle, Desc, Res> {
 type AliveResourceMap<Handle, Desc, Res> =
     SlotMap<Handle, Option<Arc<DynamicResource<Handle, Desc, Res>>>>;
 
-struct DynamicResourcePoolProtectedState<Handle: Key, Desc, Res> {
+struct DynamicResourcePoolProtectedState<Handle: Key, Desc: Debug, Res> {
     /// All currently alive resources.
     /// We store any ref counted handle we give out in [`DynamicResourcePool::alloc`] here in order to keep it alive.
     /// Every [`DynamicResourcePool::begin_frame`] we check if the pool is now the only owner of the handle
@@ -55,7 +58,7 @@ struct DynamicResourcePoolProtectedState<Handle: Key, Desc, Res> {
 ///
 /// Unlike in [`super::static_resource_pool::StaticResourcePool`], a resource can not be uniquely
 /// identified by its description, as the same description can apply to several different resources.
-pub(super) struct DynamicResourcePool<Handle: Key, Desc, Res> {
+pub(super) struct DynamicResourcePool<Handle: Key, Desc: Debug, Res> {
     state: RwLock<DynamicResourcePoolProtectedState<Handle, Desc, Res>>,
 
     current_frame_index: u64,
@@ -63,7 +66,10 @@ pub(super) struct DynamicResourcePool<Handle: Key, Desc, Res> {
 }
 
 /// We cannot #derive(Default) as that would require Handle/Desc/Res to implement Default too.
-impl<Handle: Key, Desc, Res> Default for DynamicResourcePool<Handle, Desc, Res> {
+impl<Handle: Key, Desc, Res> Default for DynamicResourcePool<Handle, Desc, Res>
+where
+    Desc: Debug,
+{
     fn default() -> Self {
         Self {
             state: RwLock::new(DynamicResourcePoolProtectedState {
@@ -202,6 +208,25 @@ where
             .load(std::sync::atomic::Ordering::Relaxed)
     }
 }
+impl<Handle, Desc, Res> Drop for DynamicResourcePool<Handle, Desc, Res>
+where
+    Handle: Key,
+    Desc: Debug,
+{
+    fn drop(&mut self) {
+        for (_, alive_resource) in self.state.read().alive_resources.iter() {
+            let alive_resource = alive_resource
+                .as_ref()
+                .expect("Alive resources should never be None");
+            let ref_count = Arc::strong_count(alive_resource);
+
+            assert!(ref_count == 1,
+                    "Resource has still {} owners at the time of pool destruction. Description desc was {:?}",
+                    ref_count - 1,
+                    &alive_resource.creation_desc);
+        }
+    }
+}
 
 #[cfg(test)]
 mod tests {

From 4e00a03d93004db32fffc1314477dda23db34c57 Mon Sep 17 00:00:00 2001
From: Andreas Reich <andreas@rerun.io>
Date: Wed, 22 Feb 2023 20:12:03 +0100
Subject: [PATCH 16/21] commenting out DynamicResourcePool drop check

---
 .../wgpu_resources/dynamic_resource_pool.rs   | 27 +++++++++++--------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs b/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs
index b966fb1c2d1d..f41636ed81b4 100644
--- a/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs
+++ b/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs
@@ -214,17 +214,22 @@ where
     Desc: Debug,
 {
     fn drop(&mut self) {
-        for (_, alive_resource) in self.state.read().alive_resources.iter() {
-            let alive_resource = alive_resource
-                .as_ref()
-                .expect("Alive resources should never be None");
-            let ref_count = Arc::strong_count(alive_resource);
-
-            assert!(ref_count == 1,
-                    "Resource has still {} owners at the time of pool destruction. Description desc was {:?}",
-                    ref_count - 1,
-                    &alive_resource.creation_desc);
-        }
+        // TODO(andreas): We're failing this check currently on re_viewer's shutdown.
+        // This is primarily the case due to the way we store the render ctx itself and other things on egui-wgpu's paint callback resources
+        // We shouldn't do this as it makes a whole lot of other things cumbersome. Instead, we should store it directly on the `App`
+        // where we control the drop order.
+
+        // for (_, alive_resource) in self.state.read().alive_resources.iter() {
+        //     let alive_resource = alive_resource
+        //         .as_ref()
+        //         .expect("Alive resources should never be None");
+        //     let ref_count = Arc::strong_count(alive_resource);
+
+        //     assert!(ref_count == 1,
+        //             "Resource has still {} owner(s) at the time of pool destruction. Description desc was {:?}",
+        //             ref_count - 1,
+        //             &alive_resource.creation_desc);
+        // }
     }
 }
 

From 053e100971c36ef14a85be4f250eb35070c961ce Mon Sep 17 00:00:00 2001
From: Andreas Reich <andreas@rerun.io>
Date: Wed, 22 Feb 2023 20:20:00 +0100
Subject: [PATCH 17/21] fix missing extend_defaults as well as fix a bug in it

---
 crates/re_renderer/src/point_cloud_builder.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/crates/re_renderer/src/point_cloud_builder.rs b/crates/re_renderer/src/point_cloud_builder.rs
index cdfd60666ea7..d13eb9105220 100644
--- a/crates/re_renderer/src/point_cloud_builder.rs
+++ b/crates/re_renderer/src/point_cloud_builder.rs
@@ -160,7 +160,7 @@ where
             );
         }
 
-        if self.0.user_data.len() < self.0.user_data.len() {
+        if self.0.user_data.len() < self.0.vertices.len() {
             self.0.user_data.extend(
                 std::iter::repeat(PerPointUserData::default())
                     .take(self.0.vertices.len() - self.0.user_data.len()),
@@ -220,6 +220,8 @@ where
 
     #[inline]
     pub fn add_point(&mut self, position: glam::Vec3) -> PointBuilder<'_, PerPointUserData> {
+        self.extend_defaults();
+
         debug_assert_eq!(self.0.vertices.len(), self.0.color_buffer.num_written());
         debug_assert_eq!(self.0.vertices.len(), self.0.user_data.len());
 
@@ -227,6 +229,7 @@ where
             position,
             radius: Size::AUTO,
         });
+        self.0.user_data.push(Default::default());
         self.batch_mut().point_count += 1;
 
         PointBuilder {

From 93f320c632efe97dd4c8aeced1650d99d4aad4b3 Mon Sep 17 00:00:00 2001
From: Andreas Reich <andreas@rerun.io>
Date: Thu, 23 Feb 2023 08:55:59 +0100
Subject: [PATCH 18/21] doc string link fixes

---
 .../src/allocator/cpu_write_gpu_read_belt.rs         | 12 ++++++------
 crates/re_renderer/src/wgpu_resources/buffer_pool.rs |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
index b614ad85ed46..7a979fc1cbc8 100644
--- a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
+++ b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
@@ -208,7 +208,7 @@ impl Chunk {
 /// Based on to [`wgpu::util::StagingBelt`](https://github.com/gfx-rs/wgpu/blob/a420e453c3d9c93dfb1a8526bf11c000d895c916/wgpu/src/util/belt.rs)
 /// However, there are some important differences:
 /// * can create buffers without yet knowing the target copy location
-/// * lifetime of returned buffers is independent of the [`StagingWriteBelt`] (allows working with several in parallel!)
+/// * lifetime of returned buffers is independent of the [`CpuWriteGpuReadBelt`] (allows working with several in parallel!)
 /// * use of `re_renderer`'s resource pool
 /// * handles alignment in a defined manner
 ///  (see this as of writing open wgpu issue on [Alignment guarantees for mapped buffers](https://github.com/gfx-rs/wgpu/issues/3508))
@@ -256,9 +256,9 @@ impl CpuWriteGpuReadBelt {
     /// sub-allocated within each chunk. Therefore, for optimal use of memory, the
     /// chunk size should be:
     ///
-    /// * larger than the largest single [`StagingBelt::write_buffer()`] operation;
+    /// * larger than the largest single [`CpuWriteGpuReadBelt::allocate`] operation;
     /// * 1-4 times less than the total amount of data uploaded per submission
-    ///   (per [`StagingBelt::finish()`]); and
+    ///   (per [`CpuWriteGpuReadBelt::before_queue_submit()`]); and
     /// * bigger is better, within these bounds.
     ///
     /// TODO(andreas): Adaptive chunk sizes
@@ -356,10 +356,10 @@ impl CpuWriteGpuReadBelt {
 
     /// Prepare currently mapped buffers for use in a submission.
     ///
-    /// This must be called before the command encoder(s) used in [`StagingBuffer`] copy operations are submitted.
+    /// This must be called before the command encoder(s) used in [`CpuWriteGpuReadBuffer`] copy operations are submitted.
     ///
     /// At this point, all the partially used staging buffers are closed (cannot be used for
-    /// further writes) until after [`StagingBelt::recall()`] is called *and* the GPU is done
+    /// further writes) until after [`CpuWriteGpuReadBelt::after_queue_submit`] is called *and* the GPU is done
     /// copying the data from them.
     pub fn before_queue_submit(&mut self) {
         // This would be a great usecase for persistent memory mapping, i.e. mapping without the need to unmap
@@ -374,7 +374,7 @@ impl CpuWriteGpuReadBelt {
 
     /// Recall all of the closed buffers back to be reused.
     ///
-    /// This must only be called after the command encoder(s) used in [`StagingBuffer`]
+    /// This must only be called after the command encoder(s) used in [`CpuWriteGpuReadBuffer`]
     /// copy operations are submitted. Additional calls are harmless.
     /// Not calling this as soon as possible may result in increased buffer memory usage.
     pub fn after_queue_submit(&mut self) {
diff --git a/crates/re_renderer/src/wgpu_resources/buffer_pool.rs b/crates/re_renderer/src/wgpu_resources/buffer_pool.rs
index 20f455e385cb..c6890749dc5c 100644
--- a/crates/re_renderer/src/wgpu_resources/buffer_pool.rs
+++ b/crates/re_renderer/src/wgpu_resources/buffer_pool.rs
@@ -26,11 +26,11 @@ pub struct BufferDesc {
     /// will panic.
     pub usage: wgpu::BufferUsages,
 
-    /// Allows a buffer to be mapped immediately after they are made. It does not have to be [`BufferUsages::MAP_READ`] or
-    /// [`BufferUsages::MAP_WRITE`], all buffers are allowed to be mapped at creation.
+    /// Allows a buffer to be mapped immediately after they are made. It does not have to be [`wgpu::BufferUsages::MAP_READ`] or
+    /// [`wgpu::BufferUsages::MAP_WRITE`], all buffers are allowed to be mapped at creation.
     ///
     /// *WARNING*: If this is `true`, the pool won't be able to reclaim the buffer later!
-    /// Furthermore, [`size`](#structfield.size) must be a multiple of [`COPY_BUFFER_ALIGNMENT`].
+    /// Furthermore, [`size`](#structfield.size) must be a multiple of [`wgpu::COPY_BUFFER_ALIGNMENT`].
     pub mapped_at_creation: bool,
 }
 

From eee79a2e670170554fe748911cc15b59c36cf0a0 Mon Sep 17 00:00:00 2001
From: Andreas Reich <andreas@rerun.io>
Date: Thu, 23 Feb 2023 09:19:30 +0100
Subject: [PATCH 19/21] drop a todo

---
 crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs b/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs
index f41636ed81b4..6da0f18b575a 100644
--- a/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs
+++ b/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs
@@ -340,8 +340,6 @@ mod tests {
         }
     }
 
-    // TODO: Add test for resources without re-use
-
     fn allocate_resources(
         descs: &[u32],
         pool: &mut DynamicResourcePool<ConcreteHandle, ConcreteResourceDesc, ConcreteResource>,

From b05925541cd00d214a044ac668407d51799a7e69 Mon Sep 17 00:00:00 2001
From: Andreas Reich <andreas@rerun.io>
Date: Thu, 23 Feb 2023 09:22:32 +0100
Subject: [PATCH 20/21] fix writing to frame uniform buffer twice

---
 crates/re_renderer/src/view_builder.rs | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/crates/re_renderer/src/view_builder.rs b/crates/re_renderer/src/view_builder.rs
index 6bafab2dbc74..1f5981b278c6 100644
--- a/crates/re_renderer/src/view_builder.rs
+++ b/crates/re_renderer/src/view_builder.rs
@@ -421,27 +421,6 @@ impl ViewBuilder {
             );
         }
 
-        ctx.queue.write_buffer(
-            &frame_uniform_buffer,
-            0,
-            bytemuck::bytes_of(&FrameUniformBuffer {
-                view_from_world: glam::Affine3A::from_mat4(view_from_world).into(),
-                projection_from_view: projection_from_view.into(),
-                projection_from_world: projection_from_world.into(),
-                camera_position,
-                camera_forward,
-                tan_half_fov: tan_half_fov.into(),
-                pixel_world_size_from_camera_distance,
-                pixels_from_point: config.pixels_from_point,
-
-                auto_size_points: auto_size_points.0,
-                auto_size_lines: auto_size_lines.0,
-
-                depth_offset_factor,
-                _padding: glam::Vec3::ZERO,
-            }),
-        );
-
         let bind_group_0 = ctx.shared_renderer_data.global_bindings.create_bind_group(
             &mut ctx.gpu_resources,
             &ctx.device,

From f193edb5dfc1b9425a426e4339f79f3b4dd8c1a9 Mon Sep 17 00:00:00 2001
From: Andreas Reich <andreas@rerun.io>
Date: Thu, 23 Feb 2023 11:22:56 +0100
Subject: [PATCH 21/21] comment fix & improvements

---
 crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
index 7a979fc1cbc8..e85b69596b16 100644
--- a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
+++ b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
@@ -232,7 +232,6 @@ pub struct CpuWriteGpuReadBelt {
 
     /// When closed chunks are mapped again, the map callback sends them here.
     ///
-    /// Behind a mutex, so that our StagingBelt becomes Sync.
     /// Note that we shouldn't use SyncSender since this can block the Sender if a buffer is full,
     /// which means that in a single threaded situation (Web!) we might deadlock.
     sender: mpsc::Sender<Chunk>,
@@ -248,6 +247,9 @@ impl CpuWriteGpuReadBelt {
     /// Also, it has the potential of making memcpy operations faster.
     ///
     /// Align to 4xf32. Should be enough for most usecases!
+    /// Needs to be larger or equal than [`wgpu::MAP_ALIGNMENT`].
+    /// For alignment requirements in `WebGPU` in general, refer to
+    /// [the specification on alignment-class limitations](https://www.w3.org/TR/webgpu/#limit-class-alignment)
     pub const MIN_ALIGNMENT: u64 = 16;
 
     /// Create a cpu-write & gpu-read staging belt.