From 137deba52c87c20eaa07891c15340de3d239dea5 Mon Sep 17 00:00:00 2001 From: Andreas Reich Date: Mon, 20 Feb 2023 11:47:01 +0100 Subject: [PATCH 01/21] Allow mapped_on_creation for wgpu buffer --- crates/re_renderer/src/mesh.rs | 3 + crates/re_renderer/src/renderer/lines.rs | 1 + .../re_renderer/src/renderer/mesh_renderer.rs | 1 + .../re_renderer/src/renderer/point_cloud.rs | 1 + crates/re_renderer/src/renderer/rectangles.rs | 1 + crates/re_renderer/src/view_builder.rs | 1 + .../src/wgpu_resources/bind_group_pool.rs | 8 ++- .../src/wgpu_resources/buffer_pool.rs | 27 ++++++-- .../wgpu_resources/dynamic_resource_pool.rs | 67 +++++++++++++------ .../src/wgpu_resources/texture_pool.rs | 8 ++- 10 files changed, 88 insertions(+), 30 deletions(-) diff --git a/crates/re_renderer/src/mesh.rs b/crates/re_renderer/src/mesh.rs index bc08f6cc6548..aeb977c27476 100644 --- a/crates/re_renderer/src/mesh.rs +++ b/crates/re_renderer/src/mesh.rs @@ -167,6 +167,7 @@ impl GpuMesh { label: data.label.clone().push_str(" - vertices"), size: vertex_buffer_combined_size, usage: wgpu::BufferUsages::VERTEX | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: false, }, ); let mut staging_buffer = queue @@ -195,6 +196,7 @@ impl GpuMesh { label: data.label.clone().push_str(" - indices"), size: index_buffer_size, usage: wgpu::BufferUsages::INDEX | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: false, }, ); let mut staging_buffer = queue @@ -224,6 +226,7 @@ impl GpuMesh { label: data.label.clone().push_str(" - material uniforms"), size: combined_buffers_size, usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::UNIFORM, + mapped_at_creation: false, }, ); diff --git a/crates/re_renderer/src/renderer/lines.rs b/crates/re_renderer/src/renderer/lines.rs index bb82c7f475ea..7405148cfc44 100644 --- a/crates/re_renderer/src/renderer/lines.rs +++ b/crates/re_renderer/src/renderer/lines.rs @@ -511,6 +511,7 @@ impl LineDrawData { label: "lines batch uniform buffers".into(), size: combined_buffers_size, usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: false, }, ); diff --git a/crates/re_renderer/src/renderer/mesh_renderer.rs b/crates/re_renderer/src/renderer/mesh_renderer.rs index 1d312a5387ab..8d71b4f8b16c 100644 --- a/crates/re_renderer/src/renderer/mesh_renderer.rs +++ b/crates/re_renderer/src/renderer/mesh_renderer.rs @@ -145,6 +145,7 @@ impl MeshDrawData { label: "MeshDrawData instance buffer".into(), size: instance_buffer_size, usage: wgpu::BufferUsages::VERTEX | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: false, }, ); diff --git a/crates/re_renderer/src/renderer/point_cloud.rs b/crates/re_renderer/src/renderer/point_cloud.rs index 164a8473b541..166b0fd781b6 100644 --- a/crates/re_renderer/src/renderer/point_cloud.rs +++ b/crates/re_renderer/src/renderer/point_cloud.rs @@ -357,6 +357,7 @@ impl PointCloudDrawData { label: "point batch uniform buffers".into(), size: combined_buffers_size, usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: false, }, ); diff --git a/crates/re_renderer/src/renderer/rectangles.rs b/crates/re_renderer/src/renderer/rectangles.rs index fd79df19d6a5..71a84d4565d5 100644 --- a/crates/re_renderer/src/renderer/rectangles.rs +++ b/crates/re_renderer/src/renderer/rectangles.rs @@ -141,6 +141,7 @@ impl RectangleDrawData { label: "rectangle uniform buffers".into(), size: combined_buffers_size, usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::UNIFORM, + mapped_at_creation: false, }, ); diff --git a/crates/re_renderer/src/view_builder.rs b/crates/re_renderer/src/view_builder.rs index fdcec7cf146c..098422ae03cf 100644 --- a/crates/re_renderer/src/view_builder.rs +++ b/crates/re_renderer/src/view_builder.rs @@ -270,6 +270,7 @@ impl ViewBuilder { label: format!("{:?} - frame uniform buffer", config.name).into(), size: std::mem::size_of::() as _, usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: false, }, ); diff --git a/crates/re_renderer/src/wgpu_resources/bind_group_pool.rs b/crates/re_renderer/src/wgpu_resources/bind_group_pool.rs index 1a746342090b..aee1112c9b08 100644 --- a/crates/re_renderer/src/wgpu_resources/bind_group_pool.rs +++ b/crates/re_renderer/src/wgpu_resources/bind_group_pool.rs @@ -7,7 +7,7 @@ use crate::debug_label::DebugLabel; use super::{ bind_group_layout_pool::{GpuBindGroupLayoutHandle, GpuBindGroupLayoutPool}, buffer_pool::{GpuBufferHandle, GpuBufferHandleStrong, GpuBufferPool}, - dynamic_resource_pool::{DynamicResourcePool, SizedResourceDesc}, + dynamic_resource_pool::{DynamicResourcePool, DynamicResourcesDesc}, resource::PoolError, sampler_pool::{GpuSamplerHandle, GpuSamplerPool}, texture_pool::{GpuTextureHandle, GpuTextureHandleStrong, GpuTexturePool}, @@ -66,12 +66,16 @@ pub struct BindGroupDesc { pub layout: GpuBindGroupLayoutHandle, } -impl SizedResourceDesc for BindGroupDesc { +impl DynamicResourcesDesc for BindGroupDesc { fn resource_size_in_bytes(&self) -> u64 { // Size depends on gpu/driver (like with all resources). // We could guess something like a pointer per descriptor, but let's not pretend we know! 0 } + + fn allow_reuse(&self) -> bool { + true + } } /// Resource pool for bind groups. diff --git a/crates/re_renderer/src/wgpu_resources/buffer_pool.rs b/crates/re_renderer/src/wgpu_resources/buffer_pool.rs index 16268afb6f7f..f62059d59c09 100644 --- a/crates/re_renderer/src/wgpu_resources/buffer_pool.rs +++ b/crates/re_renderer/src/wgpu_resources/buffer_pool.rs @@ -3,7 +3,7 @@ use std::hash::Hash; use crate::debug_label::DebugLabel; use super::{ - dynamic_resource_pool::{DynamicResourcePool, SizedResourceDesc}, + dynamic_resource_pool::{DynamicResourcePool, DynamicResourcesDesc}, resource::PoolError, }; @@ -13,6 +13,7 @@ slotmap::new_key_type! { pub struct GpuBufferHandle; } /// Once all strong handles are dropped, the bind group will be marked for reclamation in the following frame. pub type GpuBufferHandleStrong = std::sync::Arc; +/// Buffer creation descriptor. #[derive(Clone, Hash, PartialEq, Eq, Debug)] pub struct BufferDesc { /// Debug label of a buffer. This will show up in graphics debuggers for easy identification. @@ -24,12 +25,27 @@ pub struct BufferDesc { /// Usages of a buffer. If the buffer is used in any way that isn't specified here, the operation /// will panic. pub usage: wgpu::BufferUsages, + + /// Allows a buffer to be mapped immediately after they are made. It does not have to be [`BufferUsages::MAP_READ`] or + /// [`BufferUsages::MAP_WRITE`], all buffers are allowed to be mapped at creation. + /// + /// *WARNING*: If this is `true`, the pool won't be able to reclaim the buffer later! + /// Furthermore, [`size`](#structfield.size) must be a multiple of + /// [`COPY_BUFFER_ALIGNMENT`]. + pub mapped_at_creation: bool, } -impl SizedResourceDesc for BufferDesc { +impl DynamicResourcesDesc for BufferDesc { fn resource_size_in_bytes(&self) -> u64 { self.size } + + fn allow_reuse(&self) -> bool { + // We can't re-use buffers that were mapped at creation since we don't know if the user + // unmapped the buffer. + // We could try to figure it out, but mapped-at-creation buffers should only be used by one of the dedicated allocators anyways! + !self.mapped_at_creation + } } #[derive(Default)] @@ -39,7 +55,8 @@ pub struct GpuBufferPool { impl GpuBufferPool { /// Returns a ref counted handle to a currently unused buffer. - /// Once ownership to the handle is given up, the buffer may be reclaimed in future frames. + /// Once ownership to the handle is given up, the buffer may be reclaimed in future frames, + /// unless `BufferDesc::mapped_at_creation` was true. /// /// For more efficient allocation (faster, less fragmentation) you should sub-allocate buffers whenever possible /// either manually or using a higher level allocator. @@ -49,7 +66,7 @@ impl GpuBufferPool { label: desc.label.get(), size: desc.size, usage: desc.usage, - mapped_at_creation: false, + mapped_at_creation: desc.mapped_at_creation, }) }) } @@ -73,7 +90,7 @@ impl GpuBufferPool { } /// Internal method to retrieve a strong handle from a weak handle (used by [`super::GpuBindGroupPool`]) - /// without inrementing the ref-count (note the returned reference!). + /// without incrementing the ref-count (note the returned reference!). pub(super) fn get_strong_handle(&self, handle: GpuBufferHandle) -> &GpuBufferHandleStrong { self.pool.get_strong_handle(handle) } diff --git a/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs b/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs index 9edbaa52e1dd..2e7cb5d0de65 100644 --- a/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs +++ b/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs @@ -11,8 +11,12 @@ use smallvec::{smallvec, SmallVec}; use super::resource::PoolError; -pub trait SizedResourceDesc { +pub trait DynamicResourcesDesc { fn resource_size_in_bytes(&self) -> u64; + + /// If true, a unused resources will be kept around for while and then re-used in following frames. + /// If false, it will be destroyed on [`DynamicResourcePool::begin_frame`]. + fn allow_reuse(&self) -> bool; } /// Generic resource pool for all resources that have varying contents beyond their description. @@ -54,11 +58,15 @@ impl Default for DynamicResourcePool impl DynamicResourcePool where Handle: Key, - Desc: Clone + Eq + Hash + Debug + SizedResourceDesc, + Desc: Clone + Eq + Hash + Debug + DynamicResourcesDesc, { - pub fn alloc Res>(&mut self, desc: &Desc, creation_func: F) -> Arc { + fn alloc_internal Res>( + &mut self, + desc: &Desc, + creation_func: F, + ) -> Arc { // First check if we can reclaim a resource we have around from a previous frame. - let handle = + if desc.allow_reuse() { if let Entry::Occupied(mut entry) = self.last_frame_deallocated.entry(desc.clone()) { re_log::trace!(?desc, "Reclaimed previously discarded resource",); @@ -66,14 +74,18 @@ where if entry.get().is_empty() { entry.remove(); } - handle - // Otherwise create a new resource - } else { - let resource = creation_func(desc); - self.total_resource_size_in_bytes += desc.resource_size_in_bytes(); - Arc::new(self.resources.insert((desc.clone(), resource))) - }; + return handle; + } + } + + // Otherwise create a new resource + let resource = creation_func(desc); + self.total_resource_size_in_bytes += desc.resource_size_in_bytes(); + Arc::new(self.resources.insert((desc.clone(), resource))) + } + pub fn alloc Res>(&mut self, desc: &Desc, creation_func: F) -> Arc { + let handle = self.alloc_internal(desc, creation_func); self.alive_handles.insert(*handle, Arc::clone(&handle)); handle } @@ -102,10 +114,10 @@ where "Drained dangling resources from last frame", ); for handle in handles { - if let Some((_, res)) = self.resources.remove(*handle) { + if let Some((desc, res)) = self.resources.remove(*handle) { on_destroy_resource(&res); + self.total_resource_size_in_bytes -= desc.resource_size_in_bytes(); } - self.total_resource_size_in_bytes -= desc.resource_size_in_bytes(); } } @@ -117,13 +129,20 @@ where self.alive_handles.retain(|handle, strong_handle| { if Arc::strong_count(strong_handle) == 1 { let desc = &self.resources[handle].0; - match self.last_frame_deallocated.entry(desc.clone()) { - Entry::Occupied(mut e) => { - e.get_mut().push(Arc::clone(strong_handle)); - } - Entry::Vacant(e) => { - e.insert(smallvec![Arc::clone(strong_handle)]); + + // If allowed, put it on the "last frame deallocated" list instead of destroying the resource immediately. + if desc.allow_reuse() { + match self.last_frame_deallocated.entry(desc.clone()) { + Entry::Occupied(mut e) => { + e.get_mut().push(Arc::clone(strong_handle)); + } + Entry::Vacant(e) => { + e.insert(smallvec![Arc::clone(strong_handle)]); + } } + } else if let Some((desc, res)) = self.resources.remove(handle) { + on_destroy_resource(&res); + self.total_resource_size_in_bytes -= desc.resource_size_in_bytes(); } false } else { @@ -159,7 +178,7 @@ mod tests { use slotmap::Key; - use super::{DynamicResourcePool, SizedResourceDesc}; + use super::{DynamicResourcePool, DynamicResourcesDesc}; use crate::wgpu_resources::resource::PoolError; slotmap::new_key_type! { pub struct ConcreteHandle; } @@ -167,10 +186,14 @@ mod tests { #[derive(Clone, PartialEq, Eq, Hash, Debug)] pub struct ConcreteResourceDesc(u32); - impl SizedResourceDesc for ConcreteResourceDesc { + impl DynamicResourcesDesc for ConcreteResourceDesc { fn resource_size_in_bytes(&self) -> u64 { 1 } + + fn allow_reuse(&self) -> bool { + true + } } #[derive(Debug)] @@ -256,6 +279,8 @@ mod tests { } } + // TODO: Add test for resources without re-use + fn allocate_resources( descs: &[u32], pool: &mut DynamicResourcePool, diff --git a/crates/re_renderer/src/wgpu_resources/texture_pool.rs b/crates/re_renderer/src/wgpu_resources/texture_pool.rs index 85ea6b008b35..f8b93d27af37 100644 --- a/crates/re_renderer/src/wgpu_resources/texture_pool.rs +++ b/crates/re_renderer/src/wgpu_resources/texture_pool.rs @@ -3,7 +3,7 @@ use std::hash::Hash; use crate::debug_label::DebugLabel; use super::{ - dynamic_resource_pool::{DynamicResourcePool, SizedResourceDesc}, + dynamic_resource_pool::{DynamicResourcePool, DynamicResourcesDesc}, resource::PoolError, }; @@ -45,7 +45,7 @@ pub struct TextureDesc { pub usage: wgpu::TextureUsages, } -impl SizedResourceDesc for TextureDesc { +impl DynamicResourcesDesc for TextureDesc { /// Number of bytes this texture is expected to take. /// /// The actual number might be both bigger (padding) and lower (gpu sided compression). @@ -67,6 +67,10 @@ impl SizedResourceDesc for TextureDesc { size_in_bytes } + + fn allow_reuse(&self) -> bool { + true + } } #[derive(Default)] pub struct GpuTexturePool { From ef93d724c33347f84fddf1c0e345071abd1ce374 Mon Sep 17 00:00:00 2001 From: Andreas Reich Date: Mon, 20 Feb 2023 13:48:26 +0100 Subject: [PATCH 02/21] Add CpuWriteGpuReadBelt and use it for frameuniform buffer as poc --- .../src/allocator/cpu_write_gpu_read_belt.rs | 392 ++++++++++++++++++ crates/re_renderer/src/allocator/mod.rs | 8 + crates/re_renderer/src/context.rs | 50 +++ crates/re_renderer/src/lib.rs | 1 + crates/re_renderer/src/view_builder.rs | 32 ++ .../src/wgpu_resources/buffer_pool.rs | 3 +- 6 files changed, 484 insertions(+), 2 deletions(-) create mode 100644 crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs create mode 100644 crates/re_renderer/src/allocator/mod.rs diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs new file mode 100644 index 000000000000..7a65a3bb6096 --- /dev/null +++ b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs @@ -0,0 +1,392 @@ +use std::{ + ops::DerefMut, + sync::{mpsc, Arc}, +}; + +use crate::wgpu_resources::{BufferDesc, GpuBufferHandleStrong, GpuBufferPool, PoolError}; + +/// Internal chunk of the staging belt. +struct Chunk { + buffer: GpuBufferHandleStrong, + size: usize, +} + +/// Chunk that the CPU is writing to currently. +struct ActiveChunk { + chunk: Chunk, + + /// Write view into the entire buffer. + /// + /// UNSAFE: The lifetime is transmuted to be `'static`. + /// In actuality it is tied to the lifetime of [`buffer`](#structfield.chunk.buffer)! + write_view: wgpu::BufferViewMut<'static>, + + /// Safety mechanism to track the number of uses of this buffer. + /// + /// As long it is bigger 1, we are NOT allowed to deactivate the chunk! + safety_usage_counter: Arc<()>, + + /// At what offset is [`write_view`](#structfield.write_view) unused. + unused_offset: usize, +} + +impl Chunk { + #[allow(unsafe_code)] + fn into_active(self, buffer_pool: &GpuBufferPool) -> ActiveChunk { + // Get the entire mapped slice up front because mainly because there is no way to know the memory alignment beforehand! + // After discussing this on the wgpu dev channel, consensus was reached that they should be 4xf32 aligned. + // TODO: Github issue link. + // + // It also safes us a bit of time since we don't have to run through all the checks on every `allocate` call, + // but this is likely negligible (citation needed). + // + // Wgpu isn't fond of that though since it ties the (compile time) lifetime of the returned `wgpu::BufferSlice` + // to the lifetime of the buffer (after all the mapping should never outlive the buffer!). + // Therefore, we need to workaround this a bit. + // + // Note that in the JavaScript/wasm api of WebGPU this is trivially possible, see + // https://www.w3.org/TR/webgpu/#dom-gpubuffer-getmappedrange + // (but it also ends up doing an extra memcpy to that's an unfair comparison) + + let wgpu_buffer = buffer_pool + .get_resource(&self.buffer) + .expect("Invalid buffer handle"); + let buffer_slice = wgpu_buffer.slice(..); + let write_view = buffer_slice.get_mapped_range_mut(); + + // SAFETY: + // Things we need to guarantee manually now: + // * Buffer needs to stay alive for the time CpuWriteGpuReadBuffer is held. + // -> We keep a reference count to `chunk.buffer` + // * Nobody holds a view into our memory by the time + // -> We use `safety_usage_counter` and panic if there is more than one owner when we close the buffer view + // * Returned ranges into `write_view` NEVER overlap. + // -> We track the unused_offset and never return any range ahead of it! + let write_view = unsafe { + std::mem::transmute::, wgpu::BufferViewMut<'static>>(write_view) + }; + + ActiveChunk { + chunk: self, + write_view, + unused_offset: 0, + safety_usage_counter: Arc::new(()), + } + } +} + +// TODO(andreas): Make wgpu::BufferMappedRange Send upstream +#[allow(unsafe_code)] +/// SAFETY: +/// TODO: Link to pr here doing so +unsafe impl std::marker::Send for ActiveChunk {} + +/// A suballocated staging buffer that can be written to. +/// +/// We do *not* allow reading from this buffer as it is typically write-combined memory. +/// Reading would work, but it can be *very* slow. +pub struct CpuWriteGpuReadBuffer { + write_only_memory: &'static mut [T], + + #[allow(dead_code)] + safety_usage_counter: Arc<()>, + + chunk_buffer: GpuBufferHandleStrong, + offset_in_chunk_buffer: usize, +} + +impl CpuWriteGpuReadBuffer +where + T: bytemuck::Pod + 'static, +{ + /// Writes several objects to the buffer at a given location. + /// User is responsible for ensuring the element offset is valid with the element types's alignment requirement. + /// (panics otherwise) + /// + /// We do *not* allow reading from this buffer as it is typically write-combined memory. + /// Reading would work, but it can be *very* slow. + #[inline] + pub fn write(&mut self, elements: impl Iterator, num_elements_offset: usize) { + for (target, source) in self + .write_only_memory + .iter_mut() + .skip(num_elements_offset) + .zip(elements) + { + *target = source; + } + } + + /// Writes a single objects to the buffer at a given location. + /// User is responsible for ensuring the element offset is valid with the element types's alignment requirement. + /// (panics otherwise) + #[inline(always)] + pub fn write_single(&mut self, element: &T, num_elements_offset: usize) { + self.write_only_memory[num_elements_offset] = *element; + } + + // pub fn copy_to_texture( + // self, + // encoder: &mut wgpu::CommandEncoder, + // buffer_pool: &GpuBufferPool, + // destination: wgpu::ImageCopyTexture<'_>, + // bytes_per_row: Option, + // rows_per_image: Option, + // copy_size: wgpu::Extent3d, + // ) -> Result<(), PoolError> { + // let buffer = buffer_pool.get_resource(&self.chunk_buffer)?; + + // encoder.copy_buffer_to_texture( + // wgpu::ImageCopyBuffer { + // buffer, + // layout: wgpu::ImageDataLayout { + // offset: self.offset_in_chunk, + // bytes_per_row, + // rows_per_image, + // }, + // }, + // destination, + // copy_size, + // ); + + // Ok(()) + // } + + /// Consume this data view and copy it to another gpu buffer. + pub fn copy_to_buffer( + self, + encoder: &mut wgpu::CommandEncoder, + buffer_pool: &GpuBufferPool, + destination: &GpuBufferHandleStrong, + destination_offset: wgpu::BufferAddress, + ) -> Result<(), PoolError> { + encoder.copy_buffer_to_buffer( + buffer_pool.get_resource(&self.chunk_buffer)?, + self.offset_in_chunk_buffer as _, + buffer_pool.get_resource(destination)?, + destination_offset, + (self.write_only_memory.len() * std::mem::size_of::()) as u64, + ); + Ok(()) + } +} + +/// Efficiently performs many buffer writes by sharing and reusing temporary buffers. +/// +/// Internally it uses a ring-buffer of staging buffers that are sub-allocated. +/// +/// Based on to [`wgpu::util::StagingBelt`](https://github.com/gfx-rs/wgpu/blob/a420e453c3d9c93dfb1a8526bf11c000d895c916/wgpu/src/util/belt.rs) +/// However, there are some important differences: +/// * can create buffers without yet knowing the target copy location +/// * lifetime of returned buffers is independent of the [`StagingWriteBelt`] (allows working with several in parallel!) +/// * use of `re_renderer`'s resource pool +/// * handles alignment +pub struct CpuWriteGpuReadBelt { + /// Minimum size for new buffers. + chunk_size: usize, + + /// Chunks which are CPU write at the moment. + active_chunks: Vec, + + /// Chunks which are GPU read at the moment. + /// + /// I.e. they have scheduled transfers already; they are unmapped and one or more + /// command encoder has one or more `copy_buffer_to_buffer` commands with them + /// as source. + closed_chunks: Vec, + + /// Chunks that are back from the GPU and ready to be mapped for write and put + /// into `active_chunks`. + free_chunks: Vec, + + /// When closed chunks are mapped again, the map callback sends them here. + /// + /// Behind a mutex, so that our StagingBelt becomes Sync. + /// Note that we shouldn't use SyncSender since this can block the Sender if a buffer is full, + /// which means that in a single threaded situation (Web!) we might deadlock. + sender: mpsc::Sender, + + /// Free chunks are received here to be put on `self.free_chunks`. + receiver: mpsc::Receiver, +} + +impl CpuWriteGpuReadBelt { + /// Create a new staging belt. + /// + /// The `chunk_size` is the unit of internal buffer allocation; writes will be + /// sub-allocated within each chunk. Therefore, for optimal use of memory, the + /// chunk size should be: + /// + /// * larger than the largest single [`StagingBelt::write_buffer()`] operation; + /// * 1-4 times less than the total amount of data uploaded per submission + /// (per [`StagingBelt::finish()`]); and + /// * bigger is better, within these bounds. + /// + /// TODO(andreas): Adaptive chunk sizes + /// TODO(andreas): Shrinking after usage spikes? + pub fn new(chunk_size: usize) -> Self { + let (sender, receiver) = mpsc::channel(); + CpuWriteGpuReadBelt { + chunk_size, + active_chunks: Vec::new(), + closed_chunks: Vec::new(), + free_chunks: Vec::new(), + sender, + receiver, + } + } + + /// Allocates a cpu writable buffer for `num_elements` instances of T. + /// + /// Handles alignment requirements automatically which allows faster copy operations. + #[allow(unsafe_code)] + pub fn allocate( + &mut self, + device: &wgpu::Device, + buffer_pool: &mut GpuBufferPool, + num_elements: usize, + ) -> CpuWriteGpuReadBuffer { + let alignment = std::mem::align_of::().min(wgpu::MAP_ALIGNMENT as usize); + let size = std::mem::size_of::() * num_elements; + + // We need to be super careful with alignment since wgpu + // has no guarantees on how pointers to mapped memory are aligned! + + // We explicitly use `deref_mut` on write_view everywhere, since wgpu warns if we accidentally use `deref`. + + // Try to find space in any of the active chunks first. + let mut active_chunk = if let Some(index) = + self.active_chunks.iter_mut().position(|active_chunk| { + size + (active_chunk.write_view.deref_mut().as_ptr() as usize + + active_chunk.unused_offset) + % alignment + <= active_chunk.chunk.size - active_chunk.unused_offset + }) { + self.active_chunks.swap_remove(index) + } else { + self.receive_chunks(); // ensure self.free_chunks is up to date + + // We don't know yet how aligned the mapped pointer is. So we need to ask for more memory! + let required_size = size + alignment - 1; + + // Use a free chunk if possible, fall back to creating a new one if necessary. + if let Some(index) = self + .free_chunks + .iter() + .position(|chunk| chunk.size >= required_size) + { + self.free_chunks.swap_remove(index).into_active(buffer_pool) + } else { + // Allocation might be bigger than a chunk. + let size = self.chunk_size.max(required_size); + + let buffer = buffer_pool.alloc( + device, + &BufferDesc { + label: "CpuWriteGpuReadBelt buffer".into(), + size: size as _, + usage: wgpu::BufferUsages::MAP_WRITE | wgpu::BufferUsages::COPY_SRC, + mapped_at_creation: true, + }, + ); + + Chunk { buffer, size }.into_active(buffer_pool) + } + }; + + let alignment_padding = (active_chunk.write_view.deref_mut().as_ptr() as usize + + active_chunk.unused_offset) + % alignment; + let start_offset = active_chunk.unused_offset + alignment_padding; + let end_offset = start_offset + size; + + let write_only_memory = bytemuck::cast_slice_mut( + &mut active_chunk.write_view.deref_mut()[start_offset..end_offset], + ); + + // SAFETY: + // The slice we take out of the write view has a lifetime dependency on the write_view. + // This means it is self-referential. We handle this lifetime at runtime instead by + // + // See also `ActiveChunk::into_active`. + let write_only_memory = + unsafe { std::mem::transmute::<&mut [T], &'static mut [T]>(write_only_memory) }; + let cpu_buffer_view = CpuWriteGpuReadBuffer { + write_only_memory, + safety_usage_counter: active_chunk.safety_usage_counter.clone(), + chunk_buffer: active_chunk.chunk.buffer.clone(), + offset_in_chunk_buffer: start_offset, + }; + + self.active_chunks.push(active_chunk); + + cpu_buffer_view + } + + /// Prepare currently mapped buffers for use in a submission. + /// + /// This must be called before the command encoder(s) used in [`StagingBuffer`] copy operations are submitted. + /// + /// At this point, all the partially used staging buffers are closed (cannot be used for + /// further writes) until after [`StagingBelt::recall()`] is called *and* the GPU is done + /// copying the data from them. + pub fn before_queue_submit(&mut self, buffer_pool: &GpuBufferPool) { + // This would be a great usecase for persistent memory mapping, i.e. mapping without the need to unmap + // https://github.com/gfx-rs/wgpu/issues/1468 + // However, WebGPU does not support this! + + for active_chunk in self.active_chunks.drain(..) { + assert!(Arc::strong_count(&active_chunk.safety_usage_counter) == 1, + "Chunk still in use. All instances of `CpuWriteGpuReadBelt` need to be freed before submitting."); + + // Ensure write view is dropped before we unmap! + drop(active_chunk.write_view); + + buffer_pool + .get_resource(&active_chunk.chunk.buffer) + .expect("invalid buffer handle") + .unmap(); + self.closed_chunks.push(active_chunk.chunk); + } + } + + /// Recall all of the closed buffers back to be reused. + /// + /// This must only be called after the command encoder(s) used in [`StagingBuffer`] + /// copy operations are submitted. Additional calls are harmless. + /// Not calling this as soon as possible may result in increased buffer memory usage. + pub fn after_queue_submit(&mut self, buffer_pool: &GpuBufferPool) { + self.receive_chunks(); + + let sender = &self.sender; + for chunk in self.closed_chunks.drain(..) { + let sender = sender.clone(); + buffer_pool + .get_resource(&chunk.buffer) + .expect("invalid buffer handle") + .slice(..) + .map_async(wgpu::MapMode::Write, move |_| { + let _ = sender.send(chunk); + }); + } + } + + /// Move all chunks that the GPU is done with (and are now mapped again) + /// from `self.receiver` to `self.free_chunks`. + fn receive_chunks(&mut self) { + while let Ok(chunk) = self.receiver.try_recv() { + self.free_chunks.push(chunk); + } + } +} + +// impl fmt::Debug for StagingBelt { +// fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +// f.debug_struct("StagingBelt") +// .field("chunk_size", &self.chunk_size) +// .field("active_chunks", &self.active_chunks.len()) +// .field("closed_chunks", &self.closed_chunks.len()) +// .field("free_chunks", &self.free_chunks.len()) +// .finish_non_exhaustive() +// } +// } diff --git a/crates/re_renderer/src/allocator/mod.rs b/crates/re_renderer/src/allocator/mod.rs new file mode 100644 index 000000000000..7ae3d967a7e0 --- /dev/null +++ b/crates/re_renderer/src/allocator/mod.rs @@ -0,0 +1,8 @@ +//! High level GPU memory allocators. +//! +//! In contrast to the buffer pools in [`crate::wgpu_resources`], every allocator in here +//! follows some more complex strategy for efficient re-use and sub-allocation of wgpu resources. + +mod cpu_write_gpu_read_belt; + +pub use cpu_write_gpu_read_belt::CpuWriteGpuReadBelt; diff --git a/crates/re_renderer/src/context.rs b/crates/re_renderer/src/context.rs index a0bfc4474ad0..5080acd7495b 100644 --- a/crates/re_renderer/src/context.rs +++ b/crates/re_renderer/src/context.rs @@ -1,8 +1,10 @@ use std::sync::Arc; +use parking_lot::Mutex; use type_map::concurrent::{self, TypeMap}; use crate::{ + allocator::CpuWriteGpuReadBelt, config::RenderContextConfig, global_bindings::GlobalBindings, renderer::Renderer, @@ -28,6 +30,13 @@ pub struct RenderContext { pub gpu_resources: WgpuResourcePools, pub mesh_manager: MeshManager, pub texture_manager_2d: TextureManager2D, + pub(crate) cpu_write_gpu_read_belt: Mutex, + + /// Command encoder for all commands that should go in before view builder are submitted. + /// + /// This should be used for any gpu copy operation outside of a renderer or view builder. + /// (i.e. typically in [`crate::renderer::DrawData`] creation!) + pub(crate) frame_global_command_encoder: wgpu::CommandEncoder, // TODO(andreas): Add frame/lifetime statistics, shared resources (e.g. "global" uniform buffer), ?? frame_index: u64, @@ -137,6 +146,8 @@ impl RenderContext { let texture_manager_2d = TextureManager2D::new(device.clone(), queue.clone(), &mut gpu_resources.textures); + let frame_global_command_encoder = Self::create_frame_global_command_encoder(&device); + RenderContext { device, queue, @@ -148,6 +159,10 @@ impl RenderContext { mesh_manager, texture_manager_2d, + // 32MiB chunk size (as big as a for instance a 2048x1024 float4 texture) + cpu_write_gpu_read_belt: Mutex::new(CpuWriteGpuReadBelt::new(1024 * 1024 * 32)), + + frame_global_command_encoder, resolver, @@ -158,6 +173,12 @@ impl RenderContext { } } + fn create_frame_global_command_encoder(device: &wgpu::Device) -> wgpu::CommandEncoder { + device.create_command_encoder(&wgpu::CommandEncoderDescriptor { + label: crate::DebugLabel::from("global \"before viewbuilder\" command encoder").get(), + }) + } + /// Call this at the beginning of a new frame. /// /// Updates internal book-keeping, frame allocators and executes delayed events like shader reloading. @@ -219,6 +240,35 @@ impl RenderContext { bind_group_layouts.begin_frame(self.frame_index); samplers.begin_frame(self.frame_index); } + + // Retrieve unused staging buffer. + self.cpu_write_gpu_read_belt + .lock() + .after_queue_submit(&self.gpu_resources.buffers); + } + + /// Call this at the end of a frame but before submitting command buffers from [`crate::view_builder::ViewBuilder`] + pub fn before_submit(&mut self) { + // Unmap all staging buffers. + self.cpu_write_gpu_read_belt + .lock() + .before_queue_submit(&self.gpu_resources.buffers); + + let mut command_encoder = Self::create_frame_global_command_encoder(&self.device); + std::mem::swap(&mut self.frame_global_command_encoder, &mut command_encoder); + let command_buffer = command_encoder.finish(); + + // TODO(andreas): For better performance, we should try to bundle this with the single submit call that is currently happening in eframe. + // How do we hook in there and make sure this buffer is submitted first? + self.queue.submit([command_buffer]); + } +} + +impl Drop for RenderContext { + fn drop(&mut self) { + // Delete the CpuWriteGpuReadBelt beforehand since its active chunks may keep unsafe references into the buffer pool. + // (if we don't do this and buffers get destroyed before the belt, wgpu will panic) + *self.cpu_write_gpu_read_belt.lock() = CpuWriteGpuReadBelt::new(0); } } diff --git a/crates/re_renderer/src/lib.rs b/crates/re_renderer/src/lib.rs index b89004ed22cf..dcbae79b06dc 100644 --- a/crates/re_renderer/src/lib.rs +++ b/crates/re_renderer/src/lib.rs @@ -14,6 +14,7 @@ pub mod renderer; pub mod resource_managers; pub mod view_builder; +mod allocator; mod context; mod debug_label; mod depth_offset; diff --git a/crates/re_renderer/src/view_builder.rs b/crates/re_renderer/src/view_builder.rs index 098422ae03cf..c53ddce3b9bc 100644 --- a/crates/re_renderer/src/view_builder.rs +++ b/crates/re_renderer/src/view_builder.rs @@ -388,6 +388,38 @@ impl ViewBuilder { // Factor applied to depth offsets. let depth_offset_factor = 1.0e-08; // Value determined by experimentation. Quite close to the f32 machine epsilon but a bit lower. + let mut frame_uniform_buffer_cpu = ctx + .cpu_write_gpu_read_belt + .lock() + .allocate::(&ctx.device, &mut ctx.gpu_resources.buffers, 1); + frame_uniform_buffer_cpu.write_single( + &FrameUniformBuffer { + view_from_world: glam::Affine3A::from_mat4(view_from_world).into(), + projection_from_view: projection_from_view.into(), + projection_from_world: projection_from_world.into(), + camera_position, + camera_forward, + tan_half_fov: tan_half_fov.into(), + pixel_world_size_from_camera_distance, + pixels_from_point: config.pixels_from_point, + + auto_size_points: auto_size_points.0, + auto_size_lines: auto_size_lines.0, + + depth_offset_factor, + _padding: glam::Vec3::ZERO, + }, + 0, + ); + frame_uniform_buffer_cpu + .copy_to_buffer( + &mut ctx.frame_global_command_encoder, + &ctx.gpu_resources.buffers, + &frame_uniform_buffer, + 0, + ) + .unwrap(); + ctx.queue.write_buffer( ctx.gpu_resources .buffers diff --git a/crates/re_renderer/src/wgpu_resources/buffer_pool.rs b/crates/re_renderer/src/wgpu_resources/buffer_pool.rs index f62059d59c09..7e3c42563aa1 100644 --- a/crates/re_renderer/src/wgpu_resources/buffer_pool.rs +++ b/crates/re_renderer/src/wgpu_resources/buffer_pool.rs @@ -30,8 +30,7 @@ pub struct BufferDesc { /// [`BufferUsages::MAP_WRITE`], all buffers are allowed to be mapped at creation. /// /// *WARNING*: If this is `true`, the pool won't be able to reclaim the buffer later! - /// Furthermore, [`size`](#structfield.size) must be a multiple of - /// [`COPY_BUFFER_ALIGNMENT`]. + /// Furthermore, [`size`](#structfield.size) must be a multiple of [`COPY_BUFFER_ALIGNMENT`]. pub mapped_at_creation: bool, } From a1c439e0111a5357e6e552e2c2bf0e4b9c66025b Mon Sep 17 00:00:00 2001 From: Andreas Reich Date: Mon, 20 Feb 2023 19:54:00 +0100 Subject: [PATCH 03/21] Reduce amount of unsafe code in CpuWriteGpuReadBuffer. Better handling of alignment. Fix issues on render context shutdown --- crates/re_renderer/examples/framework.rs | 1 + .../src/allocator/cpu_write_gpu_read_belt.rs | 275 +++++++++--------- crates/re_renderer/src/context.rs | 95 +++--- crates/re_renderer/src/view_builder.rs | 2 +- 4 files changed, 195 insertions(+), 178 deletions(-) diff --git a/crates/re_renderer/examples/framework.rs b/crates/re_renderer/examples/framework.rs index 37b76b505802..5b6bc13c38b1 100644 --- a/crates/re_renderer/examples/framework.rs +++ b/crates/re_renderer/examples/framework.rs @@ -278,6 +278,7 @@ impl Application { // drop the pass so we can finish() the main encoder! }; + self.re_ctx.before_submit(); self.re_ctx.queue.submit( view_cmd_buffers .into_iter() diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs index 7a65a3bb6096..30528384b8b5 100644 --- a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs +++ b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs @@ -1,114 +1,67 @@ -use std::{ - ops::DerefMut, - sync::{mpsc, Arc}, -}; +use std::{ops::DerefMut, sync::mpsc}; use crate::wgpu_resources::{BufferDesc, GpuBufferHandleStrong, GpuBufferPool, PoolError}; /// Internal chunk of the staging belt. struct Chunk { buffer: GpuBufferHandleStrong, - size: usize, -} - -/// Chunk that the CPU is writing to currently. -struct ActiveChunk { - chunk: Chunk, - - /// Write view into the entire buffer. - /// - /// UNSAFE: The lifetime is transmuted to be `'static`. - /// In actuality it is tied to the lifetime of [`buffer`](#structfield.chunk.buffer)! - write_view: wgpu::BufferViewMut<'static>, - - /// Safety mechanism to track the number of uses of this buffer. - /// - /// As long it is bigger 1, we are NOT allowed to deactivate the chunk! - safety_usage_counter: Arc<()>, + size: wgpu::BufferAddress, /// At what offset is [`write_view`](#structfield.write_view) unused. - unused_offset: usize, -} - -impl Chunk { - #[allow(unsafe_code)] - fn into_active(self, buffer_pool: &GpuBufferPool) -> ActiveChunk { - // Get the entire mapped slice up front because mainly because there is no way to know the memory alignment beforehand! - // After discussing this on the wgpu dev channel, consensus was reached that they should be 4xf32 aligned. - // TODO: Github issue link. - // - // It also safes us a bit of time since we don't have to run through all the checks on every `allocate` call, - // but this is likely negligible (citation needed). - // - // Wgpu isn't fond of that though since it ties the (compile time) lifetime of the returned `wgpu::BufferSlice` - // to the lifetime of the buffer (after all the mapping should never outlive the buffer!). - // Therefore, we need to workaround this a bit. - // - // Note that in the JavaScript/wasm api of WebGPU this is trivially possible, see - // https://www.w3.org/TR/webgpu/#dom-gpubuffer-getmappedrange - // (but it also ends up doing an extra memcpy to that's an unfair comparison) - - let wgpu_buffer = buffer_pool - .get_resource(&self.buffer) - .expect("Invalid buffer handle"); - let buffer_slice = wgpu_buffer.slice(..); - let write_view = buffer_slice.get_mapped_range_mut(); - - // SAFETY: - // Things we need to guarantee manually now: - // * Buffer needs to stay alive for the time CpuWriteGpuReadBuffer is held. - // -> We keep a reference count to `chunk.buffer` - // * Nobody holds a view into our memory by the time - // -> We use `safety_usage_counter` and panic if there is more than one owner when we close the buffer view - // * Returned ranges into `write_view` NEVER overlap. - // -> We track the unused_offset and never return any range ahead of it! - let write_view = unsafe { - std::mem::transmute::, wgpu::BufferViewMut<'static>>(write_view) - }; - - ActiveChunk { - chunk: self, - write_view, - unused_offset: 0, - safety_usage_counter: Arc::new(()), - } - } + unused_offset: wgpu::BufferAddress, } -// TODO(andreas): Make wgpu::BufferMappedRange Send upstream -#[allow(unsafe_code)] -/// SAFETY: -/// TODO: Link to pr here doing so -unsafe impl std::marker::Send for ActiveChunk {} - /// A suballocated staging buffer that can be written to. /// /// We do *not* allow reading from this buffer as it is typically write-combined memory. /// Reading would work, but it can be *very* slow. -pub struct CpuWriteGpuReadBuffer { - write_only_memory: &'static mut [T], - - #[allow(dead_code)] - safety_usage_counter: Arc<()>, +pub struct CpuWriteGpuReadBuffer { + /// Write view into the relevant buffer portion. + /// + /// UNSAFE: The lifetime is transmuted to be `'static`. + /// In actuality it is tied to the lifetime of [`chunk_buffer`](#structfield.chunk.chunk_buffer)! + write_view: wgpu::BufferViewMut<'static>, chunk_buffer: GpuBufferHandleStrong, - offset_in_chunk_buffer: usize, + offset_in_chunk_buffer: wgpu::BufferAddress, + + /// Marker for the type whose alignment and size requirements are honored by `write_view`. + _type: std::marker::PhantomData, } impl CpuWriteGpuReadBuffer where T: bytemuck::Pod + 'static, { - /// Writes several objects to the buffer at a given location. + /// Memory as slice of T. + /// + /// Do *not* make this public as we need to guarantee that the memory is *never* read from! + #[inline(always)] + fn as_slice(&mut self) -> &mut [T] { + bytemuck::cast_slice_mut(&mut self.write_view) + } + + /// Writes several objects using to the buffer at a given location using a slice. + /// /// User is responsible for ensuring the element offset is valid with the element types's alignment requirement. /// (panics otherwise) + #[inline] + pub fn write_slice(&mut self, elements: &[T], num_elements_offset: usize) { + self.as_slice()[num_elements_offset..].copy_from_slice(elements); + } + + /// Writes several objects using to the buffer at a given location using an iterator. /// - /// We do *not* allow reading from this buffer as it is typically write-combined memory. - /// Reading would work, but it can be *very* slow. + /// User is responsible for ensuring the element offset is valid with the element types's alignment requirement. + /// (panics otherwise) #[inline] - pub fn write(&mut self, elements: impl Iterator, num_elements_offset: usize) { + pub fn write_iterator( + &mut self, + elements: impl Iterator, + num_elements_offset: usize, + ) { for (target, source) in self - .write_only_memory + .as_slice() .iter_mut() .skip(num_elements_offset) .zip(elements) @@ -118,11 +71,12 @@ where } /// Writes a single objects to the buffer at a given location. + /// /// User is responsible for ensuring the element offset is valid with the element types's alignment requirement. /// (panics otherwise) - #[inline(always)] + #[inline] pub fn write_single(&mut self, element: &T, num_elements_offset: usize) { - self.write_only_memory[num_elements_offset] = *element; + self.as_slice()[num_elements_offset] = *element; } // pub fn copy_to_texture( @@ -154,7 +108,7 @@ where /// Consume this data view and copy it to another gpu buffer. pub fn copy_to_buffer( - self, + mut self, encoder: &mut wgpu::CommandEncoder, buffer_pool: &GpuBufferPool, destination: &GpuBufferHandleStrong, @@ -162,10 +116,10 @@ where ) -> Result<(), PoolError> { encoder.copy_buffer_to_buffer( buffer_pool.get_resource(&self.chunk_buffer)?, - self.offset_in_chunk_buffer as _, + self.offset_in_chunk_buffer, buffer_pool.get_resource(destination)?, destination_offset, - (self.write_only_memory.len() * std::mem::size_of::()) as u64, + self.write_view.deref_mut().len() as u64, ); Ok(()) } @@ -183,10 +137,10 @@ where /// * handles alignment pub struct CpuWriteGpuReadBelt { /// Minimum size for new buffers. - chunk_size: usize, + chunk_size: wgpu::BufferSize, /// Chunks which are CPU write at the moment. - active_chunks: Vec, + active_chunks: Vec, /// Chunks which are GPU read at the moment. /// @@ -224,7 +178,7 @@ impl CpuWriteGpuReadBelt { /// /// TODO(andreas): Adaptive chunk sizes /// TODO(andreas): Shrinking after usage spikes? - pub fn new(chunk_size: usize) -> Self { + pub fn new(chunk_size: wgpu::BufferSize) -> Self { let (sender, receiver) = mpsc::channel(); CpuWriteGpuReadBelt { chunk_size, @@ -246,79 +200,132 @@ impl CpuWriteGpuReadBelt { buffer_pool: &mut GpuBufferPool, num_elements: usize, ) -> CpuWriteGpuReadBuffer { - let alignment = std::mem::align_of::().min(wgpu::MAP_ALIGNMENT as usize); - let size = std::mem::size_of::() * num_elements; + let alignment = (std::mem::align_of::() as wgpu::BufferAddress).min(wgpu::MAP_ALIGNMENT); + let size = (std::mem::size_of::() * num_elements) as wgpu::BufferAddress; - // We need to be super careful with alignment since wgpu - // has no guarantees on how pointers to mapped memory are aligned! + // We need to be super careful with alignment since today wgpu + // has NO guarantees on how pointers to mapped memory are aligned! + // For all we know, pointers might be 1 aligned, causing even a u32 write to crash the process! + // + // See https://github.com/gfx-rs/wgpu/issues/3508 + // + // To work around this, we require a bigger size to begin with. + // + // TODO(andreas): Either fix the wgpu issue or come up with a more conservative strategy, + // where we first look for a buffer slice with `size` and then again with required_size if needed. + let required_size = size + alignment - 1; // We explicitly use `deref_mut` on write_view everywhere, since wgpu warns if we accidentally use `deref`. // Try to find space in any of the active chunks first. - let mut active_chunk = if let Some(index) = - self.active_chunks.iter_mut().position(|active_chunk| { - size + (active_chunk.write_view.deref_mut().as_ptr() as usize - + active_chunk.unused_offset) - % alignment - <= active_chunk.chunk.size - active_chunk.unused_offset - }) { + let mut chunk = if let Some(index) = self + .active_chunks + .iter_mut() + .position(|chunk| chunk.size - chunk.unused_offset >= required_size) + { self.active_chunks.swap_remove(index) } else { self.receive_chunks(); // ensure self.free_chunks is up to date - // We don't know yet how aligned the mapped pointer is. So we need to ask for more memory! - let required_size = size + alignment - 1; - // Use a free chunk if possible, fall back to creating a new one if necessary. if let Some(index) = self .free_chunks .iter() .position(|chunk| chunk.size >= required_size) { - self.free_chunks.swap_remove(index).into_active(buffer_pool) + self.free_chunks.swap_remove(index) } else { // Allocation might be bigger than a chunk. - let size = self.chunk_size.max(required_size); + let size = self.chunk_size.get().max(required_size); let buffer = buffer_pool.alloc( device, &BufferDesc { label: "CpuWriteGpuReadBelt buffer".into(), - size: size as _, + size, usage: wgpu::BufferUsages::MAP_WRITE | wgpu::BufferUsages::COPY_SRC, mapped_at_creation: true, }, ); - Chunk { buffer, size }.into_active(buffer_pool) + Chunk { + buffer, + size, + unused_offset: 0, + } } }; - let alignment_padding = (active_chunk.write_view.deref_mut().as_ptr() as usize - + active_chunk.unused_offset) - % alignment; - let start_offset = active_chunk.unused_offset + alignment_padding; - let end_offset = start_offset + size; + let buffer = buffer_pool + .get_resource(&chunk.buffer) + .expect("invalid chunk buffer"); - let write_only_memory = bytemuck::cast_slice_mut( - &mut active_chunk.write_view.deref_mut()[start_offset..end_offset], - ); + // Allocate mapping from a chunk. + fn allocate_mapping<'a>( + chunk: &mut Chunk, + size: u64, + buffer: &'a wgpu::Buffer, + ) -> (u64, wgpu::BufferViewMut<'a>) { + let start_offset = chunk.unused_offset; + let end_offset = start_offset + size; - // SAFETY: - // The slice we take out of the write view has a lifetime dependency on the write_view. - // This means it is self-referential. We handle this lifetime at runtime instead by + debug_assert!(end_offset <= chunk.size); + chunk.unused_offset = end_offset; + + let buffer_slice = buffer.slice(start_offset..end_offset); + (start_offset, buffer_slice.get_mapped_range_mut()) + } + + // Allocate mapping from a chunk with alignment requirements. // - // See also `ActiveChunk::into_active`. - let write_only_memory = - unsafe { std::mem::transmute::<&mut [T], &'static mut [T]>(write_only_memory) }; + // Depending on how https://github.com/gfx-rs/wgpu/issues/3508 will be solved, this will become trivial + // as we will have knowledge of a full buffer mapping alignment beforehand. + // (we then should probably always align to 32 byte and don't allow types with even higher alignment requirements!) + fn allocate_chunk_mapping_with_alignment<'a>( + chunk: &mut Chunk, + size: u64, + buffer: &'a wgpu::Buffer, + alignment: u64, + ) -> (u64, wgpu::BufferViewMut<'a>) { + // First optimistically try without explicit padding. + let (start_offset, mut write_view) = allocate_mapping(chunk, size, buffer); + let required_padding = write_view.deref_mut().as_ptr() as u64 % alignment; // use deref_mut because wgpu warns otherwise that read access is slow. + + if required_padding == 0 { + (start_offset, write_view) + } else { + // Undo mapping and try again with padding. + // We made sure earlier that the chunk has enough space for this case! + drop(write_view); + chunk.unused_offset = start_offset + required_padding; + + let (start_offset, mut write_view) = allocate_mapping(chunk, size, buffer); + let required_padding = write_view.deref_mut().as_ptr() as u64 % alignment; // use deref_mut because wgpu warns otherwise that read access is slow. + debug_assert_eq!(required_padding, 0); + + (start_offset, write_view) + } + } + + let (start_offset, write_view) = + allocate_chunk_mapping_with_alignment(&mut chunk, size, buffer, alignment); + + // SAFETY: + // write_view has a lifetime dependency on the chunk's buffer. + // To ensure that the buffer is still around, we put the ref counted buffer handle into the struct with it. + // However, this also implies that the buffer pool is still alive! The renderer context needs to make sure of this. + let write_view = unsafe { + std::mem::transmute::, wgpu::BufferViewMut<'static>>(write_view) + }; + let cpu_buffer_view = CpuWriteGpuReadBuffer { - write_only_memory, - safety_usage_counter: active_chunk.safety_usage_counter.clone(), - chunk_buffer: active_chunk.chunk.buffer.clone(), + chunk_buffer: chunk.buffer.clone(), offset_in_chunk_buffer: start_offset, + write_view, + _type: std::marker::PhantomData, }; - self.active_chunks.push(active_chunk); + self.active_chunks.push(chunk); cpu_buffer_view } @@ -335,18 +342,12 @@ impl CpuWriteGpuReadBelt { // https://github.com/gfx-rs/wgpu/issues/1468 // However, WebGPU does not support this! - for active_chunk in self.active_chunks.drain(..) { - assert!(Arc::strong_count(&active_chunk.safety_usage_counter) == 1, - "Chunk still in use. All instances of `CpuWriteGpuReadBelt` need to be freed before submitting."); - - // Ensure write view is dropped before we unmap! - drop(active_chunk.write_view); - + for chunk in self.active_chunks.drain(..) { buffer_pool - .get_resource(&active_chunk.chunk.buffer) + .get_resource(&chunk.buffer) .expect("invalid buffer handle") .unmap(); - self.closed_chunks.push(active_chunk.chunk); + self.closed_chunks.push(chunk); } } diff --git a/crates/re_renderer/src/context.rs b/crates/re_renderer/src/context.rs index 5080acd7495b..a9cc96981d6d 100644 --- a/crates/re_renderer/src/context.rs +++ b/crates/re_renderer/src/context.rs @@ -32,14 +32,7 @@ pub struct RenderContext { pub texture_manager_2d: TextureManager2D, pub(crate) cpu_write_gpu_read_belt: Mutex, - /// Command encoder for all commands that should go in before view builder are submitted. - /// - /// This should be used for any gpu copy operation outside of a renderer or view builder. - /// (i.e. typically in [`crate::renderer::DrawData`] creation!) - pub(crate) frame_global_command_encoder: wgpu::CommandEncoder, - - // TODO(andreas): Add frame/lifetime statistics, shared resources (e.g. "global" uniform buffer), ?? - frame_index: u64, + pub(crate) active_frame: ActiveFrameContext, } /// Immutable data that is shared between all [`Renderer`] @@ -146,8 +139,6 @@ impl RenderContext { let texture_manager_2d = TextureManager2D::new(device.clone(), queue.clone(), &mut gpu_resources.textures); - let frame_global_command_encoder = Self::create_frame_global_command_encoder(&device); - RenderContext { device, queue, @@ -160,30 +151,26 @@ impl RenderContext { mesh_manager, texture_manager_2d, // 32MiB chunk size (as big as a for instance a 2048x1024 float4 texture) - cpu_write_gpu_read_belt: Mutex::new(CpuWriteGpuReadBelt::new(1024 * 1024 * 32)), - - frame_global_command_encoder, + cpu_write_gpu_read_belt: Mutex::new(CpuWriteGpuReadBelt::new(wgpu::BufferSize::new(1024 * 1024 * 32).unwrap())), resolver, #[cfg(all(not(target_arch = "wasm32"), debug_assertions))] // native debug build err_tracker, - frame_index: 0, + active_frame: ActiveFrameContext { frame_global_command_encoder: None, frame_index: 0 } } } - fn create_frame_global_command_encoder(device: &wgpu::Device) -> wgpu::CommandEncoder { - device.create_command_encoder(&wgpu::CommandEncoderDescriptor { - label: crate::DebugLabel::from("global \"before viewbuilder\" command encoder").get(), - }) - } - /// Call this at the beginning of a new frame. /// /// Updates internal book-keeping, frame allocators and executes delayed events like shader reloading. pub fn begin_frame(&mut self) { - self.frame_index += 1; + self.active_frame = ActiveFrameContext { + frame_global_command_encoder: None, + frame_index: self.active_frame.frame_index + 1, + }; + let frame_index = self.active_frame.frame_index; // Tick the error tracker so that it knows when to reset! // Note that we're ticking on begin_frame rather than raw frames, which @@ -199,8 +186,8 @@ impl RenderContext { re_log::debug!(?modified_paths, "got some filesystem events"); } - self.mesh_manager.begin_frame(self.frame_index); - self.texture_manager_2d.begin_frame(self.frame_index); + self.mesh_manager.begin_frame(frame_index); + self.texture_manager_2d.begin_frame(frame_index); { let WgpuResourcePools { @@ -219,26 +206,26 @@ impl RenderContext { shader_modules.begin_frame( &self.device, &mut self.resolver, - self.frame_index, + frame_index, &modified_paths, ); render_pipelines.begin_frame( &self.device, - self.frame_index, + frame_index, shader_modules, pipeline_layouts, ); // Bind group maintenance must come before texture/buffer maintenance since it // registers texture/buffer use - bind_groups.begin_frame(self.frame_index, textures, buffers, samplers); + bind_groups.begin_frame(frame_index, textures, buffers, samplers); - textures.begin_frame(self.frame_index); - buffers.begin_frame(self.frame_index); + textures.begin_frame(frame_index); + buffers.begin_frame(frame_index); - pipeline_layouts.begin_frame(self.frame_index); - bind_group_layouts.begin_frame(self.frame_index); - samplers.begin_frame(self.frame_index); + pipeline_layouts.begin_frame(frame_index); + bind_group_layouts.begin_frame(frame_index); + samplers.begin_frame(frame_index); } // Retrieve unused staging buffer. @@ -254,21 +241,49 @@ impl RenderContext { .lock() .before_queue_submit(&self.gpu_resources.buffers); - let mut command_encoder = Self::create_frame_global_command_encoder(&self.device); - std::mem::swap(&mut self.frame_global_command_encoder, &mut command_encoder); - let command_buffer = command_encoder.finish(); + if let Some(command_encoder) = self.active_frame.frame_global_command_encoder.take() { + let command_buffer = command_encoder.finish(); - // TODO(andreas): For better performance, we should try to bundle this with the single submit call that is currently happening in eframe. - // How do we hook in there and make sure this buffer is submitted first? - self.queue.submit([command_buffer]); + // TODO(andreas): For better performance, we should try to bundle this with the single submit call that is currently happening in eframe. + // How do we hook in there and make sure this buffer is submitted first? + self.queue.submit([command_buffer]); + } } } impl Drop for RenderContext { fn drop(&mut self) { - // Delete the CpuWriteGpuReadBelt beforehand since its active chunks may keep unsafe references into the buffer pool. - // (if we don't do this and buffers get destroyed before the belt, wgpu will panic) - *self.cpu_write_gpu_read_belt.lock() = CpuWriteGpuReadBelt::new(0); + // Close global command encoder if there is any pending. + // Not doing so before shutdown causes errors. + if let Some(encoder) = self.active_frame.frame_global_command_encoder.take() { + encoder.finish(); + } + } +} + +pub struct ActiveFrameContext { + /// Command encoder for all commands that should go in before view builder are submitted. + /// + /// This should be used for any gpu copy operation outside of a renderer or view builder. + /// (i.e. typically in [`crate::renderer::DrawData`] creation!) + frame_global_command_encoder: Option, + + // TODO(andreas): Add frame/lifetime statistics, shared resources (e.g. "global" uniform buffer), ?? + frame_index: u64, +} + +impl ActiveFrameContext { + /// Gets or creates a command encoder that runs before all view builder encoder. + pub fn frame_global_command_encoder( + &mut self, + device: &wgpu::Device, + ) -> &mut wgpu::CommandEncoder { + self.frame_global_command_encoder.get_or_insert_with(|| { + device.create_command_encoder(&wgpu::CommandEncoderDescriptor { + label: crate::DebugLabel::from("global \"before viewbuilder\" command encoder") + .get(), + }) + }) } } diff --git a/crates/re_renderer/src/view_builder.rs b/crates/re_renderer/src/view_builder.rs index c53ddce3b9bc..6a9b75abdcc1 100644 --- a/crates/re_renderer/src/view_builder.rs +++ b/crates/re_renderer/src/view_builder.rs @@ -413,7 +413,7 @@ impl ViewBuilder { ); frame_uniform_buffer_cpu .copy_to_buffer( - &mut ctx.frame_global_command_encoder, + ctx.active_frame.frame_global_command_encoder(&ctx.device), &ctx.gpu_resources.buffers, &frame_uniform_buffer, 0, From 847e3afaf6e51f45e1c38b76d0155bc4d3e5f5a3 Mon Sep 17 00:00:00 2001 From: Andreas Reich Date: Mon, 20 Feb 2023 19:58:05 +0100 Subject: [PATCH 04/21] debug fmt for CpuWriteGpuReadBelt for easier debugging --- .../src/allocator/cpu_write_gpu_read_belt.rs | 20 +++++++++---------- crates/re_renderer/src/context.rs | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs index 30528384b8b5..fe7f5b00bb58 100644 --- a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs +++ b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs @@ -381,13 +381,13 @@ impl CpuWriteGpuReadBelt { } } -// impl fmt::Debug for StagingBelt { -// fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { -// f.debug_struct("StagingBelt") -// .field("chunk_size", &self.chunk_size) -// .field("active_chunks", &self.active_chunks.len()) -// .field("closed_chunks", &self.closed_chunks.len()) -// .field("free_chunks", &self.free_chunks.len()) -// .finish_non_exhaustive() -// } -// } +impl std::fmt::Debug for CpuWriteGpuReadBelt { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("StagingBelt") + .field("chunk_size", &self.chunk_size) + .field("active_chunks", &self.active_chunks.len()) + .field("closed_chunks", &self.closed_chunks.len()) + .field("free_chunks", &self.free_chunks.len()) + .finish_non_exhaustive() + } +} diff --git a/crates/re_renderer/src/context.rs b/crates/re_renderer/src/context.rs index a9cc96981d6d..66b09e0cd622 100644 --- a/crates/re_renderer/src/context.rs +++ b/crates/re_renderer/src/context.rs @@ -30,7 +30,7 @@ pub struct RenderContext { pub gpu_resources: WgpuResourcePools, pub mesh_manager: MeshManager, pub texture_manager_2d: TextureManager2D, - pub(crate) cpu_write_gpu_read_belt: Mutex, + pub cpu_write_gpu_read_belt: Mutex, pub(crate) active_frame: ActiveFrameContext, } From ceb2695c1ba0b3f3d175859c8ef626e8e95a5a3f Mon Sep 17 00:00:00 2001 From: Andreas Reich Date: Mon, 20 Feb 2023 20:36:26 +0100 Subject: [PATCH 05/21] add missing profiler scopes & call to before_submit --- crates/re_renderer/src/context.rs | 4 ++++ crates/re_renderer/src/point_cloud_builder.rs | 5 ----- crates/re_viewer/src/app.rs | 2 ++ 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/crates/re_renderer/src/context.rs b/crates/re_renderer/src/context.rs index 66b09e0cd622..77deb934f4ce 100644 --- a/crates/re_renderer/src/context.rs +++ b/crates/re_renderer/src/context.rs @@ -166,6 +166,8 @@ impl RenderContext { /// /// Updates internal book-keeping, frame allocators and executes delayed events like shader reloading. pub fn begin_frame(&mut self) { + crate::profile_function!(); + self.active_frame = ActiveFrameContext { frame_global_command_encoder: None, frame_index: self.active_frame.frame_index + 1, @@ -236,6 +238,8 @@ impl RenderContext { /// Call this at the end of a frame but before submitting command buffers from [`crate::view_builder::ViewBuilder`] pub fn before_submit(&mut self) { + crate::profile_function!(); + // Unmap all staging buffers. self.cpu_write_gpu_read_belt .lock() diff --git a/crates/re_renderer/src/point_cloud_builder.rs b/crates/re_renderer/src/point_cloud_builder.rs index f265ae12e064..e3d85c7ace80 100644 --- a/crates/re_renderer/src/point_cloud_builder.rs +++ b/crates/re_renderer/src/point_cloud_builder.rs @@ -7,11 +7,6 @@ use crate::{ }; /// Builder for point clouds, making it easy to create [`crate::renderer::PointCloudDrawData`]. -/// -/// TODO(andreas): We could make significant optimizations here by making this builder capable -/// of writing to a GPU readable memory location. -/// This will require some ahead of time size limit, but should be feasible. -/// But before that we first need to sort out cpu->gpu transfers better by providing staging buffers. pub struct PointCloudBuilder { // Size of `point`/color`/`per_point_user_data` must be equal. pub vertices: Vec, diff --git a/crates/re_viewer/src/app.rs b/crates/re_viewer/src/app.rs index 52eeedcf20c7..a2ba4ec295a2 100644 --- a/crates/re_viewer/src/app.rs +++ b/crates/re_viewer/src/app.rs @@ -518,6 +518,8 @@ impl eframe::App for App { self.rx.source(), ); } + + render_ctx.before_submit(); } }); From 10c4a9f33aca485b031c3d7cded948b57964fe28 Mon Sep 17 00:00:00 2001 From: Andreas Reich Date: Mon, 20 Feb 2023 20:46:35 +0100 Subject: [PATCH 06/21] require point cloud builder for creating point cloud data --- crates/re_renderer/examples/multiview.rs | 52 +++++++++++-------- crates/re_renderer/src/point_cloud_builder.rs | 4 +- .../re_renderer/src/renderer/point_cloud.rs | 11 ++-- 3 files changed, 39 insertions(+), 28 deletions(-) diff --git a/crates/re_renderer/examples/multiview.rs b/crates/re_renderer/examples/multiview.rs index a5f89d1ae242..3275bf68e099 100644 --- a/crates/re_renderer/examples/multiview.rs +++ b/crates/re_renderer/examples/multiview.rs @@ -10,12 +10,11 @@ use rand::Rng; use re_renderer::{ renderer::{ GenericSkyboxDrawData, LineDrawData, LineStripFlags, MeshDrawData, MeshInstance, - PointCloudBatchFlags, PointCloudBatchInfo, PointCloudDrawData, PointCloudVertex, TestTriangleDrawData, }, resource_managers::ResourceLifeTime, view_builder::{OrthographicCameraMode, Projection, TargetConfiguration, ViewBuilder}, - Color32, LineStripSeriesBuilder, RenderContext, Rgba, Size, + Color32, LineStripSeriesBuilder, PointCloudBuilder, RenderContext, Rgba, Size, }; use winit::event::{ElementState, VirtualKeyCode}; @@ -163,7 +162,8 @@ struct Multiview { mesh_instance_positions_and_colors: Vec<(glam::Vec3, Color32)>, // Want to have a large cloud of random points, but doing rng for all of them every frame is too slow - random_points: Vec, + random_points_positions: Vec, + random_points_radii: Vec, random_points_colors: Vec, } @@ -188,17 +188,23 @@ impl Example for Multiview { let mut rnd = ::seed_from_u64(42); let random_point_range = -5.0_f32..5.0_f32; - let random_points = (0..500000) - .map(|_| PointCloudVertex { - position: glam::vec3( + + let point_count = 500000; + let random_points_positions = (0..point_count) + .map(|_| { + glam::vec3( rnd.gen_range(random_point_range.clone()), rnd.gen_range(random_point_range.clone()), rnd.gen_range(random_point_range.clone()), - ), - radius: Size::new_scene(rnd.gen_range(0.005..0.05)), + ) }) .collect_vec(); - let random_points_colors = (0..500000).map(|_| random_color(&mut rnd)).collect_vec(); + let random_points_radii = (0..point_count) + .map(|_| Size::new_scene(rnd.gen_range(0.005..0.05))) + .collect_vec(); + let random_points_colors = (0..point_count) + .map(|_| random_color(&mut rnd)) + .collect_vec(); let model_mesh_instances = { let reader = std::io::Cursor::new(include_bytes!("rerun.obj.zip")); @@ -232,7 +238,8 @@ impl Example for Multiview { model_mesh_instances, mesh_instance_positions_and_colors, - random_points, + random_points_positions, + random_points_radii, random_points_colors, } } @@ -260,18 +267,19 @@ impl Example for Multiview { let triangle = TestTriangleDrawData::new(re_ctx); let skybox = GenericSkyboxDrawData::new(re_ctx); let lines = build_lines(re_ctx, seconds_since_startup); - let point_cloud = PointCloudDrawData::new( - re_ctx, - &self.random_points, - &self.random_points_colors, - &[PointCloudBatchInfo { - label: "Random points".into(), - world_from_obj: glam::Mat4::from_rotation_x(seconds_since_startup), - point_count: self.random_points.len() as _, - flags: PointCloudBatchFlags::ENABLE_SHADING, - }], - ) - .unwrap(); + + let mut builder = PointCloudBuilder::<()>::default(); + builder + .batch("Random Points") + .world_from_obj(glam::Mat4::from_rotation_x(seconds_since_startup)) + .add_points( + self.random_points_positions.len(), + self.random_points_positions.iter().cloned(), + ) + .radii(self.random_points_radii.iter().cloned()) + .colors(self.random_points_colors.iter().cloned()); + + let point_cloud = builder.to_draw_data(re_ctx).unwrap(); let meshes = build_mesh_instances( re_ctx, &self.model_mesh_instances, diff --git a/crates/re_renderer/src/point_cloud_builder.rs b/crates/re_renderer/src/point_cloud_builder.rs index e3d85c7ace80..1b64af4b3762 100644 --- a/crates/re_renderer/src/point_cloud_builder.rs +++ b/crates/re_renderer/src/point_cloud_builder.rs @@ -13,7 +13,7 @@ pub struct PointCloudBuilder { pub colors: Vec, pub user_data: Vec, - batches: Vec, + pub(crate) batches: Vec, } impl Default for PointCloudBuilder { @@ -100,7 +100,7 @@ where &self, ctx: &mut crate::context::RenderContext, ) -> Result { - PointCloudDrawData::new(ctx, &self.vertices, &self.colors, &self.batches) + PointCloudDrawData::new(ctx, self) } } diff --git a/crates/re_renderer/src/renderer/point_cloud.rs b/crates/re_renderer/src/renderer/point_cloud.rs index 166b0fd781b6..5a37803d9562 100644 --- a/crates/re_renderer/src/renderer/point_cloud.rs +++ b/crates/re_renderer/src/renderer/point_cloud.rs @@ -20,6 +20,7 @@ use std::{ use crate::{ context::uniform_buffer_allocation_size, wgpu_resources::BufferDesc, Color32, DebugLabel, + PointCloudBuilder, }; use bitflags::bitflags; use bytemuck::Zeroable; @@ -148,11 +149,9 @@ impl PointCloudDrawData { /// Number of vertices and colors has to be equal. /// /// If no batches are passed, all points are assumed to be in a single batch with identity transform. - pub fn new( + pub fn new( ctx: &mut RenderContext, - vertices: &[PointCloudVertex], - colors: &[Color32], - batches: &[PointCloudBatchInfo], + builder: &PointCloudBuilder, ) -> Result { crate::profile_function!(); @@ -163,6 +162,10 @@ impl PointCloudDrawData { &mut ctx.resolver, ); + let vertices = builder.vertices.as_slice(); + let colors = builder.colors.as_slice(); + let batches = builder.batches.as_slice(); + if vertices.is_empty() { return Ok(PointCloudDrawData { bind_group_all_points: None, From 0226fb2f4b7c04f51d0157bbf8a664dd5cabfac3 Mon Sep 17 00:00:00 2001 From: Andreas Reich Date: Mon, 20 Feb 2023 21:52:07 +0100 Subject: [PATCH 07/21] Make `CpuWriteGpuReadBuffer` more like a Vec. POC usage in PointCloudBuilder --- crates/re_renderer/examples/2d.rs | 2 +- crates/re_renderer/examples/multiview.rs | 2 +- .../src/allocator/cpu_write_gpu_read_belt.rs | 119 ++++++++++-------- crates/re_renderer/src/allocator/mod.rs | 2 +- crates/re_renderer/src/point_cloud_builder.rs | 63 ++++++---- .../re_renderer/src/renderer/point_cloud.rs | 70 ++++------- crates/re_renderer/src/view_builder.rs | 35 +++--- crates/re_viewer/src/ui/space_view.rs | 2 +- .../src/ui/view_spatial/scene/mod.rs | 12 +- .../src/ui/view_spatial/scene/primitives.rs | 12 +- crates/re_viewer/src/ui/view_spatial/ui_2d.rs | 81 ++++++------ crates/re_viewer/src/ui/view_spatial/ui_3d.rs | 6 +- .../src/ui/view_spatial/ui_renderer_bridge.rs | 4 +- 13 files changed, 217 insertions(+), 193 deletions(-) diff --git a/crates/re_renderer/examples/2d.rs b/crates/re_renderer/examples/2d.rs index 741a27f24539..047c37dfbe51 100644 --- a/crates/re_renderer/examples/2d.rs +++ b/crates/re_renderer/examples/2d.rs @@ -149,7 +149,7 @@ impl framework::Example for Render2D { // Moving the windows to a high dpi screen makes the second one bigger. // Also, it looks different under perspective projection. // The third point is automatic thickness which is determined by the point renderer implementation. - let mut point_cloud_builder = PointCloudBuilder::<()>::default(); + let mut point_cloud_builder = PointCloudBuilder::<()>::new(re_ctx); point_cloud_builder .batch("points") .add_points_2d( diff --git a/crates/re_renderer/examples/multiview.rs b/crates/re_renderer/examples/multiview.rs index 3275bf68e099..2a663f30ca39 100644 --- a/crates/re_renderer/examples/multiview.rs +++ b/crates/re_renderer/examples/multiview.rs @@ -268,7 +268,7 @@ impl Example for Multiview { let skybox = GenericSkyboxDrawData::new(re_ctx); let lines = build_lines(re_ctx, seconds_since_startup); - let mut builder = PointCloudBuilder::<()>::default(); + let mut builder = PointCloudBuilder::<()>::new(re_ctx); builder .batch("Random Points") .world_from_obj(glam::Mat4::from_rotation_x(seconds_since_startup)) diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs index fe7f5b00bb58..dd339412d3b1 100644 --- a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs +++ b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs @@ -1,4 +1,4 @@ -use std::{ops::DerefMut, sync::mpsc}; +use std::{num::NonZeroU32, ops::DerefMut, sync::mpsc}; use crate::wgpu_resources::{BufferDesc, GpuBufferHandleStrong, GpuBufferPool, PoolError}; @@ -11,7 +11,9 @@ struct Chunk { unused_offset: wgpu::BufferAddress, } -/// A suballocated staging buffer that can be written to. +/// A sub-allocated staging buffer that can be written to. +/// +/// Behaves a bit like a fixed sized `Vec` in that far it keeps track of how many elements were written to it. /// /// We do *not* allow reading from this buffer as it is typically write-combined memory. /// Reading would work, but it can be *very* slow. @@ -22,6 +24,9 @@ pub struct CpuWriteGpuReadBuffer { /// In actuality it is tied to the lifetime of [`chunk_buffer`](#structfield.chunk.chunk_buffer)! write_view: wgpu::BufferViewMut<'static>, + /// How many elements of T were already pushed into this buffer. + write_counter: usize, + chunk_buffer: GpuBufferHandleStrong, offset_in_chunk_buffer: wgpu::BufferAddress, @@ -38,73 +43,72 @@ where /// Do *not* make this public as we need to guarantee that the memory is *never* read from! #[inline(always)] fn as_slice(&mut self) -> &mut [T] { - bytemuck::cast_slice_mut(&mut self.write_view) + &mut bytemuck::cast_slice_mut(&mut self.write_view)[self.write_counter..] } - /// Writes several objects using to the buffer at a given location using a slice. + /// Pushes a slice of elements into the buffer. /// - /// User is responsible for ensuring the element offset is valid with the element types's alignment requirement. - /// (panics otherwise) + /// Panics if the data no longer fits into the buffer. #[inline] - pub fn write_slice(&mut self, elements: &[T], num_elements_offset: usize) { - self.as_slice()[num_elements_offset..].copy_from_slice(elements); + pub fn extend_from_slice(&mut self, elements: &[T]) { + self.as_slice().copy_from_slice(elements); + self.write_counter += elements.len(); } - /// Writes several objects using to the buffer at a given location using an iterator. + /// Pushes several elements into the buffer. /// - /// User is responsible for ensuring the element offset is valid with the element types's alignment requirement. - /// (panics otherwise) + /// Panics if the data no longer fits into the buffer. #[inline] - pub fn write_iterator( - &mut self, - elements: impl Iterator, - num_elements_offset: usize, - ) { - for (target, source) in self - .as_slice() - .iter_mut() - .skip(num_elements_offset) - .zip(elements) - { + pub fn extend(&mut self, elements: impl Iterator) { + let mut num_elements = 0; + for (target, source) in self.as_slice().iter_mut().zip(elements) { *target = source; + num_elements += 1; } + self.write_counter += num_elements; } - /// Writes a single objects to the buffer at a given location. + /// Pushes a single element into the buffer and advances the write pointer. /// - /// User is responsible for ensuring the element offset is valid with the element types's alignment requirement. - /// (panics otherwise) + /// Panics if the data no longer fits into the buffer. #[inline] - pub fn write_single(&mut self, element: &T, num_elements_offset: usize) { - self.as_slice()[num_elements_offset] = *element; + pub fn push(&mut self, element: &T) { + *self.as_slice().first_mut().unwrap() = *element; + self.write_counter += 1; } - // pub fn copy_to_texture( - // self, - // encoder: &mut wgpu::CommandEncoder, - // buffer_pool: &GpuBufferPool, - // destination: wgpu::ImageCopyTexture<'_>, - // bytes_per_row: Option, - // rows_per_image: Option, - // copy_size: wgpu::Extent3d, - // ) -> Result<(), PoolError> { - // let buffer = buffer_pool.get_resource(&self.chunk_buffer)?; - - // encoder.copy_buffer_to_texture( - // wgpu::ImageCopyBuffer { - // buffer, - // layout: wgpu::ImageDataLayout { - // offset: self.offset_in_chunk, - // bytes_per_row, - // rows_per_image, - // }, - // }, - // destination, - // copy_size, - // ); - - // Ok(()) - // } + /// The number of elements pushed into the buffer so far. + #[inline] + pub fn num_written(&self) -> usize { + self.write_counter + } + + pub fn copy_to_texture( + self, + encoder: &mut wgpu::CommandEncoder, + buffer_pool: &GpuBufferPool, + destination: wgpu::ImageCopyTexture<'_>, + bytes_per_row: Option, + rows_per_image: Option, + copy_size: wgpu::Extent3d, + ) -> Result<(), PoolError> { + let source_buffer = buffer_pool.get_resource(&self.chunk_buffer)?; + + encoder.copy_buffer_to_texture( + wgpu::ImageCopyBuffer { + buffer: source_buffer, + layout: wgpu::ImageDataLayout { + offset: self.offset_in_chunk_buffer, + bytes_per_row, + rows_per_image, + }, + }, + destination, + copy_size, + ); + + Ok(()) + } /// Consume this data view and copy it to another gpu buffer. pub fn copy_to_buffer( @@ -114,13 +118,16 @@ where destination: &GpuBufferHandleStrong, destination_offset: wgpu::BufferAddress, ) -> Result<(), PoolError> { + let source_buffer = buffer_pool.get_resource(&self.chunk_buffer)?; + encoder.copy_buffer_to_buffer( - buffer_pool.get_resource(&self.chunk_buffer)?, + source_buffer, self.offset_in_chunk_buffer, buffer_pool.get_resource(destination)?, destination_offset, self.write_view.deref_mut().len() as u64, ); + Ok(()) } } @@ -322,6 +329,7 @@ impl CpuWriteGpuReadBelt { chunk_buffer: chunk.buffer.clone(), offset_in_chunk_buffer: start_offset, write_view, + write_counter: 0, _type: std::marker::PhantomData, }; @@ -375,7 +383,8 @@ impl CpuWriteGpuReadBelt { /// Move all chunks that the GPU is done with (and are now mapped again) /// from `self.receiver` to `self.free_chunks`. fn receive_chunks(&mut self) { - while let Ok(chunk) = self.receiver.try_recv() { + while let Ok(mut chunk) = self.receiver.try_recv() { + chunk.unused_offset = 0; self.free_chunks.push(chunk); } } diff --git a/crates/re_renderer/src/allocator/mod.rs b/crates/re_renderer/src/allocator/mod.rs index 7ae3d967a7e0..cb4ab783fd09 100644 --- a/crates/re_renderer/src/allocator/mod.rs +++ b/crates/re_renderer/src/allocator/mod.rs @@ -5,4 +5,4 @@ mod cpu_write_gpu_read_belt; -pub use cpu_write_gpu_read_belt::CpuWriteGpuReadBelt; +pub use cpu_write_gpu_read_belt::{CpuWriteGpuReadBelt, CpuWriteGpuReadBuffer}; diff --git a/crates/re_renderer/src/point_cloud_builder.rs b/crates/re_renderer/src/point_cloud_builder.rs index 1b64af4b3762..cdfd60666ea7 100644 --- a/crates/re_renderer/src/point_cloud_builder.rs +++ b/crates/re_renderer/src/point_cloud_builder.rs @@ -1,37 +1,45 @@ use crate::{ + allocator::CpuWriteGpuReadBuffer, renderer::{ PointCloudBatchFlags, PointCloudBatchInfo, PointCloudDrawData, PointCloudDrawDataError, PointCloudVertex, }, - Color32, DebugLabel, Size, + Color32, DebugLabel, RenderContext, Size, }; /// Builder for point clouds, making it easy to create [`crate::renderer::PointCloudDrawData`]. pub struct PointCloudBuilder { // Size of `point`/color`/`per_point_user_data` must be equal. pub vertices: Vec, - pub colors: Vec, + + pub(crate) color_buffer: CpuWriteGpuReadBuffer, pub user_data: Vec, pub(crate) batches: Vec, } -impl Default for PointCloudBuilder { - fn default() -> Self { +impl PointCloudBuilder +where + PerPointUserData: Default + Copy, +{ + pub fn new(ctx: &mut RenderContext) -> Self { const RESERVE_SIZE: usize = 512; + + // TODO(andreas): Be more resourceful about the size allocated here. Typically we know in advance! + let color_buffer = ctx.cpu_write_gpu_read_belt.lock().allocate::( + &ctx.device, + &mut ctx.gpu_resources.buffers, + PointCloudDrawData::MAX_NUM_POINTS, + ); + Self { vertices: Vec::with_capacity(RESERVE_SIZE), - colors: Vec::with_capacity(RESERVE_SIZE), + color_buffer, user_data: Vec::with_capacity(RESERVE_SIZE), batches: Vec::with_capacity(16), } } -} -impl PointCloudBuilder -where - PerPointUserData: Default + Copy, -{ /// Start of a new batch. #[inline] pub fn batch( @@ -97,7 +105,7 @@ where /// Finalizes the builder and returns a point cloud draw data with all the points added so far. pub fn to_draw_data( - &self, + self, ctx: &mut crate::context::RenderContext, ) -> Result { PointCloudDrawData::new(ctx, self) @@ -145,9 +153,10 @@ where /// Each time we `add_points`, or upon builder drop, make sure that we /// fill in any additional colors and user-data to have matched vectors. fn extend_defaults(&mut self) { - if self.0.colors.len() < self.0.vertices.len() { - self.0.colors.extend( - std::iter::repeat(Color32::WHITE).take(self.0.vertices.len() - self.0.colors.len()), + if self.0.color_buffer.num_written() < self.0.vertices.len() { + self.0.color_buffer.extend( + std::iter::repeat(Color32::WHITE) + .take(self.0.vertices.len() - self.0.color_buffer.num_written()), ); } @@ -181,7 +190,7 @@ where self.extend_defaults(); - debug_assert_eq!(self.0.vertices.len(), self.0.colors.len()); + debug_assert_eq!(self.0.vertices.len(), self.0.color_buffer.num_written()); debug_assert_eq!(self.0.vertices.len(), self.0.user_data.len()); let old_size = self.0.vertices.len(); @@ -195,7 +204,6 @@ where let num_points = self.0.vertices.len() - old_size; self.batch_mut().point_count += num_points as u32; - self.0.colors.reserve(num_points); self.0.user_data.reserve(num_points); let new_range = old_size..self.0.vertices.len(); @@ -205,27 +213,25 @@ where PointsBuilder { vertices: &mut self.0.vertices[new_range], max_points, - colors: &mut self.0.colors, + colors: &mut self.0.color_buffer, user_data: &mut self.0.user_data, } } #[inline] pub fn add_point(&mut self, position: glam::Vec3) -> PointBuilder<'_, PerPointUserData> { - debug_assert_eq!(self.0.vertices.len(), self.0.colors.len()); + debug_assert_eq!(self.0.vertices.len(), self.0.color_buffer.num_written()); debug_assert_eq!(self.0.vertices.len(), self.0.user_data.len()); self.0.vertices.push(PointCloudVertex { position, radius: Size::AUTO, }); - self.0.colors.push(Color32::WHITE); - self.0.user_data.push(PerPointUserData::default()); self.batch_mut().point_count += 1; PointBuilder { vertex: self.0.vertices.last_mut().unwrap(), - color: self.0.colors.last_mut().unwrap(), + color: &mut self.0.color_buffer, user_data: self.0.user_data.last_mut().unwrap(), } } @@ -260,7 +266,7 @@ where pub struct PointBuilder<'a, PerPointUserData> { vertex: &'a mut PointCloudVertex, - color: &'a mut Color32, + color: &'a mut CpuWriteGpuReadBuffer, user_data: &'a mut PerPointUserData, } @@ -274,9 +280,10 @@ where self } + /// This mustn't call this more than once. #[inline] pub fn color(self, color: Color32) -> Self { - *self.color = color; + self.color.push(&color); self } @@ -293,7 +300,7 @@ pub struct PointsBuilder<'a, PerPointUserData> { // Colors and user-data are the Vec we append // the data to if provided. max_points: usize, - colors: &'a mut Vec, + colors: &'a mut CpuWriteGpuReadBuffer, user_data: &'a mut Vec, } @@ -303,6 +310,8 @@ where { /// Assigns radii to all points. /// + /// This mustn't call this more than once. + /// /// If the iterator doesn't cover all points, some will not be assigned. /// If the iterator provides more values than there are points, the extra values will be ignored. #[inline] @@ -318,18 +327,22 @@ where /// Assigns colors to all points. /// + /// This mustn't call this more than once. + /// /// If the iterator doesn't cover all points, some will not be assigned. /// If the iterator provides more values than there are points, the extra values will be ignored. #[inline] pub fn colors(self, colors: impl Iterator) -> Self { crate::profile_function!(); self.colors - .extend(colors.take(self.max_points - self.colors.len())); + .extend(colors.take(self.max_points - self.colors.num_written())); self } /// Assigns user data for all points in this builder. /// + /// This mustn't call this more than once. + /// /// User data is currently not available on the GPU. #[inline] pub fn user_data(self, data: impl Iterator) -> Self { diff --git a/crates/re_renderer/src/renderer/point_cloud.rs b/crates/re_renderer/src/renderer/point_cloud.rs index 5a37803d9562..a1aea58bc5aa 100644 --- a/crates/re_renderer/src/renderer/point_cloud.rs +++ b/crates/re_renderer/src/renderer/point_cloud.rs @@ -19,7 +19,7 @@ use std::{ }; use crate::{ - context::uniform_buffer_allocation_size, wgpu_resources::BufferDesc, Color32, DebugLabel, + context::uniform_buffer_allocation_size, wgpu_resources::BufferDesc, DebugLabel, PointCloudBuilder, }; use bitflags::bitflags; @@ -151,7 +151,7 @@ impl PointCloudDrawData { /// If no batches are passed, all points are assumed to be in a single batch with identity transform. pub fn new( ctx: &mut RenderContext, - builder: &PointCloudBuilder, + builder: PointCloudBuilder, ) -> Result { crate::profile_function!(); @@ -163,7 +163,6 @@ impl PointCloudDrawData { ); let vertices = builder.vertices.as_slice(); - let colors = builder.colors.as_slice(); let batches = builder.batches.as_slice(); if vertices.is_empty() { @@ -197,23 +196,16 @@ impl PointCloudDrawData { 0 ); - if vertices.len() != colors.len() { - return Err(PointCloudDrawDataError::NumberOfColorsNotEqualNumberOfVertices); - } - - let (vertices, colors) = if vertices.len() >= Self::MAX_NUM_POINTS { + let vertices = if vertices.len() >= Self::MAX_NUM_POINTS { re_log::error_once!( "Reached maximum number of supported points. Clamping down to {}, passed were {}. See also https://github.com/rerun-io/rerun/issues/957", Self::MAX_NUM_POINTS, vertices.len() ); - ( - &vertices[..Self::MAX_NUM_POINTS], - &colors[..Self::MAX_NUM_POINTS], - ) + &vertices[..Self::MAX_NUM_POINTS] } else { - (vertices, colors) + vertices }; // TODO(andreas): We want a "stack allocation" here that lives for one frame. @@ -264,15 +256,6 @@ impl PointCloudDrawData { .collect_vec() }; - let color_staging = { - crate::profile_scope!("collect_colors"); - colors - .iter() - .cloned() - .chain(std::iter::repeat(Color32::TRANSPARENT).take(num_points_zeroed)) - .collect_vec() - }; - // Upload data from staging buffers to gpu. let size = wgpu::Extent3d { width: DATA_TEXTURE_SIZE, @@ -308,28 +291,27 @@ impl PointCloudDrawData { { crate::profile_scope!("write_color_texture"); - ctx.queue.write_texture( - wgpu::ImageCopyTexture { - texture: &ctx - .gpu_resources - .textures - .get_resource(&color_texture) - .unwrap() - .texture, - mip_level: 0, - origin: wgpu::Origin3d::ZERO, - aspect: wgpu::TextureAspect::All, - }, - bytemuck::cast_slice(&color_staging), - wgpu::ImageDataLayout { - offset: 0, - bytes_per_row: NonZeroU32::new( - DATA_TEXTURE_SIZE * std::mem::size_of::<[u8; 4]>() as u32, - ), - rows_per_image: None, - }, - size, - ); + builder + .color_buffer + .copy_to_texture( + ctx.active_frame.frame_global_command_encoder(&ctx.device), + &ctx.gpu_resources.buffers, + wgpu::ImageCopyTexture { + texture: &ctx + .gpu_resources + .textures + .get_resource(&color_texture) + .unwrap() + .texture, + mip_level: 0, + origin: wgpu::Origin3d::ZERO, + aspect: wgpu::TextureAspect::All, + }, + NonZeroU32::new(DATA_TEXTURE_SIZE * std::mem::size_of::<[u8; 4]>() as u32), + None, + size, + ) + .expect("invalid color cpu->gpu buffer"); } let bind_group_all_points = ctx.gpu_resources.bind_groups.alloc( diff --git a/crates/re_renderer/src/view_builder.rs b/crates/re_renderer/src/view_builder.rs index 6a9b75abdcc1..e0f00e3fcacf 100644 --- a/crates/re_renderer/src/view_builder.rs +++ b/crates/re_renderer/src/view_builder.rs @@ -392,25 +392,22 @@ impl ViewBuilder { .cpu_write_gpu_read_belt .lock() .allocate::(&ctx.device, &mut ctx.gpu_resources.buffers, 1); - frame_uniform_buffer_cpu.write_single( - &FrameUniformBuffer { - view_from_world: glam::Affine3A::from_mat4(view_from_world).into(), - projection_from_view: projection_from_view.into(), - projection_from_world: projection_from_world.into(), - camera_position, - camera_forward, - tan_half_fov: tan_half_fov.into(), - pixel_world_size_from_camera_distance, - pixels_from_point: config.pixels_from_point, - - auto_size_points: auto_size_points.0, - auto_size_lines: auto_size_lines.0, - - depth_offset_factor, - _padding: glam::Vec3::ZERO, - }, - 0, - ); + frame_uniform_buffer_cpu.push(&FrameUniformBuffer { + view_from_world: glam::Affine3A::from_mat4(view_from_world).into(), + projection_from_view: projection_from_view.into(), + projection_from_world: projection_from_world.into(), + camera_position, + camera_forward, + tan_half_fov: tan_half_fov.into(), + pixel_world_size_from_camera_distance, + pixels_from_point: config.pixels_from_point, + + auto_size_points: auto_size_points.0, + auto_size_lines: auto_size_lines.0, + + depth_offset_factor, + _padding: glam::Vec3::ZERO, + }); frame_uniform_buffer_cpu .copy_to_buffer( ctx.active_frame.frame_global_command_encoder(&ctx.device), diff --git a/crates/re_viewer/src/ui/space_view.rs b/crates/re_viewer/src/ui/space_view.rs index f06bbae25b37..3e9c7d693ff1 100644 --- a/crates/re_viewer/src/ui/space_view.rs +++ b/crates/re_viewer/src/ui/space_view.rs @@ -176,7 +176,7 @@ impl SpaceView { &self.space_path, self.data_blueprint.data_blueprints_projected(), ); - let mut scene = view_spatial::SceneSpatial::default(); + let mut scene = view_spatial::SceneSpatial::new(ctx.render_ctx); scene.load(ctx, &query, &transforms, highlights); self.view_state .ui_spatial(ctx, ui, &self.space_path, scene, self.id, highlights); diff --git a/crates/re_viewer/src/ui/view_spatial/scene/mod.rs b/crates/re_viewer/src/ui/view_spatial/scene/mod.rs index 18166e05d587..84376f4994c2 100644 --- a/crates/re_viewer/src/ui/view_spatial/scene/mod.rs +++ b/crates/re_viewer/src/ui/view_spatial/scene/mod.rs @@ -115,7 +115,6 @@ pub struct SceneSpatialUiData { pub images: Vec, } -#[derive(Default)] pub struct SceneSpatial { pub annotation_map: AnnotationMap, pub primitives: SceneSpatialPrimitives, @@ -146,6 +145,17 @@ fn instance_path_hash_if_interactive( pub type Keypoints = HashMap<(ClassId, i64), HashMap>; impl SceneSpatial { + pub fn new(re_ctx: &mut re_renderer::RenderContext) -> Self { + Self { + annotation_map: Default::default(), + primitives: SceneSpatialPrimitives::new(re_ctx), + ui: Default::default(), + num_logged_2d_objects: Default::default(), + num_logged_3d_objects: Default::default(), + space_cameras: Default::default(), + } + } + /// Loads all 3D objects into the scene according to the given query. pub(crate) fn load( &mut self, diff --git a/crates/re_viewer/src/ui/view_spatial/scene/primitives.rs b/crates/re_viewer/src/ui/view_spatial/scene/primitives.rs index 56b8e323e58c..22a1fd28dd31 100644 --- a/crates/re_viewer/src/ui/view_spatial/scene/primitives.rs +++ b/crates/re_viewer/src/ui/view_spatial/scene/primitives.rs @@ -10,7 +10,6 @@ use super::MeshSource; /// TODO(andreas): Right now we're using `re_renderer` data structures for reading (bounding box & picking). /// In the future, this will be more limited as we're going to gpu staging data as soon as possible /// which is very slow to read. See [#594](https://github.com/rerun-io/rerun/pull/594) -#[derive(Default)] pub struct SceneSpatialPrimitives { /// Estimated bounding box of all data in scene coordinates. Accumulated. pub(super) bounding_box: macaw::BoundingBox, @@ -31,6 +30,17 @@ const AXIS_COLOR_Y: Color32 = Color32::from_rgb(0, 240, 0); const AXIS_COLOR_Z: Color32 = Color32::from_rgb(80, 80, 255); impl SceneSpatialPrimitives { + pub fn new(re_ctx: &mut re_renderer::RenderContext) -> Self { + Self { + bounding_box: macaw::BoundingBox::nothing(), + textured_rectangles_ids: Default::default(), + textured_rectangles: Default::default(), + line_strips: Default::default(), + points: PointCloudBuilder::new(re_ctx), + meshes: Default::default(), + } + } + /// bounding box covering the rendered scene pub fn bounding_box(&self) -> macaw::BoundingBox { self.bounding_box diff --git a/crates/re_viewer/src/ui/view_spatial/ui_2d.rs b/crates/re_viewer/src/ui/view_spatial/ui_2d.rs index 172341ac30b6..23be71806264 100644 --- a/crates/re_viewer/src/ui/view_spatial/ui_2d.rs +++ b/crates/re_viewer/src/ui/view_spatial/ui_2d.rs @@ -8,7 +8,11 @@ use re_data_store::EntityPath; use re_log_types::component_types::TensorTrait; use re_renderer::view_builder::TargetConfiguration; -use super::{eye::Eye, scene::AdditionalPickingInfo, ViewSpatialState}; +use super::{ + eye::Eye, + scene::{AdditionalPickingInfo, SceneSpatialUiData}, + ViewSpatialState, +}; use crate::{ misc::{HoveredSpace, Item, SelectionHighlight, SpaceViewHighlights}, ui::{ @@ -301,43 +305,13 @@ fn view_2d_scrollable( // ------------------------------------------------------------------------ - // Draw a re_renderer driven view. - // Camera & projection are configured to ingest space coordinates directly. - { - crate::profile_scope!("build command buffer for 2D view {}", space.to_string()); - - let Ok(target_config) = setup_target_config( - &painter, - space_from_ui, - space_from_pixel, - &space.to_string(), - state.auto_size_config(response.rect.size()), - ) else { - return response; - }; - - let Ok(callback) = create_scene_paint_callback( - ctx.render_ctx, - target_config, painter.clip_rect(), - &scene.primitives, - &ScreenBackground::ClearColor(parent_ui.visuals().extreme_bg_color.into()), - ) else { - return response; - }; - - painter.add(callback); - } - - // ------------------------------------------------------------------------ - - // Add egui driven labels on top of re_renderer content. - painter.extend(create_labels( - &mut scene, + let label_shapes = create_labels( + &mut scene.ui, ui_from_space, space_from_ui, parent_ui, highlights, - )); + ); // ------------------------------------------------------------------------ @@ -452,6 +426,33 @@ fn view_2d_scrollable( // ------------------------------------------------------------------------ + // Draw a re_renderer driven view. + // Camera & projection are configured to ingest space coordinates directly. + { + crate::profile_scope!("build command buffer for 2D view {}", space.to_string()); + + let Ok(target_config) = setup_target_config( + &painter, + space_from_ui, + space_from_pixel, + &space.to_string(), + state.auto_size_config(response.rect.size()), + ) else { + return response; + }; + + let Ok(callback) = create_scene_paint_callback( + ctx.render_ctx, + target_config, painter.clip_rect(), + scene.primitives, + &ScreenBackground::ClearColor(parent_ui.visuals().extreme_bg_color.into()), + ) else { + return response; + }; + + painter.add(callback); + } + project_onto_other_spaces(ctx, space, &response, &space_from_ui, depth_at_pointer); painter.extend(show_projections_from_3d_space( ctx, @@ -460,11 +461,14 @@ fn view_2d_scrollable( &ui_from_space, )); + // Add egui driven labels on top of re_renderer content. + painter.extend(label_shapes); + response } fn create_labels( - scene: &mut SceneSpatial, + scene_ui: &mut SceneSpatialUiData, ui_from_space: RectTransform, space_from_ui: RectTransform, parent_ui: &mut egui::Ui, @@ -472,9 +476,9 @@ fn create_labels( ) -> Vec { crate::profile_function!(); - let mut label_shapes = Vec::with_capacity(scene.ui.labels_2d.len() * 2); + let mut label_shapes = Vec::with_capacity(scene_ui.labels_2d.len() * 2); - for label in &scene.ui.labels_2d { + for label in &scene_ui.labels_2d { let (wrap_width, text_anchor_pos) = match label.target { Label2DTarget::Rect(rect) => { let rect_in_ui = ui_from_space.transform_rect(rect); @@ -534,8 +538,7 @@ fn create_labels( label_shapes.push(Shape::rect_filled(bg_rect, 3.0, fill_color)); label_shapes.push(Shape::galley(text_rect.center_top(), galley)); - scene - .ui + scene_ui .pickable_ui_rects .push((space_from_ui.transform_rect(bg_rect), label.labled_instance)); } diff --git a/crates/re_viewer/src/ui/view_spatial/ui_3d.rs b/crates/re_viewer/src/ui/view_spatial/ui_3d.rs index b4d1ed2a2359..88590aba0cb7 100644 --- a/crates/re_viewer/src/ui/view_spatial/ui_3d.rs +++ b/crates/re_viewer/src/ui/view_spatial/ui_3d.rs @@ -473,7 +473,7 @@ pub fn view_3d( ui, eye, rect, - &scene, + scene, ctx.render_ctx, &space.to_string(), state.auto_size_config(rect.size()), @@ -484,7 +484,7 @@ fn paint_view( ui: &mut egui::Ui, eye: Eye, rect: egui::Rect, - scene: &SceneSpatial, + scene: SceneSpatial, render_ctx: &mut RenderContext, name: &str, auto_size_config: re_renderer::AutoSizeConfig, @@ -516,7 +516,7 @@ fn paint_view( render_ctx, target_config, rect, - &scene.primitives, &ScreenBackground::GenericSkybox) + scene.primitives, &ScreenBackground::GenericSkybox) else { return; }; diff --git a/crates/re_viewer/src/ui/view_spatial/ui_renderer_bridge.rs b/crates/re_viewer/src/ui/view_spatial/ui_renderer_bridge.rs index 3579b11cfc74..3d2f24cc8765 100644 --- a/crates/re_viewer/src/ui/view_spatial/ui_renderer_bridge.rs +++ b/crates/re_viewer/src/ui/view_spatial/ui_renderer_bridge.rs @@ -17,7 +17,7 @@ pub fn create_scene_paint_callback( render_ctx: &mut RenderContext, target_config: TargetConfiguration, clip_rect: egui::Rect, - primitives: &SceneSpatialPrimitives, + primitives: SceneSpatialPrimitives, background: &ScreenBackground, ) -> anyhow::Result { let pixels_from_point = target_config.pixels_from_point; @@ -39,7 +39,7 @@ pub enum ScreenBackground { fn create_and_fill_view_builder( render_ctx: &mut RenderContext, target_config: TargetConfiguration, - primitives: &SceneSpatialPrimitives, + primitives: SceneSpatialPrimitives, background: &ScreenBackground, ) -> anyhow::Result<(wgpu::CommandBuffer, ViewBuilder)> { let mut view_builder = ViewBuilder::default(); From 1bf15f7726b47090457bd5bb46d2003bb42a1f96 Mon Sep 17 00:00:00 2001 From: Andreas Reich Date: Mon, 20 Feb 2023 23:20:41 +0100 Subject: [PATCH 08/21] add note on pitfalls of write combined memory --- .../src/allocator/cpu_write_gpu_read_belt.rs | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs index dd339412d3b1..63e10ad65373 100644 --- a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs +++ b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs @@ -2,21 +2,17 @@ use std::{num::NonZeroU32, ops::DerefMut, sync::mpsc}; use crate::wgpu_resources::{BufferDesc, GpuBufferHandleStrong, GpuBufferPool, PoolError}; -/// Internal chunk of the staging belt. -struct Chunk { - buffer: GpuBufferHandleStrong, - size: wgpu::BufferAddress, - - /// At what offset is [`write_view`](#structfield.write_view) unused. - unused_offset: wgpu::BufferAddress, -} - /// A sub-allocated staging buffer that can be written to. /// /// Behaves a bit like a fixed sized `Vec` in that far it keeps track of how many elements were written to it. /// /// We do *not* allow reading from this buffer as it is typically write-combined memory. /// Reading would work, but it can be *very* slow. +/// For details on why, see +/// [Write combining is not your friend, by Fabian Giesen](https://fgiesen.wordpress.com/2013/01/29/write-combining-is-not-your-friend/) +/// Note that the "vec like behavior" further encourages +/// * not leaving holes +/// * keeping writes sequential pub struct CpuWriteGpuReadBuffer { /// Write view into the relevant buffer portion. /// @@ -132,6 +128,15 @@ where } } +/// Internal chunk of the staging belt. +struct Chunk { + buffer: GpuBufferHandleStrong, + size: wgpu::BufferAddress, + + /// At what offset is [`write_view`](#structfield.write_view) unused. + unused_offset: wgpu::BufferAddress, +} + /// Efficiently performs many buffer writes by sharing and reusing temporary buffers. /// /// Internally it uses a ring-buffer of staging buffers that are sub-allocated. @@ -200,7 +205,6 @@ impl CpuWriteGpuReadBelt { /// Allocates a cpu writable buffer for `num_elements` instances of T. /// /// Handles alignment requirements automatically which allows faster copy operations. - #[allow(unsafe_code)] pub fn allocate( &mut self, device: &wgpu::Device, @@ -317,6 +321,7 @@ impl CpuWriteGpuReadBelt { let (start_offset, write_view) = allocate_chunk_mapping_with_alignment(&mut chunk, size, buffer, alignment); + #[allow(unsafe_code)] // SAFETY: // write_view has a lifetime dependency on the chunk's buffer. // To ensure that the buffer is still around, we put the ref counted buffer handle into the struct with it. From 09a1f20d2c7cf323dfe8eb414604757a6f79796a Mon Sep 17 00:00:00 2001 From: Andreas Reich Date: Tue, 21 Feb 2023 10:39:14 +0100 Subject: [PATCH 09/21] Introduce minimum alignment to CpuWriteGpuReadBelt, fix padding mistake, add log --- .../src/allocator/cpu_write_gpu_read_belt.rs | 50 +++++++++++++++---- 1 file changed, 39 insertions(+), 11 deletions(-) diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs index 63e10ad65373..b1219c2ca504 100644 --- a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs +++ b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs @@ -146,7 +146,8 @@ struct Chunk { /// * can create buffers without yet knowing the target copy location /// * lifetime of returned buffers is independent of the [`StagingWriteBelt`] (allows working with several in parallel!) /// * use of `re_renderer`'s resource pool -/// * handles alignment +/// * handles alignment in a defined manner +/// (see this as of writing open wgpu issue on [Alignment guarantees for mapped buffers](https://github.com/gfx-rs/wgpu/issues/3508)) pub struct CpuWriteGpuReadBelt { /// Minimum size for new buffers. chunk_size: wgpu::BufferSize, @@ -177,6 +178,13 @@ pub struct CpuWriteGpuReadBelt { } impl CpuWriteGpuReadBelt { + /// All allocations of this allocator will be aligned to at least this size. + /// + /// Requiring a minimum alignment means we need to pad less often. + /// Also, it has the potential of making memcpy operations faster. + /// Align to 4xf32. Should be enough for most usecases and is + pub const MIN_ALIGNMENT: u64 = 128; + /// Create a new staging belt. /// /// The `chunk_size` is the unit of internal buffer allocation; writes will be @@ -191,6 +199,8 @@ impl CpuWriteGpuReadBelt { /// TODO(andreas): Adaptive chunk sizes /// TODO(andreas): Shrinking after usage spikes? pub fn new(chunk_size: wgpu::BufferSize) -> Self { + static_assertions::const_assert!(wgpu::MAP_ALIGNMENT <= CpuWriteGpuReadBelt::MIN_ALIGNMENT); + let (sender, receiver) = mpsc::channel(); CpuWriteGpuReadBelt { chunk_size, @@ -204,15 +214,20 @@ impl CpuWriteGpuReadBelt { /// Allocates a cpu writable buffer for `num_elements` instances of T. /// - /// Handles alignment requirements automatically which allows faster copy operations. + /// Handles alignment requirements automatically, allowing arbitrarily aligned types and fast memcpy. pub fn allocate( &mut self, device: &wgpu::Device, buffer_pool: &mut GpuBufferPool, num_elements: usize, ) -> CpuWriteGpuReadBuffer { - let alignment = (std::mem::align_of::() as wgpu::BufferAddress).min(wgpu::MAP_ALIGNMENT); - let size = (std::mem::size_of::() * num_elements) as wgpu::BufferAddress; + // Potentially overestimate alignment with Self::MIN_ALIGNMENT, see Self::MIN_ALIGNMENT doc string. + let alignment = (std::mem::align_of::() as wgpu::BufferAddress).max(Self::MIN_ALIGNMENT); + // Padd out the size of the used buffer, so that within a started chunk we'll always hit Self::MIN_ALIGNMENT + let size = wgpu::util::align_to( + (std::mem::size_of::() * num_elements) as wgpu::BufferAddress, + Self::MIN_ALIGNMENT, + ); // We need to be super careful with alignment since today wgpu // has NO guarantees on how pointers to mapped memory are aligned! @@ -220,10 +235,9 @@ impl CpuWriteGpuReadBelt { // // See https://github.com/gfx-rs/wgpu/issues/3508 // - // To work around this, we require a bigger size to begin with. - // - // TODO(andreas): Either fix the wgpu issue or come up with a more conservative strategy, - // where we first look for a buffer slice with `size` and then again with required_size if needed. + // To work around this, we ask for a bigger size, so we can safely pad out + // *if* the returned pointer is not correctly aligned. + // (i.e. we will use _up to_ `required_size` bytes, but at least `size`) let required_size = size + alignment - 1; // We explicitly use `deref_mut` on write_view everywhere, since wgpu warns if we accidentally use `deref`. @@ -246,9 +260,16 @@ impl CpuWriteGpuReadBelt { { self.free_chunks.swap_remove(index) } else { - // Allocation might be bigger than a chunk. - let size = self.chunk_size.get().max(required_size); + // (happens relatively rarely, this is a noteworthy event!) + re_log::debug!( + "Allocating new CpuWriteGpuReadBuffer chunk of size {required_size}" + ); + // Allocation might be bigger than a chunk. + let size = wgpu::util::align_to( + self.chunk_size.get().max(required_size), + wgpu::MAP_ALIGNMENT, + ); let buffer = buffer_pool.alloc( device, &BufferDesc { @@ -300,11 +321,18 @@ impl CpuWriteGpuReadBelt { ) -> (u64, wgpu::BufferViewMut<'a>) { // First optimistically try without explicit padding. let (start_offset, mut write_view) = allocate_mapping(chunk, size, buffer); - let required_padding = write_view.deref_mut().as_ptr() as u64 % alignment; // use deref_mut because wgpu warns otherwise that read access is slow. + + // use deref_mut explicitly because wgpu warns otherwise that read access is slow. + let ptr = write_view.deref_mut().as_ptr() as u64; + let required_padding = wgpu::util::align_to(ptr, alignment) - ptr; if required_padding == 0 { (start_offset, write_view) } else { + re_log::trace!( + "CpuWriteGpuReadBuffer::allocate alignment requirement not fulfilled. Need to add {required_padding} for alignment of {alignment}" + ); + // Undo mapping and try again with padding. // We made sure earlier that the chunk has enough space for this case! drop(write_view); From 5728005bdb3b1b1cb990cc7fc8c40ffc8ba54251 Mon Sep 17 00:00:00 2001 From: Andreas Reich Date: Wed, 22 Feb 2023 17:43:19 +0100 Subject: [PATCH 10/21] cleanup, fixup alignment padding a bit --- .../src/allocator/cpu_write_gpu_read_belt.rs | 201 +++++++++--------- .../re_renderer/src/renderer/point_cloud.rs | 30 ++- crates/re_renderer/src/view_builder.rs | 48 ++--- 3 files changed, 131 insertions(+), 148 deletions(-) diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs index 73cc8423997f..cb6e02705f15 100644 --- a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs +++ b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs @@ -1,6 +1,6 @@ use std::{num::NonZeroU32, ops::DerefMut, sync::mpsc}; -use crate::wgpu_resources::{BufferDesc, GpuBuffer, GpuBufferPool, PoolError}; +use crate::wgpu_resources::{BufferDesc, GpuBuffer, GpuBufferPool}; /// A sub-allocated staging buffer that can be written to. /// @@ -86,7 +86,7 @@ where bytes_per_row: Option, rows_per_image: Option, copy_size: wgpu::Extent3d, - ) -> Result<(), PoolError> { + ) { encoder.copy_buffer_to_texture( wgpu::ImageCopyBuffer { buffer: &self.chunk_buffer, @@ -99,8 +99,6 @@ where destination, copy_size, ); - - Ok(()) } /// Consume this data view and copy it to another gpu buffer. @@ -109,7 +107,7 @@ where encoder: &mut wgpu::CommandEncoder, destination: &GpuBuffer, destination_offset: wgpu::BufferAddress, - ) -> Result<(), PoolError> { + ) { encoder.copy_buffer_to_buffer( &self.chunk_buffer, self.offset_in_chunk_buffer, @@ -117,20 +115,82 @@ where destination_offset, self.write_view.deref_mut().len() as u64, ); - - Ok(()) } } /// Internal chunk of the staging belt. struct Chunk { buffer: GpuBuffer, - size: wgpu::BufferAddress, - /// At what offset is [`write_view`](#structfield.write_view) unused. + /// Starting at this offset the buffer is unused. unused_offset: wgpu::BufferAddress, } +impl Chunk { + fn remaining_capacity(&self) -> u64 { + self.buffer.size() - self.unused_offset + } + + fn required_padding(write_view: &mut wgpu::BufferViewMut<'_>, alignment: u64) -> u64 { + // Use deref_mut explicitly because wgpu warns otherwise that read access is slow. + let ptr = write_view.deref_mut().as_ptr() as u64; + wgpu::util::align_to(ptr, alignment) - ptr + } + + /// Caller needs to make sure that there is enough space plus potential padding. + fn allocate_aligned( + &mut self, + size_in_bytes: u64, + alignment: u64, + ) -> CpuWriteGpuReadBuffer { + // Optimistic first mapping attempt. + let mut start_offset = self.unused_offset; + let mut end_offset = start_offset + size_in_bytes; + let mut buffer_slice = self.buffer.slice(start_offset..end_offset); + let mut write_view = buffer_slice.get_mapped_range_mut(); + + // Check if it fullfills the requested alignment. + let required_padding = Self::required_padding(&mut write_view, alignment); + if required_padding != 0 { + // Undo mapping and try again with padding! + re_log::trace!( + "CpuWriteGpuReadBuffer::allocate alignment requirement not fulfilled. Need to add {required_padding} for alignment of {alignment}" + ); + + drop(write_view); + + start_offset = self.unused_offset + required_padding; + end_offset = start_offset + size_in_bytes; + buffer_slice = self.buffer.slice(start_offset..end_offset); + write_view = buffer_slice.get_mapped_range_mut(); + + let required_padding = Self::required_padding(&mut write_view, alignment); + debug_assert_eq!(required_padding, 0); + } + + debug_assert!(end_offset <= self.buffer.size()); + self.unused_offset = end_offset; + + #[allow(unsafe_code)] + // SAFETY: + // write_view has a lifetime dependency on the chunk's buffer. + // To ensure that the buffer is still around, we put the ref counted buffer handle into the struct with it. + // However, this also implies that the buffer pool is still alive! The renderer context needs to make sure of this. + // TODO: some more doc why this is here + let write_view = unsafe { + std::mem::transmute::, wgpu::BufferViewMut<'static>>(write_view) + }; + + CpuWriteGpuReadBuffer { + chunk_buffer: self.buffer.clone(), + offset_in_chunk_buffer: start_offset, + write_view, + write_counter: 0, + _type: std::marker::PhantomData, + } + } +} + /// Efficiently performs many buffer writes by sharing and reusing temporary buffers. /// /// Internally it uses a ring-buffer of staging buffers that are sub-allocated. @@ -144,7 +204,7 @@ struct Chunk { /// (see this as of writing open wgpu issue on [Alignment guarantees for mapped buffers](https://github.com/gfx-rs/wgpu/issues/3508)) pub struct CpuWriteGpuReadBelt { /// Minimum size for new buffers. - chunk_size: wgpu::BufferSize, + chunk_size: u64, /// Chunks which are CPU write at the moment. active_chunks: Vec, @@ -176,10 +236,11 @@ impl CpuWriteGpuReadBelt { /// /// Requiring a minimum alignment means we need to pad less often. /// Also, it has the potential of making memcpy operations faster. - /// Align to 4xf32. Should be enough for most usecases and is + /// + /// Align to 4xf32. Should be enough for most usecases! pub const MIN_ALIGNMENT: u64 = 16; - /// Create a new staging belt. + /// Create a cpu-write & gpu-read staging belt. /// /// The `chunk_size` is the unit of internal buffer allocation; writes will be /// sub-allocated within each chunk. Therefore, for optimal use of memory, the @@ -197,7 +258,7 @@ impl CpuWriteGpuReadBelt { let (sender, receiver) = mpsc::channel(); CpuWriteGpuReadBelt { - chunk_size, + chunk_size: wgpu::util::align_to(chunk_size.get(), Self::MIN_ALIGNMENT), active_chunks: Vec::new(), closed_chunks: Vec::new(), free_chunks: Vec::new(), @@ -206,10 +267,10 @@ impl CpuWriteGpuReadBelt { } } - /// Allocates a cpu writable buffer for `num_elements` instances of T. + /// Allocates a cpu writable buffer for `num_elements` instances of type `T`. /// - /// Handles alignment requirements automatically, allowing arbitrarily aligned types and fast memcpy. - pub fn allocate( + /// Handles alignment requirements automatically, allowing arbitrarily aligned types without issues. + pub fn allocate( &mut self, device: &wgpu::Device, buffer_pool: &mut GpuBufferPool, @@ -217,30 +278,30 @@ impl CpuWriteGpuReadBelt { ) -> CpuWriteGpuReadBuffer { // Potentially overestimate alignment with Self::MIN_ALIGNMENT, see Self::MIN_ALIGNMENT doc string. let alignment = (std::mem::align_of::() as wgpu::BufferAddress).max(Self::MIN_ALIGNMENT); - // Padd out the size of the used buffer, so that within active chunk we'll always hit at least Self::MIN_ALIGNMENT + // Pad out the size of the used buffer to a multiple of Self::MIN_ALIGNMENT. + // This increases our chance of having back to back allocations within a chunk. let size = wgpu::util::align_to( (std::mem::size_of::() * num_elements) as wgpu::BufferAddress, Self::MIN_ALIGNMENT, ); // We need to be super careful with alignment since today wgpu - // has NO guarantees on how pointers to mapped memory are aligned! + // has no guarantees on how pointers to mapped memory are aligned! // For all we know, pointers might be 1 aligned, causing even a u32 write to crash the process! // - // See https://github.com/gfx-rs/wgpu/issues/3508 + // For details and (as of writing) ongoing discussion see https://github.com/gfx-rs/wgpu/issues/3508 // // To work around this, we ask for a bigger size, so we can safely pad out - // *if* the returned pointer is not correctly aligned. - // (i.e. we will use _up to_ `required_size` bytes, but at least `size`) - let required_size = size + alignment - 1; - - // We explicitly use `deref_mut` on write_view everywhere, since wgpu warns if we accidentally use `deref`. + // if the returned pointer is not correctly aligned. + // (i.e. we will use _up to_ `required_size` bytes, but at least `size`)] + let maximum_padding = alignment - 1; + let max_required_size = size + maximum_padding; // Try to find space in any of the active chunks first. let mut chunk = if let Some(index) = self .active_chunks .iter_mut() - .position(|chunk| chunk.size - chunk.unused_offset >= required_size) + .position(|chunk| chunk.remaining_capacity() >= max_required_size) { self.active_chunks.swap_remove(index) } else { @@ -250,25 +311,22 @@ impl CpuWriteGpuReadBelt { if let Some(index) = self .free_chunks .iter() - .position(|chunk| chunk.size >= required_size) + .position(|chunk| chunk.remaining_capacity() >= max_required_size) { self.free_chunks.swap_remove(index) } else { - // (happens relatively rarely, this is a noteworthy event!) - re_log::debug!( - "Allocating new CpuWriteGpuReadBuffer chunk of size {required_size}" - ); - - // Allocation might be bigger than a chunk. - let size = wgpu::util::align_to( - self.chunk_size.get().max(required_size), - wgpu::MAP_ALIGNMENT, + // Allocation might be bigger than a chunk! + let buffer_size = wgpu::util::align_to( + self.chunk_size.max(max_required_size), + Self::MIN_ALIGNMENT, ); + // Happens relatively rarely, this is a noteworthy event! + re_log::debug!("Allocating new CpuWriteGpuReadBelt chunk of size {buffer_size}"); let buffer = buffer_pool.alloc( device, &BufferDesc { label: "CpuWriteGpuReadBelt buffer".into(), - size, + size: buffer_size, usage: wgpu::BufferUsages::MAP_WRITE | wgpu::BufferUsages::COPY_SRC, mapped_at_creation: true, }, @@ -276,82 +334,13 @@ impl CpuWriteGpuReadBelt { Chunk { buffer, - size, unused_offset: 0, } } }; - // Allocate mapping from a chunk. - fn allocate_mapping<'a>(chunk: &'a mut Chunk, size: u64) -> (u64, wgpu::BufferViewMut<'a>) { - let start_offset = chunk.unused_offset; - let end_offset = start_offset + size; - - debug_assert!(end_offset <= chunk.size); - chunk.unused_offset = end_offset; - - let buffer_slice = chunk.buffer.slice(start_offset..end_offset); - (start_offset, buffer_slice.get_mapped_range_mut()) - } - - // Allocate mapping from a chunk with alignment requirements. - // - // Depending on how https://github.com/gfx-rs/wgpu/issues/3508 will be solved, we can early out for certain alignments. - // TODO(andreas): Or just hardcode alignment for everything to `MIN_ALIGNMENT`? - fn allocate_chunk_mapping_with_alignment<'a>( - chunk: &'a mut Chunk, - size: u64, - alignment: u64, - ) -> (u64, wgpu::BufferViewMut<'a>) { - // First optimistically try without explicit padding. - let (start_offset, mut write_view) = allocate_mapping(chunk, size); - - // use deref_mut explicitly because wgpu warns otherwise that read access is slow. - let ptr = write_view.deref_mut().as_ptr() as u64; - let required_padding = wgpu::util::align_to(ptr, alignment) - ptr; - - // if required_padding == 0 { - (start_offset, write_view) - // } else { - // re_log::trace!( - // "CpuWriteGpuReadBuffer::allocate alignment requirement not fulfilled. Need to add {required_padding} for alignment of {alignment}" - // ); - - // // Undo mapping and try again with padding. - // // We made sure earlier that the chunk has enough space for this case! - // drop(write_view); - // chunk.unused_offset = start_offset + required_padding; - - // let (start_offset, mut write_view) = allocate_mapping(chunk, size); - // let required_padding = write_view.deref_mut().as_ptr() as u64 % alignment; // use deref_mut because wgpu warns otherwise that read access is slow. - // debug_assert_eq!(required_padding, 0); - - // (start_offset, write_view) - // } - } - - let (start_offset, write_view) = - allocate_chunk_mapping_with_alignment(&mut chunk, size, alignment); - - #[allow(unsafe_code)] - // SAFETY: - // write_view has a lifetime dependency on the chunk's buffer. - // To ensure that the buffer is still around, we put the ref counted buffer handle into the struct with it. - // However, this also implies that the buffer pool is still alive! The renderer context needs to make sure of this. - let write_view = unsafe { - std::mem::transmute::, wgpu::BufferViewMut<'static>>(write_view) - }; - - let cpu_buffer_view = CpuWriteGpuReadBuffer { - chunk_buffer: chunk.buffer.clone(), - offset_in_chunk_buffer: start_offset, - write_view, - write_counter: 0, - _type: std::marker::PhantomData, - }; - + let cpu_buffer_view = chunk.allocate_aligned(size, alignment); self.active_chunks.push(chunk); - cpu_buffer_view } diff --git a/crates/re_renderer/src/renderer/point_cloud.rs b/crates/re_renderer/src/renderer/point_cloud.rs index 7d9b8494cbce..233911761fd2 100644 --- a/crates/re_renderer/src/renderer/point_cloud.rs +++ b/crates/re_renderer/src/renderer/point_cloud.rs @@ -284,24 +284,18 @@ impl PointCloudDrawData { ); } - { - crate::profile_scope!("write_color_texture"); - builder - .color_buffer - .copy_to_texture( - ctx.active_frame.frame_global_command_encoder(&ctx.device), - wgpu::ImageCopyTexture { - texture: &color_texture.texture, - mip_level: 0, - origin: wgpu::Origin3d::ZERO, - aspect: wgpu::TextureAspect::All, - }, - NonZeroU32::new(DATA_TEXTURE_SIZE * std::mem::size_of::<[u8; 4]>() as u32), - None, - size, - ) - .expect("invalid color cpu->gpu buffer"); - } + builder.color_buffer.copy_to_texture( + ctx.active_frame.frame_global_command_encoder(&ctx.device), + wgpu::ImageCopyTexture { + texture: &color_texture.texture, + mip_level: 0, + origin: wgpu::Origin3d::ZERO, + aspect: wgpu::TextureAspect::All, + }, + NonZeroU32::new(DATA_TEXTURE_SIZE * std::mem::size_of::<[u8; 4]>() as u32), + None, + size, + ); let bind_group_all_points = ctx.gpu_resources.bind_groups.alloc( &ctx.device, diff --git a/crates/re_renderer/src/view_builder.rs b/crates/re_renderer/src/view_builder.rs index 849b99106629..6bafab2dbc74 100644 --- a/crates/re_renderer/src/view_builder.rs +++ b/crates/re_renderer/src/view_builder.rs @@ -393,33 +393,33 @@ impl ViewBuilder { // Factor applied to depth offsets. let depth_offset_factor = 1.0e-08; // Value determined by experimentation. Quite close to the f32 machine epsilon but a bit lower. - let mut frame_uniform_buffer_cpu = ctx - .cpu_write_gpu_read_belt - .lock() - .allocate::(&ctx.device, &mut ctx.gpu_resources.buffers, 1); - frame_uniform_buffer_cpu.push(&FrameUniformBuffer { - view_from_world: glam::Affine3A::from_mat4(view_from_world).into(), - projection_from_view: projection_from_view.into(), - projection_from_world: projection_from_world.into(), - camera_position, - camera_forward, - tan_half_fov: tan_half_fov.into(), - pixel_world_size_from_camera_distance, - pixels_from_point: config.pixels_from_point, - - auto_size_points: auto_size_points.0, - auto_size_lines: auto_size_lines.0, - - depth_offset_factor, - _padding: glam::Vec3::ZERO, - }); - frame_uniform_buffer_cpu - .copy_to_buffer( + { + let mut frame_uniform_buffer_cpu = ctx + .cpu_write_gpu_read_belt + .lock() + .allocate::(&ctx.device, &mut ctx.gpu_resources.buffers, 1); + frame_uniform_buffer_cpu.push(&FrameUniformBuffer { + view_from_world: glam::Affine3A::from_mat4(view_from_world).into(), + projection_from_view: projection_from_view.into(), + projection_from_world: projection_from_world.into(), + camera_position, + camera_forward, + tan_half_fov: tan_half_fov.into(), + pixel_world_size_from_camera_distance, + pixels_from_point: config.pixels_from_point, + + auto_size_points: auto_size_points.0, + auto_size_lines: auto_size_lines.0, + + depth_offset_factor, + _padding: glam::Vec3::ZERO, + }); + frame_uniform_buffer_cpu.copy_to_buffer( ctx.active_frame.frame_global_command_encoder(&ctx.device), &frame_uniform_buffer, 0, - ) - .unwrap(); + ); + } ctx.queue.write_buffer( &frame_uniform_buffer, From 7d544a79477df790dc1df50d092a1ecc5955db83 Mon Sep 17 00:00:00 2001 From: Andreas Reich Date: Wed, 22 Feb 2023 18:46:18 +0100 Subject: [PATCH 11/21] Limit number of in-flight queue submissions --- crates/re_renderer/src/context.rs | 43 ++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/crates/re_renderer/src/context.rs b/crates/re_renderer/src/context.rs index 166094007af6..ea4ad3fbc827 100644 --- a/crates/re_renderer/src/context.rs +++ b/crates/re_renderer/src/context.rs @@ -33,6 +33,12 @@ pub struct RenderContext { pub texture_manager_2d: TextureManager2D, pub cpu_write_gpu_read_belt: Mutex, + /// List of unfinished queue submission via this context. + /// + /// This is currently only about submissions we do via the global encoder in [`ActiveFrameContext`] + /// TODO(andreas): We rely on egui to to the "primary" submissions in re_viewer. It would be nice to take full control over all submissions. + pub inflight_queue_submissions: Vec, + pub(crate) active_frame: ActiveFrameContext, } @@ -71,6 +77,23 @@ impl Renderers { } impl RenderContext { + /// Limit maximum number of in flight submissions to this number. + /// + /// By limiting the number of submissions we have on the queue we ensure that GPU stalls do not + /// cause us to request more and more memory to prepare more and more submissions. + /// + /// Note that this is only indirectly related to number of buffered frames, + /// since buffered frames/blit strategy are all about the display<->gpu interface, + /// whereas this is about a *particular aspect* of the cpu<->gpu interface. + /// + /// Should be somewhere between 1-4, too high and we use up more memory and introduce latency, + /// too low and we may starve the GPU. + /// On the web we want to go lower because a lot more memory constraint. + #[cfg(not(target_arch = "wasm32"))] + const MAX_NUM_INFLIGHT_QUEUE_SUBMISSIONS: usize = 4; + #[cfg(target_arch = "wasm32")] + const MAX_NUM_INFLIGHT_QUEUE_SUBMISSIONS: usize = 2; + pub fn new( device: Arc, queue: Arc, @@ -162,10 +185,23 @@ impl RenderContext { #[cfg(all(not(target_arch = "wasm32"), debug_assertions))] // native debug build err_tracker, + inflight_queue_submissions: Vec::new(), + active_frame: ActiveFrameContext { frame_global_command_encoder: None, frame_index: 0 } } } + fn poll_device(&mut self) { + crate::profile_function!(); + + // Ensure not too many queue submissions are in flight. + while self.inflight_queue_submissions.len() > Self::MAX_NUM_INFLIGHT_QUEUE_SUBMISSIONS { + let submission_index = self.inflight_queue_submissions.remove(0); // Remove oldest + self.device + .poll(wgpu::Maintain::WaitForSubmissionIndex(submission_index)); + } + } + /// Call this at the beginning of a new frame. /// /// Updates internal book-keeping, frame allocators and executes delayed events like shader reloading. @@ -235,6 +271,10 @@ impl RenderContext { samplers.begin_frame(frame_index); } + // Poll device *after* resource pool `begin_frame` since resource pools may each decide drop resources. + // Wgpu internally may then internally decide to let go of these buffers. + self.poll_device(); + // Retrieve unused staging buffer. self.cpu_write_gpu_read_belt.lock().after_queue_submit(); } @@ -251,7 +291,8 @@ impl RenderContext { // TODO(andreas): For better performance, we should try to bundle this with the single submit call that is currently happening in eframe. // How do we hook in there and make sure this buffer is submitted first? - self.queue.submit([command_buffer]); + self.inflight_queue_submissions + .push(self.queue.submit([command_buffer])); } } } From cf57460f7a7e9d842c199683903376e3c4a6db20 Mon Sep 17 00:00:00 2001 From: Andreas Reich Date: Wed, 22 Feb 2023 18:46:49 +0100 Subject: [PATCH 12/21] different memory budget for web and native on staging belt --- crates/re_renderer/src/context.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/crates/re_renderer/src/context.rs b/crates/re_renderer/src/context.rs index ea4ad3fbc827..f71b28ad04e5 100644 --- a/crates/re_renderer/src/context.rs +++ b/crates/re_renderer/src/context.rs @@ -77,6 +77,17 @@ impl Renderers { } impl RenderContext { + /// Chunk size for our cpu->gpu buffer manager. + /// + /// For native: 32MiB chunk size (as big as a for instance a 2048x1024 float4 texture) + /// For web (memory constraint!): 8MiB + #[cfg(not(target_arch = "wasm32"))] + const CPU_WRITE_GPU_READ_BELT_DEFAULT_CHUNK_SIZE: Option = + wgpu::BufferSize::new(1024 * 1024 * 32); + #[cfg(target_arch = "wasm32")] + const CPU_WRITE_GPU_READ_BELT_DEFAULT_CHUNK_SIZE: Option = + wgpu::BufferSize::new(1024 * 1024 * 8); + /// Limit maximum number of in flight submissions to this number. /// /// By limiting the number of submissions we have on the queue we ensure that GPU stalls do not @@ -177,8 +188,7 @@ impl RenderContext { mesh_manager, texture_manager_2d, - // 32MiB chunk size (as big as a for instance a 2048x1024 float4 texture) - cpu_write_gpu_read_belt: Mutex::new(CpuWriteGpuReadBelt::new(wgpu::BufferSize::new(1024 * 1024 * 32).unwrap())), + cpu_write_gpu_read_belt: Mutex::new(CpuWriteGpuReadBelt::new(Self::CPU_WRITE_GPU_READ_BELT_DEFAULT_CHUNK_SIZE.unwrap())), resolver, From 9db6f70a186eaa82cbeb71c25fb884b1c96d4a5d Mon Sep 17 00:00:00 2001 From: Andreas Reich Date: Wed, 22 Feb 2023 18:57:45 +0100 Subject: [PATCH 13/21] nicer poll_device implementation --- crates/re_renderer/src/context.rs | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/crates/re_renderer/src/context.rs b/crates/re_renderer/src/context.rs index f71b28ad04e5..9ea81cc20497 100644 --- a/crates/re_renderer/src/context.rs +++ b/crates/re_renderer/src/context.rs @@ -205,10 +205,19 @@ impl RenderContext { crate::profile_function!(); // Ensure not too many queue submissions are in flight. - while self.inflight_queue_submissions.len() > Self::MAX_NUM_INFLIGHT_QUEUE_SUBMISSIONS { - let submission_index = self.inflight_queue_submissions.remove(0); // Remove oldest - self.device - .poll(wgpu::Maintain::WaitForSubmissionIndex(submission_index)); + let num_submissions_to_wait_for = self + .inflight_queue_submissions + .len() + .saturating_sub(Self::MAX_NUM_INFLIGHT_QUEUE_SUBMISSIONS); + + if let Some(newest_submission_to_wait_for) = self + .inflight_queue_submissions + .drain(0..num_submissions_to_wait_for) + .last() + { + self.device.poll(wgpu::Maintain::WaitForSubmissionIndex( + newest_submission_to_wait_for, + )); } } From 1d30e9af3d38704365d648b5cea20ed8a5898c2a Mon Sep 17 00:00:00 2001 From: Andreas Reich Date: Wed, 22 Feb 2023 19:07:17 +0100 Subject: [PATCH 14/21] move per_frame_data_helper into ActiveFrame --- .../src/allocator/cpu_write_gpu_read_belt.rs | 2 +- crates/re_renderer/src/context.rs | 28 +++++++++---------- .../src/ui/view_spatial/ui_renderer_bridge.rs | 6 +++- 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs index cb6e02705f15..ed79eef4671d 100644 --- a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs +++ b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs @@ -149,7 +149,7 @@ impl Chunk { let mut buffer_slice = self.buffer.slice(start_offset..end_offset); let mut write_view = buffer_slice.get_mapped_range_mut(); - // Check if it fullfills the requested alignment. + // Check if it fulfills the requested alignment. let required_padding = Self::required_padding(&mut write_view, alignment); if required_padding != 0 { // Undo mapping and try again with padding! diff --git a/crates/re_renderer/src/context.rs b/crates/re_renderer/src/context.rs index 9ea81cc20497..1fc79c49d4eb 100644 --- a/crates/re_renderer/src/context.rs +++ b/crates/re_renderer/src/context.rs @@ -25,9 +25,6 @@ pub struct RenderContext { #[cfg(all(not(target_arch = "wasm32"), debug_assertions))] // native debug build pub(crate) err_tracker: std::sync::Arc, - /// Utility type map that will be cleared every frame. - pub per_frame_data_helper: TypeMap, - pub gpu_resources: WgpuResourcePools, pub mesh_manager: MeshManager, pub texture_manager_2d: TextureManager2D, @@ -37,9 +34,9 @@ pub struct RenderContext { /// /// This is currently only about submissions we do via the global encoder in [`ActiveFrameContext`] /// TODO(andreas): We rely on egui to to the "primary" submissions in re_viewer. It would be nice to take full control over all submissions. - pub inflight_queue_submissions: Vec, + inflight_queue_submissions: Vec, - pub(crate) active_frame: ActiveFrameContext, + pub active_frame: ActiveFrameContext, } /// Immutable data that is shared between all [`Renderer`] @@ -182,8 +179,6 @@ impl RenderContext { renderers, - per_frame_data_helper: TypeMap::new(), - gpu_resources, mesh_manager, @@ -197,7 +192,8 @@ impl RenderContext { inflight_queue_submissions: Vec::new(), - active_frame: ActiveFrameContext { frame_global_command_encoder: None, frame_index: 0 } + active_frame: ActiveFrameContext { frame_global_command_encoder: None, per_frame_data_helper: TypeMap::new(), + frame_index: 0 } } } @@ -226,11 +222,15 @@ impl RenderContext { /// Updates internal book-keeping, frame allocators and executes delayed events like shader reloading. pub fn begin_frame(&mut self) { crate::profile_function!(); - self.per_frame_data_helper.clear(); + + // Request used staging buffer back. + // TODO(andreas): If we'd control all submissions, we could move this directly after the submission which would be a bit better. + self.cpu_write_gpu_read_belt.lock().after_queue_submit(); self.active_frame = ActiveFrameContext { frame_global_command_encoder: None, frame_index: self.active_frame.frame_index + 1, + per_frame_data_helper: TypeMap::new(), }; let frame_index = self.active_frame.frame_index; @@ -293,12 +293,9 @@ impl RenderContext { // Poll device *after* resource pool `begin_frame` since resource pools may each decide drop resources. // Wgpu internally may then internally decide to let go of these buffers. self.poll_device(); - - // Retrieve unused staging buffer. - self.cpu_write_gpu_read_belt.lock().after_queue_submit(); } - /// Call this at the end of a frame but before submitting command buffers from [`crate::view_builder::ViewBuilder`] + /// Call this at the end of a frame but before submitting command buffers (e.g. from [`crate::view_builder::ViewBuilder`]) pub fn before_submit(&mut self) { crate::profile_function!(); @@ -333,7 +330,10 @@ pub struct ActiveFrameContext { /// (i.e. typically in [`crate::renderer::DrawData`] creation!) frame_global_command_encoder: Option, - // TODO(andreas): Add frame/lifetime statistics, shared resources (e.g. "global" uniform buffer), ?? + /// Utility type map that will be cleared every frame. + pub per_frame_data_helper: TypeMap, + + /// Index of this frame. Is incremented for every render frame. frame_index: u64, } diff --git a/crates/re_viewer/src/ui/view_spatial/ui_renderer_bridge.rs b/crates/re_viewer/src/ui/view_spatial/ui_renderer_bridge.rs index ed3c6d5a4a76..30a2758bb653 100644 --- a/crates/re_viewer/src/ui/view_spatial/ui_renderer_bridge.rs +++ b/crates/re_viewer/src/ui/view_spatial/ui_renderer_bridge.rs @@ -85,6 +85,7 @@ fn renderer_paint_callback( let command_buffer = std::sync::Arc::new(Mutex::new(Some(command_buffer))); let composition_view_builder_map = render_ctx + .active_frame .per_frame_data_helper .entry::() .or_insert_with(Default::default); @@ -111,7 +112,10 @@ fn renderer_paint_callback( //let clip_rect = info.clip_rect_in_pixels(); let ctx = paint_callback_resources.get::().unwrap(); - ctx.per_frame_data_helper.get::().unwrap()[view_builder_handle] + ctx.active_frame + .per_frame_data_helper + .get::() + .unwrap()[view_builder_handle] .composite(ctx, render_pass, screen_position) .expect("Failed compositing view builder with main target."); }), From b7078c3da7b73724fed08b19dc48e035233f90dc Mon Sep 17 00:00:00 2001 From: Andreas Reich Date: Wed, 22 Feb 2023 19:57:04 +0100 Subject: [PATCH 15/21] iterating on some doc and api aspects. Add resource ownership check at end of life of resource pool --- crates/re_renderer/examples/framework.rs | 3 +- .../src/allocator/cpu_write_gpu_read_belt.rs | 48 +++++++++++-------- crates/re_renderer/src/context.rs | 3 +- .../wgpu_resources/dynamic_resource_pool.rs | 35 ++++++++++++-- 4 files changed, 63 insertions(+), 26 deletions(-) diff --git a/crates/re_renderer/examples/framework.rs b/crates/re_renderer/examples/framework.rs index f63f5e828eb0..e0b09e1d830a 100644 --- a/crates/re_renderer/examples/framework.rs +++ b/crates/re_renderer/examples/framework.rs @@ -82,10 +82,11 @@ struct Application { window: Window, surface: wgpu::Surface, surface_config: wgpu::SurfaceConfiguration, - re_ctx: RenderContext, time: Time, example: E, + + re_ctx: RenderContext, } impl Application { diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs index ed79eef4671d..b614ad85ed46 100644 --- a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs +++ b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs @@ -17,14 +17,14 @@ pub struct CpuWriteGpuReadBuffer { /// Write view into the relevant buffer portion. /// /// UNSAFE: The lifetime is transmuted to be `'static`. - /// In actuality it is tied to the lifetime of [`chunk_buffer`](#structfield.chunk.chunk_buffer)! + /// In actuality it is tied to the lifetime of [`chunk_buffer`](#structfield.chunk_buffer)! write_view: wgpu::BufferViewMut<'static>, - /// How many elements of T were already pushed into this buffer. - write_counter: usize, + /// Range in T elements in write_view that haven't been written yet. + unwritten_element_range: std::ops::Range, chunk_buffer: GpuBuffer, - offset_in_chunk_buffer: wgpu::BufferAddress, + byte_offset_in_chunk_buffer: wgpu::BufferAddress, /// Marker for the type whose alignment and size requirements are honored by `write_view`. _type: std::marker::PhantomData, @@ -39,7 +39,7 @@ where /// Do *not* make this public as we need to guarantee that the memory is *never* read from! #[inline(always)] fn as_slice(&mut self) -> &mut [T] { - &mut bytemuck::cast_slice_mut(&mut self.write_view)[self.write_counter..] + &mut bytemuck::cast_slice_mut(&mut self.write_view)[self.unwritten_element_range.clone()] } /// Pushes a slice of elements into the buffer. @@ -48,7 +48,7 @@ where #[inline] pub fn extend_from_slice(&mut self, elements: &[T]) { self.as_slice().copy_from_slice(elements); - self.write_counter += elements.len(); + self.unwritten_element_range.start += elements.len(); } /// Pushes several elements into the buffer. @@ -61,7 +61,7 @@ where *target = source; num_elements += 1; } - self.write_counter += num_elements; + self.unwritten_element_range.start += num_elements; } /// Pushes a single element into the buffer and advances the write pointer. @@ -69,16 +69,17 @@ where /// Panics if the data no longer fits into the buffer. #[inline] pub fn push(&mut self, element: &T) { - *self.as_slice().first_mut().unwrap() = *element; - self.write_counter += 1; + self.as_slice()[0] = *element; + self.unwritten_element_range.start += 1; } /// The number of elements pushed into the buffer so far. #[inline] pub fn num_written(&self) -> usize { - self.write_counter + self.unwritten_element_range.start } + /// Copies the entire buffer to a texture and drops it. pub fn copy_to_texture( self, encoder: &mut wgpu::CommandEncoder, @@ -91,7 +92,7 @@ where wgpu::ImageCopyBuffer { buffer: &self.chunk_buffer, layout: wgpu::ImageDataLayout { - offset: self.offset_in_chunk_buffer, + offset: self.byte_offset_in_chunk_buffer, bytes_per_row, rows_per_image, }, @@ -101,7 +102,7 @@ where ); } - /// Consume this data view and copy it to another gpu buffer. + /// Copies the entire buffer to another buffer and drops it. pub fn copy_to_buffer( mut self, encoder: &mut wgpu::CommandEncoder, @@ -110,7 +111,7 @@ where ) { encoder.copy_buffer_to_buffer( &self.chunk_buffer, - self.offset_in_chunk_buffer, + self.byte_offset_in_chunk_buffer, destination, destination_offset, self.write_view.deref_mut().len() as u64, @@ -140,9 +141,12 @@ impl Chunk { /// Caller needs to make sure that there is enough space plus potential padding. fn allocate_aligned( &mut self, + num_elements: usize, size_in_bytes: u64, alignment: u64, ) -> CpuWriteGpuReadBuffer { + debug_assert!(num_elements * std::mem::size_of::() <= size_in_bytes as usize); + // Optimistic first mapping attempt. let mut start_offset = self.unused_offset; let mut end_offset = start_offset + size_in_bytes; @@ -173,19 +177,25 @@ impl Chunk { #[allow(unsafe_code)] // SAFETY: - // write_view has a lifetime dependency on the chunk's buffer. + // write_view has a lifetime dependency on the chunk's buffer - internally it holds a pointer to it! + // // To ensure that the buffer is still around, we put the ref counted buffer handle into the struct with it. - // However, this also implies that the buffer pool is still alive! The renderer context needs to make sure of this. - // TODO: some more doc why this is here + // Additionally, the buffer pool needs to ensure: + // * it can't drop buffers if there's still users + // -> We assert on on that + // * buffers are never moved in memory + // -> buffers are always owned by the pool and are always Arc. + // This means it not allowed to move the buffer out. + // (We could make them Pin> but this complicates things inside the BufferPool) let write_view = unsafe { std::mem::transmute::, wgpu::BufferViewMut<'static>>(write_view) }; CpuWriteGpuReadBuffer { chunk_buffer: self.buffer.clone(), - offset_in_chunk_buffer: start_offset, + byte_offset_in_chunk_buffer: start_offset, write_view, - write_counter: 0, + unwritten_element_range: 0..num_elements, _type: std::marker::PhantomData, } } @@ -339,7 +349,7 @@ impl CpuWriteGpuReadBelt { } }; - let cpu_buffer_view = chunk.allocate_aligned(size, alignment); + let cpu_buffer_view = chunk.allocate_aligned(num_elements, size, alignment); self.active_chunks.push(chunk); cpu_buffer_view } diff --git a/crates/re_renderer/src/context.rs b/crates/re_renderer/src/context.rs index 1fc79c49d4eb..150d51320a5d 100644 --- a/crates/re_renderer/src/context.rs +++ b/crates/re_renderer/src/context.rs @@ -25,7 +25,6 @@ pub struct RenderContext { #[cfg(all(not(target_arch = "wasm32"), debug_assertions))] // native debug build pub(crate) err_tracker: std::sync::Arc, - pub gpu_resources: WgpuResourcePools, pub mesh_manager: MeshManager, pub texture_manager_2d: TextureManager2D, pub cpu_write_gpu_read_belt: Mutex, @@ -37,6 +36,8 @@ pub struct RenderContext { inflight_queue_submissions: Vec, pub active_frame: ActiveFrameContext, + + pub gpu_resources: WgpuResourcePools, // Last due to drop order. } /// Immutable data that is shared between all [`Renderer`] diff --git a/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs b/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs index 5dd5d20e7969..b966fb1c2d1d 100644 --- a/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs +++ b/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs @@ -20,13 +20,16 @@ pub trait DynamicResourcesDesc { fn allow_reuse(&self) -> bool; } -pub struct DynamicResource { +pub struct DynamicResource { pub inner: Res, pub creation_desc: Desc, pub handle: Handle, } -impl std::ops::Deref for DynamicResource { +impl std::ops::Deref for DynamicResource +where + Desc: Debug, +{ type Target = Res; #[inline] @@ -39,7 +42,7 @@ impl std::ops::Deref for DynamicResource { type AliveResourceMap = SlotMap>>>; -struct DynamicResourcePoolProtectedState { +struct DynamicResourcePoolProtectedState { /// All currently alive resources. /// We store any ref counted handle we give out in [`DynamicResourcePool::alloc`] here in order to keep it alive. /// Every [`DynamicResourcePool::begin_frame`] we check if the pool is now the only owner of the handle @@ -55,7 +58,7 @@ struct DynamicResourcePoolProtectedState { /// /// Unlike in [`super::static_resource_pool::StaticResourcePool`], a resource can not be uniquely /// identified by its description, as the same description can apply to several different resources. -pub(super) struct DynamicResourcePool { +pub(super) struct DynamicResourcePool { state: RwLock>, current_frame_index: u64, @@ -63,7 +66,10 @@ pub(super) struct DynamicResourcePool { } /// We cannot #derive(Default) as that would require Handle/Desc/Res to implement Default too. -impl Default for DynamicResourcePool { +impl Default for DynamicResourcePool +where + Desc: Debug, +{ fn default() -> Self { Self { state: RwLock::new(DynamicResourcePoolProtectedState { @@ -202,6 +208,25 @@ where .load(std::sync::atomic::Ordering::Relaxed) } } +impl Drop for DynamicResourcePool +where + Handle: Key, + Desc: Debug, +{ + fn drop(&mut self) { + for (_, alive_resource) in self.state.read().alive_resources.iter() { + let alive_resource = alive_resource + .as_ref() + .expect("Alive resources should never be None"); + let ref_count = Arc::strong_count(alive_resource); + + assert!(ref_count == 1, + "Resource has still {} owners at the time of pool destruction. Description desc was {:?}", + ref_count - 1, + &alive_resource.creation_desc); + } + } +} #[cfg(test)] mod tests { From 4e00a03d93004db32fffc1314477dda23db34c57 Mon Sep 17 00:00:00 2001 From: Andreas Reich Date: Wed, 22 Feb 2023 20:12:03 +0100 Subject: [PATCH 16/21] commenting out DynamicResourcePool drop check --- .../wgpu_resources/dynamic_resource_pool.rs | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs b/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs index b966fb1c2d1d..f41636ed81b4 100644 --- a/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs +++ b/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs @@ -214,17 +214,22 @@ where Desc: Debug, { fn drop(&mut self) { - for (_, alive_resource) in self.state.read().alive_resources.iter() { - let alive_resource = alive_resource - .as_ref() - .expect("Alive resources should never be None"); - let ref_count = Arc::strong_count(alive_resource); - - assert!(ref_count == 1, - "Resource has still {} owners at the time of pool destruction. Description desc was {:?}", - ref_count - 1, - &alive_resource.creation_desc); - } + // TODO(andreas): We're failing this check currently on re_viewer's shutdown. + // This is primarily the case due to the way we store the render ctx itself and other things on egui-wgpu's paint callback resources + // We shouldn't do this as it makes a whole lot of other things cumbersome. Instead, we should store it directly on the `App` + // where we control the drop order. + + // for (_, alive_resource) in self.state.read().alive_resources.iter() { + // let alive_resource = alive_resource + // .as_ref() + // .expect("Alive resources should never be None"); + // let ref_count = Arc::strong_count(alive_resource); + + // assert!(ref_count == 1, + // "Resource has still {} owner(s) at the time of pool destruction. Description desc was {:?}", + // ref_count - 1, + // &alive_resource.creation_desc); + // } } } From 053e100971c36ef14a85be4f250eb35070c961ce Mon Sep 17 00:00:00 2001 From: Andreas Reich Date: Wed, 22 Feb 2023 20:20:00 +0100 Subject: [PATCH 17/21] fix missing extend_defaults as well as fix a bug in it --- crates/re_renderer/src/point_cloud_builder.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/crates/re_renderer/src/point_cloud_builder.rs b/crates/re_renderer/src/point_cloud_builder.rs index cdfd60666ea7..d13eb9105220 100644 --- a/crates/re_renderer/src/point_cloud_builder.rs +++ b/crates/re_renderer/src/point_cloud_builder.rs @@ -160,7 +160,7 @@ where ); } - if self.0.user_data.len() < self.0.user_data.len() { + if self.0.user_data.len() < self.0.vertices.len() { self.0.user_data.extend( std::iter::repeat(PerPointUserData::default()) .take(self.0.vertices.len() - self.0.user_data.len()), @@ -220,6 +220,8 @@ where #[inline] pub fn add_point(&mut self, position: glam::Vec3) -> PointBuilder<'_, PerPointUserData> { + self.extend_defaults(); + debug_assert_eq!(self.0.vertices.len(), self.0.color_buffer.num_written()); debug_assert_eq!(self.0.vertices.len(), self.0.user_data.len()); @@ -227,6 +229,7 @@ where position, radius: Size::AUTO, }); + self.0.user_data.push(Default::default()); self.batch_mut().point_count += 1; PointBuilder { From 93f320c632efe97dd4c8aeced1650d99d4aad4b3 Mon Sep 17 00:00:00 2001 From: Andreas Reich Date: Thu, 23 Feb 2023 08:55:59 +0100 Subject: [PATCH 18/21] doc string link fixes --- .../src/allocator/cpu_write_gpu_read_belt.rs | 12 ++++++------ crates/re_renderer/src/wgpu_resources/buffer_pool.rs | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs index b614ad85ed46..7a979fc1cbc8 100644 --- a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs +++ b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs @@ -208,7 +208,7 @@ impl Chunk { /// Based on to [`wgpu::util::StagingBelt`](https://github.com/gfx-rs/wgpu/blob/a420e453c3d9c93dfb1a8526bf11c000d895c916/wgpu/src/util/belt.rs) /// However, there are some important differences: /// * can create buffers without yet knowing the target copy location -/// * lifetime of returned buffers is independent of the [`StagingWriteBelt`] (allows working with several in parallel!) +/// * lifetime of returned buffers is independent of the [`CpuWriteGpuReadBelt`] (allows working with several in parallel!) /// * use of `re_renderer`'s resource pool /// * handles alignment in a defined manner /// (see this as of writing open wgpu issue on [Alignment guarantees for mapped buffers](https://github.com/gfx-rs/wgpu/issues/3508)) @@ -256,9 +256,9 @@ impl CpuWriteGpuReadBelt { /// sub-allocated within each chunk. Therefore, for optimal use of memory, the /// chunk size should be: /// - /// * larger than the largest single [`StagingBelt::write_buffer()`] operation; + /// * larger than the largest single [`CpuWriteGpuReadBelt::allocate`] operation; /// * 1-4 times less than the total amount of data uploaded per submission - /// (per [`StagingBelt::finish()`]); and + /// (per [`CpuWriteGpuReadBelt::before_queue_submit()`]); and /// * bigger is better, within these bounds. /// /// TODO(andreas): Adaptive chunk sizes @@ -356,10 +356,10 @@ impl CpuWriteGpuReadBelt { /// Prepare currently mapped buffers for use in a submission. /// - /// This must be called before the command encoder(s) used in [`StagingBuffer`] copy operations are submitted. + /// This must be called before the command encoder(s) used in [`CpuWriteGpuReadBuffer`] copy operations are submitted. /// /// At this point, all the partially used staging buffers are closed (cannot be used for - /// further writes) until after [`StagingBelt::recall()`] is called *and* the GPU is done + /// further writes) until after [`CpuWriteGpuReadBelt::after_queue_submit`] is called *and* the GPU is done /// copying the data from them. pub fn before_queue_submit(&mut self) { // This would be a great usecase for persistent memory mapping, i.e. mapping without the need to unmap @@ -374,7 +374,7 @@ impl CpuWriteGpuReadBelt { /// Recall all of the closed buffers back to be reused. /// - /// This must only be called after the command encoder(s) used in [`StagingBuffer`] + /// This must only be called after the command encoder(s) used in [`CpuWriteGpuReadBuffer`] /// copy operations are submitted. Additional calls are harmless. /// Not calling this as soon as possible may result in increased buffer memory usage. pub fn after_queue_submit(&mut self) { diff --git a/crates/re_renderer/src/wgpu_resources/buffer_pool.rs b/crates/re_renderer/src/wgpu_resources/buffer_pool.rs index 20f455e385cb..c6890749dc5c 100644 --- a/crates/re_renderer/src/wgpu_resources/buffer_pool.rs +++ b/crates/re_renderer/src/wgpu_resources/buffer_pool.rs @@ -26,11 +26,11 @@ pub struct BufferDesc { /// will panic. pub usage: wgpu::BufferUsages, - /// Allows a buffer to be mapped immediately after they are made. It does not have to be [`BufferUsages::MAP_READ`] or - /// [`BufferUsages::MAP_WRITE`], all buffers are allowed to be mapped at creation. + /// Allows a buffer to be mapped immediately after they are made. It does not have to be [`wgpu::BufferUsages::MAP_READ`] or + /// [`wgpu::BufferUsages::MAP_WRITE`], all buffers are allowed to be mapped at creation. /// /// *WARNING*: If this is `true`, the pool won't be able to reclaim the buffer later! - /// Furthermore, [`size`](#structfield.size) must be a multiple of [`COPY_BUFFER_ALIGNMENT`]. + /// Furthermore, [`size`](#structfield.size) must be a multiple of [`wgpu::COPY_BUFFER_ALIGNMENT`]. pub mapped_at_creation: bool, } From eee79a2e670170554fe748911cc15b59c36cf0a0 Mon Sep 17 00:00:00 2001 From: Andreas Reich Date: Thu, 23 Feb 2023 09:19:30 +0100 Subject: [PATCH 19/21] drop a todo --- crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs b/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs index f41636ed81b4..6da0f18b575a 100644 --- a/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs +++ b/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs @@ -340,8 +340,6 @@ mod tests { } } - // TODO: Add test for resources without re-use - fn allocate_resources( descs: &[u32], pool: &mut DynamicResourcePool, From b05925541cd00d214a044ac668407d51799a7e69 Mon Sep 17 00:00:00 2001 From: Andreas Reich Date: Thu, 23 Feb 2023 09:22:32 +0100 Subject: [PATCH 20/21] fix writing to frame uniform buffer twice --- crates/re_renderer/src/view_builder.rs | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/crates/re_renderer/src/view_builder.rs b/crates/re_renderer/src/view_builder.rs index 6bafab2dbc74..1f5981b278c6 100644 --- a/crates/re_renderer/src/view_builder.rs +++ b/crates/re_renderer/src/view_builder.rs @@ -421,27 +421,6 @@ impl ViewBuilder { ); } - ctx.queue.write_buffer( - &frame_uniform_buffer, - 0, - bytemuck::bytes_of(&FrameUniformBuffer { - view_from_world: glam::Affine3A::from_mat4(view_from_world).into(), - projection_from_view: projection_from_view.into(), - projection_from_world: projection_from_world.into(), - camera_position, - camera_forward, - tan_half_fov: tan_half_fov.into(), - pixel_world_size_from_camera_distance, - pixels_from_point: config.pixels_from_point, - - auto_size_points: auto_size_points.0, - auto_size_lines: auto_size_lines.0, - - depth_offset_factor, - _padding: glam::Vec3::ZERO, - }), - ); - let bind_group_0 = ctx.shared_renderer_data.global_bindings.create_bind_group( &mut ctx.gpu_resources, &ctx.device, From f193edb5dfc1b9425a426e4339f79f3b4dd8c1a9 Mon Sep 17 00:00:00 2001 From: Andreas Reich Date: Thu, 23 Feb 2023 11:22:56 +0100 Subject: [PATCH 21/21] comment fix & improvements --- crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs index 7a979fc1cbc8..e85b69596b16 100644 --- a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs +++ b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs @@ -232,7 +232,6 @@ pub struct CpuWriteGpuReadBelt { /// When closed chunks are mapped again, the map callback sends them here. /// - /// Behind a mutex, so that our StagingBelt becomes Sync. /// Note that we shouldn't use SyncSender since this can block the Sender if a buffer is full, /// which means that in a single threaded situation (Web!) we might deadlock. sender: mpsc::Sender, @@ -248,6 +247,9 @@ impl CpuWriteGpuReadBelt { /// Also, it has the potential of making memcpy operations faster. /// /// Align to 4xf32. Should be enough for most usecases! + /// Needs to be larger or equal than [`wgpu::MAP_ALIGNMENT`]. + /// For alignment requirements in `WebGPU` in general, refer to + /// [the specification on alignment-class limitations](https://www.w3.org/TR/webgpu/#limit-class-alignment) pub const MIN_ALIGNMENT: u64 = 16; /// Create a cpu-write & gpu-read staging belt.