diff --git a/crates/re_renderer/examples/2d.rs b/crates/re_renderer/examples/2d.rs index 741a27f24539..047c37dfbe51 100644 --- a/crates/re_renderer/examples/2d.rs +++ b/crates/re_renderer/examples/2d.rs @@ -149,7 +149,7 @@ impl framework::Example for Render2D { // Moving the windows to a high dpi screen makes the second one bigger. // Also, it looks different under perspective projection. // The third point is automatic thickness which is determined by the point renderer implementation. - let mut point_cloud_builder = PointCloudBuilder::<()>::default(); + let mut point_cloud_builder = PointCloudBuilder::<()>::new(re_ctx); point_cloud_builder .batch("points") .add_points_2d( diff --git a/crates/re_renderer/examples/framework.rs b/crates/re_renderer/examples/framework.rs index 3022f19726d8..e0b09e1d830a 100644 --- a/crates/re_renderer/examples/framework.rs +++ b/crates/re_renderer/examples/framework.rs @@ -82,10 +82,11 @@ struct Application { window: Window, surface: wgpu::Surface, surface_config: wgpu::SurfaceConfiguration, - re_ctx: RenderContext, time: Time, example: E, + + re_ctx: RenderContext, } impl Application { @@ -277,6 +278,7 @@ impl Application { } }; + self.re_ctx.before_submit(); self.re_ctx.queue.submit( draw_results .into_iter() diff --git a/crates/re_renderer/examples/multiview.rs b/crates/re_renderer/examples/multiview.rs index a5f89d1ae242..2a663f30ca39 100644 --- a/crates/re_renderer/examples/multiview.rs +++ b/crates/re_renderer/examples/multiview.rs @@ -10,12 +10,11 @@ use rand::Rng; use re_renderer::{ renderer::{ GenericSkyboxDrawData, LineDrawData, LineStripFlags, MeshDrawData, MeshInstance, - PointCloudBatchFlags, PointCloudBatchInfo, PointCloudDrawData, PointCloudVertex, TestTriangleDrawData, }, resource_managers::ResourceLifeTime, view_builder::{OrthographicCameraMode, Projection, TargetConfiguration, ViewBuilder}, - Color32, LineStripSeriesBuilder, RenderContext, Rgba, Size, + Color32, LineStripSeriesBuilder, PointCloudBuilder, RenderContext, Rgba, Size, }; use winit::event::{ElementState, VirtualKeyCode}; @@ -163,7 +162,8 @@ struct Multiview { mesh_instance_positions_and_colors: Vec<(glam::Vec3, Color32)>, // Want to have a large cloud of random points, but doing rng for all of them every frame is too slow - random_points: Vec, + random_points_positions: Vec, + random_points_radii: Vec, random_points_colors: Vec, } @@ -188,17 +188,23 @@ impl Example for Multiview { let mut rnd = ::seed_from_u64(42); let random_point_range = -5.0_f32..5.0_f32; - let random_points = (0..500000) - .map(|_| PointCloudVertex { - position: glam::vec3( + + let point_count = 500000; + let random_points_positions = (0..point_count) + .map(|_| { + glam::vec3( rnd.gen_range(random_point_range.clone()), rnd.gen_range(random_point_range.clone()), rnd.gen_range(random_point_range.clone()), - ), - radius: Size::new_scene(rnd.gen_range(0.005..0.05)), + ) }) .collect_vec(); - let random_points_colors = (0..500000).map(|_| random_color(&mut rnd)).collect_vec(); + let random_points_radii = (0..point_count) + .map(|_| Size::new_scene(rnd.gen_range(0.005..0.05))) + .collect_vec(); + let random_points_colors = (0..point_count) + .map(|_| random_color(&mut rnd)) + .collect_vec(); let model_mesh_instances = { let reader = std::io::Cursor::new(include_bytes!("rerun.obj.zip")); @@ -232,7 +238,8 @@ impl Example for Multiview { model_mesh_instances, mesh_instance_positions_and_colors, - random_points, + random_points_positions, + random_points_radii, random_points_colors, } } @@ -260,18 +267,19 @@ impl Example for Multiview { let triangle = TestTriangleDrawData::new(re_ctx); let skybox = GenericSkyboxDrawData::new(re_ctx); let lines = build_lines(re_ctx, seconds_since_startup); - let point_cloud = PointCloudDrawData::new( - re_ctx, - &self.random_points, - &self.random_points_colors, - &[PointCloudBatchInfo { - label: "Random points".into(), - world_from_obj: glam::Mat4::from_rotation_x(seconds_since_startup), - point_count: self.random_points.len() as _, - flags: PointCloudBatchFlags::ENABLE_SHADING, - }], - ) - .unwrap(); + + let mut builder = PointCloudBuilder::<()>::new(re_ctx); + builder + .batch("Random Points") + .world_from_obj(glam::Mat4::from_rotation_x(seconds_since_startup)) + .add_points( + self.random_points_positions.len(), + self.random_points_positions.iter().cloned(), + ) + .radii(self.random_points_radii.iter().cloned()) + .colors(self.random_points_colors.iter().cloned()); + + let point_cloud = builder.to_draw_data(re_ctx).unwrap(); let meshes = build_mesh_instances( re_ctx, &self.model_mesh_instances, diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs new file mode 100644 index 000000000000..e85b69596b16 --- /dev/null +++ b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs @@ -0,0 +1,417 @@ +use std::{num::NonZeroU32, ops::DerefMut, sync::mpsc}; + +use crate::wgpu_resources::{BufferDesc, GpuBuffer, GpuBufferPool}; + +/// A sub-allocated staging buffer that can be written to. +/// +/// Behaves a bit like a fixed sized `Vec` in that far it keeps track of how many elements were written to it. +/// +/// We do *not* allow reading from this buffer as it is typically write-combined memory. +/// Reading would work, but it can be *very* slow. +/// For details on why, see +/// [Write combining is not your friend, by Fabian Giesen](https://fgiesen.wordpress.com/2013/01/29/write-combining-is-not-your-friend/) +/// Note that the "vec like behavior" further encourages +/// * not leaving holes +/// * keeping writes sequential +pub struct CpuWriteGpuReadBuffer { + /// Write view into the relevant buffer portion. + /// + /// UNSAFE: The lifetime is transmuted to be `'static`. + /// In actuality it is tied to the lifetime of [`chunk_buffer`](#structfield.chunk_buffer)! + write_view: wgpu::BufferViewMut<'static>, + + /// Range in T elements in write_view that haven't been written yet. + unwritten_element_range: std::ops::Range, + + chunk_buffer: GpuBuffer, + byte_offset_in_chunk_buffer: wgpu::BufferAddress, + + /// Marker for the type whose alignment and size requirements are honored by `write_view`. + _type: std::marker::PhantomData, +} + +impl CpuWriteGpuReadBuffer +where + T: bytemuck::Pod + 'static, +{ + /// Memory as slice of T. + /// + /// Do *not* make this public as we need to guarantee that the memory is *never* read from! + #[inline(always)] + fn as_slice(&mut self) -> &mut [T] { + &mut bytemuck::cast_slice_mut(&mut self.write_view)[self.unwritten_element_range.clone()] + } + + /// Pushes a slice of elements into the buffer. + /// + /// Panics if the data no longer fits into the buffer. + #[inline] + pub fn extend_from_slice(&mut self, elements: &[T]) { + self.as_slice().copy_from_slice(elements); + self.unwritten_element_range.start += elements.len(); + } + + /// Pushes several elements into the buffer. + /// + /// Panics if the data no longer fits into the buffer. + #[inline] + pub fn extend(&mut self, elements: impl Iterator) { + let mut num_elements = 0; + for (target, source) in self.as_slice().iter_mut().zip(elements) { + *target = source; + num_elements += 1; + } + self.unwritten_element_range.start += num_elements; + } + + /// Pushes a single element into the buffer and advances the write pointer. + /// + /// Panics if the data no longer fits into the buffer. + #[inline] + pub fn push(&mut self, element: &T) { + self.as_slice()[0] = *element; + self.unwritten_element_range.start += 1; + } + + /// The number of elements pushed into the buffer so far. + #[inline] + pub fn num_written(&self) -> usize { + self.unwritten_element_range.start + } + + /// Copies the entire buffer to a texture and drops it. + pub fn copy_to_texture( + self, + encoder: &mut wgpu::CommandEncoder, + destination: wgpu::ImageCopyTexture<'_>, + bytes_per_row: Option, + rows_per_image: Option, + copy_size: wgpu::Extent3d, + ) { + encoder.copy_buffer_to_texture( + wgpu::ImageCopyBuffer { + buffer: &self.chunk_buffer, + layout: wgpu::ImageDataLayout { + offset: self.byte_offset_in_chunk_buffer, + bytes_per_row, + rows_per_image, + }, + }, + destination, + copy_size, + ); + } + + /// Copies the entire buffer to another buffer and drops it. + pub fn copy_to_buffer( + mut self, + encoder: &mut wgpu::CommandEncoder, + destination: &GpuBuffer, + destination_offset: wgpu::BufferAddress, + ) { + encoder.copy_buffer_to_buffer( + &self.chunk_buffer, + self.byte_offset_in_chunk_buffer, + destination, + destination_offset, + self.write_view.deref_mut().len() as u64, + ); + } +} + +/// Internal chunk of the staging belt. +struct Chunk { + buffer: GpuBuffer, + + /// Starting at this offset the buffer is unused. + unused_offset: wgpu::BufferAddress, +} + +impl Chunk { + fn remaining_capacity(&self) -> u64 { + self.buffer.size() - self.unused_offset + } + + fn required_padding(write_view: &mut wgpu::BufferViewMut<'_>, alignment: u64) -> u64 { + // Use deref_mut explicitly because wgpu warns otherwise that read access is slow. + let ptr = write_view.deref_mut().as_ptr() as u64; + wgpu::util::align_to(ptr, alignment) - ptr + } + + /// Caller needs to make sure that there is enough space plus potential padding. + fn allocate_aligned( + &mut self, + num_elements: usize, + size_in_bytes: u64, + alignment: u64, + ) -> CpuWriteGpuReadBuffer { + debug_assert!(num_elements * std::mem::size_of::() <= size_in_bytes as usize); + + // Optimistic first mapping attempt. + let mut start_offset = self.unused_offset; + let mut end_offset = start_offset + size_in_bytes; + let mut buffer_slice = self.buffer.slice(start_offset..end_offset); + let mut write_view = buffer_slice.get_mapped_range_mut(); + + // Check if it fulfills the requested alignment. + let required_padding = Self::required_padding(&mut write_view, alignment); + if required_padding != 0 { + // Undo mapping and try again with padding! + re_log::trace!( + "CpuWriteGpuReadBuffer::allocate alignment requirement not fulfilled. Need to add {required_padding} for alignment of {alignment}" + ); + + drop(write_view); + + start_offset = self.unused_offset + required_padding; + end_offset = start_offset + size_in_bytes; + buffer_slice = self.buffer.slice(start_offset..end_offset); + write_view = buffer_slice.get_mapped_range_mut(); + + let required_padding = Self::required_padding(&mut write_view, alignment); + debug_assert_eq!(required_padding, 0); + } + + debug_assert!(end_offset <= self.buffer.size()); + self.unused_offset = end_offset; + + #[allow(unsafe_code)] + // SAFETY: + // write_view has a lifetime dependency on the chunk's buffer - internally it holds a pointer to it! + // + // To ensure that the buffer is still around, we put the ref counted buffer handle into the struct with it. + // Additionally, the buffer pool needs to ensure: + // * it can't drop buffers if there's still users + // -> We assert on on that + // * buffers are never moved in memory + // -> buffers are always owned by the pool and are always Arc. + // This means it not allowed to move the buffer out. + // (We could make them Pin> but this complicates things inside the BufferPool) + let write_view = unsafe { + std::mem::transmute::, wgpu::BufferViewMut<'static>>(write_view) + }; + + CpuWriteGpuReadBuffer { + chunk_buffer: self.buffer.clone(), + byte_offset_in_chunk_buffer: start_offset, + write_view, + unwritten_element_range: 0..num_elements, + _type: std::marker::PhantomData, + } + } +} + +/// Efficiently performs many buffer writes by sharing and reusing temporary buffers. +/// +/// Internally it uses a ring-buffer of staging buffers that are sub-allocated. +/// +/// Based on to [`wgpu::util::StagingBelt`](https://github.com/gfx-rs/wgpu/blob/a420e453c3d9c93dfb1a8526bf11c000d895c916/wgpu/src/util/belt.rs) +/// However, there are some important differences: +/// * can create buffers without yet knowing the target copy location +/// * lifetime of returned buffers is independent of the [`CpuWriteGpuReadBelt`] (allows working with several in parallel!) +/// * use of `re_renderer`'s resource pool +/// * handles alignment in a defined manner +/// (see this as of writing open wgpu issue on [Alignment guarantees for mapped buffers](https://github.com/gfx-rs/wgpu/issues/3508)) +pub struct CpuWriteGpuReadBelt { + /// Minimum size for new buffers. + chunk_size: u64, + + /// Chunks which are CPU write at the moment. + active_chunks: Vec, + + /// Chunks which are GPU read at the moment. + /// + /// I.e. they have scheduled transfers already; they are unmapped and one or more + /// command encoder has one or more `copy_buffer_to_buffer` commands with them + /// as source. + closed_chunks: Vec, + + /// Chunks that are back from the GPU and ready to be mapped for write and put + /// into `active_chunks`. + free_chunks: Vec, + + /// When closed chunks are mapped again, the map callback sends them here. + /// + /// Note that we shouldn't use SyncSender since this can block the Sender if a buffer is full, + /// which means that in a single threaded situation (Web!) we might deadlock. + sender: mpsc::Sender, + + /// Free chunks are received here to be put on `self.free_chunks`. + receiver: mpsc::Receiver, +} + +impl CpuWriteGpuReadBelt { + /// All allocations of this allocator will be aligned to at least this size. + /// + /// Requiring a minimum alignment means we need to pad less often. + /// Also, it has the potential of making memcpy operations faster. + /// + /// Align to 4xf32. Should be enough for most usecases! + /// Needs to be larger or equal than [`wgpu::MAP_ALIGNMENT`]. + /// For alignment requirements in `WebGPU` in general, refer to + /// [the specification on alignment-class limitations](https://www.w3.org/TR/webgpu/#limit-class-alignment) + pub const MIN_ALIGNMENT: u64 = 16; + + /// Create a cpu-write & gpu-read staging belt. + /// + /// The `chunk_size` is the unit of internal buffer allocation; writes will be + /// sub-allocated within each chunk. Therefore, for optimal use of memory, the + /// chunk size should be: + /// + /// * larger than the largest single [`CpuWriteGpuReadBelt::allocate`] operation; + /// * 1-4 times less than the total amount of data uploaded per submission + /// (per [`CpuWriteGpuReadBelt::before_queue_submit()`]); and + /// * bigger is better, within these bounds. + /// + /// TODO(andreas): Adaptive chunk sizes + /// TODO(andreas): Shrinking after usage spikes? + pub fn new(chunk_size: wgpu::BufferSize) -> Self { + static_assertions::const_assert!(wgpu::MAP_ALIGNMENT <= CpuWriteGpuReadBelt::MIN_ALIGNMENT); + + let (sender, receiver) = mpsc::channel(); + CpuWriteGpuReadBelt { + chunk_size: wgpu::util::align_to(chunk_size.get(), Self::MIN_ALIGNMENT), + active_chunks: Vec::new(), + closed_chunks: Vec::new(), + free_chunks: Vec::new(), + sender, + receiver, + } + } + + /// Allocates a cpu writable buffer for `num_elements` instances of type `T`. + /// + /// Handles alignment requirements automatically, allowing arbitrarily aligned types without issues. + pub fn allocate( + &mut self, + device: &wgpu::Device, + buffer_pool: &mut GpuBufferPool, + num_elements: usize, + ) -> CpuWriteGpuReadBuffer { + // Potentially overestimate alignment with Self::MIN_ALIGNMENT, see Self::MIN_ALIGNMENT doc string. + let alignment = (std::mem::align_of::() as wgpu::BufferAddress).max(Self::MIN_ALIGNMENT); + // Pad out the size of the used buffer to a multiple of Self::MIN_ALIGNMENT. + // This increases our chance of having back to back allocations within a chunk. + let size = wgpu::util::align_to( + (std::mem::size_of::() * num_elements) as wgpu::BufferAddress, + Self::MIN_ALIGNMENT, + ); + + // We need to be super careful with alignment since today wgpu + // has no guarantees on how pointers to mapped memory are aligned! + // For all we know, pointers might be 1 aligned, causing even a u32 write to crash the process! + // + // For details and (as of writing) ongoing discussion see https://github.com/gfx-rs/wgpu/issues/3508 + // + // To work around this, we ask for a bigger size, so we can safely pad out + // if the returned pointer is not correctly aligned. + // (i.e. we will use _up to_ `required_size` bytes, but at least `size`)] + let maximum_padding = alignment - 1; + let max_required_size = size + maximum_padding; + + // Try to find space in any of the active chunks first. + let mut chunk = if let Some(index) = self + .active_chunks + .iter_mut() + .position(|chunk| chunk.remaining_capacity() >= max_required_size) + { + self.active_chunks.swap_remove(index) + } else { + self.receive_chunks(); // ensure self.free_chunks is up to date + + // Use a free chunk if possible, fall back to creating a new one if necessary. + if let Some(index) = self + .free_chunks + .iter() + .position(|chunk| chunk.remaining_capacity() >= max_required_size) + { + self.free_chunks.swap_remove(index) + } else { + // Allocation might be bigger than a chunk! + let buffer_size = wgpu::util::align_to( + self.chunk_size.max(max_required_size), + Self::MIN_ALIGNMENT, + ); + // Happens relatively rarely, this is a noteworthy event! + re_log::debug!("Allocating new CpuWriteGpuReadBelt chunk of size {buffer_size}"); + let buffer = buffer_pool.alloc( + device, + &BufferDesc { + label: "CpuWriteGpuReadBelt buffer".into(), + size: buffer_size, + usage: wgpu::BufferUsages::MAP_WRITE | wgpu::BufferUsages::COPY_SRC, + mapped_at_creation: true, + }, + ); + + Chunk { + buffer, + unused_offset: 0, + } + } + }; + + let cpu_buffer_view = chunk.allocate_aligned(num_elements, size, alignment); + self.active_chunks.push(chunk); + cpu_buffer_view + } + + /// Prepare currently mapped buffers for use in a submission. + /// + /// This must be called before the command encoder(s) used in [`CpuWriteGpuReadBuffer`] copy operations are submitted. + /// + /// At this point, all the partially used staging buffers are closed (cannot be used for + /// further writes) until after [`CpuWriteGpuReadBelt::after_queue_submit`] is called *and* the GPU is done + /// copying the data from them. + pub fn before_queue_submit(&mut self) { + // This would be a great usecase for persistent memory mapping, i.e. mapping without the need to unmap + // https://github.com/gfx-rs/wgpu/issues/1468 + // However, WebGPU does not support this! + + for chunk in self.active_chunks.drain(..) { + chunk.buffer.unmap(); + self.closed_chunks.push(chunk); + } + } + + /// Recall all of the closed buffers back to be reused. + /// + /// This must only be called after the command encoder(s) used in [`CpuWriteGpuReadBuffer`] + /// copy operations are submitted. Additional calls are harmless. + /// Not calling this as soon as possible may result in increased buffer memory usage. + pub fn after_queue_submit(&mut self) { + self.receive_chunks(); + + let sender = &self.sender; + for chunk in self.closed_chunks.drain(..) { + let sender = sender.clone(); + chunk + .buffer + .clone() + .slice(..) + .map_async(wgpu::MapMode::Write, move |_| { + let _ = sender.send(chunk); + }); + } + } + + /// Move all chunks that the GPU is done with (and are now mapped again) + /// from `self.receiver` to `self.free_chunks`. + fn receive_chunks(&mut self) { + while let Ok(mut chunk) = self.receiver.try_recv() { + chunk.unused_offset = 0; + self.free_chunks.push(chunk); + } + } +} + +impl std::fmt::Debug for CpuWriteGpuReadBelt { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("StagingBelt") + .field("chunk_size", &self.chunk_size) + .field("active_chunks", &self.active_chunks.len()) + .field("closed_chunks", &self.closed_chunks.len()) + .field("free_chunks", &self.free_chunks.len()) + .finish_non_exhaustive() + } +} diff --git a/crates/re_renderer/src/allocator/mod.rs b/crates/re_renderer/src/allocator/mod.rs new file mode 100644 index 000000000000..cb4ab783fd09 --- /dev/null +++ b/crates/re_renderer/src/allocator/mod.rs @@ -0,0 +1,8 @@ +//! High level GPU memory allocators. +//! +//! In contrast to the buffer pools in [`crate::wgpu_resources`], every allocator in here +//! follows some more complex strategy for efficient re-use and sub-allocation of wgpu resources. + +mod cpu_write_gpu_read_belt; + +pub use cpu_write_gpu_read_belt::{CpuWriteGpuReadBelt, CpuWriteGpuReadBuffer}; diff --git a/crates/re_renderer/src/context.rs b/crates/re_renderer/src/context.rs index 81044cabb961..150d51320a5d 100644 --- a/crates/re_renderer/src/context.rs +++ b/crates/re_renderer/src/context.rs @@ -1,8 +1,10 @@ use std::sync::Arc; +use parking_lot::Mutex; use type_map::concurrent::{self, TypeMap}; use crate::{ + allocator::CpuWriteGpuReadBelt, config::RenderContextConfig, global_bindings::GlobalBindings, renderer::Renderer, @@ -23,15 +25,19 @@ pub struct RenderContext { #[cfg(all(not(target_arch = "wasm32"), debug_assertions))] // native debug build pub(crate) err_tracker: std::sync::Arc, - /// Utility type map that will be cleared every frame. - pub per_frame_data_helper: TypeMap, - - pub gpu_resources: WgpuResourcePools, pub mesh_manager: MeshManager, pub texture_manager_2d: TextureManager2D, + pub cpu_write_gpu_read_belt: Mutex, - // TODO(andreas): Add frame/lifetime statistics, shared resources (e.g. "global" uniform buffer), ?? - frame_index: u64, + /// List of unfinished queue submission via this context. + /// + /// This is currently only about submissions we do via the global encoder in [`ActiveFrameContext`] + /// TODO(andreas): We rely on egui to to the "primary" submissions in re_viewer. It would be nice to take full control over all submissions. + inflight_queue_submissions: Vec, + + pub active_frame: ActiveFrameContext, + + pub gpu_resources: WgpuResourcePools, // Last due to drop order. } /// Immutable data that is shared between all [`Renderer`] @@ -69,6 +75,34 @@ impl Renderers { } impl RenderContext { + /// Chunk size for our cpu->gpu buffer manager. + /// + /// For native: 32MiB chunk size (as big as a for instance a 2048x1024 float4 texture) + /// For web (memory constraint!): 8MiB + #[cfg(not(target_arch = "wasm32"))] + const CPU_WRITE_GPU_READ_BELT_DEFAULT_CHUNK_SIZE: Option = + wgpu::BufferSize::new(1024 * 1024 * 32); + #[cfg(target_arch = "wasm32")] + const CPU_WRITE_GPU_READ_BELT_DEFAULT_CHUNK_SIZE: Option = + wgpu::BufferSize::new(1024 * 1024 * 8); + + /// Limit maximum number of in flight submissions to this number. + /// + /// By limiting the number of submissions we have on the queue we ensure that GPU stalls do not + /// cause us to request more and more memory to prepare more and more submissions. + /// + /// Note that this is only indirectly related to number of buffered frames, + /// since buffered frames/blit strategy are all about the display<->gpu interface, + /// whereas this is about a *particular aspect* of the cpu<->gpu interface. + /// + /// Should be somewhere between 1-4, too high and we use up more memory and introduce latency, + /// too low and we may starve the GPU. + /// On the web we want to go lower because a lot more memory constraint. + #[cfg(not(target_arch = "wasm32"))] + const MAX_NUM_INFLIGHT_QUEUE_SUBMISSIONS: usize = 4; + #[cfg(target_arch = "wasm32")] + const MAX_NUM_INFLIGHT_QUEUE_SUBMISSIONS: usize = 2; + pub fn new( device: Arc, queue: Arc, @@ -146,19 +180,41 @@ impl RenderContext { renderers, - per_frame_data_helper: TypeMap::new(), - gpu_resources, mesh_manager, texture_manager_2d, + cpu_write_gpu_read_belt: Mutex::new(CpuWriteGpuReadBelt::new(Self::CPU_WRITE_GPU_READ_BELT_DEFAULT_CHUNK_SIZE.unwrap())), resolver, #[cfg(all(not(target_arch = "wasm32"), debug_assertions))] // native debug build err_tracker, - frame_index: 0, + inflight_queue_submissions: Vec::new(), + + active_frame: ActiveFrameContext { frame_global_command_encoder: None, per_frame_data_helper: TypeMap::new(), + frame_index: 0 } + } + } + + fn poll_device(&mut self) { + crate::profile_function!(); + + // Ensure not too many queue submissions are in flight. + let num_submissions_to_wait_for = self + .inflight_queue_submissions + .len() + .saturating_sub(Self::MAX_NUM_INFLIGHT_QUEUE_SUBMISSIONS); + + if let Some(newest_submission_to_wait_for) = self + .inflight_queue_submissions + .drain(0..num_submissions_to_wait_for) + .last() + { + self.device.poll(wgpu::Maintain::WaitForSubmissionIndex( + newest_submission_to_wait_for, + )); } } @@ -166,8 +222,18 @@ impl RenderContext { /// /// Updates internal book-keeping, frame allocators and executes delayed events like shader reloading. pub fn begin_frame(&mut self) { - self.frame_index += 1; - self.per_frame_data_helper.clear(); + crate::profile_function!(); + + // Request used staging buffer back. + // TODO(andreas): If we'd control all submissions, we could move this directly after the submission which would be a bit better. + self.cpu_write_gpu_read_belt.lock().after_queue_submit(); + + self.active_frame = ActiveFrameContext { + frame_global_command_encoder: None, + frame_index: self.active_frame.frame_index + 1, + per_frame_data_helper: TypeMap::new(), + }; + let frame_index = self.active_frame.frame_index; // Tick the error tracker so that it knows when to reset! // Note that we're ticking on begin_frame rather than raw frames, which @@ -183,8 +249,8 @@ impl RenderContext { re_log::debug!(?modified_paths, "got some filesystem events"); } - self.mesh_manager.begin_frame(self.frame_index); - self.texture_manager_2d.begin_frame(self.frame_index); + self.mesh_manager.begin_frame(frame_index); + self.texture_manager_2d.begin_frame(frame_index); { let WgpuResourcePools { @@ -203,27 +269,87 @@ impl RenderContext { shader_modules.begin_frame( &self.device, &mut self.resolver, - self.frame_index, + frame_index, &modified_paths, ); render_pipelines.begin_frame( &self.device, - self.frame_index, + frame_index, shader_modules, pipeline_layouts, ); // Bind group maintenance must come before texture/buffer maintenance since it // registers texture/buffer use - bind_groups.begin_frame(self.frame_index, textures, buffers, samplers); + bind_groups.begin_frame(frame_index, textures, buffers, samplers); - textures.begin_frame(self.frame_index); - buffers.begin_frame(self.frame_index); + textures.begin_frame(frame_index); + buffers.begin_frame(frame_index); - pipeline_layouts.begin_frame(self.frame_index); - bind_group_layouts.begin_frame(self.frame_index); - samplers.begin_frame(self.frame_index); + pipeline_layouts.begin_frame(frame_index); + bind_group_layouts.begin_frame(frame_index); + samplers.begin_frame(frame_index); } + + // Poll device *after* resource pool `begin_frame` since resource pools may each decide drop resources. + // Wgpu internally may then internally decide to let go of these buffers. + self.poll_device(); + } + + /// Call this at the end of a frame but before submitting command buffers (e.g. from [`crate::view_builder::ViewBuilder`]) + pub fn before_submit(&mut self) { + crate::profile_function!(); + + // Unmap all staging buffers. + self.cpu_write_gpu_read_belt.lock().before_queue_submit(); + + if let Some(command_encoder) = self.active_frame.frame_global_command_encoder.take() { + let command_buffer = command_encoder.finish(); + + // TODO(andreas): For better performance, we should try to bundle this with the single submit call that is currently happening in eframe. + // How do we hook in there and make sure this buffer is submitted first? + self.inflight_queue_submissions + .push(self.queue.submit([command_buffer])); + } + } +} + +impl Drop for RenderContext { + fn drop(&mut self) { + // Close global command encoder if there is any pending. + // Not doing so before shutdown causes errors. + if let Some(encoder) = self.active_frame.frame_global_command_encoder.take() { + encoder.finish(); + } + } +} + +pub struct ActiveFrameContext { + /// Command encoder for all commands that should go in before view builder are submitted. + /// + /// This should be used for any gpu copy operation outside of a renderer or view builder. + /// (i.e. typically in [`crate::renderer::DrawData`] creation!) + frame_global_command_encoder: Option, + + /// Utility type map that will be cleared every frame. + pub per_frame_data_helper: TypeMap, + + /// Index of this frame. Is incremented for every render frame. + frame_index: u64, +} + +impl ActiveFrameContext { + /// Gets or creates a command encoder that runs before all view builder encoder. + pub fn frame_global_command_encoder( + &mut self, + device: &wgpu::Device, + ) -> &mut wgpu::CommandEncoder { + self.frame_global_command_encoder.get_or_insert_with(|| { + device.create_command_encoder(&wgpu::CommandEncoderDescriptor { + label: crate::DebugLabel::from("global \"before viewbuilder\" command encoder") + .get(), + }) + }) } } diff --git a/crates/re_renderer/src/lib.rs b/crates/re_renderer/src/lib.rs index b89004ed22cf..dcbae79b06dc 100644 --- a/crates/re_renderer/src/lib.rs +++ b/crates/re_renderer/src/lib.rs @@ -14,6 +14,7 @@ pub mod renderer; pub mod resource_managers; pub mod view_builder; +mod allocator; mod context; mod debug_label; mod depth_offset; diff --git a/crates/re_renderer/src/mesh.rs b/crates/re_renderer/src/mesh.rs index da0edf449293..7f30c8fccb92 100644 --- a/crates/re_renderer/src/mesh.rs +++ b/crates/re_renderer/src/mesh.rs @@ -167,6 +167,7 @@ impl GpuMesh { label: data.label.clone().push_str(" - vertices"), size: vertex_buffer_combined_size, usage: wgpu::BufferUsages::VERTEX | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: false, }, ); let mut staging_buffer = queue @@ -193,6 +194,7 @@ impl GpuMesh { label: data.label.clone().push_str(" - indices"), size: index_buffer_size, usage: wgpu::BufferUsages::INDEX | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: false, }, ); let mut staging_buffer = queue @@ -216,6 +218,7 @@ impl GpuMesh { label: data.label.clone().push_str(" - material uniforms"), size: combined_buffers_size, usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::UNIFORM, + mapped_at_creation: false, }, ); diff --git a/crates/re_renderer/src/point_cloud_builder.rs b/crates/re_renderer/src/point_cloud_builder.rs index f265ae12e064..d13eb9105220 100644 --- a/crates/re_renderer/src/point_cloud_builder.rs +++ b/crates/re_renderer/src/point_cloud_builder.rs @@ -1,42 +1,45 @@ use crate::{ + allocator::CpuWriteGpuReadBuffer, renderer::{ PointCloudBatchFlags, PointCloudBatchInfo, PointCloudDrawData, PointCloudDrawDataError, PointCloudVertex, }, - Color32, DebugLabel, Size, + Color32, DebugLabel, RenderContext, Size, }; /// Builder for point clouds, making it easy to create [`crate::renderer::PointCloudDrawData`]. -/// -/// TODO(andreas): We could make significant optimizations here by making this builder capable -/// of writing to a GPU readable memory location. -/// This will require some ahead of time size limit, but should be feasible. -/// But before that we first need to sort out cpu->gpu transfers better by providing staging buffers. pub struct PointCloudBuilder { // Size of `point`/color`/`per_point_user_data` must be equal. pub vertices: Vec, - pub colors: Vec, + + pub(crate) color_buffer: CpuWriteGpuReadBuffer, pub user_data: Vec, - batches: Vec, + pub(crate) batches: Vec, } -impl Default for PointCloudBuilder { - fn default() -> Self { +impl PointCloudBuilder +where + PerPointUserData: Default + Copy, +{ + pub fn new(ctx: &mut RenderContext) -> Self { const RESERVE_SIZE: usize = 512; + + // TODO(andreas): Be more resourceful about the size allocated here. Typically we know in advance! + let color_buffer = ctx.cpu_write_gpu_read_belt.lock().allocate::( + &ctx.device, + &mut ctx.gpu_resources.buffers, + PointCloudDrawData::MAX_NUM_POINTS, + ); + Self { vertices: Vec::with_capacity(RESERVE_SIZE), - colors: Vec::with_capacity(RESERVE_SIZE), + color_buffer, user_data: Vec::with_capacity(RESERVE_SIZE), batches: Vec::with_capacity(16), } } -} -impl PointCloudBuilder -where - PerPointUserData: Default + Copy, -{ /// Start of a new batch. #[inline] pub fn batch( @@ -102,10 +105,10 @@ where /// Finalizes the builder and returns a point cloud draw data with all the points added so far. pub fn to_draw_data( - &self, + self, ctx: &mut crate::context::RenderContext, ) -> Result { - PointCloudDrawData::new(ctx, &self.vertices, &self.colors, &self.batches) + PointCloudDrawData::new(ctx, self) } } @@ -150,13 +153,14 @@ where /// Each time we `add_points`, or upon builder drop, make sure that we /// fill in any additional colors and user-data to have matched vectors. fn extend_defaults(&mut self) { - if self.0.colors.len() < self.0.vertices.len() { - self.0.colors.extend( - std::iter::repeat(Color32::WHITE).take(self.0.vertices.len() - self.0.colors.len()), + if self.0.color_buffer.num_written() < self.0.vertices.len() { + self.0.color_buffer.extend( + std::iter::repeat(Color32::WHITE) + .take(self.0.vertices.len() - self.0.color_buffer.num_written()), ); } - if self.0.user_data.len() < self.0.user_data.len() { + if self.0.user_data.len() < self.0.vertices.len() { self.0.user_data.extend( std::iter::repeat(PerPointUserData::default()) .take(self.0.vertices.len() - self.0.user_data.len()), @@ -186,7 +190,7 @@ where self.extend_defaults(); - debug_assert_eq!(self.0.vertices.len(), self.0.colors.len()); + debug_assert_eq!(self.0.vertices.len(), self.0.color_buffer.num_written()); debug_assert_eq!(self.0.vertices.len(), self.0.user_data.len()); let old_size = self.0.vertices.len(); @@ -200,7 +204,6 @@ where let num_points = self.0.vertices.len() - old_size; self.batch_mut().point_count += num_points as u32; - self.0.colors.reserve(num_points); self.0.user_data.reserve(num_points); let new_range = old_size..self.0.vertices.len(); @@ -210,27 +213,28 @@ where PointsBuilder { vertices: &mut self.0.vertices[new_range], max_points, - colors: &mut self.0.colors, + colors: &mut self.0.color_buffer, user_data: &mut self.0.user_data, } } #[inline] pub fn add_point(&mut self, position: glam::Vec3) -> PointBuilder<'_, PerPointUserData> { - debug_assert_eq!(self.0.vertices.len(), self.0.colors.len()); + self.extend_defaults(); + + debug_assert_eq!(self.0.vertices.len(), self.0.color_buffer.num_written()); debug_assert_eq!(self.0.vertices.len(), self.0.user_data.len()); self.0.vertices.push(PointCloudVertex { position, radius: Size::AUTO, }); - self.0.colors.push(Color32::WHITE); - self.0.user_data.push(PerPointUserData::default()); + self.0.user_data.push(Default::default()); self.batch_mut().point_count += 1; PointBuilder { vertex: self.0.vertices.last_mut().unwrap(), - color: self.0.colors.last_mut().unwrap(), + color: &mut self.0.color_buffer, user_data: self.0.user_data.last_mut().unwrap(), } } @@ -265,7 +269,7 @@ where pub struct PointBuilder<'a, PerPointUserData> { vertex: &'a mut PointCloudVertex, - color: &'a mut Color32, + color: &'a mut CpuWriteGpuReadBuffer, user_data: &'a mut PerPointUserData, } @@ -279,9 +283,10 @@ where self } + /// This mustn't call this more than once. #[inline] pub fn color(self, color: Color32) -> Self { - *self.color = color; + self.color.push(&color); self } @@ -298,7 +303,7 @@ pub struct PointsBuilder<'a, PerPointUserData> { // Colors and user-data are the Vec we append // the data to if provided. max_points: usize, - colors: &'a mut Vec, + colors: &'a mut CpuWriteGpuReadBuffer, user_data: &'a mut Vec, } @@ -308,6 +313,8 @@ where { /// Assigns radii to all points. /// + /// This mustn't call this more than once. + /// /// If the iterator doesn't cover all points, some will not be assigned. /// If the iterator provides more values than there are points, the extra values will be ignored. #[inline] @@ -323,18 +330,22 @@ where /// Assigns colors to all points. /// + /// This mustn't call this more than once. + /// /// If the iterator doesn't cover all points, some will not be assigned. /// If the iterator provides more values than there are points, the extra values will be ignored. #[inline] pub fn colors(self, colors: impl Iterator) -> Self { crate::profile_function!(); self.colors - .extend(colors.take(self.max_points - self.colors.len())); + .extend(colors.take(self.max_points - self.colors.num_written())); self } /// Assigns user data for all points in this builder. /// + /// This mustn't call this more than once. + /// /// User data is currently not available on the GPU. #[inline] pub fn user_data(self, data: impl Iterator) -> Self { diff --git a/crates/re_renderer/src/renderer/lines.rs b/crates/re_renderer/src/renderer/lines.rs index 996fa897e69d..f8237709ebb6 100644 --- a/crates/re_renderer/src/renderer/lines.rs +++ b/crates/re_renderer/src/renderer/lines.rs @@ -503,6 +503,7 @@ impl LineDrawData { label: "lines batch uniform buffers".into(), size: combined_buffers_size, usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: false, }, ); diff --git a/crates/re_renderer/src/renderer/mesh_renderer.rs b/crates/re_renderer/src/renderer/mesh_renderer.rs index d9c7b734947b..14559b9f69e6 100644 --- a/crates/re_renderer/src/renderer/mesh_renderer.rs +++ b/crates/re_renderer/src/renderer/mesh_renderer.rs @@ -145,6 +145,7 @@ impl MeshDrawData { label: "MeshDrawData instance buffer".into(), size: instance_buffer_size, usage: wgpu::BufferUsages::VERTEX | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: false, }, ); diff --git a/crates/re_renderer/src/renderer/point_cloud.rs b/crates/re_renderer/src/renderer/point_cloud.rs index 415a82550830..233911761fd2 100644 --- a/crates/re_renderer/src/renderer/point_cloud.rs +++ b/crates/re_renderer/src/renderer/point_cloud.rs @@ -19,7 +19,8 @@ use std::{ }; use crate::{ - context::uniform_buffer_allocation_size, wgpu_resources::BufferDesc, Color32, DebugLabel, + context::uniform_buffer_allocation_size, wgpu_resources::BufferDesc, DebugLabel, + PointCloudBuilder, }; use bitflags::bitflags; use bytemuck::Zeroable; @@ -148,11 +149,9 @@ impl PointCloudDrawData { /// Number of vertices and colors has to be equal. /// /// If no batches are passed, all points are assumed to be in a single batch with identity transform. - pub fn new( + pub fn new( ctx: &mut RenderContext, - vertices: &[PointCloudVertex], - colors: &[Color32], - batches: &[PointCloudBatchInfo], + builder: PointCloudBuilder, ) -> Result { crate::profile_function!(); @@ -163,6 +162,9 @@ impl PointCloudDrawData { &mut ctx.resolver, ); + let vertices = builder.vertices.as_slice(); + let batches = builder.batches.as_slice(); + if vertices.is_empty() { return Ok(PointCloudDrawData { bind_group_all_points: None, @@ -194,23 +196,16 @@ impl PointCloudDrawData { 0 ); - if vertices.len() != colors.len() { - return Err(PointCloudDrawDataError::NumberOfColorsNotEqualNumberOfVertices); - } - - let (vertices, colors) = if vertices.len() >= Self::MAX_NUM_POINTS { + let vertices = if vertices.len() >= Self::MAX_NUM_POINTS { re_log::error_once!( "Reached maximum number of supported points. Clamping down to {}, passed were {}. See also https://github.com/rerun-io/rerun/issues/957", Self::MAX_NUM_POINTS, vertices.len() ); - ( - &vertices[..Self::MAX_NUM_POINTS], - &colors[..Self::MAX_NUM_POINTS], - ) + &vertices[..Self::MAX_NUM_POINTS] } else { - (vertices, colors) + vertices }; // TODO(andreas): We want a "stack allocation" here that lives for one frame. @@ -261,15 +256,6 @@ impl PointCloudDrawData { .collect_vec() }; - let color_staging = { - crate::profile_scope!("collect_colors"); - colors - .iter() - .cloned() - .chain(std::iter::repeat(Color32::TRANSPARENT).take(num_points_zeroed)) - .collect_vec() - }; - // Upload data from staging buffers to gpu. let size = wgpu::Extent3d { width: DATA_TEXTURE_SIZE, @@ -298,26 +284,18 @@ impl PointCloudDrawData { ); } - { - crate::profile_scope!("write_color_texture"); - ctx.queue.write_texture( - wgpu::ImageCopyTexture { - texture: &color_texture.texture, - mip_level: 0, - origin: wgpu::Origin3d::ZERO, - aspect: wgpu::TextureAspect::All, - }, - bytemuck::cast_slice(&color_staging), - wgpu::ImageDataLayout { - offset: 0, - bytes_per_row: NonZeroU32::new( - DATA_TEXTURE_SIZE * std::mem::size_of::<[u8; 4]>() as u32, - ), - rows_per_image: None, - }, - size, - ); - } + builder.color_buffer.copy_to_texture( + ctx.active_frame.frame_global_command_encoder(&ctx.device), + wgpu::ImageCopyTexture { + texture: &color_texture.texture, + mip_level: 0, + origin: wgpu::Origin3d::ZERO, + aspect: wgpu::TextureAspect::All, + }, + NonZeroU32::new(DATA_TEXTURE_SIZE * std::mem::size_of::<[u8; 4]>() as u32), + None, + size, + ); let bind_group_all_points = ctx.gpu_resources.bind_groups.alloc( &ctx.device, @@ -347,6 +325,7 @@ impl PointCloudDrawData { label: "point batch uniform buffers".into(), size: combined_buffers_size, usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: false, }, ); diff --git a/crates/re_renderer/src/renderer/rectangles.rs b/crates/re_renderer/src/renderer/rectangles.rs index 5b325e13d24c..42f98378dceb 100644 --- a/crates/re_renderer/src/renderer/rectangles.rs +++ b/crates/re_renderer/src/renderer/rectangles.rs @@ -141,6 +141,7 @@ impl RectangleDrawData { label: "rectangle uniform buffers".into(), size: combined_buffers_size, usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::UNIFORM, + mapped_at_creation: false, }, ); diff --git a/crates/re_renderer/src/view_builder.rs b/crates/re_renderer/src/view_builder.rs index 0fcd491935da..1f5981b278c6 100644 --- a/crates/re_renderer/src/view_builder.rs +++ b/crates/re_renderer/src/view_builder.rs @@ -275,6 +275,7 @@ impl ViewBuilder { label: format!("{:?} - frame uniform buffer", config.name).into(), size: std::mem::size_of::() as _, usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: false, }, ); @@ -392,10 +393,12 @@ impl ViewBuilder { // Factor applied to depth offsets. let depth_offset_factor = 1.0e-08; // Value determined by experimentation. Quite close to the f32 machine epsilon but a bit lower. - ctx.queue.write_buffer( - &frame_uniform_buffer, - 0, - bytemuck::bytes_of(&FrameUniformBuffer { + { + let mut frame_uniform_buffer_cpu = ctx + .cpu_write_gpu_read_belt + .lock() + .allocate::(&ctx.device, &mut ctx.gpu_resources.buffers, 1); + frame_uniform_buffer_cpu.push(&FrameUniformBuffer { view_from_world: glam::Affine3A::from_mat4(view_from_world).into(), projection_from_view: projection_from_view.into(), projection_from_world: projection_from_world.into(), @@ -410,8 +413,13 @@ impl ViewBuilder { depth_offset_factor, _padding: glam::Vec3::ZERO, - }), - ); + }); + frame_uniform_buffer_cpu.copy_to_buffer( + ctx.active_frame.frame_global_command_encoder(&ctx.device), + &frame_uniform_buffer, + 0, + ); + } let bind_group_0 = ctx.shared_renderer_data.global_bindings.create_bind_group( &mut ctx.gpu_resources, diff --git a/crates/re_renderer/src/wgpu_resources/bind_group_pool.rs b/crates/re_renderer/src/wgpu_resources/bind_group_pool.rs index 668eee548b11..8422ae97734a 100644 --- a/crates/re_renderer/src/wgpu_resources/bind_group_pool.rs +++ b/crates/re_renderer/src/wgpu_resources/bind_group_pool.rs @@ -7,7 +7,7 @@ use crate::debug_label::DebugLabel; use super::{ bind_group_layout_pool::{GpuBindGroupLayoutHandle, GpuBindGroupLayoutPool}, buffer_pool::{GpuBuffer, GpuBufferHandle, GpuBufferPool}, - dynamic_resource_pool::{DynamicResource, DynamicResourcePool, SizedResourceDesc}, + dynamic_resource_pool::{DynamicResource, DynamicResourcePool, DynamicResourcesDesc}, sampler_pool::{GpuSamplerHandle, GpuSamplerPool}, texture_pool::{GpuTexture, GpuTextureHandle, GpuTexturePool}, }; @@ -60,12 +60,16 @@ pub struct BindGroupDesc { pub layout: GpuBindGroupLayoutHandle, } -impl SizedResourceDesc for BindGroupDesc { +impl DynamicResourcesDesc for BindGroupDesc { fn resource_size_in_bytes(&self) -> u64 { // Size depends on gpu/driver (like with all resources). // We could guess something like a pointer per descriptor, but let's not pretend we know! 0 } + + fn allow_reuse(&self) -> bool { + true + } } /// Resource pool for bind groups. diff --git a/crates/re_renderer/src/wgpu_resources/buffer_pool.rs b/crates/re_renderer/src/wgpu_resources/buffer_pool.rs index 21b837e53a2b..c6890749dc5c 100644 --- a/crates/re_renderer/src/wgpu_resources/buffer_pool.rs +++ b/crates/re_renderer/src/wgpu_resources/buffer_pool.rs @@ -3,7 +3,7 @@ use std::hash::Hash; use crate::debug_label::DebugLabel; use super::{ - dynamic_resource_pool::{DynamicResource, DynamicResourcePool, SizedResourceDesc}, + dynamic_resource_pool::{DynamicResource, DynamicResourcePool, DynamicResourcesDesc}, resource::PoolError, }; @@ -13,6 +13,7 @@ slotmap::new_key_type! { pub struct GpuBufferHandle; } /// Once all instances are dropped, the buffer will be marked for reclamation in the following frame. pub type GpuBuffer = std::sync::Arc>; +/// Buffer creation descriptor. #[derive(Clone, Hash, PartialEq, Eq, Debug)] pub struct BufferDesc { /// Debug label of a buffer. This will show up in graphics debuggers for easy identification. @@ -24,12 +25,26 @@ pub struct BufferDesc { /// Usages of a buffer. If the buffer is used in any way that isn't specified here, the operation /// will panic. pub usage: wgpu::BufferUsages, + + /// Allows a buffer to be mapped immediately after they are made. It does not have to be [`wgpu::BufferUsages::MAP_READ`] or + /// [`wgpu::BufferUsages::MAP_WRITE`], all buffers are allowed to be mapped at creation. + /// + /// *WARNING*: If this is `true`, the pool won't be able to reclaim the buffer later! + /// Furthermore, [`size`](#structfield.size) must be a multiple of [`wgpu::COPY_BUFFER_ALIGNMENT`]. + pub mapped_at_creation: bool, } -impl SizedResourceDesc for BufferDesc { +impl DynamicResourcesDesc for BufferDesc { fn resource_size_in_bytes(&self) -> u64 { self.size } + + fn allow_reuse(&self) -> bool { + // We can't re-use buffers that were mapped at creation since we don't know if the user + // unmapped the buffer. + // We could try to figure it out, but mapped-at-creation buffers should only be used by one of the dedicated allocators anyways! + !self.mapped_at_creation + } } #[derive(Default)] @@ -39,7 +54,8 @@ pub struct GpuBufferPool { impl GpuBufferPool { /// Returns a reference-counted gpu buffer that is currently unused. - /// Once ownership is given up, the buffer may be reclaimed in future frames. + /// Once ownership is given up, the buffer may be reclaimed in future frames + /// unless `BufferDesc::mapped_at_creation` was true. /// /// For more efficient allocation (faster, less fragmentation) you should sub-allocate buffers whenever possible /// either manually or using a higher level allocator. @@ -49,7 +65,7 @@ impl GpuBufferPool { label: desc.label.get(), size: desc.size, usage: desc.usage, - mapped_at_creation: false, + mapped_at_creation: desc.mapped_at_creation, }) }) } diff --git a/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs b/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs index 4d10fcc9f839..6da0f18b575a 100644 --- a/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs +++ b/crates/re_renderer/src/wgpu_resources/dynamic_resource_pool.rs @@ -12,17 +12,24 @@ use smallvec::SmallVec; use super::resource::PoolError; -pub trait SizedResourceDesc { +pub trait DynamicResourcesDesc { fn resource_size_in_bytes(&self) -> u64; + + /// If true, a unused resources will be kept around for while and then re-used in following frames. + /// If false, it will be destroyed on [`DynamicResourcePool::begin_frame`]. + fn allow_reuse(&self) -> bool; } -pub struct DynamicResource { +pub struct DynamicResource { pub inner: Res, pub creation_desc: Desc, pub handle: Handle, } -impl std::ops::Deref for DynamicResource { +impl std::ops::Deref for DynamicResource +where + Desc: Debug, +{ type Target = Res; #[inline] @@ -35,7 +42,7 @@ impl std::ops::Deref for DynamicResource { type AliveResourceMap = SlotMap>>>; -struct DynamicResourcePoolProtectedState { +struct DynamicResourcePoolProtectedState { /// All currently alive resources. /// We store any ref counted handle we give out in [`DynamicResourcePool::alloc`] here in order to keep it alive. /// Every [`DynamicResourcePool::begin_frame`] we check if the pool is now the only owner of the handle @@ -51,7 +58,7 @@ struct DynamicResourcePoolProtectedState { /// /// Unlike in [`super::static_resource_pool::StaticResourcePool`], a resource can not be uniquely /// identified by its description, as the same description can apply to several different resources. -pub(super) struct DynamicResourcePool { +pub(super) struct DynamicResourcePool { state: RwLock>, current_frame_index: u64, @@ -59,7 +66,10 @@ pub(super) struct DynamicResourcePool { } /// We cannot #derive(Default) as that would require Handle/Desc/Res to implement Default too. -impl Default for DynamicResourcePool { +impl Default for DynamicResourcePool +where + Desc: Debug, +{ fn default() -> Self { Self { state: RwLock::new(DynamicResourcePoolProtectedState { @@ -75,7 +85,7 @@ impl Default for DynamicResourcePool impl DynamicResourcePool where Handle: Key, - Desc: Clone + Eq + Hash + Debug + SizedResourceDesc, + Desc: Clone + Eq + Hash + Debug + DynamicResourcesDesc, { pub fn alloc Res>( &self, @@ -85,24 +95,27 @@ where let mut state = self.state.write(); // First check if we can reclaim a resource we have around from a previous frame. - let inner_resource = - if let Entry::Occupied(mut entry) = state.last_frame_deallocated.entry(desc.clone()) { - re_log::trace!(?desc, "Reclaimed previously discarded resource",); - - let handle = entry.get_mut().pop().unwrap(); - if entry.get().is_empty() { - entry.remove(); + let inner_resource = (|| { + if desc.allow_reuse() { + if let Entry::Occupied(mut entry) = state.last_frame_deallocated.entry(desc.clone()) + { + re_log::trace!(?desc, "Reclaimed previously discarded resource"); + let inner_resource = entry.get_mut().pop().unwrap(); + if entry.get().is_empty() { + entry.remove(); + } + return inner_resource; } - handle + } // Otherwise create a new resource - } else { - let resource = creation_func(desc); - self.total_resource_size_in_bytes.fetch_add( - desc.resource_size_in_bytes(), - std::sync::atomic::Ordering::Relaxed, - ); - resource - }; + re_log::debug!(?desc, "Allocated new resource"); + let inner_resource = creation_func(desc); + self.total_resource_size_in_bytes.fetch_add( + desc.resource_size_in_bytes(), + std::sync::atomic::Ordering::Relaxed, + ); + inner_resource + })(); let handle = state.alive_resources.insert_with_key(|handle| { Some(Arc::new(DynamicResource { @@ -195,6 +208,30 @@ where .load(std::sync::atomic::Ordering::Relaxed) } } +impl Drop for DynamicResourcePool +where + Handle: Key, + Desc: Debug, +{ + fn drop(&mut self) { + // TODO(andreas): We're failing this check currently on re_viewer's shutdown. + // This is primarily the case due to the way we store the render ctx itself and other things on egui-wgpu's paint callback resources + // We shouldn't do this as it makes a whole lot of other things cumbersome. Instead, we should store it directly on the `App` + // where we control the drop order. + + // for (_, alive_resource) in self.state.read().alive_resources.iter() { + // let alive_resource = alive_resource + // .as_ref() + // .expect("Alive resources should never be None"); + // let ref_count = Arc::strong_count(alive_resource); + + // assert!(ref_count == 1, + // "Resource has still {} owner(s) at the time of pool destruction. Description desc was {:?}", + // ref_count - 1, + // &alive_resource.creation_desc); + // } + } +} #[cfg(test)] mod tests { @@ -206,17 +243,21 @@ mod tests { }, }; - use super::{DynamicResourcePool, SizedResourceDesc}; + use super::{DynamicResourcePool, DynamicResourcesDesc}; slotmap::new_key_type! { pub struct ConcreteHandle; } #[derive(Clone, PartialEq, Eq, Hash, Debug)] pub struct ConcreteResourceDesc(u32); - impl SizedResourceDesc for ConcreteResourceDesc { + impl DynamicResourcesDesc for ConcreteResourceDesc { fn resource_size_in_bytes(&self) -> u64 { 1 } + + fn allow_reuse(&self) -> bool { + true + } } #[derive(Debug)] diff --git a/crates/re_renderer/src/wgpu_resources/texture_pool.rs b/crates/re_renderer/src/wgpu_resources/texture_pool.rs index d4907b73b178..ad8c7f5c0d06 100644 --- a/crates/re_renderer/src/wgpu_resources/texture_pool.rs +++ b/crates/re_renderer/src/wgpu_resources/texture_pool.rs @@ -3,7 +3,7 @@ use std::hash::Hash; use crate::debug_label::DebugLabel; use super::{ - dynamic_resource_pool::{DynamicResource, DynamicResourcePool, SizedResourceDesc}, + dynamic_resource_pool::{DynamicResource, DynamicResourcePool, DynamicResourcesDesc}, resource::PoolError, }; @@ -46,7 +46,7 @@ pub struct TextureDesc { pub usage: wgpu::TextureUsages, } -impl SizedResourceDesc for TextureDesc { +impl DynamicResourcesDesc for TextureDesc { /// Number of bytes this texture is expected to take. /// /// The actual number might be both bigger (padding) and lower (gpu sided compression). @@ -68,6 +68,10 @@ impl SizedResourceDesc for TextureDesc { size_in_bytes } + + fn allow_reuse(&self) -> bool { + true + } } #[derive(Default)] pub struct GpuTexturePool { diff --git a/crates/re_viewer/src/app.rs b/crates/re_viewer/src/app.rs index 6d01a41916e5..d45afa2376de 100644 --- a/crates/re_viewer/src/app.rs +++ b/crates/re_viewer/src/app.rs @@ -491,6 +491,8 @@ impl eframe::App for App { self.rx.source(), ); } + + render_ctx.before_submit(); } }); diff --git a/crates/re_viewer/src/ui/space_view.rs b/crates/re_viewer/src/ui/space_view.rs index f06bbae25b37..3e9c7d693ff1 100644 --- a/crates/re_viewer/src/ui/space_view.rs +++ b/crates/re_viewer/src/ui/space_view.rs @@ -176,7 +176,7 @@ impl SpaceView { &self.space_path, self.data_blueprint.data_blueprints_projected(), ); - let mut scene = view_spatial::SceneSpatial::default(); + let mut scene = view_spatial::SceneSpatial::new(ctx.render_ctx); scene.load(ctx, &query, &transforms, highlights); self.view_state .ui_spatial(ctx, ui, &self.space_path, scene, self.id, highlights); diff --git a/crates/re_viewer/src/ui/view_spatial/scene/mod.rs b/crates/re_viewer/src/ui/view_spatial/scene/mod.rs index 18166e05d587..84376f4994c2 100644 --- a/crates/re_viewer/src/ui/view_spatial/scene/mod.rs +++ b/crates/re_viewer/src/ui/view_spatial/scene/mod.rs @@ -115,7 +115,6 @@ pub struct SceneSpatialUiData { pub images: Vec, } -#[derive(Default)] pub struct SceneSpatial { pub annotation_map: AnnotationMap, pub primitives: SceneSpatialPrimitives, @@ -146,6 +145,17 @@ fn instance_path_hash_if_interactive( pub type Keypoints = HashMap<(ClassId, i64), HashMap>; impl SceneSpatial { + pub fn new(re_ctx: &mut re_renderer::RenderContext) -> Self { + Self { + annotation_map: Default::default(), + primitives: SceneSpatialPrimitives::new(re_ctx), + ui: Default::default(), + num_logged_2d_objects: Default::default(), + num_logged_3d_objects: Default::default(), + space_cameras: Default::default(), + } + } + /// Loads all 3D objects into the scene according to the given query. pub(crate) fn load( &mut self, diff --git a/crates/re_viewer/src/ui/view_spatial/scene/primitives.rs b/crates/re_viewer/src/ui/view_spatial/scene/primitives.rs index 56b8e323e58c..22a1fd28dd31 100644 --- a/crates/re_viewer/src/ui/view_spatial/scene/primitives.rs +++ b/crates/re_viewer/src/ui/view_spatial/scene/primitives.rs @@ -10,7 +10,6 @@ use super::MeshSource; /// TODO(andreas): Right now we're using `re_renderer` data structures for reading (bounding box & picking). /// In the future, this will be more limited as we're going to gpu staging data as soon as possible /// which is very slow to read. See [#594](https://github.com/rerun-io/rerun/pull/594) -#[derive(Default)] pub struct SceneSpatialPrimitives { /// Estimated bounding box of all data in scene coordinates. Accumulated. pub(super) bounding_box: macaw::BoundingBox, @@ -31,6 +30,17 @@ const AXIS_COLOR_Y: Color32 = Color32::from_rgb(0, 240, 0); const AXIS_COLOR_Z: Color32 = Color32::from_rgb(80, 80, 255); impl SceneSpatialPrimitives { + pub fn new(re_ctx: &mut re_renderer::RenderContext) -> Self { + Self { + bounding_box: macaw::BoundingBox::nothing(), + textured_rectangles_ids: Default::default(), + textured_rectangles: Default::default(), + line_strips: Default::default(), + points: PointCloudBuilder::new(re_ctx), + meshes: Default::default(), + } + } + /// bounding box covering the rendered scene pub fn bounding_box(&self) -> macaw::BoundingBox { self.bounding_box diff --git a/crates/re_viewer/src/ui/view_spatial/ui_2d.rs b/crates/re_viewer/src/ui/view_spatial/ui_2d.rs index 172341ac30b6..23be71806264 100644 --- a/crates/re_viewer/src/ui/view_spatial/ui_2d.rs +++ b/crates/re_viewer/src/ui/view_spatial/ui_2d.rs @@ -8,7 +8,11 @@ use re_data_store::EntityPath; use re_log_types::component_types::TensorTrait; use re_renderer::view_builder::TargetConfiguration; -use super::{eye::Eye, scene::AdditionalPickingInfo, ViewSpatialState}; +use super::{ + eye::Eye, + scene::{AdditionalPickingInfo, SceneSpatialUiData}, + ViewSpatialState, +}; use crate::{ misc::{HoveredSpace, Item, SelectionHighlight, SpaceViewHighlights}, ui::{ @@ -301,43 +305,13 @@ fn view_2d_scrollable( // ------------------------------------------------------------------------ - // Draw a re_renderer driven view. - // Camera & projection are configured to ingest space coordinates directly. - { - crate::profile_scope!("build command buffer for 2D view {}", space.to_string()); - - let Ok(target_config) = setup_target_config( - &painter, - space_from_ui, - space_from_pixel, - &space.to_string(), - state.auto_size_config(response.rect.size()), - ) else { - return response; - }; - - let Ok(callback) = create_scene_paint_callback( - ctx.render_ctx, - target_config, painter.clip_rect(), - &scene.primitives, - &ScreenBackground::ClearColor(parent_ui.visuals().extreme_bg_color.into()), - ) else { - return response; - }; - - painter.add(callback); - } - - // ------------------------------------------------------------------------ - - // Add egui driven labels on top of re_renderer content. - painter.extend(create_labels( - &mut scene, + let label_shapes = create_labels( + &mut scene.ui, ui_from_space, space_from_ui, parent_ui, highlights, - )); + ); // ------------------------------------------------------------------------ @@ -452,6 +426,33 @@ fn view_2d_scrollable( // ------------------------------------------------------------------------ + // Draw a re_renderer driven view. + // Camera & projection are configured to ingest space coordinates directly. + { + crate::profile_scope!("build command buffer for 2D view {}", space.to_string()); + + let Ok(target_config) = setup_target_config( + &painter, + space_from_ui, + space_from_pixel, + &space.to_string(), + state.auto_size_config(response.rect.size()), + ) else { + return response; + }; + + let Ok(callback) = create_scene_paint_callback( + ctx.render_ctx, + target_config, painter.clip_rect(), + scene.primitives, + &ScreenBackground::ClearColor(parent_ui.visuals().extreme_bg_color.into()), + ) else { + return response; + }; + + painter.add(callback); + } + project_onto_other_spaces(ctx, space, &response, &space_from_ui, depth_at_pointer); painter.extend(show_projections_from_3d_space( ctx, @@ -460,11 +461,14 @@ fn view_2d_scrollable( &ui_from_space, )); + // Add egui driven labels on top of re_renderer content. + painter.extend(label_shapes); + response } fn create_labels( - scene: &mut SceneSpatial, + scene_ui: &mut SceneSpatialUiData, ui_from_space: RectTransform, space_from_ui: RectTransform, parent_ui: &mut egui::Ui, @@ -472,9 +476,9 @@ fn create_labels( ) -> Vec { crate::profile_function!(); - let mut label_shapes = Vec::with_capacity(scene.ui.labels_2d.len() * 2); + let mut label_shapes = Vec::with_capacity(scene_ui.labels_2d.len() * 2); - for label in &scene.ui.labels_2d { + for label in &scene_ui.labels_2d { let (wrap_width, text_anchor_pos) = match label.target { Label2DTarget::Rect(rect) => { let rect_in_ui = ui_from_space.transform_rect(rect); @@ -534,8 +538,7 @@ fn create_labels( label_shapes.push(Shape::rect_filled(bg_rect, 3.0, fill_color)); label_shapes.push(Shape::galley(text_rect.center_top(), galley)); - scene - .ui + scene_ui .pickable_ui_rects .push((space_from_ui.transform_rect(bg_rect), label.labled_instance)); } diff --git a/crates/re_viewer/src/ui/view_spatial/ui_3d.rs b/crates/re_viewer/src/ui/view_spatial/ui_3d.rs index b4d1ed2a2359..88590aba0cb7 100644 --- a/crates/re_viewer/src/ui/view_spatial/ui_3d.rs +++ b/crates/re_viewer/src/ui/view_spatial/ui_3d.rs @@ -473,7 +473,7 @@ pub fn view_3d( ui, eye, rect, - &scene, + scene, ctx.render_ctx, &space.to_string(), state.auto_size_config(rect.size()), @@ -484,7 +484,7 @@ fn paint_view( ui: &mut egui::Ui, eye: Eye, rect: egui::Rect, - scene: &SceneSpatial, + scene: SceneSpatial, render_ctx: &mut RenderContext, name: &str, auto_size_config: re_renderer::AutoSizeConfig, @@ -516,7 +516,7 @@ fn paint_view( render_ctx, target_config, rect, - &scene.primitives, &ScreenBackground::GenericSkybox) + scene.primitives, &ScreenBackground::GenericSkybox) else { return; }; diff --git a/crates/re_viewer/src/ui/view_spatial/ui_renderer_bridge.rs b/crates/re_viewer/src/ui/view_spatial/ui_renderer_bridge.rs index 2c9201982add..30a2758bb653 100644 --- a/crates/re_viewer/src/ui/view_spatial/ui_renderer_bridge.rs +++ b/crates/re_viewer/src/ui/view_spatial/ui_renderer_bridge.rs @@ -18,7 +18,7 @@ pub fn create_scene_paint_callback( render_ctx: &mut RenderContext, target_config: TargetConfiguration, clip_rect: egui::Rect, - primitives: &SceneSpatialPrimitives, + primitives: SceneSpatialPrimitives, background: &ScreenBackground, ) -> anyhow::Result { let pixels_from_point = target_config.pixels_from_point; @@ -41,7 +41,7 @@ pub enum ScreenBackground { fn create_and_fill_view_builder( render_ctx: &mut RenderContext, target_config: TargetConfiguration, - primitives: &SceneSpatialPrimitives, + primitives: SceneSpatialPrimitives, background: &ScreenBackground, ) -> anyhow::Result<(wgpu::CommandBuffer, ViewBuilder)> { let mut view_builder = ViewBuilder::default(); @@ -85,6 +85,7 @@ fn renderer_paint_callback( let command_buffer = std::sync::Arc::new(Mutex::new(Some(command_buffer))); let composition_view_builder_map = render_ctx + .active_frame .per_frame_data_helper .entry::() .or_insert_with(Default::default); @@ -111,7 +112,10 @@ fn renderer_paint_callback( //let clip_rect = info.clip_rect_in_pixels(); let ctx = paint_callback_resources.get::().unwrap(); - ctx.per_frame_data_helper.get::().unwrap()[view_builder_handle] + ctx.active_frame + .per_frame_data_helper + .get::() + .unwrap()[view_builder_handle] .composite(ctx, render_pass, screen_position) .expect("Failed compositing view builder with main target."); }),