From aec0e87f6824398f873ea0c3d6b296692d8e61a0 Mon Sep 17 00:00:00 2001 From: Andreas Reich Date: Mon, 27 Feb 2023 10:05:09 +0100 Subject: [PATCH] [re_renderer] Uniform buffer utility using `CpuWriteGpuReadBelt` (#1400) * utility function for uniform buffer from struct * Uniform buffers are now compile time forced to be 256 bytes aligned * CpuWriteGpuReadBelt no longer takes mutable buffer pool * Renderers and frame_global_command_encoder are now behind locks --------- Co-authored-by: Clement Rey --- .../src/allocator/cpu_write_gpu_read_belt.rs | 11 +- crates/re_renderer/src/allocator/mod.rs | 4 + .../src/allocator/uniform_buffer_fill.rs | 102 ++++++++++++++++++ crates/re_renderer/src/context.rs | 66 +++++++----- crates/re_renderer/src/global_bindings.rs | 15 +-- crates/re_renderer/src/mesh.rs | 3 +- crates/re_renderer/src/point_cloud_builder.rs | 4 +- crates/re_renderer/src/renderer/compositor.rs | 3 +- .../src/renderer/generic_skybox.rs | 2 +- crates/re_renderer/src/renderer/lines.rs | 64 ++++------- .../re_renderer/src/renderer/mesh_renderer.rs | 2 +- .../re_renderer/src/renderer/point_cloud.rs | 74 +++++-------- crates/re_renderer/src/renderer/rectangles.rs | 92 +++++----------- .../re_renderer/src/renderer/test_triangle.rs | 2 +- crates/re_renderer/src/view_builder.rs | 49 +++------ crates/re_renderer/src/wgpu_buffer_types.rs | 76 ++++++++++--- .../src/wgpu_resources/bind_group_pool.rs | 2 +- .../src/wgpu_resources/buffer_pool.rs | 2 +- .../src/wgpu_resources/texture_pool.rs | 2 +- 19 files changed, 322 insertions(+), 253 deletions(-) create mode 100644 crates/re_renderer/src/allocator/uniform_buffer_fill.rs diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs index e85b69596b16..0af9ad8d2587 100644 --- a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs +++ b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs @@ -39,6 +39,7 @@ where /// Do *not* make this public as we need to guarantee that the memory is *never* read from! #[inline(always)] fn as_slice(&mut self) -> &mut [T] { + // TODO(andreas): Is this access slow given that it internally goes through a trait interface? Should we keep the pointer around? &mut bytemuck::cast_slice_mut(&mut self.write_view)[self.unwritten_element_range.clone()] } @@ -47,13 +48,13 @@ where /// Panics if the data no longer fits into the buffer. #[inline] pub fn extend_from_slice(&mut self, elements: &[T]) { - self.as_slice().copy_from_slice(elements); + self.as_slice()[..elements.len()].copy_from_slice(elements); self.unwritten_element_range.start += elements.len(); } /// Pushes several elements into the buffer. /// - /// Panics if the data no longer fits into the buffer. + /// Extends until either running out of space or elements. #[inline] pub fn extend(&mut self, elements: impl Iterator) { let mut num_elements = 0; @@ -68,8 +69,8 @@ where /// /// Panics if the data no longer fits into the buffer. #[inline] - pub fn push(&mut self, element: &T) { - self.as_slice()[0] = *element; + pub fn push(&mut self, element: T) { + self.as_slice()[0] = element; self.unwritten_element_range.start += 1; } @@ -285,7 +286,7 @@ impl CpuWriteGpuReadBelt { pub fn allocate( &mut self, device: &wgpu::Device, - buffer_pool: &mut GpuBufferPool, + buffer_pool: &GpuBufferPool, num_elements: usize, ) -> CpuWriteGpuReadBuffer { // Potentially overestimate alignment with Self::MIN_ALIGNMENT, see Self::MIN_ALIGNMENT doc string. diff --git a/crates/re_renderer/src/allocator/mod.rs b/crates/re_renderer/src/allocator/mod.rs index cb4ab783fd09..2c0523348dee 100644 --- a/crates/re_renderer/src/allocator/mod.rs +++ b/crates/re_renderer/src/allocator/mod.rs @@ -4,5 +4,9 @@ //! follows some more complex strategy for efficient re-use and sub-allocation of wgpu resources. mod cpu_write_gpu_read_belt; +mod uniform_buffer_fill; pub use cpu_write_gpu_read_belt::{CpuWriteGpuReadBelt, CpuWriteGpuReadBuffer}; +pub use uniform_buffer_fill::{ + create_and_fill_uniform_buffer, create_and_fill_uniform_buffer_batch, +}; diff --git a/crates/re_renderer/src/allocator/uniform_buffer_fill.rs b/crates/re_renderer/src/allocator/uniform_buffer_fill.rs new file mode 100644 index 000000000000..4239dee69825 --- /dev/null +++ b/crates/re_renderer/src/allocator/uniform_buffer_fill.rs @@ -0,0 +1,102 @@ +pub use super::cpu_write_gpu_read_belt::{CpuWriteGpuReadBelt, CpuWriteGpuReadBuffer}; + +use crate::{wgpu_resources::BindGroupEntry, DebugLabel, RenderContext}; + +struct UniformBufferAlignmentCheck { + pub _marker: std::marker::PhantomData, +} +impl UniformBufferAlignmentCheck { + /// wgpu requires uniform buffers to be aligned to up to 256 bytes. + /// + /// This is a property of device limits, see [`WebGPU` specification](https://www.w3.org/TR/webgpu/#limits). + /// Implementations are allowed to advertise a lower alignment requirement, however + /// 256 bytes is fairly common even in modern hardware and is even hardcoded for DX12. + /// + /// Technically this is only relevant when sub-allocating a buffer, as the wgpu backend + /// is internally forced to make sure that the start of any [`wgpu::Buffer`] with [`wgpu::BufferUsages::UNIFORM`] usage + /// has this alignment. Practically, ensuring this alignment everywhere + /// + /// Alternatively to enforcing this alignment on the type we could: + /// * only align on the gpu buffer + /// -> causes more fine grained `copy_buffer_to_buffer` calls on the gpu encoder + /// * only align on the [`CpuWriteGpuReadBuffer`] & gpu buffer + /// -> causes more complicated offset computation on [`CpuWriteGpuReadBuffer`] as well as either + /// holes at padding (-> undefined values & slow for write combined!) or complicated nulling of padding + /// + /// About the [`bytemuck::Pod`] requirement (dragged in by [`CpuWriteGpuReadBuffer`]): + /// [`bytemuck::Pod`] forces us to be explicit about padding as it doesn't allow invisible padding bytes! + /// We could drop this and thus make it easier to define uniform buffer types. + /// But this leads to more unsafe code, harder to avoid holes in write combined memory access + /// and potentially undefined values in the padding bytes on GPU. + const CHECK: () = assert!( + std::mem::align_of::() >= 256, + "Uniform buffers need to be aligned to 256 bytes. Use `#[repr(C, align(256))]`" + ); +} +/// Utility for fast & efficient creation of uniform buffers from a series of structs. +/// +/// For subsequent frames, this will automatically not allocate any resources (thanks to our buffer pooling mechanism). +/// +/// TODO(#1383): We could do this on a more complex stack allocator. +pub fn create_and_fill_uniform_buffer_batch( + ctx: &RenderContext, + label: DebugLabel, + content: impl ExactSizeIterator, +) -> Vec { + #[allow(clippy::let_unit_value)] + let _ = UniformBufferAlignmentCheck::::CHECK; + + let num_buffers = content.len() as u64; + let element_size = std::mem::size_of::() as u64; + + assert!( + element_size > 0, + "Uniform buffer need to have a non-zero size" + ); + + let buffer = ctx.gpu_resources.buffers.alloc( + &ctx.device, + &crate::wgpu_resources::BufferDesc { + label, + size: num_buffers * element_size, + usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: false, + }, + ); + + let mut staging_buffer = ctx.cpu_write_gpu_read_belt.lock().allocate::( + &ctx.device, + &ctx.gpu_resources.buffers, + num_buffers as _, + ); + staging_buffer.extend(content); + staging_buffer.copy_to_buffer( + ctx.active_frame + .frame_global_command_encoder + .lock() + .get_or_create(&ctx.device), + &buffer, + 0, + ); + + (0..num_buffers) + .into_iter() + .map(|i| BindGroupEntry::Buffer { + handle: buffer.handle, + offset: i * element_size, + size: Some(std::num::NonZeroU64::new(element_size).unwrap()), + }) + .collect() +} + +/// See [`create_and_fill_uniform_buffer`]. +pub fn create_and_fill_uniform_buffer( + ctx: &mut RenderContext, + label: DebugLabel, + content: T, +) -> BindGroupEntry { + create_and_fill_uniform_buffer_batch(ctx, label, std::iter::once(content)) + .into_iter() + .next() + .unwrap() +} diff --git a/crates/re_renderer/src/context.rs b/crates/re_renderer/src/context.rs index 150d51320a5d..6a985c714277 100644 --- a/crates/re_renderer/src/context.rs +++ b/crates/re_renderer/src/context.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use parking_lot::Mutex; +use parking_lot::{Mutex, RwLock}; use type_map::concurrent::{self, TypeMap}; use crate::{ @@ -20,7 +20,7 @@ pub struct RenderContext { pub queue: Arc, pub(crate) shared_renderer_data: SharedRendererData, - pub(crate) renderers: Renderers, + pub(crate) renderers: RwLock, pub(crate) resolver: RecommendedFileResolver, #[cfg(all(not(target_arch = "wasm32"), debug_assertions))] // native debug build pub(crate) err_tracker: std::sync::Arc, @@ -155,14 +155,14 @@ impl RenderContext { }; let mut resolver = crate::new_recommended_file_resolver(); - let mut renderers = Renderers { + let mut renderers = RwLock::new(Renderers { renderers: TypeMap::new(), - }; + }); let mesh_manager = MeshManager::new( device.clone(), queue.clone(), - renderers.get_or_create( + renderers.get_mut().get_or_create( &shared_renderer_data, &mut gpu_resources, &device, @@ -193,8 +193,11 @@ impl RenderContext { inflight_queue_submissions: Vec::new(), - active_frame: ActiveFrameContext { frame_global_command_encoder: None, per_frame_data_helper: TypeMap::new(), - frame_index: 0 } + active_frame: ActiveFrameContext { + frame_global_command_encoder: Mutex::new(FrameGlobalCommandEncoder(None)), + per_frame_data_helper: TypeMap::new(), + frame_index: 0 + } } } @@ -229,7 +232,7 @@ impl RenderContext { self.cpu_write_gpu_read_belt.lock().after_queue_submit(); self.active_frame = ActiveFrameContext { - frame_global_command_encoder: None, + frame_global_command_encoder: Mutex::new(FrameGlobalCommandEncoder(None)), frame_index: self.active_frame.frame_index + 1, per_frame_data_helper: TypeMap::new(), }; @@ -303,7 +306,13 @@ impl RenderContext { // Unmap all staging buffers. self.cpu_write_gpu_read_belt.lock().before_queue_submit(); - if let Some(command_encoder) = self.active_frame.frame_global_command_encoder.take() { + if let Some(command_encoder) = self + .active_frame + .frame_global_command_encoder + .lock() + .0 + .take() + { let command_buffer = command_encoder.finish(); // TODO(andreas): For better performance, we should try to bundle this with the single submit call that is currently happening in eframe. @@ -318,18 +327,38 @@ impl Drop for RenderContext { fn drop(&mut self) { // Close global command encoder if there is any pending. // Not doing so before shutdown causes errors. - if let Some(encoder) = self.active_frame.frame_global_command_encoder.take() { + if let Some(encoder) = self + .active_frame + .frame_global_command_encoder + .lock() + .0 + .take() + { encoder.finish(); } } } +pub struct FrameGlobalCommandEncoder(Option); + +impl FrameGlobalCommandEncoder { + /// Gets or creates a command encoder that runs before all view builder encoder. + pub fn get_or_create(&mut self, device: &wgpu::Device) -> &mut wgpu::CommandEncoder { + self.0.get_or_insert_with(|| { + device.create_command_encoder(&wgpu::CommandEncoderDescriptor { + label: crate::DebugLabel::from("global \"before viewbuilder\" command encoder") + .get(), + }) + }) + } +} + pub struct ActiveFrameContext { /// Command encoder for all commands that should go in before view builder are submitted. /// /// This should be used for any gpu copy operation outside of a renderer or view builder. /// (i.e. typically in [`crate::renderer::DrawData`] creation!) - frame_global_command_encoder: Option, + pub frame_global_command_encoder: Mutex, /// Utility type map that will be cleared every frame. pub per_frame_data_helper: TypeMap, @@ -338,21 +367,6 @@ pub struct ActiveFrameContext { frame_index: u64, } -impl ActiveFrameContext { - /// Gets or creates a command encoder that runs before all view builder encoder. - pub fn frame_global_command_encoder( - &mut self, - device: &wgpu::Device, - ) -> &mut wgpu::CommandEncoder { - self.frame_global_command_encoder.get_or_insert_with(|| { - device.create_command_encoder(&wgpu::CommandEncoderDescriptor { - label: crate::DebugLabel::from("global \"before viewbuilder\" command encoder") - .get(), - }) - }) - } -} - /// Gets allocation size for a uniform buffer padded in a way that multiple can be put in a single wgpu buffer. /// /// TODO(andreas): Once we have higher level buffer allocators this should be handled there. diff --git a/crates/re_renderer/src/global_bindings.rs b/crates/re_renderer/src/global_bindings.rs index bf7729adcf0a..7ac1b2712a94 100644 --- a/crates/re_renderer/src/global_bindings.rs +++ b/crates/re_renderer/src/global_bindings.rs @@ -2,7 +2,7 @@ use crate::{ wgpu_buffer_types, wgpu_resources::{ BindGroupDesc, BindGroupEntry, BindGroupLayoutDesc, GpuBindGroup, GpuBindGroupLayoutHandle, - GpuBuffer, GpuSamplerHandle, SamplerDesc, WgpuResourcePools, + GpuSamplerHandle, SamplerDesc, WgpuResourcePools, }, }; @@ -13,7 +13,7 @@ use smallvec::smallvec; /// /// Contains information that is constant for a single frame like camera. /// (does not contain information that is special to a particular renderer) -#[repr(C)] +#[repr(C, align(256))] #[derive(Clone, Copy, Zeroable, Pod)] pub(crate) struct FrameUniformBuffer { pub view_from_world: wgpu_buffer_types::Mat4x3, @@ -46,8 +46,7 @@ pub(crate) struct FrameUniformBuffer { pub auto_size_lines: f32, /// Factor used to compute depth offsets, see `depth_offset.wgsl`. - pub depth_offset_factor: f32, - pub _padding: glam::Vec3, + pub depth_offset_factor: wgpu_buffer_types::F32RowPadded, } pub(crate) struct GlobalBindings { @@ -128,7 +127,7 @@ impl GlobalBindings { &self, pools: &mut WgpuResourcePools, device: &wgpu::Device, - frame_uniform_buffer: &GpuBuffer, + frame_uniform_buffer_binding: BindGroupEntry, ) -> GpuBindGroup { pools.bind_groups.alloc( device, @@ -136,11 +135,7 @@ impl GlobalBindings { &BindGroupDesc { label: "global bind group".into(), entries: smallvec![ - BindGroupEntry::Buffer { - handle: frame_uniform_buffer.handle, - offset: 0, - size: None, - }, + frame_uniform_buffer_binding, BindGroupEntry::Sampler(self.nearest_neighbor_sampler), BindGroupEntry::Sampler(self.trilinear_sampler), ], diff --git a/crates/re_renderer/src/mesh.rs b/crates/re_renderer/src/mesh.rs index 7f30c8fccb92..14a124bebaa2 100644 --- a/crates/re_renderer/src/mesh.rs +++ b/crates/re_renderer/src/mesh.rs @@ -137,8 +137,9 @@ pub(crate) mod gpu_data { } impl GpuMesh { + // TODO(andreas): Take read-only context here and make uploads happen on staging belt. pub fn new( - pools: &mut WgpuResourcePools, + pools: &WgpuResourcePools, texture_manager: &TextureManager2D, mesh_bound_group_layout: GpuBindGroupLayoutHandle, device: &wgpu::Device, diff --git a/crates/re_renderer/src/point_cloud_builder.rs b/crates/re_renderer/src/point_cloud_builder.rs index d13eb9105220..b3c4028194e3 100644 --- a/crates/re_renderer/src/point_cloud_builder.rs +++ b/crates/re_renderer/src/point_cloud_builder.rs @@ -28,7 +28,7 @@ where // TODO(andreas): Be more resourceful about the size allocated here. Typically we know in advance! let color_buffer = ctx.cpu_write_gpu_read_belt.lock().allocate::( &ctx.device, - &mut ctx.gpu_resources.buffers, + &ctx.gpu_resources.buffers, PointCloudDrawData::MAX_NUM_POINTS, ); @@ -286,7 +286,7 @@ where /// This mustn't call this more than once. #[inline] pub fn color(self, color: Color32) -> Self { - self.color.push(&color); + self.color.push(color); self } diff --git a/crates/re_renderer/src/renderer/compositor.rs b/crates/re_renderer/src/renderer/compositor.rs index 6ceae9fbd2e4..187ba68b05ed 100644 --- a/crates/re_renderer/src/renderer/compositor.rs +++ b/crates/re_renderer/src/renderer/compositor.rs @@ -31,7 +31,8 @@ impl DrawData for CompositorDrawData { impl CompositorDrawData { pub fn new(ctx: &mut RenderContext, target: &GpuTexture) -> Self { let pools = &mut ctx.gpu_resources; - let compositor = ctx.renderers.get_or_create::<_, Compositor>( + let mut renderers = ctx.renderers.write(); + let compositor = renderers.get_or_create::<_, Compositor>( &ctx.shared_renderer_data, pools, &ctx.device, diff --git a/crates/re_renderer/src/renderer/generic_skybox.rs b/crates/re_renderer/src/renderer/generic_skybox.rs index c0efe78817aa..1e84152f8fbf 100644 --- a/crates/re_renderer/src/renderer/generic_skybox.rs +++ b/crates/re_renderer/src/renderer/generic_skybox.rs @@ -29,7 +29,7 @@ impl DrawData for GenericSkyboxDrawData { impl GenericSkyboxDrawData { pub fn new(ctx: &mut RenderContext) -> Self { - ctx.renderers.get_or_create::<_, GenericSkybox>( + ctx.renderers.write().get_or_create::<_, GenericSkybox>( &ctx.shared_renderer_data, &mut ctx.gpu_resources, &ctx.device, diff --git a/crates/re_renderer/src/renderer/lines.rs b/crates/re_renderer/src/renderer/lines.rs index f8237709ebb6..29dc902da0c9 100644 --- a/crates/re_renderer/src/renderer/lines.rs +++ b/crates/re_renderer/src/renderer/lines.rs @@ -114,14 +114,14 @@ use bytemuck::Zeroable; use smallvec::smallvec; use crate::{ - context::uniform_buffer_allocation_size, + allocator::create_and_fill_uniform_buffer_batch, include_file, size::Size, view_builder::ViewBuilder, wgpu_resources::{ - BindGroupDesc, BindGroupEntry, BindGroupLayoutDesc, BufferDesc, GpuBindGroup, - GpuBindGroupLayoutHandle, GpuRenderPipelineHandle, PipelineLayoutDesc, PoolError, - RenderPipelineDesc, ShaderModuleDesc, TextureDesc, + BindGroupDesc, BindGroupEntry, BindGroupLayoutDesc, GpuBindGroup, GpuBindGroupLayoutHandle, + GpuRenderPipelineHandle, PipelineLayoutDesc, PoolError, RenderPipelineDesc, + ShaderModuleDesc, TextureDesc, }, Color32, DebugLabel, }; @@ -160,10 +160,12 @@ pub mod gpu_data { static_assertions::assert_eq_size!(LineStripInfo, [u32; 2]); /// Uniform buffer that changes for every batch of line strips. - #[repr(C)] + #[repr(C, align(256))] #[derive(Clone, Copy, bytemuck::Pod, bytemuck::Zeroable)] pub struct BatchUniformBuffer { pub world_from_obj: wgpu_buffer_types::Mat4, + + pub end_padding: [wgpu_buffer_types::PaddingRow; 16 - 4], } } @@ -299,7 +301,8 @@ impl LineDrawData { strips: &[LineStripInfo], batches: &[LineBatchInfo], ) -> Result { - let line_renderer = ctx.renderers.get_or_create::<_, LineRenderer>( + let mut renderers = ctx.renderers.write(); + let line_renderer = renderers.get_or_create::<_, LineRenderer>( &ctx.shared_renderer_data, &mut ctx.gpu_resources, &ctx.device, @@ -494,52 +497,29 @@ impl LineDrawData { // Process batches let mut batches_internal = Vec::with_capacity(batches.len()); { - let allocation_size_per_uniform_buffer = - uniform_buffer_allocation_size::(&ctx.device); - let combined_buffers_size = allocation_size_per_uniform_buffer * batches.len() as u64; - let uniform_buffers = ctx.gpu_resources.buffers.alloc( - &ctx.device, - &BufferDesc { - label: "lines batch uniform buffers".into(), - size: combined_buffers_size, - usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST, - mapped_at_creation: false, - }, + let uniform_buffer_bindings = create_and_fill_uniform_buffer_batch( + ctx, + "lines batch uniform buffers".into(), + batches + .iter() + .map(|batch_info| gpu_data::BatchUniformBuffer { + world_from_obj: batch_info.world_from_obj.into(), + end_padding: Default::default(), + }), ); - let mut staging_buffer = ctx - .queue - .write_buffer_with( - &uniform_buffers, - 0, - NonZeroU64::new(combined_buffers_size).unwrap(), - ) - .unwrap(); // Fails only if mapping is bigger than buffer size. - let mut start_vertex_for_next_batch = 0; - for (i, batch_info) in batches.iter().enumerate() { - // CAREFUL: Memory from `write_buffer_with` may not be aligned, causing bytemuck to fail at runtime if we use it to cast the memory to a slice! - let offset = i * allocation_size_per_uniform_buffer as usize; + for (batch_info, uniform_buffer_binding) in + batches.iter().zip(uniform_buffer_bindings.into_iter()) + { let line_vertex_range_end = (start_vertex_for_next_batch + batch_info.line_vertex_count) .min(Self::MAX_NUM_VERTICES as u32); - staging_buffer - [offset..(offset + std::mem::size_of::())] - .copy_from_slice(bytemuck::bytes_of(&gpu_data::BatchUniformBuffer { - world_from_obj: batch_info.world_from_obj.into(), - })); - let bind_group = ctx.gpu_resources.bind_groups.alloc( &ctx.device, &BindGroupDesc { label: batch_info.label.clone(), - entries: smallvec![BindGroupEntry::Buffer { - handle: uniform_buffers.handle, - offset: offset as _, - size: NonZeroU64::new( - std::mem::size_of::() as _ - ), - }], + entries: smallvec![uniform_buffer_binding], layout: line_renderer.bind_group_layout_batch, }, &ctx.gpu_resources.bind_group_layouts, diff --git a/crates/re_renderer/src/renderer/mesh_renderer.rs b/crates/re_renderer/src/renderer/mesh_renderer.rs index 14559b9f69e6..eb84b97976e3 100644 --- a/crates/re_renderer/src/renderer/mesh_renderer.rs +++ b/crates/re_renderer/src/renderer/mesh_renderer.rs @@ -120,7 +120,7 @@ impl MeshDrawData { pub fn new(ctx: &mut RenderContext, instances: &[MeshInstance]) -> anyhow::Result { crate::profile_function!(); - let _mesh_renderer = ctx.renderers.get_or_create::<_, MeshRenderer>( + let _mesh_renderer = ctx.renderers.write().get_or_create::<_, MeshRenderer>( &ctx.shared_renderer_data, &mut ctx.gpu_resources, &ctx.device, diff --git a/crates/re_renderer/src/renderer/point_cloud.rs b/crates/re_renderer/src/renderer/point_cloud.rs index 233911761fd2..70bfbbb69045 100644 --- a/crates/re_renderer/src/renderer/point_cloud.rs +++ b/crates/re_renderer/src/renderer/point_cloud.rs @@ -18,10 +18,7 @@ use std::{ ops::Range, }; -use crate::{ - context::uniform_buffer_allocation_size, wgpu_resources::BufferDesc, DebugLabel, - PointCloudBuilder, -}; +use crate::{allocator::create_and_fill_uniform_buffer_batch, DebugLabel, PointCloudBuilder}; use bitflags::bitflags; use bytemuck::Zeroable; use itertools::Itertools; @@ -56,7 +53,6 @@ bitflags! { } mod gpu_data { - use super::PointCloudBatchFlags; use crate::{wgpu_buffer_types, Size}; // Don't use `wgsl_buffer_types` since this data doesn't go into a buffer, so alignment rules don't apply like on buffers.. @@ -69,12 +65,14 @@ mod gpu_data { static_assertions::assert_eq_size!(PositionData, glam::Vec4); /// Uniform buffer that changes for every batch of line strips. - #[repr(C)] + #[repr(C, align(256))] #[derive(Clone, Copy, bytemuck::Pod, bytemuck::Zeroable)] pub struct BatchUniformBuffer { pub world_from_obj: wgpu_buffer_types::Mat4, - pub flags: PointCloudBatchFlags, - pub _padding: glam::Vec3, + + pub flags: wgpu_buffer_types::U32RowPadded, // PointCloudBatchFlags + + pub end_padding: [wgpu_buffer_types::PaddingRow; 16 - 5], } } @@ -155,7 +153,8 @@ impl PointCloudDrawData { ) -> Result { crate::profile_function!(); - let point_renderer = ctx.renderers.get_or_create::<_, PointCloudRenderer>( + let mut renderers = ctx.renderers.write(); + let point_renderer = renderers.get_or_create::<_, PointCloudRenderer>( &ctx.shared_renderer_data, &mut ctx.gpu_resources, &ctx.device, @@ -285,7 +284,10 @@ impl PointCloudDrawData { } builder.color_buffer.copy_to_texture( - ctx.active_frame.frame_global_command_encoder(&ctx.device), + ctx.active_frame + .frame_global_command_encoder + .lock() + .get_or_create(&ctx.device), wgpu::ImageCopyTexture { texture: &color_texture.texture, mip_level: 0, @@ -316,51 +318,27 @@ impl PointCloudDrawData { // Process batches let mut batches_internal = Vec::with_capacity(batches.len()); { - let allocation_size_per_uniform_buffer = - uniform_buffer_allocation_size::(&ctx.device); - let combined_buffers_size = allocation_size_per_uniform_buffer * batches.len() as u64; - let uniform_buffers = ctx.gpu_resources.buffers.alloc( - &ctx.device, - &BufferDesc { - label: "point batch uniform buffers".into(), - size: combined_buffers_size, - usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST, - mapped_at_creation: false, - }, + let uniform_buffer_bindings = create_and_fill_uniform_buffer_batch( + ctx, + "point batch uniform buffers".into(), + batches + .iter() + .map(|batch_info| gpu_data::BatchUniformBuffer { + world_from_obj: batch_info.world_from_obj.into(), + flags: batch_info.flags.bits.into(), + end_padding: Default::default(), + }), ); - let mut staging_buffer = ctx - .queue - .write_buffer_with( - &uniform_buffers, - 0, - NonZeroU64::new(combined_buffers_size).unwrap(), - ) - .unwrap(); // Fails only if mapping is bigger than buffer size. - let mut start_point_for_next_batch = 0; - for (i, batch_info) in batches.iter().enumerate() { - // CAREFUL: Memory from `write_buffer_with` may not be aligned, causing bytemuck to fail at runtime if we use it to cast the memory to a slice! - let offset = i * allocation_size_per_uniform_buffer as usize; - staging_buffer - [offset..(offset + std::mem::size_of::())] - .copy_from_slice(bytemuck::bytes_of(&gpu_data::BatchUniformBuffer { - world_from_obj: batch_info.world_from_obj.into(), - flags: batch_info.flags, - _padding: glam::Vec3::ZERO, - })); - + for (batch_info, uniform_buffer_binding) in + batches.iter().zip(uniform_buffer_bindings.into_iter()) + { let bind_group = ctx.gpu_resources.bind_groups.alloc( &ctx.device, &BindGroupDesc { label: batch_info.label.clone(), - entries: smallvec![BindGroupEntry::Buffer { - handle: uniform_buffers.handle, - offset: offset as _, - size: NonZeroU64::new( - std::mem::size_of::() as _ - ), - }], + entries: smallvec![uniform_buffer_binding], layout: point_renderer.bind_group_layout_batch, }, &ctx.gpu_resources.bind_group_layouts, diff --git a/crates/re_renderer/src/renderer/rectangles.rs b/crates/re_renderer/src/renderer/rectangles.rs index 42f98378dceb..2a6d019bb263 100644 --- a/crates/re_renderer/src/renderer/rectangles.rs +++ b/crates/re_renderer/src/renderer/rectangles.rs @@ -11,18 +11,17 @@ //! we are forced to have individual bind groups per rectangle and thus a draw call per rectangle. use smallvec::smallvec; -use std::num::NonZeroU64; use crate::{ - context::uniform_buffer_allocation_size, + allocator::create_and_fill_uniform_buffer_batch, depth_offset::DepthOffset, include_file, resource_managers::{GpuTexture2DHandle, ResourceManagerError}, view_builder::ViewBuilder, wgpu_resources::{ - BindGroupDesc, BindGroupEntry, BindGroupLayoutDesc, BufferDesc, GpuBindGroup, - GpuBindGroupLayoutHandle, GpuRenderPipelineHandle, PipelineLayoutDesc, RenderPipelineDesc, - SamplerDesc, ShaderModuleDesc, + BindGroupDesc, BindGroupEntry, BindGroupLayoutDesc, GpuBindGroup, GpuBindGroupLayoutHandle, + GpuRenderPipelineHandle, PipelineLayoutDesc, RenderPipelineDesc, SamplerDesc, + ShaderModuleDesc, }, Rgba, }; @@ -36,14 +35,16 @@ mod gpu_data { use crate::wgpu_buffer_types; // Keep in sync with mirror in rectangle.wgsl - #[repr(C)] + #[repr(C, align(256))] #[derive(Clone, Copy, bytemuck::Pod, bytemuck::Zeroable)] pub struct UniformBuffer { - pub top_left_corner_position: wgpu_buffer_types::Vec3, - pub extent_u: wgpu_buffer_types::Vec3, + pub top_left_corner_position: wgpu_buffer_types::Vec3RowPadded, + pub extent_u: wgpu_buffer_types::Vec3RowPadded, pub extent_v: wgpu_buffer_types::Vec3Unpadded, pub depth_offset: f32, pub multiplicative_tint: crate::Rgba, + + pub end_padding: [wgpu_buffer_types::PaddingRow; 16 - 4], } } @@ -116,7 +117,8 @@ impl RectangleDrawData { ) -> Result { crate::profile_function!(); - let rectangle_renderer = ctx.renderers.get_or_create::<_, RectangleRenderer>( + let mut renderers = ctx.renderers.write(); + let rectangle_renderer = renderers.get_or_create::<_, RectangleRenderer>( &ctx.shared_renderer_data, &mut ctx.gpu_resources, &ctx.device, @@ -129,59 +131,23 @@ impl RectangleDrawData { }); } - let allocation_size_per_uniform_buffer = - uniform_buffer_allocation_size::(&ctx.device); - let combined_buffers_size = allocation_size_per_uniform_buffer * rectangles.len() as u64; - - // Allocate all constant buffers at once. - // TODO(andreas): This should come from a per-frame allocator! - let uniform_buffer = ctx.gpu_resources.buffers.alloc( - &ctx.device, - &BufferDesc { - label: "rectangle uniform buffers".into(), - size: combined_buffers_size, - usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::UNIFORM, - mapped_at_creation: false, - }, + let uniform_buffer_bindings = create_and_fill_uniform_buffer_batch( + ctx, + "rectangle uniform buffers".into(), + rectangles.iter().map(|rectangle| gpu_data::UniformBuffer { + top_left_corner_position: rectangle.top_left_corner_position.into(), + extent_u: rectangle.extent_u.into(), + extent_v: rectangle.extent_v.into(), + depth_offset: rectangle.depth_offset as f32, + multiplicative_tint: rectangle.multiplicative_tint, + end_padding: Default::default(), + }), ); - // Fill staging buffer in a separate loop to avoid borrow checker issues - { - // TODO(andreas): This should come from a staging buffer. - let mut staging_buffer = ctx - .queue - .write_buffer_with( - &uniform_buffer, - 0, - NonZeroU64::new(combined_buffers_size).unwrap(), - ) - .unwrap(); // Fails only if mapping is bigger than buffer size. - - for (i, rectangle) in rectangles.iter().enumerate() { - let offset = i * allocation_size_per_uniform_buffer as usize; - - // CAREFUL: Memory from `write_buffer_with` may not be aligned, causing bytemuck to fail at runtime if we use it to cast the memory to a slice! - // I.e. this will crash randomly: - // - // let target_buffer = bytemuck::from_bytes_mut::( - // &mut staging_buffer[offset..(offset + uniform_buffer_size)], - // ); - // - // TODO(andreas): with our own staging buffers we could fix this very easily - - staging_buffer[offset..(offset + std::mem::size_of::())] - .copy_from_slice(bytemuck::bytes_of(&gpu_data::UniformBuffer { - top_left_corner_position: rectangle.top_left_corner_position.into(), - extent_u: rectangle.extent_u.into(), - extent_v: rectangle.extent_v.into(), - depth_offset: rectangle.depth_offset as f32, - multiplicative_tint: rectangle.multiplicative_tint, - })); - } - } - let mut bind_groups = Vec::with_capacity(rectangles.len()); - for (i, rectangle) in rectangles.iter().enumerate() { + for (rectangle, uniform_buffer) in + rectangles.iter().zip(uniform_buffer_bindings.into_iter()) + { let texture = ctx.texture_manager_2d.get(&rectangle.texture)?; let sampler = ctx.gpu_resources.samplers.get_or_create( @@ -211,13 +177,7 @@ impl RectangleDrawData { &BindGroupDesc { label: "rectangle".into(), entries: smallvec![ - BindGroupEntry::Buffer { - handle: uniform_buffer.handle, - offset: i as u64 * allocation_size_per_uniform_buffer, - size: NonZeroU64::new( - std::mem::size_of::() as u64 - ), - }, + uniform_buffer, BindGroupEntry::DefaultTextureView(texture.handle), BindGroupEntry::Sampler(sampler) ], diff --git a/crates/re_renderer/src/renderer/test_triangle.rs b/crates/re_renderer/src/renderer/test_triangle.rs index 2ac382562c09..2b5a627080e3 100644 --- a/crates/re_renderer/src/renderer/test_triangle.rs +++ b/crates/re_renderer/src/renderer/test_triangle.rs @@ -24,7 +24,7 @@ impl DrawData for TestTriangleDrawData { impl TestTriangleDrawData { pub fn new(ctx: &mut RenderContext) -> Self { - ctx.renderers.get_or_create::<_, TestTriangle>( + ctx.renderers.write().get_or_create::<_, TestTriangle>( &ctx.shared_renderer_data, &mut ctx.gpu_resources, &ctx.device, diff --git a/crates/re_renderer/src/view_builder.rs b/crates/re_renderer/src/view_builder.rs index 1f5981b278c6..defdeb80f770 100644 --- a/crates/re_renderer/src/view_builder.rs +++ b/crates/re_renderer/src/view_builder.rs @@ -3,13 +3,14 @@ use parking_lot::RwLock; use std::sync::Arc; use crate::{ + allocator::create_and_fill_uniform_buffer, context::RenderContext, global_bindings::FrameUniformBuffer, renderer::{ compositor::{Compositor, CompositorDrawData}, DrawData, Renderer, }, - wgpu_resources::{BufferDesc, GpuBindGroup, GpuTexture, TextureDesc}, + wgpu_resources::{GpuBindGroup, GpuTexture, TextureDesc}, DebugLabel, Rgba, Size, }; @@ -268,17 +269,6 @@ impl ViewBuilder { let tonemapping_draw_data = CompositorDrawData::new(ctx, &main_target_resolved); - // Setup frame uniform buffer - let frame_uniform_buffer = ctx.gpu_resources.buffers.alloc( - &ctx.device, - &BufferDesc { - label: format!("{:?} - frame uniform buffer", config.name).into(), - size: std::mem::size_of::() as _, - usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST, - mapped_at_creation: false, - }, - ); - let aspect_ratio = config.resolution_in_pixel[0] as f32 / config.resolution_in_pixel[1] as f32; @@ -393,12 +383,11 @@ impl ViewBuilder { // Factor applied to depth offsets. let depth_offset_factor = 1.0e-08; // Value determined by experimentation. Quite close to the f32 machine epsilon but a bit lower. - { - let mut frame_uniform_buffer_cpu = ctx - .cpu_write_gpu_read_belt - .lock() - .allocate::(&ctx.device, &mut ctx.gpu_resources.buffers, 1); - frame_uniform_buffer_cpu.push(&FrameUniformBuffer { + // Setup frame uniform buffer + let frame_uniform_buffer = create_and_fill_uniform_buffer( + ctx, + format!("{:?} - frame uniform buffer", config.name).into(), + FrameUniformBuffer { view_from_world: glam::Affine3A::from_mat4(view_from_world).into(), projection_from_view: projection_from_view.into(), projection_from_world: projection_from_world.into(), @@ -411,20 +400,14 @@ impl ViewBuilder { auto_size_points: auto_size_points.0, auto_size_lines: auto_size_lines.0, - depth_offset_factor, - _padding: glam::Vec3::ZERO, - }); - frame_uniform_buffer_cpu.copy_to_buffer( - ctx.active_frame.frame_global_command_encoder(&ctx.device), - &frame_uniform_buffer, - 0, - ); - } + depth_offset_factor: depth_offset_factor.into(), + }, + ); let bind_group_0 = ctx.shared_renderer_data.global_bindings.create_bind_group( &mut ctx.gpu_resources, &ctx.device, - &frame_uniform_buffer, + frame_uniform_buffer, ); self.setup = Some(ViewTargetSetup { @@ -448,8 +431,8 @@ impl ViewBuilder { self.queued_draws.push(QueuedDraw { draw_func: Box::new(move |ctx, pass, draw_data| { - let renderer = ctx - .renderers + let renderers = ctx.renderers.read(); + let renderer = renderers .get::() .context("failed to retrieve renderer")?; let draw_data = draw_data @@ -556,10 +539,8 @@ impl ViewBuilder { pass.set_bind_group(0, &setup.bind_group_0, &[]); - let tonemapper = ctx - .renderers - .get::() - .context("get compositor")?; + let renderers = ctx.renderers.read(); + let tonemapper = renderers.get::().context("get compositor")?; tonemapper .draw(&ctx.gpu_resources, pass, &setup.compositor_draw_data) .context("composite into main view") diff --git a/crates/re_renderer/src/wgpu_buffer_types.rs b/crates/re_renderer/src/wgpu_buffer_types.rs index 4156bb6cc607..15686ce0491d 100644 --- a/crates/re_renderer/src/wgpu_buffer_types.rs +++ b/crates/re_renderer/src/wgpu_buffer_types.rs @@ -5,6 +5,48 @@ use bytemuck::{Pod, Zeroable}; +#[repr(C, align(16))] +#[derive(Clone, Copy, Zeroable, Pod)] +pub struct F32RowPadded { + pub v: f32, + pub padding0: f32, + pub padding1: f32, + pub padding2: f32, +} + +impl From for F32RowPadded { + #[inline] + fn from(v: f32) -> Self { + F32RowPadded { + v, + padding0: 0.0, + padding1: 0.0, + padding2: 0.0, + } + } +} + +#[repr(C, align(16))] +#[derive(Clone, Copy, Zeroable, Pod)] +pub struct U32RowPadded { + pub v: u32, + pub padding0: u32, + pub padding1: u32, + pub padding2: u32, +} + +impl From for U32RowPadded { + #[inline] + fn from(v: u32) -> Self { + U32RowPadded { + v, + padding0: 0, + padding1: 0, + padding2: 0, + } + } +} + #[repr(C, align(8))] #[derive(Clone, Copy, Zeroable, Pod)] pub struct Vec2 { @@ -21,17 +63,17 @@ impl From for Vec2 { #[repr(C, align(16))] #[derive(Clone, Copy, Zeroable, Pod)] -pub struct Vec2Padded { +pub struct Vec2RowPadded { pub x: f32, pub y: f32, pub padding0: f32, pub padding1: f32, } -impl From for Vec2Padded { +impl From for Vec2RowPadded { #[inline] fn from(v: glam::Vec2) -> Self { - Vec2Padded { + Vec2RowPadded { x: v.x, y: v.y, padding0: 0.0, @@ -42,17 +84,17 @@ impl From for Vec2Padded { #[repr(C, align(16))] #[derive(Clone, Copy, Zeroable, Pod)] -pub struct Vec3 { +pub struct Vec3RowPadded { pub x: f32, pub y: f32, pub z: f32, pub padding: f32, } -impl From for Vec3 { +impl From for Vec3RowPadded { #[inline] fn from(v: glam::Vec3) -> Self { - Vec3 { + Vec3RowPadded { x: v.x, y: v.y, z: v.z, @@ -61,10 +103,10 @@ impl From for Vec3 { } } -impl From for Vec3 { +impl From for Vec3RowPadded { #[inline] fn from(v: glam::Vec3A) -> Self { - Vec3 { + Vec3RowPadded { x: v.x, y: v.y, z: v.z, @@ -160,10 +202,10 @@ impl From for Mat4 { #[repr(C, align(16))] #[derive(Clone, Copy, Zeroable, Pod)] pub struct Mat4x3 { - c0: Vec3, - c1: Vec3, - c2: Vec3, - c3: Vec3, + c0: Vec3RowPadded, + c1: Vec3RowPadded, + c2: Vec3RowPadded, + c3: Vec3RowPadded, } impl From for Mat4x3 { @@ -177,3 +219,13 @@ impl From for Mat4x3 { } } } + +/// A Vec4 of pure padding (i.e. 16 bytes of padding) +/// +/// Useful utility to pad uniform buffers out to a multiple of 16 rows, +/// (256 bytes is the alignment requirement for Uniform buffers) +#[repr(C, align(16))] +#[derive(Clone, Copy, Zeroable, Pod, Default)] +pub struct PaddingRow { + p: [f32; 4], +} diff --git a/crates/re_renderer/src/wgpu_resources/bind_group_pool.rs b/crates/re_renderer/src/wgpu_resources/bind_group_pool.rs index 8422ae97734a..7de882eaa7c1 100644 --- a/crates/re_renderer/src/wgpu_resources/bind_group_pool.rs +++ b/crates/re_renderer/src/wgpu_resources/bind_group_pool.rs @@ -102,7 +102,7 @@ impl GpuBindGroupPool { /// Once ownership to the handle is given up, the bind group may be reclaimed in future frames. /// The handle also keeps alive any dependent resources. pub fn alloc( - &mut self, + &self, device: &wgpu::Device, desc: &BindGroupDesc, bind_group_layouts: &GpuBindGroupLayoutPool, diff --git a/crates/re_renderer/src/wgpu_resources/buffer_pool.rs b/crates/re_renderer/src/wgpu_resources/buffer_pool.rs index c6890749dc5c..7b96406aca95 100644 --- a/crates/re_renderer/src/wgpu_resources/buffer_pool.rs +++ b/crates/re_renderer/src/wgpu_resources/buffer_pool.rs @@ -59,7 +59,7 @@ impl GpuBufferPool { /// /// For more efficient allocation (faster, less fragmentation) you should sub-allocate buffers whenever possible /// either manually or using a higher level allocator. - pub fn alloc(&mut self, device: &wgpu::Device, desc: &BufferDesc) -> GpuBuffer { + pub fn alloc(&self, device: &wgpu::Device, desc: &BufferDesc) -> GpuBuffer { self.pool.alloc(desc, |desc| { device.create_buffer(&wgpu::BufferDescriptor { label: desc.label.get(), diff --git a/crates/re_renderer/src/wgpu_resources/texture_pool.rs b/crates/re_renderer/src/wgpu_resources/texture_pool.rs index ad8c7f5c0d06..3cfdac02cf93 100644 --- a/crates/re_renderer/src/wgpu_resources/texture_pool.rs +++ b/crates/re_renderer/src/wgpu_resources/texture_pool.rs @@ -81,7 +81,7 @@ pub struct GpuTexturePool { impl GpuTexturePool { /// Returns a reference-counted handle to a currently unused texture. /// Once ownership to the handle is given up, the texture may be reclaimed in future frames. - pub fn alloc(&mut self, device: &wgpu::Device, desc: &TextureDesc) -> GpuTexture { + pub fn alloc(&self, device: &wgpu::Device, desc: &TextureDesc) -> GpuTexture { crate::profile_function!(); self.pool.alloc(desc, |desc| { let texture = device.create_texture(&wgpu::TextureDescriptor {