From 5ec55b2ca7b40f026363fca7787bdeea4ff7689d Mon Sep 17 00:00:00 2001 From: Jorge Pineda Date: Fri, 30 Aug 2024 15:08:17 -0700 Subject: [PATCH 01/11] [ET-VK] Rename `StorageBuffer` to `StagingBuffer` 1. Tensors accessible to both CPU and GPU use `StorageBuffer`. 2. Tensors accessible to GPU only use `vTensorStorage`. Since `StorageBuffer` is only used for [staging buffers](https://vulkan-tutorial.com/Vertex_buffers/Staging_buffer), i.e., buffers accessible to both CPU and GPU, `StagingBuffer` is a fitting name for (1) that decreases confusion between (1) and (2). Differential Revision: [D62049779](https://our.internmc.facebook.com/intern/diff/D62049779/) [ghstack-poisoned] --- backends/vulkan/runtime/api/Context.cpp | 2 +- backends/vulkan/runtime/api/api.h | 2 +- .../{StorageBuffer.h => StagingBuffer.h} | 14 +++++++------- backends/vulkan/runtime/graph/ComputeGraph.cpp | 4 ++-- backends/vulkan/runtime/graph/ComputeGraph.h | 2 +- backends/vulkan/runtime/graph/containers/Value.h | 8 ++++---- .../vulkan/runtime/graph/ops/PrepackNode.cpp | 8 ++++---- backends/vulkan/runtime/graph/ops/PrepackNode.h | 2 +- .../runtime/graph/ops/utils/BindingUtils.cpp | 2 +- .../runtime/graph/ops/utils/BindingUtils.h | 2 +- .../runtime/graph/ops/utils/StagingUtils.cpp | 6 +++--- .../runtime/graph/ops/utils/StagingUtils.h | 6 +++--- backends/vulkan/test/utils/test_utils.cpp | 6 +++--- backends/vulkan/test/utils/test_utils.h | 10 +++++----- backends/vulkan/test/vulkan_compute_api_test.cpp | 16 ++++++++-------- .../vulkan/tools/gpuinfo/include/architecture.h | 6 +++--- backends/vulkan/tools/gpuinfo/include/buffers.h | 8 ++++---- backends/vulkan/tools/gpuinfo/include/textures.h | 4 ++-- 18 files changed, 54 insertions(+), 54 deletions(-) rename backends/vulkan/runtime/api/containers/{StorageBuffer.h => StagingBuffer.h} (83%) diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp index 4d2a854de3b..e13293c3098 100644 --- a/backends/vulkan/runtime/api/Context.cpp +++ b/backends/vulkan/runtime/api/Context.cpp @@ -189,7 +189,7 @@ Context* context() { const vkapi::DescriptorPoolConfig descriptor_pool_config{ VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorPoolMaxSets VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorUniformBufferCount - VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorStorageBufferCount + VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorStagingBufferCount VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorCombinedSamplerCount VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorStorageImageCount 32u, // descriptorPileSizes diff --git a/backends/vulkan/runtime/api/api.h b/backends/vulkan/runtime/api/api.h index de77c57fb0e..0f496a4af8a 100644 --- a/backends/vulkan/runtime/api/api.h +++ b/backends/vulkan/runtime/api/api.h @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include diff --git a/backends/vulkan/runtime/api/containers/StorageBuffer.h b/backends/vulkan/runtime/api/containers/StagingBuffer.h similarity index 83% rename from backends/vulkan/runtime/api/containers/StorageBuffer.h rename to backends/vulkan/runtime/api/containers/StagingBuffer.h index 17c34706057..ab650c09a43 100644 --- a/backends/vulkan/runtime/api/containers/StorageBuffer.h +++ b/backends/vulkan/runtime/api/containers/StagingBuffer.h @@ -17,7 +17,7 @@ namespace vkcompute { namespace api { -class StorageBuffer final { +class StagingBuffer final { private: Context* context_p_; vkapi::ScalarType dtype_; @@ -26,7 +26,7 @@ class StorageBuffer final { vkapi::VulkanBuffer vulkan_buffer_; public: - StorageBuffer( + StagingBuffer( Context* context_p, const vkapi::ScalarType dtype, const size_t numel, @@ -39,13 +39,13 @@ class StorageBuffer final { nbytes_, gpuonly)) {} - StorageBuffer(const StorageBuffer&) = delete; - StorageBuffer& operator=(const StorageBuffer&) = delete; + StagingBuffer(const StagingBuffer&) = delete; + StagingBuffer& operator=(const StagingBuffer&) = delete; - StorageBuffer(StorageBuffer&&) = default; - StorageBuffer& operator=(StorageBuffer&&) = default; + StagingBuffer(StagingBuffer&&) = default; + StagingBuffer& operator=(StagingBuffer&&) = default; - ~StorageBuffer() { + ~StagingBuffer() { context_p_->register_buffer_cleanup(vulkan_buffer_); } diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp index 9fa0091b298..6c3ec88eaa7 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp @@ -38,7 +38,7 @@ namespace vkcompute { VALUE_PTR_CLASS_IMPL(vTensorPtr, api::vTensor, Tensor) VALUE_PTR_CLASS_IMPL(TensorRefPtr, TensorRef, TensorRef) -VALUE_PTR_CLASS_IMPL(StagingPtr, api::StorageBuffer, Staging) +VALUE_PTR_CLASS_IMPL(StagingPtr, api::StagingBuffer, Staging) VALUE_PTR_CLASS_IMPL(IntListPtr, std::vector, IntList) VALUE_PTR_CLASS_IMPL(DoubleListPtr, std::vector, DoubleList) VALUE_PTR_CLASS_IMPL(BoolListPtr, std::vector, BoolList) @@ -236,7 +236,7 @@ ValueRef ComputeGraph::add_staging( const size_t numel) { ValueRef idx(static_cast(values_.size())); check_no_active_value_ptrs(); - values_.emplace_back(api::StorageBuffer(context(), dtype, numel)); + values_.emplace_back(api::StagingBuffer(context(), dtype, numel)); return idx; } diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index 5740d24a448..9b04b08a70e 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -58,7 +58,7 @@ class ComputeGraph; DECL_VALUE_PTR_CLASS(vTensorPtr, api::vTensor) DECL_VALUE_PTR_CLASS(TensorRefPtr, TensorRef) -DECL_VALUE_PTR_CLASS(StagingPtr, api::StorageBuffer) +DECL_VALUE_PTR_CLASS(StagingPtr, api::StagingBuffer) DECL_VALUE_PTR_CLASS(IntListPtr, std::vector) DECL_VALUE_PTR_CLASS(DoubleListPtr, std::vector) DECL_VALUE_PTR_CLASS(BoolListPtr, std::vector) diff --git a/backends/vulkan/runtime/graph/containers/Value.h b/backends/vulkan/runtime/graph/containers/Value.h index ba82213c6f8..6e03bbd4a21 100644 --- a/backends/vulkan/runtime/graph/containers/Value.h +++ b/backends/vulkan/runtime/graph/containers/Value.h @@ -53,7 +53,7 @@ struct Value final { } u; api::vTensor as_tensor; - api::StorageBuffer as_staging; + api::StagingBuffer as_staging; TensorRef as_tensorref; std::vector as_int_list; @@ -108,7 +108,7 @@ struct Value final { CASE_MOVE_MOVEABLE_TYPE( TypeTag::TENSOR, api::vTensor, as_tensor, vTensor); CASE_MOVE_MOVEABLE_TYPE( - TypeTag::STAGING, api::StorageBuffer, as_staging, StorageBuffer); + TypeTag::STAGING, api::StagingBuffer, as_staging, StagingBuffer); CASE_MOVE_MOVEABLE_TYPE( TypeTag::TENSORREF, TensorRef, as_tensorref, TensorRef); // Scalar lists @@ -152,7 +152,7 @@ struct Value final { payload.as_tensor.~vTensor(); break; case TypeTag::STAGING: - payload.as_staging.~StorageBuffer(); + payload.as_staging.~StagingBuffer(); break; case TypeTag::TENSORREF: payload.as_tensorref.~TensorRef(); @@ -247,7 +247,7 @@ struct Value final { as_tensor); SUPPORT_TRIVIALLY_MOVEABLE_TYPE( - api::StorageBuffer, + api::StagingBuffer, Staging, TypeTag::STAGING, as_staging); diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp index b77c62920dd..a9c2f6c9b6a 100644 --- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp +++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp @@ -45,14 +45,14 @@ PrepackNode::PrepackNode( graph.update_descriptor_counts(noop_shader_, /*execute = */ false); } -api::StorageBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) { +api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) { vTensorPtr packed = graph->get_tensor(packed_); // If no TensorRef is provided, create a staging buffer of zeros according to // the vkapi::vTensor metadata. if (graph->val_is_none(tref_)) { size_t numel = utils::multiply_integers(packed->sizes()); - api::StorageBuffer staging(graph->context(), packed->dtype(), numel); + api::StagingBuffer staging(graph->context(), packed->dtype(), numel); size_t nbytes = numel * vkapi::element_size(packed->dtype()); set_staging_zeros(staging, nbytes); return staging; @@ -60,7 +60,7 @@ api::StorageBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) { TensorRefPtr tref = graph->get_tref(tref_); size_t numel = utils::multiply_integers(tref->sizes); - api::StorageBuffer staging(graph->context(), tref->dtype, numel); + api::StagingBuffer staging(graph->context(), tref->dtype, numel); size_t nbytes = numel * vkapi::element_size(tref->dtype); copy_ptr_to_staging(tref->data, staging, nbytes); return staging; @@ -70,7 +70,7 @@ void PrepackNode::encode(ComputeGraph* graph) { api::Context* const context = graph->context(); vTensorPtr packed = graph->get_tensor(packed_); - api::StorageBuffer staging = create_staging_buffer(graph); + api::StagingBuffer staging = create_staging_buffer(graph); std::unique_lock cmd_lock = context->dispatch_lock(); diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.h b/backends/vulkan/runtime/graph/ops/PrepackNode.h index c3ac8b963fd..3e713303c3d 100644 --- a/backends/vulkan/runtime/graph/ops/PrepackNode.h +++ b/backends/vulkan/runtime/graph/ops/PrepackNode.h @@ -56,7 +56,7 @@ class PrepackNode final { const vkapi::SpecVarList spec_vars_; private: - api::StorageBuffer create_staging_buffer(ComputeGraph* graph); + api::StagingBuffer create_staging_buffer(ComputeGraph* graph); }; } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp index b0964ace225..2cfb34a052e 100644 --- a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp +++ b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp @@ -66,7 +66,7 @@ uint32_t bind_params_to_descriptor_set( } void bind_staging_to_descriptor_set( - api::StorageBuffer& staging, + api::StagingBuffer& staging, vkapi::DescriptorSet& descriptor_set, const uint32_t idx) { descriptor_set.bind(idx, staging.buffer()); diff --git a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h index 3a7ec029da7..eed39a97979 100644 --- a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h +++ b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h @@ -40,7 +40,7 @@ uint32_t bind_params_to_descriptor_set( const uint32_t base_idx); void bind_staging_to_descriptor_set( - api::StorageBuffer& staging, + api::StagingBuffer& staging, vkapi::DescriptorSet& descriptor_set, const uint32_t idx); diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp index 294e36b9a86..9cb715e202a 100644 --- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp +++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp @@ -73,7 +73,7 @@ void memcpy_from_mapping( void copy_ptr_to_staging( const void* src, - api::StorageBuffer& staging, + api::StagingBuffer& staging, const size_t nbytes) { vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::WRITE); mapping.invalidate(); @@ -81,7 +81,7 @@ void copy_ptr_to_staging( } void copy_staging_to_ptr( - api::StorageBuffer& staging, + api::StagingBuffer& staging, void* dst, const size_t nbytes) { vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::READ); @@ -89,7 +89,7 @@ void copy_staging_to_ptr( memcpy_from_mapping(mapping, dst, nbytes, staging.dtype()); } -void set_staging_zeros(api::StorageBuffer& staging, const size_t nbytes) { +void set_staging_zeros(api::StagingBuffer& staging, const size_t nbytes) { vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::WRITE); uint8_t* data_ptr = mapping.template data(); memset(data_ptr, 0, staging.nbytes()); diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h index cabc17f30ee..f16c52ecf33 100644 --- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h +++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h @@ -18,14 +18,14 @@ namespace vkcompute { void copy_ptr_to_staging( const void* src, - api::StorageBuffer& staging, + api::StagingBuffer& staging, const size_t nbytes); void copy_staging_to_ptr( - api::StorageBuffer& staging, + api::StagingBuffer& staging, void* dst, const size_t nbytes); -void set_staging_zeros(api::StorageBuffer& staging, const size_t nbytes); +void set_staging_zeros(api::StagingBuffer& staging, const size_t nbytes); // // Functions to get shaders diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp index 4a3a41d6c72..4c2972419d0 100644 --- a/backends/vulkan/test/utils/test_utils.cpp +++ b/backends/vulkan/test/utils/test_utils.cpp @@ -112,7 +112,7 @@ void record_image_to_nchw_op( void record_int8_image_to_nchw_noint8_op( api::Context* const context, api::vTensor& v_src, - api::StorageBuffer& dst_buffer) { + api::StagingBuffer& dst_buffer) { vkapi::PipelineBarrier pipeline_barrier{}; uint32_t buffer_len = utils::safe_downcast(dst_buffer.numel() / 4); utils::uvec3 global_wg_size = {buffer_len, 1, 1}; @@ -324,7 +324,7 @@ void record_reference_matmul( _(int8_t, QInt8) void fill_vtensor(api::vTensor& vten, std::vector& data) { - api::StorageBuffer staging_buffer(api::context(), vten.dtype(), data.size()); + api::StagingBuffer staging_buffer(api::context(), vten.dtype(), data.size()); #define CASE(ctype, name) \ case vkapi::ScalarType::name: { \ @@ -411,7 +411,7 @@ void fill_vtensor( } void extract_vtensor(api::vTensor& vten, std::vector& data) { - api::StorageBuffer staging_buffer( + api::StagingBuffer staging_buffer( api::context(), vten.dtype(), vten.staging_buffer_numel()); if (vten.storage_type() == utils::StorageType::BUFFER) { diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h index c8af5470862..3bc12c472db 100644 --- a/backends/vulkan/test/utils/test_utils.h +++ b/backends/vulkan/test/utils/test_utils.h @@ -37,13 +37,13 @@ using namespace vkcompute; allocate_memory); #define DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(tensor) \ - api::StorageBuffer staging_buffer_##tensor( \ + api::StagingBuffer staging_buffer_##tensor( \ api::context(), vkapi::kFloat, tensor.staging_buffer_numel()); \ record_nchw_to_image_op( \ api::context(), staging_buffer_##tensor.buffer(), tensor); #define DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(tensor) \ - api::StorageBuffer staging_buffer_##tensor( \ + api::StagingBuffer staging_buffer_##tensor( \ api::context(), vkapi::kFloat, tensor.staging_buffer_numel()); \ record_image_to_nchw_op( \ api::context(), tensor, staging_buffer_##tensor.buffer()); @@ -85,7 +85,7 @@ void record_image_to_nchw_op( void record_int8_image_to_nchw_noint8_op( api::Context* const context, api::vTensor& v_src, - api::StorageBuffer& dst_buffer); + api::StagingBuffer& dst_buffer); void record_conv2d_prepack_weights_op( api::Context* const context, @@ -126,7 +126,7 @@ void record_reference_matmul( // inline void -fill_staging(api::StorageBuffer& staging, float val, int numel = -1) { +fill_staging(api::StagingBuffer& staging, float val, int numel = -1) { if (numel < 0) { numel = staging.numel(); } @@ -164,7 +164,7 @@ inline std::vector extract_vtensor(api::vTensor& vten) { } inline void -check_staging_buffer(api::StorageBuffer& staging, float val, int numel = -1) { +check_staging_buffer(api::StagingBuffer& staging, float val, int numel = -1) { if (numel < 0) { numel = staging.numel(); } diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 3d172f490cf..f3c60a21376 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -336,7 +336,7 @@ TEST_F(VulkanComputeAPITest, spec_var_classes_test) { TEST_F(VulkanComputeAPITest, spec_var_shader_test) { size_t len = 16; - StorageBuffer buffer(context(), vkapi::kFloat, len); + StagingBuffer buffer(context(), vkapi::kFloat, len); float scale = 3.0f; float offset = 1.5f; @@ -407,7 +407,7 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) { params.buffer()); } - StorageBuffer staging_buffer( + StagingBuffer staging_buffer( context(), vkapi::kFloat, a.staging_buffer_numel()); record_image_to_nchw_op(context(), a, staging_buffer.buffer()); @@ -428,7 +428,7 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) { template void test_storage_buffer_type(const size_t len) { - StorageBuffer buffer(context(), dtype, len); + StagingBuffer buffer(context(), dtype, len); std::string kernel_name("idx_fill_buffer"); switch (dtype) { @@ -2040,7 +2040,7 @@ void run_from_gpu_test( vten.sizes_ubo()); } - StorageBuffer staging_buffer(context(), dtype, vten.staging_buffer_numel()); + StagingBuffer staging_buffer(context(), dtype, vten.staging_buffer_numel()); if (dtype == vkapi::kChar && !context()->adapter_ptr()->has_full_int8_buffers_support()) { @@ -2073,7 +2073,7 @@ void round_trip_test( vTensor vten = vTensor(context(), sizes, dtype, storage_type, memory_layout); // Create and fill input staging buffer - StorageBuffer staging_buffer_in( + StagingBuffer staging_buffer_in( context(), dtype, vten.staging_buffer_numel()); std::vector data_in(staging_buffer_in.numel()); @@ -2084,7 +2084,7 @@ void round_trip_test( data_in.data(), staging_buffer_in, vten.staging_buffer_nbytes()); // Output staging buffer - StorageBuffer staging_buffer_out( + StagingBuffer staging_buffer_out( context(), dtype, vten.staging_buffer_numel()); record_nchw_to_image_op(context(), staging_buffer_in.buffer(), vten); @@ -2538,7 +2538,7 @@ void test_conv2d( // Create and fill input staging buffer const int64_t in_numel = utils::multiply_integers(original_sizes); - StorageBuffer staging_buffer_in(context(), vkapi::kFloat, in_numel); + StagingBuffer staging_buffer_in(context(), vkapi::kFloat, in_numel); std::vector data_in(in_numel); for (int i = 0; i < in_numel; i++) { @@ -2550,7 +2550,7 @@ void test_conv2d( // Output staging buffer const int64_t out_numel = padded_sizes[0] * padded_sizes[1] * original_sizes[2] * original_sizes[3]; - StorageBuffer staging_buffer_out(context(), vkapi::kFloat, out_numel); + StagingBuffer staging_buffer_out(context(), vkapi::kFloat, out_numel); // Copy data in and out of the tensor record_conv2d_prepack_weights_op( diff --git a/backends/vulkan/tools/gpuinfo/include/architecture.h b/backends/vulkan/tools/gpuinfo/include/architecture.h index 0d312ee87c3..20c6254e1a0 100644 --- a/backends/vulkan/tools/gpuinfo/include/architecture.h +++ b/backends/vulkan/tools/gpuinfo/include/architecture.h @@ -40,7 +40,7 @@ void reg_count(const App& app) { uint32_t NITER; auto bench = [&](uint32_t ngrp, uint32_t nreg) { - StorageBuffer buffer(context(), vkapi::kFloat, 1); + StagingBuffer buffer(context(), vkapi::kFloat, 1); vkapi::PipelineBarrier pipeline_barrier{}; auto shader_name = "reg_count_" + std::to_string(nreg); @@ -164,7 +164,7 @@ void warp_size(const App& app, const bool verbose = false) { uint32_t NITER; auto bench = [&](uint32_t nthread) { - StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); + StagingBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); vkapi::PipelineBarrier pipeline_barrier{}; auto shader_name = "warp_size_physical"; @@ -224,7 +224,7 @@ void warp_size(const App& app, const bool verbose = false) { // doesn't depend on kernel timing, so the extra wait time doesn't lead to // inaccuracy. auto bench_sm = [&](uint32_t nthread) { - StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); + StagingBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); vkapi::PipelineBarrier pipeline_barrier{}; auto shader_name = "warp_size_scheduler"; diff --git a/backends/vulkan/tools/gpuinfo/include/buffers.h b/backends/vulkan/tools/gpuinfo/include/buffers.h index c8cf93c4a12..31137b11eea 100644 --- a/backends/vulkan/tools/gpuinfo/include/buffers.h +++ b/backends/vulkan/tools/gpuinfo/include/buffers.h @@ -35,8 +35,8 @@ void buf_cacheline_size(const App& app) { uint32_t NITER; auto bench = [&](int stride) { - StorageBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE); - StorageBuffer out_buf(context(), vkapi::kFloat, 1); + StagingBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE); + StagingBuffer out_buf(context(), vkapi::kFloat, 1); vkapi::PipelineBarrier pipeline_barrier{}; auto shader_name = "buf_cacheline_size"; @@ -132,8 +132,8 @@ void _bandwidth( // workgroups, once the size of the access excedes the workgroup width. const uint32_t workgroup_width = local_x * NITER * NUNROLL; - StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float)); - StorageBuffer out_buf( + StagingBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float)); + StagingBuffer out_buf( context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic); vkapi::PipelineBarrier pipeline_barrier{}; diff --git a/backends/vulkan/tools/gpuinfo/include/textures.h b/backends/vulkan/tools/gpuinfo/include/textures.h index 7679f11b0ca..c9ff133f1ec 100644 --- a/backends/vulkan/tools/gpuinfo/include/textures.h +++ b/backends/vulkan/tools/gpuinfo/include/textures.h @@ -61,7 +61,7 @@ void tex_cacheline_concurr(const App& app) { vTensor in_tensor = api::vTensor(api::context(), sizes_nchw, vkapi::kFloat); - StorageBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH); + StagingBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH); vkapi::PipelineBarrier pipeline_barrier{}; @@ -173,7 +173,7 @@ void tex_bandwidth(const App& app) { // workgroups, once the size of the access excedes the workgroup width. const uint32_t workgroup_width = local_x * NITER * NUNROLL; - StorageBuffer out_buf( + StagingBuffer out_buf( context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic); vkapi::PipelineBarrier pipeline_barrier{}; From d3987b83dbe25cfd76629247ab1409ccff511256 Mon Sep 17 00:00:00 2001 From: Jorge Pineda Date: Fri, 30 Aug 2024 15:08:20 -0700 Subject: [PATCH 02/11] [ET-VK] Remove unused Allocator function TSIA Differential Revision: [D62049777](https://our.internmc.facebook.com/intern/diff/D62049777/) [ghstack-poisoned] --- backends/vulkan/runtime/vk_api/memory/Allocator.cpp | 11 ----------- backends/vulkan/runtime/vk_api/memory/Allocator.h | 2 -- 2 files changed, 13 deletions(-) diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp index 1dadca27a0b..f7428f12b67 100644 --- a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp +++ b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp @@ -162,17 +162,6 @@ VulkanBuffer Allocator::create_storage_buffer( allocator_, size, alloc_create_info, buffer_usage, allocate_memory); } -VulkanBuffer Allocator::create_staging_buffer(const VkDeviceSize size) { - VmaAllocationCreateInfo alloc_create_info = {}; - alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY; - alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST; - - VkBufferUsageFlags buffer_usage = - VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; - - return VulkanBuffer(allocator_, size, alloc_create_info, buffer_usage); -} - VulkanBuffer Allocator::create_uniform_buffer(const VkDeviceSize size) { VmaAllocationCreateInfo alloc_create_info = {}; alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY | diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.h b/backends/vulkan/runtime/vk_api/memory/Allocator.h index 904163cefb4..6d8ee09ae5d 100644 --- a/backends/vulkan/runtime/vk_api/memory/Allocator.h +++ b/backends/vulkan/runtime/vk_api/memory/Allocator.h @@ -67,8 +67,6 @@ class Allocator final { const bool gpu_only = true, const bool allocate_memory = true); - VulkanBuffer create_staging_buffer(const VkDeviceSize); - /* * Create a uniform buffer with a specified size */ From 634033368f4d7c36e8137f924815a85163f47e2e Mon Sep 17 00:00:00 2001 From: Jorge Pineda Date: Fri, 30 Aug 2024 15:08:23 -0700 Subject: [PATCH 03/11] [ET-VK] Simplify Allocator's buffer creation methods There are ~4 lines of duplicated code but IMO, the distinct function names make the code much clearer. Differential Revision: [D62049778](https://our.internmc.facebook.com/intern/diff/D62049778/) [ghstack-poisoned] --- .../runtime/api/containers/StagingBuffer.h | 8 ++-- .../vulkan/runtime/api/containers/Tensor.cpp | 2 +- .../runtime/vk_api/memory/Allocator.cpp | 40 +++++++++---------- .../vulkan/runtime/vk_api/memory/Allocator.h | 3 +- 4 files changed, 26 insertions(+), 27 deletions(-) diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.h b/backends/vulkan/runtime/api/containers/StagingBuffer.h index ab650c09a43..a24728470b0 100644 --- a/backends/vulkan/runtime/api/containers/StagingBuffer.h +++ b/backends/vulkan/runtime/api/containers/StagingBuffer.h @@ -29,15 +29,13 @@ class StagingBuffer final { StagingBuffer( Context* context_p, const vkapi::ScalarType dtype, - const size_t numel, - const bool gpuonly = false) + const size_t numel) : context_p_(context_p), dtype_(dtype), numel_(numel), nbytes_(element_size(dtype_) * numel_), - vulkan_buffer_(context_p_->adapter_ptr()->vma().create_storage_buffer( - nbytes_, - gpuonly)) {} + vulkan_buffer_( + context_p_->adapter_ptr()->vma().create_staging_buffer(nbytes_)) {} StagingBuffer(const StagingBuffer&) = delete; StagingBuffer& operator=(const StagingBuffer&) = delete; diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp index 578898ad194..7b9d30ef658 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.cpp +++ b/backends/vulkan/runtime/api/containers/Tensor.cpp @@ -540,7 +540,7 @@ vkapi::VulkanBuffer allocate_buffer( } return adapter_ptr->vma().create_storage_buffer( - element_size(dtype) * numel, /*gpu_only = */ true, allocate_memory); + element_size(dtype) * numel, allocate_memory); } vTensorStorage::vTensorStorage( diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp index f7428f12b67..b990cf6a119 100644 --- a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp +++ b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp @@ -132,9 +132,27 @@ VulkanImage Allocator::create_image( allocate_memory); } +VulkanBuffer Allocator::create_staging_buffer(const VkDeviceSize size) { + const VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + + VmaAllocationCreateInfo alloc_create_info = {}; + alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY; + alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE; + + // Staging buffers are accessed by both the CPU and GPU, so set the + // appropriate flags to indicate that the host device will be accessing + // the data from this buffer. + alloc_create_info.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT; + alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST; + alloc_create_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + alloc_create_info.preferredFlags = + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT; + + return VulkanBuffer(allocator_, size, alloc_create_info, buffer_usage); +} + VulkanBuffer Allocator::create_storage_buffer( const VkDeviceSize size, - const bool gpu_only, const bool allocate_memory) { const VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; @@ -142,22 +160,6 @@ VulkanBuffer Allocator::create_storage_buffer( alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY; alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE; - // The create storage buffer will be accessed by both the CPU and GPU, so set - // the appropriate flags to indicate that the host device will be accessing - // the data from this buffer. - if (!gpu_only) { - // Deferred memory allocation should only be used for GPU only buffers. - VK_CHECK_COND( - allocate_memory, - "Only GPU-only buffers should use deferred memory allocation"); - - alloc_create_info.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT; - alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST; - alloc_create_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; - alloc_create_info.preferredFlags = VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | - VK_MEMORY_PROPERTY_HOST_CACHED_BIT; - } - return VulkanBuffer( allocator_, size, alloc_create_info, buffer_usage, allocate_memory); } @@ -170,9 +172,7 @@ VulkanBuffer Allocator::create_uniform_buffer(const VkDeviceSize size) { VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT; - VulkanBuffer uniform_buffer( - allocator_, size, alloc_create_info, buffer_usage); - return uniform_buffer; + return VulkanBuffer(allocator_, size, alloc_create_info, buffer_usage); } } // namespace vkapi diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.h b/backends/vulkan/runtime/vk_api/memory/Allocator.h index 6d8ee09ae5d..7d02ffe54e3 100644 --- a/backends/vulkan/runtime/vk_api/memory/Allocator.h +++ b/backends/vulkan/runtime/vk_api/memory/Allocator.h @@ -62,9 +62,10 @@ class Allocator final { const bool allow_transfer = false, const bool allocate_memory = true); + VulkanBuffer create_staging_buffer(const VkDeviceSize); + VulkanBuffer create_storage_buffer( const VkDeviceSize, - const bool gpu_only = true, const bool allocate_memory = true); /* From 341c5a7c95a7c10827c47d255d33238f6b16ce96 Mon Sep 17 00:00:00 2001 From: Jorge Pineda Date: Fri, 30 Aug 2024 16:31:57 -0700 Subject: [PATCH 04/11] Update base for Update on "[ET-VK] Simplify Allocator's buffer creation methods" There are ~4 lines of duplicated code but IMO, the distinct function names make the code much clearer. Differential Revision: [D62049778](https://our.internmc.facebook.com/intern/diff/D62049778/) [ghstack-poisoned] --- backends/vulkan/runtime/api/Context.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp index e13293c3098..4d2a854de3b 100644 --- a/backends/vulkan/runtime/api/Context.cpp +++ b/backends/vulkan/runtime/api/Context.cpp @@ -189,7 +189,7 @@ Context* context() { const vkapi::DescriptorPoolConfig descriptor_pool_config{ VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorPoolMaxSets VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorUniformBufferCount - VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorStagingBufferCount + VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorStorageBufferCount VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorCombinedSamplerCount VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorStorageImageCount 32u, // descriptorPileSizes From a17af04a3464259f34c2e293763de98916ba38e2 Mon Sep 17 00:00:00 2001 From: Jorge Pineda Date: Fri, 30 Aug 2024 16:31:58 -0700 Subject: [PATCH 05/11] [ET-VK] Persistently map staging buffers `StagingBuffer`s are used for input, output, and constant tensors. For input and output, they are mapped every inference which is unnecessary. Instead, [persistently map the memory](https://gpuopen-librariesandsdks.github.io/VulkanMemoryAllocator/html/memory_mapping.html) once on model load. Differential Revision: [D59706627](https://our.internmc.facebook.com/intern/diff/D59706627/) [ghstack-poisoned] --- .../runtime/api/containers/StagingBuffer.h | 11 +++++++++-- backends/vulkan/runtime/graph/ComputeGraph.cpp | 11 +++++++---- backends/vulkan/runtime/graph/ComputeGraph.h | 5 ++++- .../vulkan/runtime/graph/ops/PrepackNode.cpp | 9 +++++++-- .../runtime/graph/ops/utils/StagingUtils.cpp | 8 ++------ .../vulkan/runtime/vk_api/memory/Allocator.cpp | 3 ++- backends/vulkan/runtime/vk_api/memory/Buffer.cpp | 15 ++++++++++++++- backends/vulkan/runtime/vk_api/memory/Buffer.h | 4 ++-- backends/vulkan/test/utils/test_utils.cpp | 4 ++-- backends/vulkan/test/utils/test_utils.h | 4 ++-- backends/vulkan/test/vulkan_compute_api_test.cpp | 16 ++++++++-------- 11 files changed, 59 insertions(+), 31 deletions(-) diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.h b/backends/vulkan/runtime/api/containers/StagingBuffer.h index a24728470b0..f457878bc0e 100644 --- a/backends/vulkan/runtime/api/containers/StagingBuffer.h +++ b/backends/vulkan/runtime/api/containers/StagingBuffer.h @@ -24,18 +24,21 @@ class StagingBuffer final { size_t numel_; size_t nbytes_; vkapi::VulkanBuffer vulkan_buffer_; + vkapi::MemoryMap memory_map_; public: StagingBuffer( Context* context_p, const vkapi::ScalarType dtype, - const size_t numel) + const size_t numel, + const vkapi::MemoryAccessType access) : context_p_(context_p), dtype_(dtype), numel_(numel), nbytes_(element_size(dtype_) * numel_), vulkan_buffer_( - context_p_->adapter_ptr()->vma().create_staging_buffer(nbytes_)) {} + context_p_->adapter_ptr()->vma().create_staging_buffer(nbytes_)), + memory_map_(vulkan_buffer_, access) {} StagingBuffer(const StagingBuffer&) = delete; StagingBuffer& operator=(const StagingBuffer&) = delete; @@ -55,6 +58,10 @@ class StagingBuffer final { return vulkan_buffer_; } + inline vkapi::MemoryMap& mapping() { + return memory_map_; + } + inline size_t numel() { return numel_; } diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp index 6c3ec88eaa7..f46623c375c 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp @@ -233,10 +233,11 @@ ValueRef ComputeGraph::add_tensorref( ValueRef ComputeGraph::add_staging( const vkapi::ScalarType dtype, - const size_t numel) { + const size_t numel, + const vkapi::MemoryAccessType access) { ValueRef idx(static_cast(values_.size())); check_no_active_value_ptrs(); - values_.emplace_back(api::StagingBuffer(context(), dtype, numel)); + values_.emplace_back(api::StagingBuffer(context(), dtype, numel, access)); return idx; } @@ -269,7 +270,8 @@ ValueRef ComputeGraph::set_input_tensor( // For texture storage, the buffer size needs to account for the zero // padding applied by unused texel elements. size_t buf_numel = get_tensor(idx)->staging_buffer_numel(); - ValueRef staging_idx = add_staging(dtype, buf_numel); + ValueRef staging_idx = + add_staging(dtype, buf_numel, vkapi::MemoryAccessType::WRITE); add_staging_to_tensor_node(*this, staging_idx, idx); inputs_.push_back({idx, staging_idx}); return staging_idx; @@ -286,7 +288,8 @@ ValueRef ComputeGraph::set_output_tensor( // For texture storage, the buffer size needs to account for the zero // padding applied by unused texel elements. size_t buf_numel = get_tensor(idx)->staging_buffer_numel(); - ValueRef staging_idx = add_staging(dtype, buf_numel); + ValueRef staging_idx = + add_staging(dtype, buf_numel, vkapi::MemoryAccessType::READ); // We only run this when the tensor is non-empty. When the underlying // tensor is empty (e.g. padded_numel == 0), we do not allocate a VkImage to // tensor, we will not be able to bind the node for execution. diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index 9b04b08a70e..3fe7b104199 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -406,7 +406,10 @@ class ComputeGraph final { * use memory that is visible to both the CPU and GPU, and therefore is used * as a intermediary when transferring data between the CPU and GPU. */ - ValueRef add_staging(const vkapi::ScalarType dtype, const size_t numel); + ValueRef add_staging( + const vkapi::ScalarType dtype, + const size_t numel, + const vkapi::MemoryAccessType access); ValueRef add_none(); diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp index a9c2f6c9b6a..d1b75a52c9e 100644 --- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp +++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp @@ -52,7 +52,11 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) { // the vkapi::vTensor metadata. if (graph->val_is_none(tref_)) { size_t numel = utils::multiply_integers(packed->sizes()); - api::StagingBuffer staging(graph->context(), packed->dtype(), numel); + api::StagingBuffer staging( + graph->context(), + packed->dtype(), + numel, + vkapi::MemoryAccessType::WRITE); size_t nbytes = numel * vkapi::element_size(packed->dtype()); set_staging_zeros(staging, nbytes); return staging; @@ -60,7 +64,8 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) { TensorRefPtr tref = graph->get_tref(tref_); size_t numel = utils::multiply_integers(tref->sizes); - api::StagingBuffer staging(graph->context(), tref->dtype, numel); + api::StagingBuffer staging( + graph->context(), tref->dtype, numel, vkapi::MemoryAccessType::WRITE); size_t nbytes = numel * vkapi::element_size(tref->dtype); copy_ptr_to_staging(tref->data, staging, nbytes); return staging; diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp index 9cb715e202a..ab18f233fbe 100644 --- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp +++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp @@ -75,18 +75,14 @@ void copy_ptr_to_staging( const void* src, api::StagingBuffer& staging, const size_t nbytes) { - vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::WRITE); - mapping.invalidate(); - memcpy_to_mapping(src, mapping, nbytes, staging.dtype()); + memcpy_to_mapping(src, staging.mapping(), nbytes, staging.dtype()); } void copy_staging_to_ptr( api::StagingBuffer& staging, void* dst, const size_t nbytes) { - vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::READ); - mapping.invalidate(); - memcpy_from_mapping(mapping, dst, nbytes, staging.dtype()); + memcpy_from_mapping(staging.mapping(), dst, nbytes, staging.dtype()); } void set_staging_zeros(api::StagingBuffer& staging, const size_t nbytes) { diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp index b990cf6a119..e814063fa90 100644 --- a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp +++ b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp @@ -142,7 +142,8 @@ VulkanBuffer Allocator::create_staging_buffer(const VkDeviceSize size) { // Staging buffers are accessed by both the CPU and GPU, so set the // appropriate flags to indicate that the host device will be accessing // the data from this buffer. - alloc_create_info.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT; + alloc_create_info.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT | + VMA_ALLOCATION_CREATE_MAPPED_BIT; alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST; alloc_create_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; alloc_create_info.preferredFlags = diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.cpp b/backends/vulkan/runtime/vk_api/memory/Buffer.cpp index 366b45a5e41..8593806788b 100644 --- a/backends/vulkan/runtime/vk_api/memory/Buffer.cpp +++ b/backends/vulkan/runtime/vk_api/memory/Buffer.cpp @@ -158,7 +158,7 @@ MemoryMap::MemoryMap(const VulkanBuffer& buffer, const uint8_t access) } } -MemoryMap::MemoryMap(MemoryMap&& other) noexcept +MemoryMap::MemoryMap(MemoryMap&& other) : access_(other.access_), allocator_(other.allocator_), allocation_(other.allocation_), @@ -168,6 +168,19 @@ MemoryMap::MemoryMap(MemoryMap&& other) noexcept other.data_ = nullptr; } +MemoryMap& MemoryMap::operator=(MemoryMap&& other) { + access_ = other.access_; + allocator_ = other.allocator_; + allocation_ = other.allocation_; + data_ = other.data_; + data_len_ = other.data_len_; + + other.allocation_ = VK_NULL_HANDLE; + other.data_ = nullptr; + + return *this; +} + MemoryMap::~MemoryMap() { if (!data_) { return; diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.h b/backends/vulkan/runtime/vk_api/memory/Buffer.h index 9302048f861..d7878347118 100644 --- a/backends/vulkan/runtime/vk_api/memory/Buffer.h +++ b/backends/vulkan/runtime/vk_api/memory/Buffer.h @@ -174,8 +174,8 @@ class MemoryMap final { MemoryMap(const MemoryMap&) = delete; MemoryMap& operator=(const MemoryMap&) = delete; - MemoryMap(MemoryMap&&) noexcept; - MemoryMap& operator=(MemoryMap&&) = delete; + MemoryMap(MemoryMap&&); + MemoryMap& operator=(MemoryMap&&); ~MemoryMap(); diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp index 4c2972419d0..b4eba76ca4e 100644 --- a/backends/vulkan/test/utils/test_utils.cpp +++ b/backends/vulkan/test/utils/test_utils.cpp @@ -324,7 +324,7 @@ void record_reference_matmul( _(int8_t, QInt8) void fill_vtensor(api::vTensor& vten, std::vector& data) { - api::StagingBuffer staging_buffer(api::context(), vten.dtype(), data.size()); + api::StagingBuffer staging_buffer(api::context(), vten.dtype(), data.size(), vkapi::MemoryAccessType::WRITE); #define CASE(ctype, name) \ case vkapi::ScalarType::name: { \ @@ -412,7 +412,7 @@ void fill_vtensor( void extract_vtensor(api::vTensor& vten, std::vector& data) { api::StagingBuffer staging_buffer( - api::context(), vten.dtype(), vten.staging_buffer_numel()); + api::context(), vten.dtype(), vten.staging_buffer_numel(), vkapi::MemoryAccessType::READ); if (vten.storage_type() == utils::StorageType::BUFFER) { record_buffer_to_nchw_op(api::context(), vten, staging_buffer.buffer()); diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h index 3bc12c472db..c087b192d41 100644 --- a/backends/vulkan/test/utils/test_utils.h +++ b/backends/vulkan/test/utils/test_utils.h @@ -38,13 +38,13 @@ using namespace vkcompute; #define DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(tensor) \ api::StagingBuffer staging_buffer_##tensor( \ - api::context(), vkapi::kFloat, tensor.staging_buffer_numel()); \ + api::context(), vkapi::kFloat, tensor.staging_buffer_numel(), vkapi::MemoryAccessType::WRITE); \ record_nchw_to_image_op( \ api::context(), staging_buffer_##tensor.buffer(), tensor); #define DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(tensor) \ api::StagingBuffer staging_buffer_##tensor( \ - api::context(), vkapi::kFloat, tensor.staging_buffer_numel()); \ + api::context(), vkapi::kFloat, tensor.staging_buffer_numel(), vkapi::MemoryAccessType::READ); \ record_image_to_nchw_op( \ api::context(), tensor, staging_buffer_##tensor.buffer()); diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index f3c60a21376..2bb66654205 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -336,7 +336,7 @@ TEST_F(VulkanComputeAPITest, spec_var_classes_test) { TEST_F(VulkanComputeAPITest, spec_var_shader_test) { size_t len = 16; - StagingBuffer buffer(context(), vkapi::kFloat, len); + StagingBuffer buffer(context(), vkapi::kFloat, len, vkapi::MemoryAccessType::WRITE); float scale = 3.0f; float offset = 1.5f; @@ -408,7 +408,7 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) { } StagingBuffer staging_buffer( - context(), vkapi::kFloat, a.staging_buffer_numel()); + context(), vkapi::kFloat, a.staging_buffer_numel(), vkapi::MemoryAccessType::WRITE); record_image_to_nchw_op(context(), a, staging_buffer.buffer()); submit_to_gpu(); @@ -428,7 +428,7 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) { template void test_storage_buffer_type(const size_t len) { - StagingBuffer buffer(context(), dtype, len); + StagingBuffer buffer(context(), dtype, len, vkapi::MemoryAccessType::WRITE); std::string kernel_name("idx_fill_buffer"); switch (dtype) { @@ -2040,7 +2040,7 @@ void run_from_gpu_test( vten.sizes_ubo()); } - StagingBuffer staging_buffer(context(), dtype, vten.staging_buffer_numel()); + StagingBuffer staging_buffer(context(), dtype, vten.staging_buffer_numel(), vkapi::MemoryAccessType::READ); if (dtype == vkapi::kChar && !context()->adapter_ptr()->has_full_int8_buffers_support()) { @@ -2074,7 +2074,7 @@ void round_trip_test( // Create and fill input staging buffer StagingBuffer staging_buffer_in( - context(), dtype, vten.staging_buffer_numel()); + context(), dtype, vten.staging_buffer_numel(), vkapi::MemoryAccessType::WRITE); std::vector data_in(staging_buffer_in.numel()); for (int i = 0; i < staging_buffer_in.numel(); i++) { @@ -2085,7 +2085,7 @@ void round_trip_test( // Output staging buffer StagingBuffer staging_buffer_out( - context(), dtype, vten.staging_buffer_numel()); + context(), dtype, vten.staging_buffer_numel(), vkapi::MemoryAccessType::READ); record_nchw_to_image_op(context(), staging_buffer_in.buffer(), vten); @@ -2538,7 +2538,7 @@ void test_conv2d( // Create and fill input staging buffer const int64_t in_numel = utils::multiply_integers(original_sizes); - StagingBuffer staging_buffer_in(context(), vkapi::kFloat, in_numel); + StagingBuffer staging_buffer_in(context(), vkapi::kFloat, in_numel, vkapi::MemoryAccessType::WRITE); std::vector data_in(in_numel); for (int i = 0; i < in_numel; i++) { @@ -2550,7 +2550,7 @@ void test_conv2d( // Output staging buffer const int64_t out_numel = padded_sizes[0] * padded_sizes[1] * original_sizes[2] * original_sizes[3]; - StagingBuffer staging_buffer_out(context(), vkapi::kFloat, out_numel); + StagingBuffer staging_buffer_out(context(), vkapi::kFloat, out_numel, vkapi::MemoryAccessType::READ); // Copy data in and out of the tensor record_conv2d_prepack_weights_op( From 7f5c3e2f4efb98ceb5586c672f825858ab0ca1a9 Mon Sep 17 00:00:00 2001 From: Jorge Pineda Date: Tue, 3 Sep 2024 09:13:20 -0700 Subject: [PATCH 06/11] Update on "[ET-VK] Persistently map staging buffers" `StagingBuffer`s are used for input, output, and constant tensors. For input and output, they are mapped every inference which is unnecessary. Instead, [persistently map the memory](https://gpuopen-librariesandsdks.github.io/VulkanMemoryAllocator/html/memory_mapping.html) once on model load. Differential Revision: [D59706627](https://our.internmc.facebook.com/intern/diff/D59706627/) [ghstack-poisoned] --- backends/vulkan/test/utils/test_utils.cpp | 11 +++++-- backends/vulkan/test/utils/test_utils.h | 22 +++++++++----- .../vulkan/test/vulkan_compute_api_test.cpp | 30 ++++++++++++++----- 3 files changed, 46 insertions(+), 17 deletions(-) diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp index b4eba76ca4e..8e4c27d98d2 100644 --- a/backends/vulkan/test/utils/test_utils.cpp +++ b/backends/vulkan/test/utils/test_utils.cpp @@ -324,7 +324,11 @@ void record_reference_matmul( _(int8_t, QInt8) void fill_vtensor(api::vTensor& vten, std::vector& data) { - api::StagingBuffer staging_buffer(api::context(), vten.dtype(), data.size(), vkapi::MemoryAccessType::WRITE); + api::StagingBuffer staging_buffer( + api::context(), + vten.dtype(), + data.size(), + vkapi::MemoryAccessType::WRITE); #define CASE(ctype, name) \ case vkapi::ScalarType::name: { \ @@ -412,7 +416,10 @@ void fill_vtensor( void extract_vtensor(api::vTensor& vten, std::vector& data) { api::StagingBuffer staging_buffer( - api::context(), vten.dtype(), vten.staging_buffer_numel(), vkapi::MemoryAccessType::READ); + api::context(), + vten.dtype(), + vten.staging_buffer_numel(), + vkapi::MemoryAccessType::READ); if (vten.storage_type() == utils::StorageType::BUFFER) { record_buffer_to_nchw_op(api::context(), vten, staging_buffer.buffer()); diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h index c087b192d41..e76a5a7c29b 100644 --- a/backends/vulkan/test/utils/test_utils.h +++ b/backends/vulkan/test/utils/test_utils.h @@ -36,16 +36,22 @@ using namespace vkcompute; utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, \ allocate_memory); -#define DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(tensor) \ - api::StagingBuffer staging_buffer_##tensor( \ - api::context(), vkapi::kFloat, tensor.staging_buffer_numel(), vkapi::MemoryAccessType::WRITE); \ - record_nchw_to_image_op( \ +#define DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(tensor) \ + api::StagingBuffer staging_buffer_##tensor( \ + api::context(), \ + vkapi::kFloat, \ + tensor.staging_buffer_numel(), \ + vkapi::MemoryAccessType::WRITE); \ + record_nchw_to_image_op( \ api::context(), staging_buffer_##tensor.buffer(), tensor); -#define DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(tensor) \ - api::StagingBuffer staging_buffer_##tensor( \ - api::context(), vkapi::kFloat, tensor.staging_buffer_numel(), vkapi::MemoryAccessType::READ); \ - record_image_to_nchw_op( \ +#define DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(tensor) \ + api::StagingBuffer staging_buffer_##tensor( \ + api::context(), \ + vkapi::kFloat, \ + tensor.staging_buffer_numel(), \ + vkapi::MemoryAccessType::READ); \ + record_image_to_nchw_op( \ api::context(), tensor, staging_buffer_##tensor.buffer()); #define CHECK_VALUE(data, idx, expected) \ diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 2bb66654205..7e71c58f166 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -336,7 +336,8 @@ TEST_F(VulkanComputeAPITest, spec_var_classes_test) { TEST_F(VulkanComputeAPITest, spec_var_shader_test) { size_t len = 16; - StagingBuffer buffer(context(), vkapi::kFloat, len, vkapi::MemoryAccessType::WRITE); + StagingBuffer buffer( + context(), vkapi::kFloat, len, vkapi::MemoryAccessType::WRITE); float scale = 3.0f; float offset = 1.5f; @@ -408,7 +409,10 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) { } StagingBuffer staging_buffer( - context(), vkapi::kFloat, a.staging_buffer_numel(), vkapi::MemoryAccessType::WRITE); + context(), + vkapi::kFloat, + a.staging_buffer_numel(), + vkapi::MemoryAccessType::WRITE); record_image_to_nchw_op(context(), a, staging_buffer.buffer()); submit_to_gpu(); @@ -2040,7 +2044,11 @@ void run_from_gpu_test( vten.sizes_ubo()); } - StagingBuffer staging_buffer(context(), dtype, vten.staging_buffer_numel(), vkapi::MemoryAccessType::READ); + StagingBuffer staging_buffer( + context(), + dtype, + vten.staging_buffer_numel(), + vkapi::MemoryAccessType::READ); if (dtype == vkapi::kChar && !context()->adapter_ptr()->has_full_int8_buffers_support()) { @@ -2074,7 +2082,10 @@ void round_trip_test( // Create and fill input staging buffer StagingBuffer staging_buffer_in( - context(), dtype, vten.staging_buffer_numel(), vkapi::MemoryAccessType::WRITE); + context(), + dtype, + vten.staging_buffer_numel(), + vkapi::MemoryAccessType::WRITE); std::vector data_in(staging_buffer_in.numel()); for (int i = 0; i < staging_buffer_in.numel(); i++) { @@ -2085,7 +2096,10 @@ void round_trip_test( // Output staging buffer StagingBuffer staging_buffer_out( - context(), dtype, vten.staging_buffer_numel(), vkapi::MemoryAccessType::READ); + context(), + dtype, + vten.staging_buffer_numel(), + vkapi::MemoryAccessType::READ); record_nchw_to_image_op(context(), staging_buffer_in.buffer(), vten); @@ -2538,7 +2552,8 @@ void test_conv2d( // Create and fill input staging buffer const int64_t in_numel = utils::multiply_integers(original_sizes); - StagingBuffer staging_buffer_in(context(), vkapi::kFloat, in_numel, vkapi::MemoryAccessType::WRITE); + StagingBuffer staging_buffer_in( + context(), vkapi::kFloat, in_numel, vkapi::MemoryAccessType::WRITE); std::vector data_in(in_numel); for (int i = 0; i < in_numel; i++) { @@ -2550,7 +2565,8 @@ void test_conv2d( // Output staging buffer const int64_t out_numel = padded_sizes[0] * padded_sizes[1] * original_sizes[2] * original_sizes[3]; - StagingBuffer staging_buffer_out(context(), vkapi::kFloat, out_numel, vkapi::MemoryAccessType::READ); + StagingBuffer staging_buffer_out( + context(), vkapi::kFloat, out_numel, vkapi::MemoryAccessType::READ); // Copy data in and out of the tensor record_conv2d_prepack_weights_op( From 8e578eb0056fce9f5a1145af9c74de66129f3250 Mon Sep 17 00:00:00 2001 From: Jorge Pineda Date: Tue, 3 Sep 2024 10:19:38 -0700 Subject: [PATCH 07/11] Update base for Update on "[ET-VK] Persistently map staging buffers" `StagingBuffer`s are used for input, output, and constant tensors. For input and output, they are mapped every inference which is unnecessary. Instead, [persistently map the memory](https://gpuopen-librariesandsdks.github.io/VulkanMemoryAllocator/html/memory_mapping.html) once on model load. Differential Revision: [D59706627](https://our.internmc.facebook.com/intern/diff/D59706627/) [ghstack-poisoned] --- backends/vulkan/runtime/api/api.h | 2 +- .../{StagingBuffer.h => StorageBuffer.h} | 14 +++++++------- backends/vulkan/runtime/graph/ComputeGraph.cpp | 4 ++-- backends/vulkan/runtime/graph/ComputeGraph.h | 2 +- backends/vulkan/runtime/graph/containers/Value.h | 8 ++++---- .../vulkan/runtime/graph/ops/PrepackNode.cpp | 8 ++++---- backends/vulkan/runtime/graph/ops/PrepackNode.h | 2 +- .../runtime/graph/ops/utils/BindingUtils.cpp | 2 +- .../runtime/graph/ops/utils/BindingUtils.h | 2 +- .../runtime/graph/ops/utils/StagingUtils.cpp | 6 +++--- .../runtime/graph/ops/utils/StagingUtils.h | 6 +++--- .../vulkan/runtime/vk_api/memory/Allocator.cpp | 11 +++++++++++ .../vulkan/runtime/vk_api/memory/Allocator.h | 2 ++ backends/vulkan/test/utils/test_utils.cpp | 6 +++--- backends/vulkan/test/utils/test_utils.h | 10 +++++----- backends/vulkan/test/vulkan_compute_api_test.cpp | 16 ++++++++-------- .../vulkan/tools/gpuinfo/include/architecture.h | 6 +++--- backends/vulkan/tools/gpuinfo/include/buffers.h | 8 ++++---- backends/vulkan/tools/gpuinfo/include/textures.h | 4 ++-- 19 files changed, 66 insertions(+), 53 deletions(-) rename backends/vulkan/runtime/api/containers/{StagingBuffer.h => StorageBuffer.h} (82%) diff --git a/backends/vulkan/runtime/api/api.h b/backends/vulkan/runtime/api/api.h index 0f496a4af8a..de77c57fb0e 100644 --- a/backends/vulkan/runtime/api/api.h +++ b/backends/vulkan/runtime/api/api.h @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.h b/backends/vulkan/runtime/api/containers/StorageBuffer.h similarity index 82% rename from backends/vulkan/runtime/api/containers/StagingBuffer.h rename to backends/vulkan/runtime/api/containers/StorageBuffer.h index a24728470b0..f6a17c7e909 100644 --- a/backends/vulkan/runtime/api/containers/StagingBuffer.h +++ b/backends/vulkan/runtime/api/containers/StorageBuffer.h @@ -17,7 +17,7 @@ namespace vkcompute { namespace api { -class StagingBuffer final { +class StorageBuffer final { private: Context* context_p_; vkapi::ScalarType dtype_; @@ -26,7 +26,7 @@ class StagingBuffer final { vkapi::VulkanBuffer vulkan_buffer_; public: - StagingBuffer( + StorageBuffer( Context* context_p, const vkapi::ScalarType dtype, const size_t numel) @@ -37,13 +37,13 @@ class StagingBuffer final { vulkan_buffer_( context_p_->adapter_ptr()->vma().create_staging_buffer(nbytes_)) {} - StagingBuffer(const StagingBuffer&) = delete; - StagingBuffer& operator=(const StagingBuffer&) = delete; + StorageBuffer(const StorageBuffer&) = delete; + StorageBuffer& operator=(const StorageBuffer&) = delete; - StagingBuffer(StagingBuffer&&) = default; - StagingBuffer& operator=(StagingBuffer&&) = default; + StorageBuffer(StorageBuffer&&) = default; + StorageBuffer& operator=(StorageBuffer&&) = default; - ~StagingBuffer() { + ~StorageBuffer() { context_p_->register_buffer_cleanup(vulkan_buffer_); } diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp index 6c3ec88eaa7..9fa0091b298 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp @@ -38,7 +38,7 @@ namespace vkcompute { VALUE_PTR_CLASS_IMPL(vTensorPtr, api::vTensor, Tensor) VALUE_PTR_CLASS_IMPL(TensorRefPtr, TensorRef, TensorRef) -VALUE_PTR_CLASS_IMPL(StagingPtr, api::StagingBuffer, Staging) +VALUE_PTR_CLASS_IMPL(StagingPtr, api::StorageBuffer, Staging) VALUE_PTR_CLASS_IMPL(IntListPtr, std::vector, IntList) VALUE_PTR_CLASS_IMPL(DoubleListPtr, std::vector, DoubleList) VALUE_PTR_CLASS_IMPL(BoolListPtr, std::vector, BoolList) @@ -236,7 +236,7 @@ ValueRef ComputeGraph::add_staging( const size_t numel) { ValueRef idx(static_cast(values_.size())); check_no_active_value_ptrs(); - values_.emplace_back(api::StagingBuffer(context(), dtype, numel)); + values_.emplace_back(api::StorageBuffer(context(), dtype, numel)); return idx; } diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index 9b04b08a70e..5740d24a448 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -58,7 +58,7 @@ class ComputeGraph; DECL_VALUE_PTR_CLASS(vTensorPtr, api::vTensor) DECL_VALUE_PTR_CLASS(TensorRefPtr, TensorRef) -DECL_VALUE_PTR_CLASS(StagingPtr, api::StagingBuffer) +DECL_VALUE_PTR_CLASS(StagingPtr, api::StorageBuffer) DECL_VALUE_PTR_CLASS(IntListPtr, std::vector) DECL_VALUE_PTR_CLASS(DoubleListPtr, std::vector) DECL_VALUE_PTR_CLASS(BoolListPtr, std::vector) diff --git a/backends/vulkan/runtime/graph/containers/Value.h b/backends/vulkan/runtime/graph/containers/Value.h index 6e03bbd4a21..ba82213c6f8 100644 --- a/backends/vulkan/runtime/graph/containers/Value.h +++ b/backends/vulkan/runtime/graph/containers/Value.h @@ -53,7 +53,7 @@ struct Value final { } u; api::vTensor as_tensor; - api::StagingBuffer as_staging; + api::StorageBuffer as_staging; TensorRef as_tensorref; std::vector as_int_list; @@ -108,7 +108,7 @@ struct Value final { CASE_MOVE_MOVEABLE_TYPE( TypeTag::TENSOR, api::vTensor, as_tensor, vTensor); CASE_MOVE_MOVEABLE_TYPE( - TypeTag::STAGING, api::StagingBuffer, as_staging, StagingBuffer); + TypeTag::STAGING, api::StorageBuffer, as_staging, StorageBuffer); CASE_MOVE_MOVEABLE_TYPE( TypeTag::TENSORREF, TensorRef, as_tensorref, TensorRef); // Scalar lists @@ -152,7 +152,7 @@ struct Value final { payload.as_tensor.~vTensor(); break; case TypeTag::STAGING: - payload.as_staging.~StagingBuffer(); + payload.as_staging.~StorageBuffer(); break; case TypeTag::TENSORREF: payload.as_tensorref.~TensorRef(); @@ -247,7 +247,7 @@ struct Value final { as_tensor); SUPPORT_TRIVIALLY_MOVEABLE_TYPE( - api::StagingBuffer, + api::StorageBuffer, Staging, TypeTag::STAGING, as_staging); diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp index a9c2f6c9b6a..b77c62920dd 100644 --- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp +++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp @@ -45,14 +45,14 @@ PrepackNode::PrepackNode( graph.update_descriptor_counts(noop_shader_, /*execute = */ false); } -api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) { +api::StorageBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) { vTensorPtr packed = graph->get_tensor(packed_); // If no TensorRef is provided, create a staging buffer of zeros according to // the vkapi::vTensor metadata. if (graph->val_is_none(tref_)) { size_t numel = utils::multiply_integers(packed->sizes()); - api::StagingBuffer staging(graph->context(), packed->dtype(), numel); + api::StorageBuffer staging(graph->context(), packed->dtype(), numel); size_t nbytes = numel * vkapi::element_size(packed->dtype()); set_staging_zeros(staging, nbytes); return staging; @@ -60,7 +60,7 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) { TensorRefPtr tref = graph->get_tref(tref_); size_t numel = utils::multiply_integers(tref->sizes); - api::StagingBuffer staging(graph->context(), tref->dtype, numel); + api::StorageBuffer staging(graph->context(), tref->dtype, numel); size_t nbytes = numel * vkapi::element_size(tref->dtype); copy_ptr_to_staging(tref->data, staging, nbytes); return staging; @@ -70,7 +70,7 @@ void PrepackNode::encode(ComputeGraph* graph) { api::Context* const context = graph->context(); vTensorPtr packed = graph->get_tensor(packed_); - api::StagingBuffer staging = create_staging_buffer(graph); + api::StorageBuffer staging = create_staging_buffer(graph); std::unique_lock cmd_lock = context->dispatch_lock(); diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.h b/backends/vulkan/runtime/graph/ops/PrepackNode.h index 3e713303c3d..c3ac8b963fd 100644 --- a/backends/vulkan/runtime/graph/ops/PrepackNode.h +++ b/backends/vulkan/runtime/graph/ops/PrepackNode.h @@ -56,7 +56,7 @@ class PrepackNode final { const vkapi::SpecVarList spec_vars_; private: - api::StagingBuffer create_staging_buffer(ComputeGraph* graph); + api::StorageBuffer create_staging_buffer(ComputeGraph* graph); }; } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp index 2cfb34a052e..b0964ace225 100644 --- a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp +++ b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp @@ -66,7 +66,7 @@ uint32_t bind_params_to_descriptor_set( } void bind_staging_to_descriptor_set( - api::StagingBuffer& staging, + api::StorageBuffer& staging, vkapi::DescriptorSet& descriptor_set, const uint32_t idx) { descriptor_set.bind(idx, staging.buffer()); diff --git a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h index eed39a97979..3a7ec029da7 100644 --- a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h +++ b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h @@ -40,7 +40,7 @@ uint32_t bind_params_to_descriptor_set( const uint32_t base_idx); void bind_staging_to_descriptor_set( - api::StagingBuffer& staging, + api::StorageBuffer& staging, vkapi::DescriptorSet& descriptor_set, const uint32_t idx); diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp index 9cb715e202a..294e36b9a86 100644 --- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp +++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp @@ -73,7 +73,7 @@ void memcpy_from_mapping( void copy_ptr_to_staging( const void* src, - api::StagingBuffer& staging, + api::StorageBuffer& staging, const size_t nbytes) { vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::WRITE); mapping.invalidate(); @@ -81,7 +81,7 @@ void copy_ptr_to_staging( } void copy_staging_to_ptr( - api::StagingBuffer& staging, + api::StorageBuffer& staging, void* dst, const size_t nbytes) { vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::READ); @@ -89,7 +89,7 @@ void copy_staging_to_ptr( memcpy_from_mapping(mapping, dst, nbytes, staging.dtype()); } -void set_staging_zeros(api::StagingBuffer& staging, const size_t nbytes) { +void set_staging_zeros(api::StorageBuffer& staging, const size_t nbytes) { vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::WRITE); uint8_t* data_ptr = mapping.template data(); memset(data_ptr, 0, staging.nbytes()); diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h index f16c52ecf33..cabc17f30ee 100644 --- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h +++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h @@ -18,14 +18,14 @@ namespace vkcompute { void copy_ptr_to_staging( const void* src, - api::StagingBuffer& staging, + api::StorageBuffer& staging, const size_t nbytes); void copy_staging_to_ptr( - api::StagingBuffer& staging, + api::StorageBuffer& staging, void* dst, const size_t nbytes); -void set_staging_zeros(api::StagingBuffer& staging, const size_t nbytes); +void set_staging_zeros(api::StorageBuffer& staging, const size_t nbytes); // // Functions to get shaders diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp index b990cf6a119..17492589176 100644 --- a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp +++ b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp @@ -164,6 +164,17 @@ VulkanBuffer Allocator::create_storage_buffer( allocator_, size, alloc_create_info, buffer_usage, allocate_memory); } +VulkanBuffer Allocator::create_staging_buffer(const VkDeviceSize size) { + VmaAllocationCreateInfo alloc_create_info = {}; + alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY; + alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST; + + VkBufferUsageFlags buffer_usage = + VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; + + return VulkanBuffer(allocator_, size, alloc_create_info, buffer_usage); +} + VulkanBuffer Allocator::create_uniform_buffer(const VkDeviceSize size) { VmaAllocationCreateInfo alloc_create_info = {}; alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY | diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.h b/backends/vulkan/runtime/vk_api/memory/Allocator.h index 7d02ffe54e3..0317f36eaa7 100644 --- a/backends/vulkan/runtime/vk_api/memory/Allocator.h +++ b/backends/vulkan/runtime/vk_api/memory/Allocator.h @@ -68,6 +68,8 @@ class Allocator final { const VkDeviceSize, const bool allocate_memory = true); + VulkanBuffer create_staging_buffer(const VkDeviceSize); + /* * Create a uniform buffer with a specified size */ diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp index 4c2972419d0..4a3a41d6c72 100644 --- a/backends/vulkan/test/utils/test_utils.cpp +++ b/backends/vulkan/test/utils/test_utils.cpp @@ -112,7 +112,7 @@ void record_image_to_nchw_op( void record_int8_image_to_nchw_noint8_op( api::Context* const context, api::vTensor& v_src, - api::StagingBuffer& dst_buffer) { + api::StorageBuffer& dst_buffer) { vkapi::PipelineBarrier pipeline_barrier{}; uint32_t buffer_len = utils::safe_downcast(dst_buffer.numel() / 4); utils::uvec3 global_wg_size = {buffer_len, 1, 1}; @@ -324,7 +324,7 @@ void record_reference_matmul( _(int8_t, QInt8) void fill_vtensor(api::vTensor& vten, std::vector& data) { - api::StagingBuffer staging_buffer(api::context(), vten.dtype(), data.size()); + api::StorageBuffer staging_buffer(api::context(), vten.dtype(), data.size()); #define CASE(ctype, name) \ case vkapi::ScalarType::name: { \ @@ -411,7 +411,7 @@ void fill_vtensor( } void extract_vtensor(api::vTensor& vten, std::vector& data) { - api::StagingBuffer staging_buffer( + api::StorageBuffer staging_buffer( api::context(), vten.dtype(), vten.staging_buffer_numel()); if (vten.storage_type() == utils::StorageType::BUFFER) { diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h index 3bc12c472db..c8af5470862 100644 --- a/backends/vulkan/test/utils/test_utils.h +++ b/backends/vulkan/test/utils/test_utils.h @@ -37,13 +37,13 @@ using namespace vkcompute; allocate_memory); #define DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(tensor) \ - api::StagingBuffer staging_buffer_##tensor( \ + api::StorageBuffer staging_buffer_##tensor( \ api::context(), vkapi::kFloat, tensor.staging_buffer_numel()); \ record_nchw_to_image_op( \ api::context(), staging_buffer_##tensor.buffer(), tensor); #define DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(tensor) \ - api::StagingBuffer staging_buffer_##tensor( \ + api::StorageBuffer staging_buffer_##tensor( \ api::context(), vkapi::kFloat, tensor.staging_buffer_numel()); \ record_image_to_nchw_op( \ api::context(), tensor, staging_buffer_##tensor.buffer()); @@ -85,7 +85,7 @@ void record_image_to_nchw_op( void record_int8_image_to_nchw_noint8_op( api::Context* const context, api::vTensor& v_src, - api::StagingBuffer& dst_buffer); + api::StorageBuffer& dst_buffer); void record_conv2d_prepack_weights_op( api::Context* const context, @@ -126,7 +126,7 @@ void record_reference_matmul( // inline void -fill_staging(api::StagingBuffer& staging, float val, int numel = -1) { +fill_staging(api::StorageBuffer& staging, float val, int numel = -1) { if (numel < 0) { numel = staging.numel(); } @@ -164,7 +164,7 @@ inline std::vector extract_vtensor(api::vTensor& vten) { } inline void -check_staging_buffer(api::StagingBuffer& staging, float val, int numel = -1) { +check_staging_buffer(api::StorageBuffer& staging, float val, int numel = -1) { if (numel < 0) { numel = staging.numel(); } diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index f3c60a21376..3d172f490cf 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -336,7 +336,7 @@ TEST_F(VulkanComputeAPITest, spec_var_classes_test) { TEST_F(VulkanComputeAPITest, spec_var_shader_test) { size_t len = 16; - StagingBuffer buffer(context(), vkapi::kFloat, len); + StorageBuffer buffer(context(), vkapi::kFloat, len); float scale = 3.0f; float offset = 1.5f; @@ -407,7 +407,7 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) { params.buffer()); } - StagingBuffer staging_buffer( + StorageBuffer staging_buffer( context(), vkapi::kFloat, a.staging_buffer_numel()); record_image_to_nchw_op(context(), a, staging_buffer.buffer()); @@ -428,7 +428,7 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) { template void test_storage_buffer_type(const size_t len) { - StagingBuffer buffer(context(), dtype, len); + StorageBuffer buffer(context(), dtype, len); std::string kernel_name("idx_fill_buffer"); switch (dtype) { @@ -2040,7 +2040,7 @@ void run_from_gpu_test( vten.sizes_ubo()); } - StagingBuffer staging_buffer(context(), dtype, vten.staging_buffer_numel()); + StorageBuffer staging_buffer(context(), dtype, vten.staging_buffer_numel()); if (dtype == vkapi::kChar && !context()->adapter_ptr()->has_full_int8_buffers_support()) { @@ -2073,7 +2073,7 @@ void round_trip_test( vTensor vten = vTensor(context(), sizes, dtype, storage_type, memory_layout); // Create and fill input staging buffer - StagingBuffer staging_buffer_in( + StorageBuffer staging_buffer_in( context(), dtype, vten.staging_buffer_numel()); std::vector data_in(staging_buffer_in.numel()); @@ -2084,7 +2084,7 @@ void round_trip_test( data_in.data(), staging_buffer_in, vten.staging_buffer_nbytes()); // Output staging buffer - StagingBuffer staging_buffer_out( + StorageBuffer staging_buffer_out( context(), dtype, vten.staging_buffer_numel()); record_nchw_to_image_op(context(), staging_buffer_in.buffer(), vten); @@ -2538,7 +2538,7 @@ void test_conv2d( // Create and fill input staging buffer const int64_t in_numel = utils::multiply_integers(original_sizes); - StagingBuffer staging_buffer_in(context(), vkapi::kFloat, in_numel); + StorageBuffer staging_buffer_in(context(), vkapi::kFloat, in_numel); std::vector data_in(in_numel); for (int i = 0; i < in_numel; i++) { @@ -2550,7 +2550,7 @@ void test_conv2d( // Output staging buffer const int64_t out_numel = padded_sizes[0] * padded_sizes[1] * original_sizes[2] * original_sizes[3]; - StagingBuffer staging_buffer_out(context(), vkapi::kFloat, out_numel); + StorageBuffer staging_buffer_out(context(), vkapi::kFloat, out_numel); // Copy data in and out of the tensor record_conv2d_prepack_weights_op( diff --git a/backends/vulkan/tools/gpuinfo/include/architecture.h b/backends/vulkan/tools/gpuinfo/include/architecture.h index 20c6254e1a0..0d312ee87c3 100644 --- a/backends/vulkan/tools/gpuinfo/include/architecture.h +++ b/backends/vulkan/tools/gpuinfo/include/architecture.h @@ -40,7 +40,7 @@ void reg_count(const App& app) { uint32_t NITER; auto bench = [&](uint32_t ngrp, uint32_t nreg) { - StagingBuffer buffer(context(), vkapi::kFloat, 1); + StorageBuffer buffer(context(), vkapi::kFloat, 1); vkapi::PipelineBarrier pipeline_barrier{}; auto shader_name = "reg_count_" + std::to_string(nreg); @@ -164,7 +164,7 @@ void warp_size(const App& app, const bool verbose = false) { uint32_t NITER; auto bench = [&](uint32_t nthread) { - StagingBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); + StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); vkapi::PipelineBarrier pipeline_barrier{}; auto shader_name = "warp_size_physical"; @@ -224,7 +224,7 @@ void warp_size(const App& app, const bool verbose = false) { // doesn't depend on kernel timing, so the extra wait time doesn't lead to // inaccuracy. auto bench_sm = [&](uint32_t nthread) { - StagingBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); + StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); vkapi::PipelineBarrier pipeline_barrier{}; auto shader_name = "warp_size_scheduler"; diff --git a/backends/vulkan/tools/gpuinfo/include/buffers.h b/backends/vulkan/tools/gpuinfo/include/buffers.h index 31137b11eea..c8cf93c4a12 100644 --- a/backends/vulkan/tools/gpuinfo/include/buffers.h +++ b/backends/vulkan/tools/gpuinfo/include/buffers.h @@ -35,8 +35,8 @@ void buf_cacheline_size(const App& app) { uint32_t NITER; auto bench = [&](int stride) { - StagingBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE); - StagingBuffer out_buf(context(), vkapi::kFloat, 1); + StorageBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE); + StorageBuffer out_buf(context(), vkapi::kFloat, 1); vkapi::PipelineBarrier pipeline_barrier{}; auto shader_name = "buf_cacheline_size"; @@ -132,8 +132,8 @@ void _bandwidth( // workgroups, once the size of the access excedes the workgroup width. const uint32_t workgroup_width = local_x * NITER * NUNROLL; - StagingBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float)); - StagingBuffer out_buf( + StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float)); + StorageBuffer out_buf( context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic); vkapi::PipelineBarrier pipeline_barrier{}; diff --git a/backends/vulkan/tools/gpuinfo/include/textures.h b/backends/vulkan/tools/gpuinfo/include/textures.h index c9ff133f1ec..7679f11b0ca 100644 --- a/backends/vulkan/tools/gpuinfo/include/textures.h +++ b/backends/vulkan/tools/gpuinfo/include/textures.h @@ -61,7 +61,7 @@ void tex_cacheline_concurr(const App& app) { vTensor in_tensor = api::vTensor(api::context(), sizes_nchw, vkapi::kFloat); - StagingBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH); + StorageBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH); vkapi::PipelineBarrier pipeline_barrier{}; @@ -173,7 +173,7 @@ void tex_bandwidth(const App& app) { // workgroups, once the size of the access excedes the workgroup width. const uint32_t workgroup_width = local_x * NITER * NUNROLL; - StagingBuffer out_buf( + StorageBuffer out_buf( context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic); vkapi::PipelineBarrier pipeline_barrier{}; From 48b2c0b06fa021a632bcb215dd7ced206adfe809 Mon Sep 17 00:00:00 2001 From: Jorge Pineda Date: Tue, 3 Sep 2024 10:36:27 -0700 Subject: [PATCH 08/11] Update base for Update on "[ET-VK] Persistently map staging buffers" `StagingBuffer`s are used for input, output, and constant tensors. For input and output, they are mapped every inference which is unnecessary. Instead, [persistently map the memory](https://gpuopen-librariesandsdks.github.io/VulkanMemoryAllocator/html/memory_mapping.html) once on model load. Differential Revision: [D59706627](https://our.internmc.facebook.com/intern/diff/D59706627/) [ghstack-poisoned] From e0928ff1ba7561d14d383c11739e1d3f388cb935 Mon Sep 17 00:00:00 2001 From: Jorge Pineda Date: Tue, 3 Sep 2024 11:30:59 -0700 Subject: [PATCH 09/11] Update base for Update on "[ET-VK] Persistently map staging buffers" `StagingBuffer`s are used for input, output, and constant tensors. For input and output, they are mapped every inference which is unnecessary. Instead, [persistently map the memory](https://gpuopen-librariesandsdks.github.io/VulkanMemoryAllocator/html/memory_mapping.html) once on model load. Differential Revision: [D59706627](https://our.internmc.facebook.com/intern/diff/D59706627/) [ghstack-poisoned] --- .ci/docker/ci_commit_pins/pytorch.txt | 2 +- install_requirements.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt index b291722c3f0..14422e45d7c 100644 --- a/.ci/docker/ci_commit_pins/pytorch.txt +++ b/.ci/docker/ci_commit_pins/pytorch.txt @@ -1 +1 @@ -00e3eea170ce5db8ea9c62ce5e48f13886cd6d20 +e4cd76cf8283c8ddbf95674b020fbfcff467cb4b diff --git a/install_requirements.py b/install_requirements.py index 64243ec6943..1f5982c80e0 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -94,7 +94,7 @@ def python_is_compatible(): # NOTE: If a newly-fetched version of the executorch repo changes the value of # NIGHTLY_VERSION, you should re-run this script to install the necessary # package versions. -NIGHTLY_VERSION = "dev20240901" +NIGHTLY_VERSION = "dev20240829" # The pip repository that hosts nightly torch packages. TORCH_NIGHTLY_URL = "https://download.pytorch.org/whl/nightly/cpu" From 7029ab6cc4459de0411269097c0c88f562275843 Mon Sep 17 00:00:00 2001 From: Jorge Pineda Date: Wed, 4 Sep 2024 13:52:09 -0700 Subject: [PATCH 10/11] Update base for Update on "[ET-VK] Persistently map staging buffers" `StagingBuffer`s are used for input, output, and constant tensors. For input and output, they are mapped every inference which is unnecessary. Instead, [persistently map the memory](https://gpuopen-librariesandsdks.github.io/VulkanMemoryAllocator/html/memory_mapping.html) once on model load. Differential Revision: [D59706627](https://our.internmc.facebook.com/intern/diff/D59706627/) [ghstack-poisoned] From cc058156475fadb63b2fbb03779ab87cbc7321f2 Mon Sep 17 00:00:00 2001 From: Jorge Pineda Date: Wed, 4 Sep 2024 13:57:53 -0700 Subject: [PATCH 11/11] Update base for Update on "[ET-VK] Persistently map staging buffers" `StagingBuffer`s are used for input, output, and constant tensors. For input and output, they are mapped every inference which is unnecessary. Instead, [persistently map the memory](https://gpuopen-librariesandsdks.github.io/VulkanMemoryAllocator/html/memory_mapping.html) once on model load. Differential Revision: [D59706627](https://our.internmc.facebook.com/intern/diff/D59706627/) [ghstack-poisoned]