From 5ec55b2ca7b40f026363fca7787bdeea4ff7689d Mon Sep 17 00:00:00 2001
From: Jorge Pineda <jorgep31415@meta.com>
Date: Fri, 30 Aug 2024 15:08:17 -0700
Subject: [PATCH 01/11] [ET-VK] Rename `StorageBuffer` to `StagingBuffer`

1. Tensors accessible to both CPU and GPU use `StorageBuffer`.
2. Tensors accessible to GPU only use `vTensorStorage`.

Since `StorageBuffer` is only used for [staging buffers](https://vulkan-tutorial.com/Vertex_buffers/Staging_buffer), i.e., buffers accessible to both CPU and GPU, `StagingBuffer` is a fitting name for (1) that decreases confusion between (1) and (2).

Differential Revision: [D62049779](https://our.internmc.facebook.com/intern/diff/D62049779/)

[ghstack-poisoned]
---
 backends/vulkan/runtime/api/Context.cpp          |  2 +-
 backends/vulkan/runtime/api/api.h                |  2 +-
 .../{StorageBuffer.h => StagingBuffer.h}         | 14 +++++++-------
 backends/vulkan/runtime/graph/ComputeGraph.cpp   |  4 ++--
 backends/vulkan/runtime/graph/ComputeGraph.h     |  2 +-
 backends/vulkan/runtime/graph/containers/Value.h |  8 ++++----
 .../vulkan/runtime/graph/ops/PrepackNode.cpp     |  8 ++++----
 backends/vulkan/runtime/graph/ops/PrepackNode.h  |  2 +-
 .../runtime/graph/ops/utils/BindingUtils.cpp     |  2 +-
 .../runtime/graph/ops/utils/BindingUtils.h       |  2 +-
 .../runtime/graph/ops/utils/StagingUtils.cpp     |  6 +++---
 .../runtime/graph/ops/utils/StagingUtils.h       |  6 +++---
 backends/vulkan/test/utils/test_utils.cpp        |  6 +++---
 backends/vulkan/test/utils/test_utils.h          | 10 +++++-----
 backends/vulkan/test/vulkan_compute_api_test.cpp | 16 ++++++++--------
 .../vulkan/tools/gpuinfo/include/architecture.h  |  6 +++---
 backends/vulkan/tools/gpuinfo/include/buffers.h  |  8 ++++----
 backends/vulkan/tools/gpuinfo/include/textures.h |  4 ++--
 18 files changed, 54 insertions(+), 54 deletions(-)
 rename backends/vulkan/runtime/api/containers/{StorageBuffer.h => StagingBuffer.h} (83%)

diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp
index 4d2a854de3b..e13293c3098 100644
--- a/backends/vulkan/runtime/api/Context.cpp
+++ b/backends/vulkan/runtime/api/Context.cpp
@@ -189,7 +189,7 @@ Context* context() {
       const vkapi::DescriptorPoolConfig descriptor_pool_config{
           VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorPoolMaxSets
           VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorUniformBufferCount
-          VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorStorageBufferCount
+          VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorStagingBufferCount
           VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorCombinedSamplerCount
           VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorStorageImageCount
           32u, // descriptorPileSizes
diff --git a/backends/vulkan/runtime/api/api.h b/backends/vulkan/runtime/api/api.h
index de77c57fb0e..0f496a4af8a 100644
--- a/backends/vulkan/runtime/api/api.h
+++ b/backends/vulkan/runtime/api/api.h
@@ -12,7 +12,7 @@
 #include <executorch/backends/vulkan/runtime/api/ShaderRegistry.h>
 
 #include <executorch/backends/vulkan/runtime/api/containers/ParamsBuffer.h>
-#include <executorch/backends/vulkan/runtime/api/containers/StorageBuffer.h>
+#include <executorch/backends/vulkan/runtime/api/containers/StagingBuffer.h>
 #include <executorch/backends/vulkan/runtime/api/containers/Tensor.h>
 
 #include <executorch/backends/vulkan/runtime/utils/VecUtils.h>
diff --git a/backends/vulkan/runtime/api/containers/StorageBuffer.h b/backends/vulkan/runtime/api/containers/StagingBuffer.h
similarity index 83%
rename from backends/vulkan/runtime/api/containers/StorageBuffer.h
rename to backends/vulkan/runtime/api/containers/StagingBuffer.h
index 17c34706057..ab650c09a43 100644
--- a/backends/vulkan/runtime/api/containers/StorageBuffer.h
+++ b/backends/vulkan/runtime/api/containers/StagingBuffer.h
@@ -17,7 +17,7 @@
 namespace vkcompute {
 namespace api {
 
-class StorageBuffer final {
+class StagingBuffer final {
  private:
   Context* context_p_;
   vkapi::ScalarType dtype_;
@@ -26,7 +26,7 @@ class StorageBuffer final {
   vkapi::VulkanBuffer vulkan_buffer_;
 
  public:
-  StorageBuffer(
+  StagingBuffer(
       Context* context_p,
       const vkapi::ScalarType dtype,
       const size_t numel,
@@ -39,13 +39,13 @@ class StorageBuffer final {
             nbytes_,
             gpuonly)) {}
 
-  StorageBuffer(const StorageBuffer&) = delete;
-  StorageBuffer& operator=(const StorageBuffer&) = delete;
+  StagingBuffer(const StagingBuffer&) = delete;
+  StagingBuffer& operator=(const StagingBuffer&) = delete;
 
-  StorageBuffer(StorageBuffer&&) = default;
-  StorageBuffer& operator=(StorageBuffer&&) = default;
+  StagingBuffer(StagingBuffer&&) = default;
+  StagingBuffer& operator=(StagingBuffer&&) = default;
 
-  ~StorageBuffer() {
+  ~StagingBuffer() {
     context_p_->register_buffer_cleanup(vulkan_buffer_);
   }
 
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index 9fa0091b298..6c3ec88eaa7 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -38,7 +38,7 @@ namespace vkcompute {
 
 VALUE_PTR_CLASS_IMPL(vTensorPtr, api::vTensor, Tensor)
 VALUE_PTR_CLASS_IMPL(TensorRefPtr, TensorRef, TensorRef)
-VALUE_PTR_CLASS_IMPL(StagingPtr, api::StorageBuffer, Staging)
+VALUE_PTR_CLASS_IMPL(StagingPtr, api::StagingBuffer, Staging)
 VALUE_PTR_CLASS_IMPL(IntListPtr, std::vector<int64_t>, IntList)
 VALUE_PTR_CLASS_IMPL(DoubleListPtr, std::vector<double>, DoubleList)
 VALUE_PTR_CLASS_IMPL(BoolListPtr, std::vector<bool>, BoolList)
@@ -236,7 +236,7 @@ ValueRef ComputeGraph::add_staging(
     const size_t numel) {
   ValueRef idx(static_cast<int>(values_.size()));
   check_no_active_value_ptrs();
-  values_.emplace_back(api::StorageBuffer(context(), dtype, numel));
+  values_.emplace_back(api::StagingBuffer(context(), dtype, numel));
   return idx;
 }
 
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 5740d24a448..9b04b08a70e 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -58,7 +58,7 @@ class ComputeGraph;
 
 DECL_VALUE_PTR_CLASS(vTensorPtr, api::vTensor)
 DECL_VALUE_PTR_CLASS(TensorRefPtr, TensorRef)
-DECL_VALUE_PTR_CLASS(StagingPtr, api::StorageBuffer)
+DECL_VALUE_PTR_CLASS(StagingPtr, api::StagingBuffer)
 DECL_VALUE_PTR_CLASS(IntListPtr, std::vector<int64_t>)
 DECL_VALUE_PTR_CLASS(DoubleListPtr, std::vector<double>)
 DECL_VALUE_PTR_CLASS(BoolListPtr, std::vector<bool>)
diff --git a/backends/vulkan/runtime/graph/containers/Value.h b/backends/vulkan/runtime/graph/containers/Value.h
index ba82213c6f8..6e03bbd4a21 100644
--- a/backends/vulkan/runtime/graph/containers/Value.h
+++ b/backends/vulkan/runtime/graph/containers/Value.h
@@ -53,7 +53,7 @@ struct Value final {
     } u;
 
     api::vTensor as_tensor;
-    api::StorageBuffer as_staging;
+    api::StagingBuffer as_staging;
     TensorRef as_tensorref;
 
     std::vector<int64_t> as_int_list;
@@ -108,7 +108,7 @@ struct Value final {
       CASE_MOVE_MOVEABLE_TYPE(
           TypeTag::TENSOR, api::vTensor, as_tensor, vTensor);
       CASE_MOVE_MOVEABLE_TYPE(
-          TypeTag::STAGING, api::StorageBuffer, as_staging, StorageBuffer);
+          TypeTag::STAGING, api::StagingBuffer, as_staging, StagingBuffer);
       CASE_MOVE_MOVEABLE_TYPE(
           TypeTag::TENSORREF, TensorRef, as_tensorref, TensorRef);
       // Scalar lists
@@ -152,7 +152,7 @@ struct Value final {
         payload.as_tensor.~vTensor();
         break;
       case TypeTag::STAGING:
-        payload.as_staging.~StorageBuffer();
+        payload.as_staging.~StagingBuffer();
         break;
       case TypeTag::TENSORREF:
         payload.as_tensorref.~TensorRef();
@@ -247,7 +247,7 @@ struct Value final {
       as_tensor);
 
   SUPPORT_TRIVIALLY_MOVEABLE_TYPE(
-      api::StorageBuffer,
+      api::StagingBuffer,
       Staging,
       TypeTag::STAGING,
       as_staging);
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
index b77c62920dd..a9c2f6c9b6a 100644
--- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
@@ -45,14 +45,14 @@ PrepackNode::PrepackNode(
   graph.update_descriptor_counts(noop_shader_, /*execute = */ false);
 }
 
-api::StorageBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
+api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
   vTensorPtr packed = graph->get_tensor(packed_);
 
   // If no TensorRef is provided, create a staging buffer of zeros according to
   // the vkapi::vTensor metadata.
   if (graph->val_is_none(tref_)) {
     size_t numel = utils::multiply_integers(packed->sizes());
-    api::StorageBuffer staging(graph->context(), packed->dtype(), numel);
+    api::StagingBuffer staging(graph->context(), packed->dtype(), numel);
     size_t nbytes = numel * vkapi::element_size(packed->dtype());
     set_staging_zeros(staging, nbytes);
     return staging;
@@ -60,7 +60,7 @@ api::StorageBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
 
   TensorRefPtr tref = graph->get_tref(tref_);
   size_t numel = utils::multiply_integers(tref->sizes);
-  api::StorageBuffer staging(graph->context(), tref->dtype, numel);
+  api::StagingBuffer staging(graph->context(), tref->dtype, numel);
   size_t nbytes = numel * vkapi::element_size(tref->dtype);
   copy_ptr_to_staging(tref->data, staging, nbytes);
   return staging;
@@ -70,7 +70,7 @@ void PrepackNode::encode(ComputeGraph* graph) {
   api::Context* const context = graph->context();
 
   vTensorPtr packed = graph->get_tensor(packed_);
-  api::StorageBuffer staging = create_staging_buffer(graph);
+  api::StagingBuffer staging = create_staging_buffer(graph);
 
   std::unique_lock<std::mutex> cmd_lock = context->dispatch_lock();
 
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.h b/backends/vulkan/runtime/graph/ops/PrepackNode.h
index c3ac8b963fd..3e713303c3d 100644
--- a/backends/vulkan/runtime/graph/ops/PrepackNode.h
+++ b/backends/vulkan/runtime/graph/ops/PrepackNode.h
@@ -56,7 +56,7 @@ class PrepackNode final {
   const vkapi::SpecVarList spec_vars_;
 
  private:
-  api::StorageBuffer create_staging_buffer(ComputeGraph* graph);
+  api::StagingBuffer create_staging_buffer(ComputeGraph* graph);
 };
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp
index b0964ace225..2cfb34a052e 100644
--- a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp
@@ -66,7 +66,7 @@ uint32_t bind_params_to_descriptor_set(
 }
 
 void bind_staging_to_descriptor_set(
-    api::StorageBuffer& staging,
+    api::StagingBuffer& staging,
     vkapi::DescriptorSet& descriptor_set,
     const uint32_t idx) {
   descriptor_set.bind(idx, staging.buffer());
diff --git a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h
index 3a7ec029da7..eed39a97979 100644
--- a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h
+++ b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h
@@ -40,7 +40,7 @@ uint32_t bind_params_to_descriptor_set(
     const uint32_t base_idx);
 
 void bind_staging_to_descriptor_set(
-    api::StorageBuffer& staging,
+    api::StagingBuffer& staging,
     vkapi::DescriptorSet& descriptor_set,
     const uint32_t idx);
 
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
index 294e36b9a86..9cb715e202a 100644
--- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
@@ -73,7 +73,7 @@ void memcpy_from_mapping(
 
 void copy_ptr_to_staging(
     const void* src,
-    api::StorageBuffer& staging,
+    api::StagingBuffer& staging,
     const size_t nbytes) {
   vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::WRITE);
   mapping.invalidate();
@@ -81,7 +81,7 @@ void copy_ptr_to_staging(
 }
 
 void copy_staging_to_ptr(
-    api::StorageBuffer& staging,
+    api::StagingBuffer& staging,
     void* dst,
     const size_t nbytes) {
   vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::READ);
@@ -89,7 +89,7 @@ void copy_staging_to_ptr(
   memcpy_from_mapping(mapping, dst, nbytes, staging.dtype());
 }
 
-void set_staging_zeros(api::StorageBuffer& staging, const size_t nbytes) {
+void set_staging_zeros(api::StagingBuffer& staging, const size_t nbytes) {
   vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::WRITE);
   uint8_t* data_ptr = mapping.template data<uint8_t>();
   memset(data_ptr, 0, staging.nbytes());
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
index cabc17f30ee..f16c52ecf33 100644
--- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
+++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
@@ -18,14 +18,14 @@ namespace vkcompute {
 
 void copy_ptr_to_staging(
     const void* src,
-    api::StorageBuffer& staging,
+    api::StagingBuffer& staging,
     const size_t nbytes);
 void copy_staging_to_ptr(
-    api::StorageBuffer& staging,
+    api::StagingBuffer& staging,
     void* dst,
     const size_t nbytes);
 
-void set_staging_zeros(api::StorageBuffer& staging, const size_t nbytes);
+void set_staging_zeros(api::StagingBuffer& staging, const size_t nbytes);
 
 //
 // Functions to get shaders
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
index 4a3a41d6c72..4c2972419d0 100644
--- a/backends/vulkan/test/utils/test_utils.cpp
+++ b/backends/vulkan/test/utils/test_utils.cpp
@@ -112,7 +112,7 @@ void record_image_to_nchw_op(
 void record_int8_image_to_nchw_noint8_op(
     api::Context* const context,
     api::vTensor& v_src,
-    api::StorageBuffer& dst_buffer) {
+    api::StagingBuffer& dst_buffer) {
   vkapi::PipelineBarrier pipeline_barrier{};
   uint32_t buffer_len = utils::safe_downcast<uint32_t>(dst_buffer.numel() / 4);
   utils::uvec3 global_wg_size = {buffer_len, 1, 1};
@@ -324,7 +324,7 @@ void record_reference_matmul(
   _(int8_t, QInt8)
 
 void fill_vtensor(api::vTensor& vten, std::vector<float>& data) {
-  api::StorageBuffer staging_buffer(api::context(), vten.dtype(), data.size());
+  api::StagingBuffer staging_buffer(api::context(), vten.dtype(), data.size());
 
 #define CASE(ctype, name)                                                     \
   case vkapi::ScalarType::name: {                                             \
@@ -411,7 +411,7 @@ void fill_vtensor(
 }
 
 void extract_vtensor(api::vTensor& vten, std::vector<float>& data) {
-  api::StorageBuffer staging_buffer(
+  api::StagingBuffer staging_buffer(
       api::context(), vten.dtype(), vten.staging_buffer_numel());
 
   if (vten.storage_type() == utils::StorageType::BUFFER) {
diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h
index c8af5470862..3bc12c472db 100644
--- a/backends/vulkan/test/utils/test_utils.h
+++ b/backends/vulkan/test/utils/test_utils.h
@@ -37,13 +37,13 @@ using namespace vkcompute;
       allocate_memory);
 
 #define DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(tensor)          \
-  api::StorageBuffer staging_buffer_##tensor(                        \
+  api::StagingBuffer staging_buffer_##tensor(                        \
       api::context(), vkapi::kFloat, tensor.staging_buffer_numel()); \
   record_nchw_to_image_op(                                           \
       api::context(), staging_buffer_##tensor.buffer(), tensor);
 
 #define DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(tensor)        \
-  api::StorageBuffer staging_buffer_##tensor(                        \
+  api::StagingBuffer staging_buffer_##tensor(                        \
       api::context(), vkapi::kFloat, tensor.staging_buffer_numel()); \
   record_image_to_nchw_op(                                           \
       api::context(), tensor, staging_buffer_##tensor.buffer());
@@ -85,7 +85,7 @@ void record_image_to_nchw_op(
 void record_int8_image_to_nchw_noint8_op(
     api::Context* const context,
     api::vTensor& v_src,
-    api::StorageBuffer& dst_buffer);
+    api::StagingBuffer& dst_buffer);
 
 void record_conv2d_prepack_weights_op(
     api::Context* const context,
@@ -126,7 +126,7 @@ void record_reference_matmul(
 //
 
 inline void
-fill_staging(api::StorageBuffer& staging, float val, int numel = -1) {
+fill_staging(api::StagingBuffer& staging, float val, int numel = -1) {
   if (numel < 0) {
     numel = staging.numel();
   }
@@ -164,7 +164,7 @@ inline std::vector<float> extract_vtensor(api::vTensor& vten) {
 }
 
 inline void
-check_staging_buffer(api::StorageBuffer& staging, float val, int numel = -1) {
+check_staging_buffer(api::StagingBuffer& staging, float val, int numel = -1) {
   if (numel < 0) {
     numel = staging.numel();
   }
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 3d172f490cf..f3c60a21376 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -336,7 +336,7 @@ TEST_F(VulkanComputeAPITest, spec_var_classes_test) {
 
 TEST_F(VulkanComputeAPITest, spec_var_shader_test) {
   size_t len = 16;
-  StorageBuffer buffer(context(), vkapi::kFloat, len);
+  StagingBuffer buffer(context(), vkapi::kFloat, len);
 
   float scale = 3.0f;
   float offset = 1.5f;
@@ -407,7 +407,7 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) {
         params.buffer());
   }
 
-  StorageBuffer staging_buffer(
+  StagingBuffer staging_buffer(
       context(), vkapi::kFloat, a.staging_buffer_numel());
   record_image_to_nchw_op(context(), a, staging_buffer.buffer());
 
@@ -428,7 +428,7 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) {
 
 template <typename T, vkapi::ScalarType dtype>
 void test_storage_buffer_type(const size_t len) {
-  StorageBuffer buffer(context(), dtype, len);
+  StagingBuffer buffer(context(), dtype, len);
 
   std::string kernel_name("idx_fill_buffer");
   switch (dtype) {
@@ -2040,7 +2040,7 @@ void run_from_gpu_test(
         vten.sizes_ubo());
   }
 
-  StorageBuffer staging_buffer(context(), dtype, vten.staging_buffer_numel());
+  StagingBuffer staging_buffer(context(), dtype, vten.staging_buffer_numel());
 
   if (dtype == vkapi::kChar &&
       !context()->adapter_ptr()->has_full_int8_buffers_support()) {
@@ -2073,7 +2073,7 @@ void round_trip_test(
   vTensor vten = vTensor(context(), sizes, dtype, storage_type, memory_layout);
 
   // Create and fill input staging buffer
-  StorageBuffer staging_buffer_in(
+  StagingBuffer staging_buffer_in(
       context(), dtype, vten.staging_buffer_numel());
 
   std::vector<T> data_in(staging_buffer_in.numel());
@@ -2084,7 +2084,7 @@ void round_trip_test(
       data_in.data(), staging_buffer_in, vten.staging_buffer_nbytes());
 
   // Output staging buffer
-  StorageBuffer staging_buffer_out(
+  StagingBuffer staging_buffer_out(
       context(), dtype, vten.staging_buffer_numel());
 
   record_nchw_to_image_op(context(), staging_buffer_in.buffer(), vten);
@@ -2538,7 +2538,7 @@ void test_conv2d(
 
   // Create and fill input staging buffer
   const int64_t in_numel = utils::multiply_integers(original_sizes);
-  StorageBuffer staging_buffer_in(context(), vkapi::kFloat, in_numel);
+  StagingBuffer staging_buffer_in(context(), vkapi::kFloat, in_numel);
 
   std::vector<float> data_in(in_numel);
   for (int i = 0; i < in_numel; i++) {
@@ -2550,7 +2550,7 @@ void test_conv2d(
   // Output staging buffer
   const int64_t out_numel =
       padded_sizes[0] * padded_sizes[1] * original_sizes[2] * original_sizes[3];
-  StorageBuffer staging_buffer_out(context(), vkapi::kFloat, out_numel);
+  StagingBuffer staging_buffer_out(context(), vkapi::kFloat, out_numel);
 
   // Copy data in and out of the tensor
   record_conv2d_prepack_weights_op(
diff --git a/backends/vulkan/tools/gpuinfo/include/architecture.h b/backends/vulkan/tools/gpuinfo/include/architecture.h
index 0d312ee87c3..20c6254e1a0 100644
--- a/backends/vulkan/tools/gpuinfo/include/architecture.h
+++ b/backends/vulkan/tools/gpuinfo/include/architecture.h
@@ -40,7 +40,7 @@ void reg_count(const App& app) {
   uint32_t NITER;
 
   auto bench = [&](uint32_t ngrp, uint32_t nreg) {
-    StorageBuffer buffer(context(), vkapi::kFloat, 1);
+    StagingBuffer buffer(context(), vkapi::kFloat, 1);
     vkapi::PipelineBarrier pipeline_barrier{};
 
     auto shader_name = "reg_count_" + std::to_string(nreg);
@@ -164,7 +164,7 @@ void warp_size(const App& app, const bool verbose = false) {
   uint32_t NITER;
 
   auto bench = [&](uint32_t nthread) {
-    StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic);
+    StagingBuffer out_buf(context(), vkapi::kInt, app.nthread_logic);
     vkapi::PipelineBarrier pipeline_barrier{};
 
     auto shader_name = "warp_size_physical";
@@ -224,7 +224,7 @@ void warp_size(const App& app, const bool verbose = false) {
   // doesn't depend on kernel timing, so the extra wait time doesn't lead to
   // inaccuracy.
   auto bench_sm = [&](uint32_t nthread) {
-    StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic);
+    StagingBuffer out_buf(context(), vkapi::kInt, app.nthread_logic);
     vkapi::PipelineBarrier pipeline_barrier{};
 
     auto shader_name = "warp_size_scheduler";
diff --git a/backends/vulkan/tools/gpuinfo/include/buffers.h b/backends/vulkan/tools/gpuinfo/include/buffers.h
index c8cf93c4a12..31137b11eea 100644
--- a/backends/vulkan/tools/gpuinfo/include/buffers.h
+++ b/backends/vulkan/tools/gpuinfo/include/buffers.h
@@ -35,8 +35,8 @@ void buf_cacheline_size(const App& app) {
   uint32_t NITER;
 
   auto bench = [&](int stride) {
-    StorageBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE);
-    StorageBuffer out_buf(context(), vkapi::kFloat, 1);
+    StagingBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE);
+    StagingBuffer out_buf(context(), vkapi::kFloat, 1);
     vkapi::PipelineBarrier pipeline_barrier{};
 
     auto shader_name = "buf_cacheline_size";
@@ -132,8 +132,8 @@ void _bandwidth(
     // workgroups, once the size of the access excedes the workgroup width.
     const uint32_t workgroup_width = local_x * NITER * NUNROLL;
 
-    StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float));
-    StorageBuffer out_buf(
+    StagingBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float));
+    StagingBuffer out_buf(
         context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic);
     vkapi::PipelineBarrier pipeline_barrier{};
 
diff --git a/backends/vulkan/tools/gpuinfo/include/textures.h b/backends/vulkan/tools/gpuinfo/include/textures.h
index 7679f11b0ca..c9ff133f1ec 100644
--- a/backends/vulkan/tools/gpuinfo/include/textures.h
+++ b/backends/vulkan/tools/gpuinfo/include/textures.h
@@ -61,7 +61,7 @@ void tex_cacheline_concurr(const App& app) {
       vTensor in_tensor =
           api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);
 
-      StorageBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH);
+      StagingBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH);
 
       vkapi::PipelineBarrier pipeline_barrier{};
 
@@ -173,7 +173,7 @@ void tex_bandwidth(const App& app) {
       // workgroups, once the size of the access excedes the workgroup width.
       const uint32_t workgroup_width = local_x * NITER * NUNROLL;
 
-      StorageBuffer out_buf(
+      StagingBuffer out_buf(
           context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic);
       vkapi::PipelineBarrier pipeline_barrier{};
 

From d3987b83dbe25cfd76629247ab1409ccff511256 Mon Sep 17 00:00:00 2001
From: Jorge Pineda <jorgep31415@meta.com>
Date: Fri, 30 Aug 2024 15:08:20 -0700
Subject: [PATCH 02/11] [ET-VK] Remove unused Allocator function

TSIA

Differential Revision: [D62049777](https://our.internmc.facebook.com/intern/diff/D62049777/)

[ghstack-poisoned]
---
 backends/vulkan/runtime/vk_api/memory/Allocator.cpp | 11 -----------
 backends/vulkan/runtime/vk_api/memory/Allocator.h   |  2 --
 2 files changed, 13 deletions(-)

diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
index 1dadca27a0b..f7428f12b67 100644
--- a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
+++ b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
@@ -162,17 +162,6 @@ VulkanBuffer Allocator::create_storage_buffer(
       allocator_, size, alloc_create_info, buffer_usage, allocate_memory);
 }
 
-VulkanBuffer Allocator::create_staging_buffer(const VkDeviceSize size) {
-  VmaAllocationCreateInfo alloc_create_info = {};
-  alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY;
-  alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST;
-
-  VkBufferUsageFlags buffer_usage =
-      VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
-
-  return VulkanBuffer(allocator_, size, alloc_create_info, buffer_usage);
-}
-
 VulkanBuffer Allocator::create_uniform_buffer(const VkDeviceSize size) {
   VmaAllocationCreateInfo alloc_create_info = {};
   alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY |
diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.h b/backends/vulkan/runtime/vk_api/memory/Allocator.h
index 904163cefb4..6d8ee09ae5d 100644
--- a/backends/vulkan/runtime/vk_api/memory/Allocator.h
+++ b/backends/vulkan/runtime/vk_api/memory/Allocator.h
@@ -67,8 +67,6 @@ class Allocator final {
       const bool gpu_only = true,
       const bool allocate_memory = true);
 
-  VulkanBuffer create_staging_buffer(const VkDeviceSize);
-
   /*
    * Create a uniform buffer with a specified size
    */

From 634033368f4d7c36e8137f924815a85163f47e2e Mon Sep 17 00:00:00 2001
From: Jorge Pineda <jorgep31415@meta.com>
Date: Fri, 30 Aug 2024 15:08:23 -0700
Subject: [PATCH 03/11] [ET-VK] Simplify Allocator's buffer creation methods

There are ~4 lines of duplicated code but IMO, the distinct function names make the code much clearer.

Differential Revision: [D62049778](https://our.internmc.facebook.com/intern/diff/D62049778/)

[ghstack-poisoned]
---
 .../runtime/api/containers/StagingBuffer.h    |  8 ++--
 .../vulkan/runtime/api/containers/Tensor.cpp  |  2 +-
 .../runtime/vk_api/memory/Allocator.cpp       | 40 +++++++++----------
 .../vulkan/runtime/vk_api/memory/Allocator.h  |  3 +-
 4 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.h b/backends/vulkan/runtime/api/containers/StagingBuffer.h
index ab650c09a43..a24728470b0 100644
--- a/backends/vulkan/runtime/api/containers/StagingBuffer.h
+++ b/backends/vulkan/runtime/api/containers/StagingBuffer.h
@@ -29,15 +29,13 @@ class StagingBuffer final {
   StagingBuffer(
       Context* context_p,
       const vkapi::ScalarType dtype,
-      const size_t numel,
-      const bool gpuonly = false)
+      const size_t numel)
       : context_p_(context_p),
         dtype_(dtype),
         numel_(numel),
         nbytes_(element_size(dtype_) * numel_),
-        vulkan_buffer_(context_p_->adapter_ptr()->vma().create_storage_buffer(
-            nbytes_,
-            gpuonly)) {}
+        vulkan_buffer_(
+            context_p_->adapter_ptr()->vma().create_staging_buffer(nbytes_)) {}
 
   StagingBuffer(const StagingBuffer&) = delete;
   StagingBuffer& operator=(const StagingBuffer&) = delete;
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index 578898ad194..7b9d30ef658 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -540,7 +540,7 @@ vkapi::VulkanBuffer allocate_buffer(
   }
 
   return adapter_ptr->vma().create_storage_buffer(
-      element_size(dtype) * numel, /*gpu_only = */ true, allocate_memory);
+      element_size(dtype) * numel, allocate_memory);
 }
 
 vTensorStorage::vTensorStorage(
diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
index f7428f12b67..b990cf6a119 100644
--- a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
+++ b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
@@ -132,9 +132,27 @@ VulkanImage Allocator::create_image(
       allocate_memory);
 }
 
+VulkanBuffer Allocator::create_staging_buffer(const VkDeviceSize size) {
+  const VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
+
+  VmaAllocationCreateInfo alloc_create_info = {};
+  alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY;
+  alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
+
+  // Staging buffers are accessed by both the CPU and GPU, so set the
+  // appropriate flags to indicate that the host device will be accessing
+  // the data from this buffer.
+  alloc_create_info.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT;
+  alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST;
+  alloc_create_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+  alloc_create_info.preferredFlags =
+      VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
+
+  return VulkanBuffer(allocator_, size, alloc_create_info, buffer_usage);
+}
+
 VulkanBuffer Allocator::create_storage_buffer(
     const VkDeviceSize size,
-    const bool gpu_only,
     const bool allocate_memory) {
   const VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
 
@@ -142,22 +160,6 @@ VulkanBuffer Allocator::create_storage_buffer(
   alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY;
   alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
 
-  // The create storage buffer will be accessed by both the CPU and GPU, so set
-  // the appropriate flags to indicate that the host device will be accessing
-  // the data from this buffer.
-  if (!gpu_only) {
-    // Deferred memory allocation should only be used for GPU only buffers.
-    VK_CHECK_COND(
-        allocate_memory,
-        "Only GPU-only buffers should use deferred memory allocation");
-
-    alloc_create_info.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT;
-    alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST;
-    alloc_create_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
-    alloc_create_info.preferredFlags = VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
-        VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
-  }
-
   return VulkanBuffer(
       allocator_, size, alloc_create_info, buffer_usage, allocate_memory);
 }
@@ -170,9 +172,7 @@ VulkanBuffer Allocator::create_uniform_buffer(const VkDeviceSize size) {
 
   VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
 
-  VulkanBuffer uniform_buffer(
-      allocator_, size, alloc_create_info, buffer_usage);
-  return uniform_buffer;
+  return VulkanBuffer(allocator_, size, alloc_create_info, buffer_usage);
 }
 
 } // namespace vkapi
diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.h b/backends/vulkan/runtime/vk_api/memory/Allocator.h
index 6d8ee09ae5d..7d02ffe54e3 100644
--- a/backends/vulkan/runtime/vk_api/memory/Allocator.h
+++ b/backends/vulkan/runtime/vk_api/memory/Allocator.h
@@ -62,9 +62,10 @@ class Allocator final {
       const bool allow_transfer = false,
       const bool allocate_memory = true);
 
+  VulkanBuffer create_staging_buffer(const VkDeviceSize);
+
   VulkanBuffer create_storage_buffer(
       const VkDeviceSize,
-      const bool gpu_only = true,
       const bool allocate_memory = true);
 
   /*

From 341c5a7c95a7c10827c47d255d33238f6b16ce96 Mon Sep 17 00:00:00 2001
From: Jorge Pineda <jorgep31415@meta.com>
Date: Fri, 30 Aug 2024 16:31:57 -0700
Subject: [PATCH 04/11] Update base for Update on "[ET-VK] Simplify Allocator's
 buffer creation methods"

There are ~4 lines of duplicated code but IMO, the distinct function names make the code much clearer.

Differential Revision: [D62049778](https://our.internmc.facebook.com/intern/diff/D62049778/)

[ghstack-poisoned]
---
 backends/vulkan/runtime/api/Context.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp
index e13293c3098..4d2a854de3b 100644
--- a/backends/vulkan/runtime/api/Context.cpp
+++ b/backends/vulkan/runtime/api/Context.cpp
@@ -189,7 +189,7 @@ Context* context() {
       const vkapi::DescriptorPoolConfig descriptor_pool_config{
           VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorPoolMaxSets
           VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorUniformBufferCount
-          VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorStagingBufferCount
+          VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorStorageBufferCount
           VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorCombinedSamplerCount
           VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorStorageImageCount
           32u, // descriptorPileSizes

From a17af04a3464259f34c2e293763de98916ba38e2 Mon Sep 17 00:00:00 2001
From: Jorge Pineda <jorgep31415@meta.com>
Date: Fri, 30 Aug 2024 16:31:58 -0700
Subject: [PATCH 05/11] [ET-VK] Persistently map staging buffers

`StagingBuffer`s are used for input, output, and constant tensors. For input and output, they are mapped every inference which is unnecessary. Instead, [persistently map the memory](https://gpuopen-librariesandsdks.github.io/VulkanMemoryAllocator/html/memory_mapping.html) once on model load.

Differential Revision: [D59706627](https://our.internmc.facebook.com/intern/diff/D59706627/)

[ghstack-poisoned]
---
 .../runtime/api/containers/StagingBuffer.h       | 11 +++++++++--
 backends/vulkan/runtime/graph/ComputeGraph.cpp   | 11 +++++++----
 backends/vulkan/runtime/graph/ComputeGraph.h     |  5 ++++-
 .../vulkan/runtime/graph/ops/PrepackNode.cpp     |  9 +++++++--
 .../runtime/graph/ops/utils/StagingUtils.cpp     |  8 ++------
 .../vulkan/runtime/vk_api/memory/Allocator.cpp   |  3 ++-
 backends/vulkan/runtime/vk_api/memory/Buffer.cpp | 15 ++++++++++++++-
 backends/vulkan/runtime/vk_api/memory/Buffer.h   |  4 ++--
 backends/vulkan/test/utils/test_utils.cpp        |  4 ++--
 backends/vulkan/test/utils/test_utils.h          |  4 ++--
 backends/vulkan/test/vulkan_compute_api_test.cpp | 16 ++++++++--------
 11 files changed, 59 insertions(+), 31 deletions(-)

diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.h b/backends/vulkan/runtime/api/containers/StagingBuffer.h
index a24728470b0..f457878bc0e 100644
--- a/backends/vulkan/runtime/api/containers/StagingBuffer.h
+++ b/backends/vulkan/runtime/api/containers/StagingBuffer.h
@@ -24,18 +24,21 @@ class StagingBuffer final {
   size_t numel_;
   size_t nbytes_;
   vkapi::VulkanBuffer vulkan_buffer_;
+  vkapi::MemoryMap memory_map_;
 
  public:
   StagingBuffer(
       Context* context_p,
       const vkapi::ScalarType dtype,
-      const size_t numel)
+      const size_t numel,
+      const vkapi::MemoryAccessType access)
       : context_p_(context_p),
         dtype_(dtype),
         numel_(numel),
         nbytes_(element_size(dtype_) * numel_),
         vulkan_buffer_(
-            context_p_->adapter_ptr()->vma().create_staging_buffer(nbytes_)) {}
+            context_p_->adapter_ptr()->vma().create_staging_buffer(nbytes_)),
+        memory_map_(vulkan_buffer_, access) {}
 
   StagingBuffer(const StagingBuffer&) = delete;
   StagingBuffer& operator=(const StagingBuffer&) = delete;
@@ -55,6 +58,10 @@ class StagingBuffer final {
     return vulkan_buffer_;
   }
 
+  inline vkapi::MemoryMap& mapping() {
+    return memory_map_;
+  }
+
   inline size_t numel() {
     return numel_;
   }
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index 6c3ec88eaa7..f46623c375c 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -233,10 +233,11 @@ ValueRef ComputeGraph::add_tensorref(
 
 ValueRef ComputeGraph::add_staging(
     const vkapi::ScalarType dtype,
-    const size_t numel) {
+    const size_t numel,
+    const vkapi::MemoryAccessType access) {
   ValueRef idx(static_cast<int>(values_.size()));
   check_no_active_value_ptrs();
-  values_.emplace_back(api::StagingBuffer(context(), dtype, numel));
+  values_.emplace_back(api::StagingBuffer(context(), dtype, numel, access));
   return idx;
 }
 
@@ -269,7 +270,8 @@ ValueRef ComputeGraph::set_input_tensor(
     // For texture storage, the buffer size needs to account for the zero
     // padding applied by unused texel elements.
     size_t buf_numel = get_tensor(idx)->staging_buffer_numel();
-    ValueRef staging_idx = add_staging(dtype, buf_numel);
+    ValueRef staging_idx =
+        add_staging(dtype, buf_numel, vkapi::MemoryAccessType::WRITE);
     add_staging_to_tensor_node(*this, staging_idx, idx);
     inputs_.push_back({idx, staging_idx});
     return staging_idx;
@@ -286,7 +288,8 @@ ValueRef ComputeGraph::set_output_tensor(
     // For texture storage, the buffer size needs to account for the zero
     // padding applied by unused texel elements.
     size_t buf_numel = get_tensor(idx)->staging_buffer_numel();
-    ValueRef staging_idx = add_staging(dtype, buf_numel);
+    ValueRef staging_idx =
+        add_staging(dtype, buf_numel, vkapi::MemoryAccessType::READ);
     // We only run this when the tensor is non-empty.  When the underlying
     // tensor is empty (e.g. padded_numel == 0), we do not allocate a VkImage to
     // tensor, we will not be able to bind the node for execution.
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 9b04b08a70e..3fe7b104199 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -406,7 +406,10 @@ class ComputeGraph final {
    * use memory that is visible to both the CPU and GPU, and therefore is used
    * as a intermediary when transferring data between the CPU and GPU.
    */
-  ValueRef add_staging(const vkapi::ScalarType dtype, const size_t numel);
+  ValueRef add_staging(
+      const vkapi::ScalarType dtype,
+      const size_t numel,
+      const vkapi::MemoryAccessType access);
 
   ValueRef add_none();
 
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
index a9c2f6c9b6a..d1b75a52c9e 100644
--- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
@@ -52,7 +52,11 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
   // the vkapi::vTensor metadata.
   if (graph->val_is_none(tref_)) {
     size_t numel = utils::multiply_integers(packed->sizes());
-    api::StagingBuffer staging(graph->context(), packed->dtype(), numel);
+    api::StagingBuffer staging(
+        graph->context(),
+        packed->dtype(),
+        numel,
+        vkapi::MemoryAccessType::WRITE);
     size_t nbytes = numel * vkapi::element_size(packed->dtype());
     set_staging_zeros(staging, nbytes);
     return staging;
@@ -60,7 +64,8 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
 
   TensorRefPtr tref = graph->get_tref(tref_);
   size_t numel = utils::multiply_integers(tref->sizes);
-  api::StagingBuffer staging(graph->context(), tref->dtype, numel);
+  api::StagingBuffer staging(
+      graph->context(), tref->dtype, numel, vkapi::MemoryAccessType::WRITE);
   size_t nbytes = numel * vkapi::element_size(tref->dtype);
   copy_ptr_to_staging(tref->data, staging, nbytes);
   return staging;
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
index 9cb715e202a..ab18f233fbe 100644
--- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
@@ -75,18 +75,14 @@ void copy_ptr_to_staging(
     const void* src,
     api::StagingBuffer& staging,
     const size_t nbytes) {
-  vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::WRITE);
-  mapping.invalidate();
-  memcpy_to_mapping(src, mapping, nbytes, staging.dtype());
+  memcpy_to_mapping(src, staging.mapping(), nbytes, staging.dtype());
 }
 
 void copy_staging_to_ptr(
     api::StagingBuffer& staging,
     void* dst,
     const size_t nbytes) {
-  vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::READ);
-  mapping.invalidate();
-  memcpy_from_mapping(mapping, dst, nbytes, staging.dtype());
+  memcpy_from_mapping(staging.mapping(), dst, nbytes, staging.dtype());
 }
 
 void set_staging_zeros(api::StagingBuffer& staging, const size_t nbytes) {
diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
index b990cf6a119..e814063fa90 100644
--- a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
+++ b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
@@ -142,7 +142,8 @@ VulkanBuffer Allocator::create_staging_buffer(const VkDeviceSize size) {
   // Staging buffers are accessed by both the CPU and GPU, so set the
   // appropriate flags to indicate that the host device will be accessing
   // the data from this buffer.
-  alloc_create_info.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT;
+  alloc_create_info.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT |
+      VMA_ALLOCATION_CREATE_MAPPED_BIT;
   alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST;
   alloc_create_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
   alloc_create_info.preferredFlags =
diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.cpp b/backends/vulkan/runtime/vk_api/memory/Buffer.cpp
index 366b45a5e41..8593806788b 100644
--- a/backends/vulkan/runtime/vk_api/memory/Buffer.cpp
+++ b/backends/vulkan/runtime/vk_api/memory/Buffer.cpp
@@ -158,7 +158,7 @@ MemoryMap::MemoryMap(const VulkanBuffer& buffer, const uint8_t access)
   }
 }
 
-MemoryMap::MemoryMap(MemoryMap&& other) noexcept
+MemoryMap::MemoryMap(MemoryMap&& other)
     : access_(other.access_),
       allocator_(other.allocator_),
       allocation_(other.allocation_),
@@ -168,6 +168,19 @@ MemoryMap::MemoryMap(MemoryMap&& other) noexcept
   other.data_ = nullptr;
 }
 
+MemoryMap& MemoryMap::operator=(MemoryMap&& other) {
+  access_ = other.access_;
+  allocator_ = other.allocator_;
+  allocation_ = other.allocation_;
+  data_ = other.data_;
+  data_len_ = other.data_len_;
+
+  other.allocation_ = VK_NULL_HANDLE;
+  other.data_ = nullptr;
+
+  return *this;
+}
+
 MemoryMap::~MemoryMap() {
   if (!data_) {
     return;
diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.h b/backends/vulkan/runtime/vk_api/memory/Buffer.h
index 9302048f861..d7878347118 100644
--- a/backends/vulkan/runtime/vk_api/memory/Buffer.h
+++ b/backends/vulkan/runtime/vk_api/memory/Buffer.h
@@ -174,8 +174,8 @@ class MemoryMap final {
   MemoryMap(const MemoryMap&) = delete;
   MemoryMap& operator=(const MemoryMap&) = delete;
 
-  MemoryMap(MemoryMap&&) noexcept;
-  MemoryMap& operator=(MemoryMap&&) = delete;
+  MemoryMap(MemoryMap&&);
+  MemoryMap& operator=(MemoryMap&&);
 
   ~MemoryMap();
 
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
index 4c2972419d0..b4eba76ca4e 100644
--- a/backends/vulkan/test/utils/test_utils.cpp
+++ b/backends/vulkan/test/utils/test_utils.cpp
@@ -324,7 +324,7 @@ void record_reference_matmul(
   _(int8_t, QInt8)
 
 void fill_vtensor(api::vTensor& vten, std::vector<float>& data) {
-  api::StagingBuffer staging_buffer(api::context(), vten.dtype(), data.size());
+  api::StagingBuffer staging_buffer(api::context(), vten.dtype(), data.size(), vkapi::MemoryAccessType::WRITE);
 
 #define CASE(ctype, name)                                                     \
   case vkapi::ScalarType::name: {                                             \
@@ -412,7 +412,7 @@ void fill_vtensor(
 
 void extract_vtensor(api::vTensor& vten, std::vector<float>& data) {
   api::StagingBuffer staging_buffer(
-      api::context(), vten.dtype(), vten.staging_buffer_numel());
+      api::context(), vten.dtype(), vten.staging_buffer_numel(), vkapi::MemoryAccessType::READ);
 
   if (vten.storage_type() == utils::StorageType::BUFFER) {
     record_buffer_to_nchw_op(api::context(), vten, staging_buffer.buffer());
diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h
index 3bc12c472db..c087b192d41 100644
--- a/backends/vulkan/test/utils/test_utils.h
+++ b/backends/vulkan/test/utils/test_utils.h
@@ -38,13 +38,13 @@ using namespace vkcompute;
 
 #define DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(tensor)          \
   api::StagingBuffer staging_buffer_##tensor(                        \
-      api::context(), vkapi::kFloat, tensor.staging_buffer_numel()); \
+      api::context(), vkapi::kFloat, tensor.staging_buffer_numel(), vkapi::MemoryAccessType::WRITE); \
   record_nchw_to_image_op(                                           \
       api::context(), staging_buffer_##tensor.buffer(), tensor);
 
 #define DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(tensor)        \
   api::StagingBuffer staging_buffer_##tensor(                        \
-      api::context(), vkapi::kFloat, tensor.staging_buffer_numel()); \
+      api::context(), vkapi::kFloat, tensor.staging_buffer_numel(), vkapi::MemoryAccessType::READ); \
   record_image_to_nchw_op(                                           \
       api::context(), tensor, staging_buffer_##tensor.buffer());
 
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index f3c60a21376..2bb66654205 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -336,7 +336,7 @@ TEST_F(VulkanComputeAPITest, spec_var_classes_test) {
 
 TEST_F(VulkanComputeAPITest, spec_var_shader_test) {
   size_t len = 16;
-  StagingBuffer buffer(context(), vkapi::kFloat, len);
+  StagingBuffer buffer(context(), vkapi::kFloat, len, vkapi::MemoryAccessType::WRITE);
 
   float scale = 3.0f;
   float offset = 1.5f;
@@ -408,7 +408,7 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) {
   }
 
   StagingBuffer staging_buffer(
-      context(), vkapi::kFloat, a.staging_buffer_numel());
+      context(), vkapi::kFloat, a.staging_buffer_numel(), vkapi::MemoryAccessType::WRITE);
   record_image_to_nchw_op(context(), a, staging_buffer.buffer());
 
   submit_to_gpu();
@@ -428,7 +428,7 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) {
 
 template <typename T, vkapi::ScalarType dtype>
 void test_storage_buffer_type(const size_t len) {
-  StagingBuffer buffer(context(), dtype, len);
+  StagingBuffer buffer(context(), dtype, len, vkapi::MemoryAccessType::WRITE);
 
   std::string kernel_name("idx_fill_buffer");
   switch (dtype) {
@@ -2040,7 +2040,7 @@ void run_from_gpu_test(
         vten.sizes_ubo());
   }
 
-  StagingBuffer staging_buffer(context(), dtype, vten.staging_buffer_numel());
+  StagingBuffer staging_buffer(context(), dtype, vten.staging_buffer_numel(), vkapi::MemoryAccessType::READ);
 
   if (dtype == vkapi::kChar &&
       !context()->adapter_ptr()->has_full_int8_buffers_support()) {
@@ -2074,7 +2074,7 @@ void round_trip_test(
 
   // Create and fill input staging buffer
   StagingBuffer staging_buffer_in(
-      context(), dtype, vten.staging_buffer_numel());
+      context(), dtype, vten.staging_buffer_numel(), vkapi::MemoryAccessType::WRITE);
 
   std::vector<T> data_in(staging_buffer_in.numel());
   for (int i = 0; i < staging_buffer_in.numel(); i++) {
@@ -2085,7 +2085,7 @@ void round_trip_test(
 
   // Output staging buffer
   StagingBuffer staging_buffer_out(
-      context(), dtype, vten.staging_buffer_numel());
+      context(), dtype, vten.staging_buffer_numel(), vkapi::MemoryAccessType::READ);
 
   record_nchw_to_image_op(context(), staging_buffer_in.buffer(), vten);
 
@@ -2538,7 +2538,7 @@ void test_conv2d(
 
   // Create and fill input staging buffer
   const int64_t in_numel = utils::multiply_integers(original_sizes);
-  StagingBuffer staging_buffer_in(context(), vkapi::kFloat, in_numel);
+  StagingBuffer staging_buffer_in(context(), vkapi::kFloat, in_numel, vkapi::MemoryAccessType::WRITE);
 
   std::vector<float> data_in(in_numel);
   for (int i = 0; i < in_numel; i++) {
@@ -2550,7 +2550,7 @@ void test_conv2d(
   // Output staging buffer
   const int64_t out_numel =
       padded_sizes[0] * padded_sizes[1] * original_sizes[2] * original_sizes[3];
-  StagingBuffer staging_buffer_out(context(), vkapi::kFloat, out_numel);
+  StagingBuffer staging_buffer_out(context(), vkapi::kFloat, out_numel, vkapi::MemoryAccessType::READ);
 
   // Copy data in and out of the tensor
   record_conv2d_prepack_weights_op(

From 7f5c3e2f4efb98ceb5586c672f825858ab0ca1a9 Mon Sep 17 00:00:00 2001
From: Jorge Pineda <jorgep31415@meta.com>
Date: Tue, 3 Sep 2024 09:13:20 -0700
Subject: [PATCH 06/11] Update on "[ET-VK] Persistently map staging buffers"

`StagingBuffer`s are used for input, output, and constant tensors. For input and output, they are mapped every inference which is unnecessary. Instead, [persistently map the memory](https://gpuopen-librariesandsdks.github.io/VulkanMemoryAllocator/html/memory_mapping.html) once on model load.

Differential Revision: [D59706627](https://our.internmc.facebook.com/intern/diff/D59706627/)

[ghstack-poisoned]
---
 backends/vulkan/test/utils/test_utils.cpp     | 11 +++++--
 backends/vulkan/test/utils/test_utils.h       | 22 +++++++++-----
 .../vulkan/test/vulkan_compute_api_test.cpp   | 30 ++++++++++++++-----
 3 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
index b4eba76ca4e..8e4c27d98d2 100644
--- a/backends/vulkan/test/utils/test_utils.cpp
+++ b/backends/vulkan/test/utils/test_utils.cpp
@@ -324,7 +324,11 @@ void record_reference_matmul(
   _(int8_t, QInt8)
 
 void fill_vtensor(api::vTensor& vten, std::vector<float>& data) {
-  api::StagingBuffer staging_buffer(api::context(), vten.dtype(), data.size(), vkapi::MemoryAccessType::WRITE);
+  api::StagingBuffer staging_buffer(
+      api::context(),
+      vten.dtype(),
+      data.size(),
+      vkapi::MemoryAccessType::WRITE);
 
 #define CASE(ctype, name)                                                     \
   case vkapi::ScalarType::name: {                                             \
@@ -412,7 +416,10 @@ void fill_vtensor(
 
 void extract_vtensor(api::vTensor& vten, std::vector<float>& data) {
   api::StagingBuffer staging_buffer(
-      api::context(), vten.dtype(), vten.staging_buffer_numel(), vkapi::MemoryAccessType::READ);
+      api::context(),
+      vten.dtype(),
+      vten.staging_buffer_numel(),
+      vkapi::MemoryAccessType::READ);
 
   if (vten.storage_type() == utils::StorageType::BUFFER) {
     record_buffer_to_nchw_op(api::context(), vten, staging_buffer.buffer());
diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h
index c087b192d41..e76a5a7c29b 100644
--- a/backends/vulkan/test/utils/test_utils.h
+++ b/backends/vulkan/test/utils/test_utils.h
@@ -36,16 +36,22 @@ using namespace vkcompute;
       utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED,  \
       allocate_memory);
 
-#define DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(tensor)          \
-  api::StagingBuffer staging_buffer_##tensor(                        \
-      api::context(), vkapi::kFloat, tensor.staging_buffer_numel(), vkapi::MemoryAccessType::WRITE); \
-  record_nchw_to_image_op(                                           \
+#define DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(tensor) \
+  api::StagingBuffer staging_buffer_##tensor(               \
+      api::context(),                                       \
+      vkapi::kFloat,                                        \
+      tensor.staging_buffer_numel(),                        \
+      vkapi::MemoryAccessType::WRITE);                      \
+  record_nchw_to_image_op(                                  \
       api::context(), staging_buffer_##tensor.buffer(), tensor);
 
-#define DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(tensor)        \
-  api::StagingBuffer staging_buffer_##tensor(                        \
-      api::context(), vkapi::kFloat, tensor.staging_buffer_numel(), vkapi::MemoryAccessType::READ); \
-  record_image_to_nchw_op(                                           \
+#define DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(tensor) \
+  api::StagingBuffer staging_buffer_##tensor(                 \
+      api::context(),                                         \
+      vkapi::kFloat,                                          \
+      tensor.staging_buffer_numel(),                          \
+      vkapi::MemoryAccessType::READ);                         \
+  record_image_to_nchw_op(                                    \
       api::context(), tensor, staging_buffer_##tensor.buffer());
 
 #define CHECK_VALUE(data, idx, expected)                          \
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 2bb66654205..7e71c58f166 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -336,7 +336,8 @@ TEST_F(VulkanComputeAPITest, spec_var_classes_test) {
 
 TEST_F(VulkanComputeAPITest, spec_var_shader_test) {
   size_t len = 16;
-  StagingBuffer buffer(context(), vkapi::kFloat, len, vkapi::MemoryAccessType::WRITE);
+  StagingBuffer buffer(
+      context(), vkapi::kFloat, len, vkapi::MemoryAccessType::WRITE);
 
   float scale = 3.0f;
   float offset = 1.5f;
@@ -408,7 +409,10 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) {
   }
 
   StagingBuffer staging_buffer(
-      context(), vkapi::kFloat, a.staging_buffer_numel(), vkapi::MemoryAccessType::WRITE);
+      context(),
+      vkapi::kFloat,
+      a.staging_buffer_numel(),
+      vkapi::MemoryAccessType::WRITE);
   record_image_to_nchw_op(context(), a, staging_buffer.buffer());
 
   submit_to_gpu();
@@ -2040,7 +2044,11 @@ void run_from_gpu_test(
         vten.sizes_ubo());
   }
 
-  StagingBuffer staging_buffer(context(), dtype, vten.staging_buffer_numel(), vkapi::MemoryAccessType::READ);
+  StagingBuffer staging_buffer(
+      context(),
+      dtype,
+      vten.staging_buffer_numel(),
+      vkapi::MemoryAccessType::READ);
 
   if (dtype == vkapi::kChar &&
       !context()->adapter_ptr()->has_full_int8_buffers_support()) {
@@ -2074,7 +2082,10 @@ void round_trip_test(
 
   // Create and fill input staging buffer
   StagingBuffer staging_buffer_in(
-      context(), dtype, vten.staging_buffer_numel(), vkapi::MemoryAccessType::WRITE);
+      context(),
+      dtype,
+      vten.staging_buffer_numel(),
+      vkapi::MemoryAccessType::WRITE);
 
   std::vector<T> data_in(staging_buffer_in.numel());
   for (int i = 0; i < staging_buffer_in.numel(); i++) {
@@ -2085,7 +2096,10 @@ void round_trip_test(
 
   // Output staging buffer
   StagingBuffer staging_buffer_out(
-      context(), dtype, vten.staging_buffer_numel(), vkapi::MemoryAccessType::READ);
+      context(),
+      dtype,
+      vten.staging_buffer_numel(),
+      vkapi::MemoryAccessType::READ);
 
   record_nchw_to_image_op(context(), staging_buffer_in.buffer(), vten);
 
@@ -2538,7 +2552,8 @@ void test_conv2d(
 
   // Create and fill input staging buffer
   const int64_t in_numel = utils::multiply_integers(original_sizes);
-  StagingBuffer staging_buffer_in(context(), vkapi::kFloat, in_numel, vkapi::MemoryAccessType::WRITE);
+  StagingBuffer staging_buffer_in(
+      context(), vkapi::kFloat, in_numel, vkapi::MemoryAccessType::WRITE);
 
   std::vector<float> data_in(in_numel);
   for (int i = 0; i < in_numel; i++) {
@@ -2550,7 +2565,8 @@ void test_conv2d(
   // Output staging buffer
   const int64_t out_numel =
       padded_sizes[0] * padded_sizes[1] * original_sizes[2] * original_sizes[3];
-  StagingBuffer staging_buffer_out(context(), vkapi::kFloat, out_numel, vkapi::MemoryAccessType::READ);
+  StagingBuffer staging_buffer_out(
+      context(), vkapi::kFloat, out_numel, vkapi::MemoryAccessType::READ);
 
   // Copy data in and out of the tensor
   record_conv2d_prepack_weights_op(

From 8e578eb0056fce9f5a1145af9c74de66129f3250 Mon Sep 17 00:00:00 2001
From: Jorge Pineda <jorgep31415@meta.com>
Date: Tue, 3 Sep 2024 10:19:38 -0700
Subject: [PATCH 07/11] Update base for Update on "[ET-VK] Persistently map
 staging buffers"

`StagingBuffer`s are used for input, output, and constant tensors. For input and output, they are mapped every inference which is unnecessary. Instead, [persistently map the memory](https://gpuopen-librariesandsdks.github.io/VulkanMemoryAllocator/html/memory_mapping.html) once on model load.

Differential Revision: [D59706627](https://our.internmc.facebook.com/intern/diff/D59706627/)

[ghstack-poisoned]
---
 backends/vulkan/runtime/api/api.h                |  2 +-
 .../{StagingBuffer.h => StorageBuffer.h}         | 14 +++++++-------
 backends/vulkan/runtime/graph/ComputeGraph.cpp   |  4 ++--
 backends/vulkan/runtime/graph/ComputeGraph.h     |  2 +-
 backends/vulkan/runtime/graph/containers/Value.h |  8 ++++----
 .../vulkan/runtime/graph/ops/PrepackNode.cpp     |  8 ++++----
 backends/vulkan/runtime/graph/ops/PrepackNode.h  |  2 +-
 .../runtime/graph/ops/utils/BindingUtils.cpp     |  2 +-
 .../runtime/graph/ops/utils/BindingUtils.h       |  2 +-
 .../runtime/graph/ops/utils/StagingUtils.cpp     |  6 +++---
 .../runtime/graph/ops/utils/StagingUtils.h       |  6 +++---
 .../vulkan/runtime/vk_api/memory/Allocator.cpp   | 11 +++++++++++
 .../vulkan/runtime/vk_api/memory/Allocator.h     |  2 ++
 backends/vulkan/test/utils/test_utils.cpp        |  6 +++---
 backends/vulkan/test/utils/test_utils.h          | 10 +++++-----
 backends/vulkan/test/vulkan_compute_api_test.cpp | 16 ++++++++--------
 .../vulkan/tools/gpuinfo/include/architecture.h  |  6 +++---
 backends/vulkan/tools/gpuinfo/include/buffers.h  |  8 ++++----
 backends/vulkan/tools/gpuinfo/include/textures.h |  4 ++--
 19 files changed, 66 insertions(+), 53 deletions(-)
 rename backends/vulkan/runtime/api/containers/{StagingBuffer.h => StorageBuffer.h} (82%)

diff --git a/backends/vulkan/runtime/api/api.h b/backends/vulkan/runtime/api/api.h
index 0f496a4af8a..de77c57fb0e 100644
--- a/backends/vulkan/runtime/api/api.h
+++ b/backends/vulkan/runtime/api/api.h
@@ -12,7 +12,7 @@
 #include <executorch/backends/vulkan/runtime/api/ShaderRegistry.h>
 
 #include <executorch/backends/vulkan/runtime/api/containers/ParamsBuffer.h>
-#include <executorch/backends/vulkan/runtime/api/containers/StagingBuffer.h>
+#include <executorch/backends/vulkan/runtime/api/containers/StorageBuffer.h>
 #include <executorch/backends/vulkan/runtime/api/containers/Tensor.h>
 
 #include <executorch/backends/vulkan/runtime/utils/VecUtils.h>
diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.h b/backends/vulkan/runtime/api/containers/StorageBuffer.h
similarity index 82%
rename from backends/vulkan/runtime/api/containers/StagingBuffer.h
rename to backends/vulkan/runtime/api/containers/StorageBuffer.h
index a24728470b0..f6a17c7e909 100644
--- a/backends/vulkan/runtime/api/containers/StagingBuffer.h
+++ b/backends/vulkan/runtime/api/containers/StorageBuffer.h
@@ -17,7 +17,7 @@
 namespace vkcompute {
 namespace api {
 
-class StagingBuffer final {
+class StorageBuffer final {
  private:
   Context* context_p_;
   vkapi::ScalarType dtype_;
@@ -26,7 +26,7 @@ class StagingBuffer final {
   vkapi::VulkanBuffer vulkan_buffer_;
 
  public:
-  StagingBuffer(
+  StorageBuffer(
       Context* context_p,
       const vkapi::ScalarType dtype,
       const size_t numel)
@@ -37,13 +37,13 @@ class StagingBuffer final {
         vulkan_buffer_(
             context_p_->adapter_ptr()->vma().create_staging_buffer(nbytes_)) {}
 
-  StagingBuffer(const StagingBuffer&) = delete;
-  StagingBuffer& operator=(const StagingBuffer&) = delete;
+  StorageBuffer(const StorageBuffer&) = delete;
+  StorageBuffer& operator=(const StorageBuffer&) = delete;
 
-  StagingBuffer(StagingBuffer&&) = default;
-  StagingBuffer& operator=(StagingBuffer&&) = default;
+  StorageBuffer(StorageBuffer&&) = default;
+  StorageBuffer& operator=(StorageBuffer&&) = default;
 
-  ~StagingBuffer() {
+  ~StorageBuffer() {
     context_p_->register_buffer_cleanup(vulkan_buffer_);
   }
 
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index 6c3ec88eaa7..9fa0091b298 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -38,7 +38,7 @@ namespace vkcompute {
 
 VALUE_PTR_CLASS_IMPL(vTensorPtr, api::vTensor, Tensor)
 VALUE_PTR_CLASS_IMPL(TensorRefPtr, TensorRef, TensorRef)
-VALUE_PTR_CLASS_IMPL(StagingPtr, api::StagingBuffer, Staging)
+VALUE_PTR_CLASS_IMPL(StagingPtr, api::StorageBuffer, Staging)
 VALUE_PTR_CLASS_IMPL(IntListPtr, std::vector<int64_t>, IntList)
 VALUE_PTR_CLASS_IMPL(DoubleListPtr, std::vector<double>, DoubleList)
 VALUE_PTR_CLASS_IMPL(BoolListPtr, std::vector<bool>, BoolList)
@@ -236,7 +236,7 @@ ValueRef ComputeGraph::add_staging(
     const size_t numel) {
   ValueRef idx(static_cast<int>(values_.size()));
   check_no_active_value_ptrs();
-  values_.emplace_back(api::StagingBuffer(context(), dtype, numel));
+  values_.emplace_back(api::StorageBuffer(context(), dtype, numel));
   return idx;
 }
 
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 9b04b08a70e..5740d24a448 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -58,7 +58,7 @@ class ComputeGraph;
 
 DECL_VALUE_PTR_CLASS(vTensorPtr, api::vTensor)
 DECL_VALUE_PTR_CLASS(TensorRefPtr, TensorRef)
-DECL_VALUE_PTR_CLASS(StagingPtr, api::StagingBuffer)
+DECL_VALUE_PTR_CLASS(StagingPtr, api::StorageBuffer)
 DECL_VALUE_PTR_CLASS(IntListPtr, std::vector<int64_t>)
 DECL_VALUE_PTR_CLASS(DoubleListPtr, std::vector<double>)
 DECL_VALUE_PTR_CLASS(BoolListPtr, std::vector<bool>)
diff --git a/backends/vulkan/runtime/graph/containers/Value.h b/backends/vulkan/runtime/graph/containers/Value.h
index 6e03bbd4a21..ba82213c6f8 100644
--- a/backends/vulkan/runtime/graph/containers/Value.h
+++ b/backends/vulkan/runtime/graph/containers/Value.h
@@ -53,7 +53,7 @@ struct Value final {
     } u;
 
     api::vTensor as_tensor;
-    api::StagingBuffer as_staging;
+    api::StorageBuffer as_staging;
     TensorRef as_tensorref;
 
     std::vector<int64_t> as_int_list;
@@ -108,7 +108,7 @@ struct Value final {
       CASE_MOVE_MOVEABLE_TYPE(
           TypeTag::TENSOR, api::vTensor, as_tensor, vTensor);
       CASE_MOVE_MOVEABLE_TYPE(
-          TypeTag::STAGING, api::StagingBuffer, as_staging, StagingBuffer);
+          TypeTag::STAGING, api::StorageBuffer, as_staging, StorageBuffer);
       CASE_MOVE_MOVEABLE_TYPE(
           TypeTag::TENSORREF, TensorRef, as_tensorref, TensorRef);
       // Scalar lists
@@ -152,7 +152,7 @@ struct Value final {
         payload.as_tensor.~vTensor();
         break;
       case TypeTag::STAGING:
-        payload.as_staging.~StagingBuffer();
+        payload.as_staging.~StorageBuffer();
         break;
       case TypeTag::TENSORREF:
         payload.as_tensorref.~TensorRef();
@@ -247,7 +247,7 @@ struct Value final {
       as_tensor);
 
   SUPPORT_TRIVIALLY_MOVEABLE_TYPE(
-      api::StagingBuffer,
+      api::StorageBuffer,
       Staging,
       TypeTag::STAGING,
       as_staging);
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
index a9c2f6c9b6a..b77c62920dd 100644
--- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
@@ -45,14 +45,14 @@ PrepackNode::PrepackNode(
   graph.update_descriptor_counts(noop_shader_, /*execute = */ false);
 }
 
-api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
+api::StorageBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
   vTensorPtr packed = graph->get_tensor(packed_);
 
   // If no TensorRef is provided, create a staging buffer of zeros according to
   // the vkapi::vTensor metadata.
   if (graph->val_is_none(tref_)) {
     size_t numel = utils::multiply_integers(packed->sizes());
-    api::StagingBuffer staging(graph->context(), packed->dtype(), numel);
+    api::StorageBuffer staging(graph->context(), packed->dtype(), numel);
     size_t nbytes = numel * vkapi::element_size(packed->dtype());
     set_staging_zeros(staging, nbytes);
     return staging;
@@ -60,7 +60,7 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
 
   TensorRefPtr tref = graph->get_tref(tref_);
   size_t numel = utils::multiply_integers(tref->sizes);
-  api::StagingBuffer staging(graph->context(), tref->dtype, numel);
+  api::StorageBuffer staging(graph->context(), tref->dtype, numel);
   size_t nbytes = numel * vkapi::element_size(tref->dtype);
   copy_ptr_to_staging(tref->data, staging, nbytes);
   return staging;
@@ -70,7 +70,7 @@ void PrepackNode::encode(ComputeGraph* graph) {
   api::Context* const context = graph->context();
 
   vTensorPtr packed = graph->get_tensor(packed_);
-  api::StagingBuffer staging = create_staging_buffer(graph);
+  api::StorageBuffer staging = create_staging_buffer(graph);
 
   std::unique_lock<std::mutex> cmd_lock = context->dispatch_lock();
 
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.h b/backends/vulkan/runtime/graph/ops/PrepackNode.h
index 3e713303c3d..c3ac8b963fd 100644
--- a/backends/vulkan/runtime/graph/ops/PrepackNode.h
+++ b/backends/vulkan/runtime/graph/ops/PrepackNode.h
@@ -56,7 +56,7 @@ class PrepackNode final {
   const vkapi::SpecVarList spec_vars_;
 
  private:
-  api::StagingBuffer create_staging_buffer(ComputeGraph* graph);
+  api::StorageBuffer create_staging_buffer(ComputeGraph* graph);
 };
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp
index 2cfb34a052e..b0964ace225 100644
--- a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp
@@ -66,7 +66,7 @@ uint32_t bind_params_to_descriptor_set(
 }
 
 void bind_staging_to_descriptor_set(
-    api::StagingBuffer& staging,
+    api::StorageBuffer& staging,
     vkapi::DescriptorSet& descriptor_set,
     const uint32_t idx) {
   descriptor_set.bind(idx, staging.buffer());
diff --git a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h
index eed39a97979..3a7ec029da7 100644
--- a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h
+++ b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h
@@ -40,7 +40,7 @@ uint32_t bind_params_to_descriptor_set(
     const uint32_t base_idx);
 
 void bind_staging_to_descriptor_set(
-    api::StagingBuffer& staging,
+    api::StorageBuffer& staging,
     vkapi::DescriptorSet& descriptor_set,
     const uint32_t idx);
 
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
index 9cb715e202a..294e36b9a86 100644
--- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
@@ -73,7 +73,7 @@ void memcpy_from_mapping(
 
 void copy_ptr_to_staging(
     const void* src,
-    api::StagingBuffer& staging,
+    api::StorageBuffer& staging,
     const size_t nbytes) {
   vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::WRITE);
   mapping.invalidate();
@@ -81,7 +81,7 @@ void copy_ptr_to_staging(
 }
 
 void copy_staging_to_ptr(
-    api::StagingBuffer& staging,
+    api::StorageBuffer& staging,
     void* dst,
     const size_t nbytes) {
   vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::READ);
@@ -89,7 +89,7 @@ void copy_staging_to_ptr(
   memcpy_from_mapping(mapping, dst, nbytes, staging.dtype());
 }
 
-void set_staging_zeros(api::StagingBuffer& staging, const size_t nbytes) {
+void set_staging_zeros(api::StorageBuffer& staging, const size_t nbytes) {
   vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::WRITE);
   uint8_t* data_ptr = mapping.template data<uint8_t>();
   memset(data_ptr, 0, staging.nbytes());
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
index f16c52ecf33..cabc17f30ee 100644
--- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
+++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
@@ -18,14 +18,14 @@ namespace vkcompute {
 
 void copy_ptr_to_staging(
     const void* src,
-    api::StagingBuffer& staging,
+    api::StorageBuffer& staging,
     const size_t nbytes);
 void copy_staging_to_ptr(
-    api::StagingBuffer& staging,
+    api::StorageBuffer& staging,
     void* dst,
     const size_t nbytes);
 
-void set_staging_zeros(api::StagingBuffer& staging, const size_t nbytes);
+void set_staging_zeros(api::StorageBuffer& staging, const size_t nbytes);
 
 //
 // Functions to get shaders
diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
index b990cf6a119..17492589176 100644
--- a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
+++ b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
@@ -164,6 +164,17 @@ VulkanBuffer Allocator::create_storage_buffer(
       allocator_, size, alloc_create_info, buffer_usage, allocate_memory);
 }
 
+VulkanBuffer Allocator::create_staging_buffer(const VkDeviceSize size) {
+  VmaAllocationCreateInfo alloc_create_info = {};
+  alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY;
+  alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST;
+
+  VkBufferUsageFlags buffer_usage =
+      VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
+
+  return VulkanBuffer(allocator_, size, alloc_create_info, buffer_usage);
+}
+
 VulkanBuffer Allocator::create_uniform_buffer(const VkDeviceSize size) {
   VmaAllocationCreateInfo alloc_create_info = {};
   alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY |
diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.h b/backends/vulkan/runtime/vk_api/memory/Allocator.h
index 7d02ffe54e3..0317f36eaa7 100644
--- a/backends/vulkan/runtime/vk_api/memory/Allocator.h
+++ b/backends/vulkan/runtime/vk_api/memory/Allocator.h
@@ -68,6 +68,8 @@ class Allocator final {
       const VkDeviceSize,
       const bool allocate_memory = true);
 
+  VulkanBuffer create_staging_buffer(const VkDeviceSize);
+
   /*
    * Create a uniform buffer with a specified size
    */
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
index 4c2972419d0..4a3a41d6c72 100644
--- a/backends/vulkan/test/utils/test_utils.cpp
+++ b/backends/vulkan/test/utils/test_utils.cpp
@@ -112,7 +112,7 @@ void record_image_to_nchw_op(
 void record_int8_image_to_nchw_noint8_op(
     api::Context* const context,
     api::vTensor& v_src,
-    api::StagingBuffer& dst_buffer) {
+    api::StorageBuffer& dst_buffer) {
   vkapi::PipelineBarrier pipeline_barrier{};
   uint32_t buffer_len = utils::safe_downcast<uint32_t>(dst_buffer.numel() / 4);
   utils::uvec3 global_wg_size = {buffer_len, 1, 1};
@@ -324,7 +324,7 @@ void record_reference_matmul(
   _(int8_t, QInt8)
 
 void fill_vtensor(api::vTensor& vten, std::vector<float>& data) {
-  api::StagingBuffer staging_buffer(api::context(), vten.dtype(), data.size());
+  api::StorageBuffer staging_buffer(api::context(), vten.dtype(), data.size());
 
 #define CASE(ctype, name)                                                     \
   case vkapi::ScalarType::name: {                                             \
@@ -411,7 +411,7 @@ void fill_vtensor(
 }
 
 void extract_vtensor(api::vTensor& vten, std::vector<float>& data) {
-  api::StagingBuffer staging_buffer(
+  api::StorageBuffer staging_buffer(
       api::context(), vten.dtype(), vten.staging_buffer_numel());
 
   if (vten.storage_type() == utils::StorageType::BUFFER) {
diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h
index 3bc12c472db..c8af5470862 100644
--- a/backends/vulkan/test/utils/test_utils.h
+++ b/backends/vulkan/test/utils/test_utils.h
@@ -37,13 +37,13 @@ using namespace vkcompute;
       allocate_memory);
 
 #define DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(tensor)          \
-  api::StagingBuffer staging_buffer_##tensor(                        \
+  api::StorageBuffer staging_buffer_##tensor(                        \
       api::context(), vkapi::kFloat, tensor.staging_buffer_numel()); \
   record_nchw_to_image_op(                                           \
       api::context(), staging_buffer_##tensor.buffer(), tensor);
 
 #define DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(tensor)        \
-  api::StagingBuffer staging_buffer_##tensor(                        \
+  api::StorageBuffer staging_buffer_##tensor(                        \
       api::context(), vkapi::kFloat, tensor.staging_buffer_numel()); \
   record_image_to_nchw_op(                                           \
       api::context(), tensor, staging_buffer_##tensor.buffer());
@@ -85,7 +85,7 @@ void record_image_to_nchw_op(
 void record_int8_image_to_nchw_noint8_op(
     api::Context* const context,
     api::vTensor& v_src,
-    api::StagingBuffer& dst_buffer);
+    api::StorageBuffer& dst_buffer);
 
 void record_conv2d_prepack_weights_op(
     api::Context* const context,
@@ -126,7 +126,7 @@ void record_reference_matmul(
 //
 
 inline void
-fill_staging(api::StagingBuffer& staging, float val, int numel = -1) {
+fill_staging(api::StorageBuffer& staging, float val, int numel = -1) {
   if (numel < 0) {
     numel = staging.numel();
   }
@@ -164,7 +164,7 @@ inline std::vector<float> extract_vtensor(api::vTensor& vten) {
 }
 
 inline void
-check_staging_buffer(api::StagingBuffer& staging, float val, int numel = -1) {
+check_staging_buffer(api::StorageBuffer& staging, float val, int numel = -1) {
   if (numel < 0) {
     numel = staging.numel();
   }
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index f3c60a21376..3d172f490cf 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -336,7 +336,7 @@ TEST_F(VulkanComputeAPITest, spec_var_classes_test) {
 
 TEST_F(VulkanComputeAPITest, spec_var_shader_test) {
   size_t len = 16;
-  StagingBuffer buffer(context(), vkapi::kFloat, len);
+  StorageBuffer buffer(context(), vkapi::kFloat, len);
 
   float scale = 3.0f;
   float offset = 1.5f;
@@ -407,7 +407,7 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) {
         params.buffer());
   }
 
-  StagingBuffer staging_buffer(
+  StorageBuffer staging_buffer(
       context(), vkapi::kFloat, a.staging_buffer_numel());
   record_image_to_nchw_op(context(), a, staging_buffer.buffer());
 
@@ -428,7 +428,7 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) {
 
 template <typename T, vkapi::ScalarType dtype>
 void test_storage_buffer_type(const size_t len) {
-  StagingBuffer buffer(context(), dtype, len);
+  StorageBuffer buffer(context(), dtype, len);
 
   std::string kernel_name("idx_fill_buffer");
   switch (dtype) {
@@ -2040,7 +2040,7 @@ void run_from_gpu_test(
         vten.sizes_ubo());
   }
 
-  StagingBuffer staging_buffer(context(), dtype, vten.staging_buffer_numel());
+  StorageBuffer staging_buffer(context(), dtype, vten.staging_buffer_numel());
 
   if (dtype == vkapi::kChar &&
       !context()->adapter_ptr()->has_full_int8_buffers_support()) {
@@ -2073,7 +2073,7 @@ void round_trip_test(
   vTensor vten = vTensor(context(), sizes, dtype, storage_type, memory_layout);
 
   // Create and fill input staging buffer
-  StagingBuffer staging_buffer_in(
+  StorageBuffer staging_buffer_in(
       context(), dtype, vten.staging_buffer_numel());
 
   std::vector<T> data_in(staging_buffer_in.numel());
@@ -2084,7 +2084,7 @@ void round_trip_test(
       data_in.data(), staging_buffer_in, vten.staging_buffer_nbytes());
 
   // Output staging buffer
-  StagingBuffer staging_buffer_out(
+  StorageBuffer staging_buffer_out(
       context(), dtype, vten.staging_buffer_numel());
 
   record_nchw_to_image_op(context(), staging_buffer_in.buffer(), vten);
@@ -2538,7 +2538,7 @@ void test_conv2d(
 
   // Create and fill input staging buffer
   const int64_t in_numel = utils::multiply_integers(original_sizes);
-  StagingBuffer staging_buffer_in(context(), vkapi::kFloat, in_numel);
+  StorageBuffer staging_buffer_in(context(), vkapi::kFloat, in_numel);
 
   std::vector<float> data_in(in_numel);
   for (int i = 0; i < in_numel; i++) {
@@ -2550,7 +2550,7 @@ void test_conv2d(
   // Output staging buffer
   const int64_t out_numel =
       padded_sizes[0] * padded_sizes[1] * original_sizes[2] * original_sizes[3];
-  StagingBuffer staging_buffer_out(context(), vkapi::kFloat, out_numel);
+  StorageBuffer staging_buffer_out(context(), vkapi::kFloat, out_numel);
 
   // Copy data in and out of the tensor
   record_conv2d_prepack_weights_op(
diff --git a/backends/vulkan/tools/gpuinfo/include/architecture.h b/backends/vulkan/tools/gpuinfo/include/architecture.h
index 20c6254e1a0..0d312ee87c3 100644
--- a/backends/vulkan/tools/gpuinfo/include/architecture.h
+++ b/backends/vulkan/tools/gpuinfo/include/architecture.h
@@ -40,7 +40,7 @@ void reg_count(const App& app) {
   uint32_t NITER;
 
   auto bench = [&](uint32_t ngrp, uint32_t nreg) {
-    StagingBuffer buffer(context(), vkapi::kFloat, 1);
+    StorageBuffer buffer(context(), vkapi::kFloat, 1);
     vkapi::PipelineBarrier pipeline_barrier{};
 
     auto shader_name = "reg_count_" + std::to_string(nreg);
@@ -164,7 +164,7 @@ void warp_size(const App& app, const bool verbose = false) {
   uint32_t NITER;
 
   auto bench = [&](uint32_t nthread) {
-    StagingBuffer out_buf(context(), vkapi::kInt, app.nthread_logic);
+    StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic);
     vkapi::PipelineBarrier pipeline_barrier{};
 
     auto shader_name = "warp_size_physical";
@@ -224,7 +224,7 @@ void warp_size(const App& app, const bool verbose = false) {
   // doesn't depend on kernel timing, so the extra wait time doesn't lead to
   // inaccuracy.
   auto bench_sm = [&](uint32_t nthread) {
-    StagingBuffer out_buf(context(), vkapi::kInt, app.nthread_logic);
+    StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic);
     vkapi::PipelineBarrier pipeline_barrier{};
 
     auto shader_name = "warp_size_scheduler";
diff --git a/backends/vulkan/tools/gpuinfo/include/buffers.h b/backends/vulkan/tools/gpuinfo/include/buffers.h
index 31137b11eea..c8cf93c4a12 100644
--- a/backends/vulkan/tools/gpuinfo/include/buffers.h
+++ b/backends/vulkan/tools/gpuinfo/include/buffers.h
@@ -35,8 +35,8 @@ void buf_cacheline_size(const App& app) {
   uint32_t NITER;
 
   auto bench = [&](int stride) {
-    StagingBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE);
-    StagingBuffer out_buf(context(), vkapi::kFloat, 1);
+    StorageBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE);
+    StorageBuffer out_buf(context(), vkapi::kFloat, 1);
     vkapi::PipelineBarrier pipeline_barrier{};
 
     auto shader_name = "buf_cacheline_size";
@@ -132,8 +132,8 @@ void _bandwidth(
     // workgroups, once the size of the access excedes the workgroup width.
     const uint32_t workgroup_width = local_x * NITER * NUNROLL;
 
-    StagingBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float));
-    StagingBuffer out_buf(
+    StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float));
+    StorageBuffer out_buf(
         context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic);
     vkapi::PipelineBarrier pipeline_barrier{};
 
diff --git a/backends/vulkan/tools/gpuinfo/include/textures.h b/backends/vulkan/tools/gpuinfo/include/textures.h
index c9ff133f1ec..7679f11b0ca 100644
--- a/backends/vulkan/tools/gpuinfo/include/textures.h
+++ b/backends/vulkan/tools/gpuinfo/include/textures.h
@@ -61,7 +61,7 @@ void tex_cacheline_concurr(const App& app) {
       vTensor in_tensor =
           api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);
 
-      StagingBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH);
+      StorageBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH);
 
       vkapi::PipelineBarrier pipeline_barrier{};
 
@@ -173,7 +173,7 @@ void tex_bandwidth(const App& app) {
       // workgroups, once the size of the access excedes the workgroup width.
       const uint32_t workgroup_width = local_x * NITER * NUNROLL;
 
-      StagingBuffer out_buf(
+      StorageBuffer out_buf(
           context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic);
       vkapi::PipelineBarrier pipeline_barrier{};
 

From 48b2c0b06fa021a632bcb215dd7ced206adfe809 Mon Sep 17 00:00:00 2001
From: Jorge Pineda <jorgep31415@meta.com>
Date: Tue, 3 Sep 2024 10:36:27 -0700
Subject: [PATCH 08/11] Update base for Update on "[ET-VK] Persistently map
 staging buffers"

`StagingBuffer`s are used for input, output, and constant tensors. For input and output, they are mapped every inference which is unnecessary. Instead, [persistently map the memory](https://gpuopen-librariesandsdks.github.io/VulkanMemoryAllocator/html/memory_mapping.html) once on model load.

Differential Revision: [D59706627](https://our.internmc.facebook.com/intern/diff/D59706627/)

[ghstack-poisoned]

From e0928ff1ba7561d14d383c11739e1d3f388cb935 Mon Sep 17 00:00:00 2001
From: Jorge Pineda <jorgep31415@meta.com>
Date: Tue, 3 Sep 2024 11:30:59 -0700
Subject: [PATCH 09/11] Update base for Update on "[ET-VK] Persistently map
 staging buffers"

`StagingBuffer`s are used for input, output, and constant tensors. For input and output, they are mapped every inference which is unnecessary. Instead, [persistently map the memory](https://gpuopen-librariesandsdks.github.io/VulkanMemoryAllocator/html/memory_mapping.html) once on model load.

Differential Revision: [D59706627](https://our.internmc.facebook.com/intern/diff/D59706627/)

[ghstack-poisoned]
---
 .ci/docker/ci_commit_pins/pytorch.txt | 2 +-
 install_requirements.py               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index b291722c3f0..14422e45d7c 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-00e3eea170ce5db8ea9c62ce5e48f13886cd6d20
+e4cd76cf8283c8ddbf95674b020fbfcff467cb4b
diff --git a/install_requirements.py b/install_requirements.py
index 64243ec6943..1f5982c80e0 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -94,7 +94,7 @@ def python_is_compatible():
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-NIGHTLY_VERSION = "dev20240901"
+NIGHTLY_VERSION = "dev20240829"
 
 # The pip repository that hosts nightly torch packages.
 TORCH_NIGHTLY_URL = "https://download.pytorch.org/whl/nightly/cpu"

From 7029ab6cc4459de0411269097c0c88f562275843 Mon Sep 17 00:00:00 2001
From: Jorge Pineda <jorgep31415@meta.com>
Date: Wed, 4 Sep 2024 13:52:09 -0700
Subject: [PATCH 10/11] Update base for Update on "[ET-VK] Persistently map
 staging buffers"

`StagingBuffer`s are used for input, output, and constant tensors. For input and output, they are mapped every inference which is unnecessary. Instead, [persistently map the memory](https://gpuopen-librariesandsdks.github.io/VulkanMemoryAllocator/html/memory_mapping.html) once on model load.

Differential Revision: [D59706627](https://our.internmc.facebook.com/intern/diff/D59706627/)

[ghstack-poisoned]

From cc058156475fadb63b2fbb03779ab87cbc7321f2 Mon Sep 17 00:00:00 2001
From: Jorge Pineda <jorgep31415@meta.com>
Date: Wed, 4 Sep 2024 13:57:53 -0700
Subject: [PATCH 11/11] Update base for Update on "[ET-VK] Persistently map
 staging buffers"

`StagingBuffer`s are used for input, output, and constant tensors. For input and output, they are mapped every inference which is unnecessary. Instead, [persistently map the memory](https://gpuopen-librariesandsdks.github.io/VulkanMemoryAllocator/html/memory_mapping.html) once on model load.

Differential Revision: [D59706627](https://our.internmc.facebook.com/intern/diff/D59706627/)

[ghstack-poisoned]