diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.h b/backends/vulkan/runtime/api/containers/StagingBuffer.h
index a24728470b0..66c607e178c 100644
--- a/backends/vulkan/runtime/api/containers/StagingBuffer.h
+++ b/backends/vulkan/runtime/api/containers/StagingBuffer.h
@@ -14,6 +14,8 @@
 
 #include <executorch/backends/vulkan/runtime/vk_api/memory/Buffer.h>
 
+#include <cstring>
+
 namespace vkcompute {
 namespace api {
 
@@ -55,6 +57,10 @@ class StagingBuffer final {
     return vulkan_buffer_;
   }
 
+  inline void* data() {
+    return vulkan_buffer_.allocation_info().pMappedData;
+  }
+
   inline size_t numel() {
     return numel_;
   }
@@ -62,6 +68,30 @@ class StagingBuffer final {
   inline size_t nbytes() {
     return nbytes_;
   }
+
+  inline void copy_from(const void* src, const size_t nbytes) {
+    VK_CHECK_COND(nbytes <= nbytes_);
+    memcpy(data(), src, nbytes);
+    vmaFlushAllocation(
+        vulkan_buffer_.vma_allocator(),
+        vulkan_buffer_.allocation(),
+        0u,
+        VK_WHOLE_SIZE);
+  }
+
+  inline void copy_to(void* dst, const size_t nbytes) {
+    VK_CHECK_COND(nbytes <= nbytes_);
+    vmaInvalidateAllocation(
+        vulkan_buffer_.vma_allocator(),
+        vulkan_buffer_.allocation(),
+        0u,
+        VK_WHOLE_SIZE);
+    memcpy(dst, data(), nbytes);
+  }
+
+  inline void set_staging_zeros() {
+    memset(data(), 0, nbytes_);
+  }
 };
 
 } // namespace api
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index a8f57f57d2a..c22241940f8 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -401,7 +401,7 @@ void ComputeGraph::copy_into_staging(
     const size_t numel) {
   StagingPtr staging = get_staging(idx);
   size_t nbytes = numel * vkapi::element_size(staging->dtype());
-  copy_ptr_to_staging(data, *staging, nbytes);
+  staging->copy_from(data, nbytes);
 }
 
 void ComputeGraph::copy_from_staging(
@@ -410,7 +410,7 @@ void ComputeGraph::copy_from_staging(
     const size_t numel) {
   StagingPtr staging = get_staging(idx);
   size_t nbytes = numel * vkapi::element_size(staging->dtype());
-  copy_staging_to_ptr(*staging, data, nbytes);
+  staging->copy_to(data, nbytes);
 }
 
 void ComputeGraph::prepare() {
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
index a9c2f6c9b6a..61b24cd409b 100644
--- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
@@ -53,8 +53,7 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
   if (graph->val_is_none(tref_)) {
     size_t numel = utils::multiply_integers(packed->sizes());
     api::StagingBuffer staging(graph->context(), packed->dtype(), numel);
-    size_t nbytes = numel * vkapi::element_size(packed->dtype());
-    set_staging_zeros(staging, nbytes);
+    staging.set_staging_zeros();
     return staging;
   }
 
@@ -62,7 +61,7 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
   size_t numel = utils::multiply_integers(tref->sizes);
   api::StagingBuffer staging(graph->context(), tref->dtype, numel);
   size_t nbytes = numel * vkapi::element_size(tref->dtype);
-  copy_ptr_to_staging(tref->data, staging, nbytes);
+  staging.copy_from(tref->data, nbytes);
   return staging;
 }
 
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
index 9cb715e202a..8804bcf2ef6 100644
--- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
@@ -13,88 +13,8 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
 
-#include <cstring>
-
 namespace vkcompute {
 
-template <typename T>
-void memcpy_to_mapping_impl(
-    const void* src,
-    vkapi::MemoryMap& dst_mapping,
-    const size_t nbytes) {
-  T* data_ptr = dst_mapping.template data<T>();
-  memcpy(data_ptr, reinterpret_cast<const T*>(src), nbytes);
-}
-
-template <typename T>
-void memcpy_from_mapping_impl(
-    vkapi::MemoryMap& src_mapping,
-    void* dst,
-    const size_t nbytes) {
-  T* data_ptr = src_mapping.template data<T>();
-  memcpy(reinterpret_cast<T*>(dst), data_ptr, nbytes);
-}
-
-void memcpy_to_mapping(
-    const void* src,
-    vkapi::MemoryMap& dst_mapping,
-    const size_t nbytes,
-    const vkapi::ScalarType dtype) {
-#define DTYPE_CASE(ctype, vkformat, name)                    \
-  case vkapi::ScalarType::name:                              \
-    memcpy_to_mapping_impl<ctype>(src, dst_mapping, nbytes); \
-    break;
-
-  switch (dtype) {
-    VK_FORALL_SCALAR_TYPES(DTYPE_CASE)
-    default:
-      VK_THROW("Unrecognized dtype!");
-  }
-#undef DTYPE_CASE
-}
-
-void memcpy_from_mapping(
-    vkapi::MemoryMap& src_mapping,
-    void* dst,
-    const size_t nbytes,
-    const vkapi::ScalarType dtype) {
-#define DTYPE_CASE(ctype, vkformat, name)                      \
-  case vkapi::ScalarType::name:                                \
-    memcpy_from_mapping_impl<ctype>(src_mapping, dst, nbytes); \
-    break;
-
-  switch (dtype) {
-    VK_FORALL_SCALAR_TYPES(DTYPE_CASE)
-    default:
-      VK_THROW("Unrecognized dtype!");
-  }
-#undef DTYPE_CASE
-}
-
-void copy_ptr_to_staging(
-    const void* src,
-    api::StagingBuffer& staging,
-    const size_t nbytes) {
-  vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::WRITE);
-  mapping.invalidate();
-  memcpy_to_mapping(src, mapping, nbytes, staging.dtype());
-}
-
-void copy_staging_to_ptr(
-    api::StagingBuffer& staging,
-    void* dst,
-    const size_t nbytes) {
-  vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::READ);
-  mapping.invalidate();
-  memcpy_from_mapping(mapping, dst, nbytes, staging.dtype());
-}
-
-void set_staging_zeros(api::StagingBuffer& staging, const size_t nbytes) {
-  vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::WRITE);
-  uint8_t* data_ptr = mapping.template data<uint8_t>();
-  memset(data_ptr, 0, staging.nbytes());
-}
-
 vkapi::ShaderInfo get_nchw_to_tensor_shader(
     const api::vTensor& v_dst,
     const bool int8_buffer_enabled) {
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
index f16c52ecf33..8d63958a738 100644
--- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
+++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
@@ -12,25 +12,6 @@
 
 namespace vkcompute {
 
-//
-// Functions to copy data into and out of a staging buffer
-//
-
-void copy_ptr_to_staging(
-    const void* src,
-    api::StagingBuffer& staging,
-    const size_t nbytes);
-void copy_staging_to_ptr(
-    api::StagingBuffer& staging,
-    void* dst,
-    const size_t nbytes);
-
-void set_staging_zeros(api::StagingBuffer& staging, const size_t nbytes);
-
-//
-// Functions to get shaders
-//
-
 vkapi::ShaderInfo get_nchw_to_tensor_shader(
     const api::vTensor& v_dst,
     bool int8_buffer_enabled = true);
diff --git a/backends/vulkan/runtime/vk_api/memory/Allocation.cpp b/backends/vulkan/runtime/vk_api/memory/Allocation.cpp
index b07bb2862d3..d4e0fc9702e 100644
--- a/backends/vulkan/runtime/vk_api/memory/Allocation.cpp
+++ b/backends/vulkan/runtime/vk_api/memory/Allocation.cpp
@@ -30,6 +30,7 @@ Allocation::Allocation()
       create_info{},
       allocator(VK_NULL_HANDLE),
       allocation(VK_NULL_HANDLE),
+      allocation_info({}),
       is_copy_(false) {}
 
 Allocation::Allocation(
@@ -40,6 +41,7 @@ Allocation::Allocation(
       create_info(create_info),
       allocator(vma_allocator),
       allocation(VK_NULL_HANDLE),
+      allocation_info({}),
       is_copy_(false) {
   VK_CHECK(vmaAllocateMemory(
       allocator, &memory_requirements, &create_info, &allocation, nullptr));
@@ -50,6 +52,7 @@ Allocation::Allocation(const Allocation& other) noexcept
       create_info(other.create_info),
       allocator(other.allocator),
       allocation(other.allocation),
+      allocation_info(other.allocation_info),
       is_copy_(true) {}
 
 Allocation::Allocation(Allocation&& other) noexcept
@@ -57,8 +60,10 @@ Allocation::Allocation(Allocation&& other) noexcept
       create_info(other.create_info),
       allocator(other.allocator),
       allocation(other.allocation),
+      allocation_info(other.allocation_info),
       is_copy_(other.is_copy_) {
   other.allocation = VK_NULL_HANDLE;
+  other.allocation_info = {};
 }
 
 Allocation& Allocation::operator=(Allocation&& other) noexcept {
@@ -68,9 +73,11 @@ Allocation& Allocation::operator=(Allocation&& other) noexcept {
   create_info = other.create_info;
   allocator = other.allocator;
   allocation = other.allocation;
+  allocation_info = other.allocation_info;
   is_copy_ = other.is_copy_;
 
   other.allocation = tmp_allocation;
+  other.allocation_info = {};
 
   return *this;
 }
diff --git a/backends/vulkan/runtime/vk_api/memory/Allocation.h b/backends/vulkan/runtime/vk_api/memory/Allocation.h
index cec6f61e766..44e8277a35c 100644
--- a/backends/vulkan/runtime/vk_api/memory/Allocation.h
+++ b/backends/vulkan/runtime/vk_api/memory/Allocation.h
@@ -62,6 +62,8 @@ struct Allocation final {
   VmaAllocator allocator;
   // Handles to the allocated memory
   VmaAllocation allocation;
+  // Information about the allocated memory
+  VmaAllocationInfo allocation_info;
 
  private:
   // Indicates whether this class instance is a copy of another class instance,
diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
index b990cf6a119..e814063fa90 100644
--- a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
+++ b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
@@ -142,7 +142,8 @@ VulkanBuffer Allocator::create_staging_buffer(const VkDeviceSize size) {
   // Staging buffers are accessed by both the CPU and GPU, so set the
   // appropriate flags to indicate that the host device will be accessing
   // the data from this buffer.
-  alloc_create_info.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT;
+  alloc_create_info.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT |
+      VMA_ALLOCATION_CREATE_MAPPED_BIT;
   alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST;
   alloc_create_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
   alloc_create_info.preferredFlags =
diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.cpp b/backends/vulkan/runtime/vk_api/memory/Buffer.cpp
index 366b45a5e41..5a78dab764d 100644
--- a/backends/vulkan/runtime/vk_api/memory/Buffer.cpp
+++ b/backends/vulkan/runtime/vk_api/memory/Buffer.cpp
@@ -67,7 +67,7 @@ VulkanBuffer::VulkanBuffer(
         &allocation_create_info,
         &handle_,
         &(memory_.allocation),
-        nullptr));
+        &(memory_.allocation_info)));
   } else {
     VmaAllocatorInfo allocator_info{};
     vmaGetAllocatorInfo(allocator_, &allocator_info);
diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.h b/backends/vulkan/runtime/vk_api/memory/Buffer.h
index 9302048f861..af32ffffa84 100644
--- a/backends/vulkan/runtime/vk_api/memory/Buffer.h
+++ b/backends/vulkan/runtime/vk_api/memory/Buffer.h
@@ -114,6 +114,10 @@ class VulkanBuffer final {
     return memory_.allocation;
   }
 
+  inline VmaAllocationInfo allocation_info() const {
+    return memory_.allocation_info;
+  }
+
   inline VmaAllocationCreateInfo allocation_create_info() const {
     return VmaAllocationCreateInfo(memory_.create_info);
   }
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
index 4c2972419d0..a469a44dc1a 100644
--- a/backends/vulkan/test/utils/test_utils.cpp
+++ b/backends/vulkan/test/utils/test_utils.cpp
@@ -326,15 +326,15 @@ void record_reference_matmul(
 void fill_vtensor(api::vTensor& vten, std::vector<float>& data) {
   api::StagingBuffer staging_buffer(api::context(), vten.dtype(), data.size());
 
-#define CASE(ctype, name)                                                     \
-  case vkapi::ScalarType::name: {                                             \
-    std::vector<ctype> data_converted;                                        \
-    data_converted.resize(data.size());                                       \
-    for (int i = 0; i < data.size(); ++i) {                                   \
-      data_converted[i] = ctype(data[i]);                                     \
-    }                                                                         \
-    copy_ptr_to_staging(                                                      \
-        data_converted.data(), staging_buffer, vten.staging_buffer_nbytes()); \
+#define CASE(ctype, name)                                     \
+  case vkapi::ScalarType::name: {                             \
+    std::vector<ctype> data_converted;                        \
+    data_converted.resize(data.size());                       \
+    for (int i = 0; i < data.size(); ++i) {                   \
+      data_converted[i] = ctype(data[i]);                     \
+    }                                                         \
+    staging_buffer.copy_from(                                 \
+        data_converted.data(), vten.staging_buffer_nbytes()); \
   } break;
 
   switch (vten.dtype()) {
@@ -424,14 +424,14 @@ void extract_vtensor(api::vTensor& vten, std::vector<float>& data) {
   api::context()->submit_cmd_to_gpu(fence.get_submit_handle());
   fence.wait();
 
-#define CASE(ctype, name)                                                     \
-  case vkapi::ScalarType::name: {                                             \
-    std::vector<ctype> data_converted(data.size());                           \
-    copy_staging_to_ptr(                                                      \
-        staging_buffer, data_converted.data(), vten.staging_buffer_nbytes()); \
-    for (int i = 0; i < data.size(); ++i) {                                   \
-      data[i] = float(data_converted[i]);                                     \
-    }                                                                         \
+#define CASE(ctype, name)                                     \
+  case vkapi::ScalarType::name: {                             \
+    std::vector<ctype> data_converted(data.size());           \
+    staging_buffer.copy_to(                                   \
+        data_converted.data(), vten.staging_buffer_nbytes()); \
+    for (int i = 0; i < data.size(); ++i) {                   \
+      data[i] = float(data_converted[i]);                     \
+    }                                                         \
   } break;
 
   switch (vten.dtype()) {
diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h
index 3bc12c472db..25163e664bf 100644
--- a/backends/vulkan/test/utils/test_utils.h
+++ b/backends/vulkan/test/utils/test_utils.h
@@ -132,7 +132,7 @@ fill_staging(api::StagingBuffer& staging, float val, int numel = -1) {
   }
   std::vector<float> data(numel);
   std::fill(data.begin(), data.end(), val);
-  copy_ptr_to_staging(data.data(), staging, sizeof(float) * numel);
+  staging.copy_from(data.data(), sizeof(float) * numel);
 }
 
 void fill_vtensor(api::vTensor& vten, std::vector<float>& data);
@@ -169,7 +169,7 @@ check_staging_buffer(api::StagingBuffer& staging, float val, int numel = -1) {
     numel = staging.numel();
   }
   std::vector<float> data(numel);
-  copy_staging_to_ptr(staging, data.data(), sizeof(float) * numel);
+  staging.copy_to(data.data(), sizeof(float) * numel);
 
   for (size_t i = 0; i < data.size(); ++i) {
     CHECK_VALUE(data, i, val);
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index a0bfefafa02..c035d5f8b85 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -360,7 +360,7 @@ TEST_F(VulkanComputeAPITest, spec_var_shader_test) {
   submit_to_gpu();
 
   std::vector<float> data(len);
-  copy_staging_to_ptr(buffer, data.data(), buffer.nbytes());
+  buffer.copy_to(data.data(), buffer.nbytes());
 
   for (size_t i = 0; i < len; ++i) {
     CHECK_VALUE(data, i, scale * i + offset);
@@ -470,7 +470,7 @@ void test_storage_buffer_type(const size_t len) {
   submit_to_gpu();
 
   std::vector<T> data(len);
-  copy_staging_to_ptr(buffer, data.data(), buffer.nbytes());
+  buffer.copy_to(data.data(), buffer.nbytes());
 
   for (size_t i = 0; i < len; ++i) {
     CHECK_VALUE(data, i, T(i));
@@ -2132,7 +2132,7 @@ void run_from_gpu_test(
   submit_to_gpu();
 
   std::vector<T> data_out(staging_buffer.numel());
-  copy_staging_to_ptr(staging_buffer, data_out.data(), staging_buffer.nbytes());
+  staging_buffer.copy_to(data_out.data(), staging_buffer.nbytes());
 
   for (int i = 0; i < vten.numel(); i++) {
     CHECK_VALUE(data_out, i, i + offset);
@@ -2160,8 +2160,7 @@ void round_trip_test(
   for (int i = 0; i < staging_buffer_in.numel(); i++) {
     data_in[i] = T(i * -1);
   }
-  copy_ptr_to_staging(
-      data_in.data(), staging_buffer_in, vten.staging_buffer_nbytes());
+  staging_buffer_in.copy_from(data_in.data(), vten.staging_buffer_nbytes());
 
   // Output staging buffer
   StagingBuffer staging_buffer_out(
@@ -2182,8 +2181,7 @@ void round_trip_test(
 
   // Extract data from output staging buffer
   std::vector<T> data_out(staging_buffer_out.numel());
-  copy_staging_to_ptr(
-      staging_buffer_out, data_out.data(), staging_buffer_out.nbytes());
+  staging_buffer_out.copy_to(data_out.data(), staging_buffer_out.nbytes());
 
   // All indices should be equal to the input data
   for (int i = 0; i < vten.numel(); i++) {
@@ -2624,8 +2622,7 @@ void test_conv2d(
   for (int i = 0; i < in_numel; i++) {
     data_in[i] = i + 1;
   }
-  copy_ptr_to_staging(
-      data_in.data(), staging_buffer_in, sizeof(float) * in_numel);
+  staging_buffer_in.copy_from(data_in.data(), sizeof(float) * in_numel);
 
   // Output staging buffer
   const int64_t out_numel =
@@ -2642,8 +2639,7 @@ void test_conv2d(
 
   // Extract data from output staging buffer
   std::vector<float> data_out(out_numel);
-  copy_staging_to_ptr(
-      staging_buffer_out, data_out.data(), sizeof(float) * out_numel);
+  staging_buffer_out.copy_to(data_out.data(), sizeof(float) * out_numel);
 
   // Check data matches results copied from ATen-VK
   for (int i = 0; i < vten.numel(); i++) {