diff --git a/CMakeLists.txt b/CMakeLists.txt
index a388a4549d66..62ea0a64d6c0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -205,10 +205,10 @@ cmake_dependent_option(
     USE_VALGRIND "Use Valgrind. Only available on Linux." ON
     "LINUX" OFF)
 option(USE_VULKAN "Use Vulkan GPU backend" OFF)
-option(USE_VULKAN_API "Use Vulkan GPU backend v2" OFF)
-option(USE_VULKAN_WRAPPER "Use Vulkan wrapper" ON)
-option(USE_VULKAN_SHADERC_RUNTIME "Use Vulkan Shader compilation runtime(Needs shaderc lib)" OFF)
-option(USE_VULKAN_RELAXED_PRECISION "Use Vulkan relaxed precision(mediump)" OFF)
+option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference even on fp32 tensors" ON)
+option(USE_VULKAN_RELAXED_PRECISION "Vulkan - Use relaxed precision math in the kernels (mediump)" OFF)
+option(USE_VULKAN_SHADERC_RUNTIME "Vulkan - Use runtime shader compilation (needs libshaderc)" OFF)
+option(USE_VULKAN_WRAPPER "Vulkan - Dynamically load Vulkan functions" ON)
 option(USE_XNNPACK "Use XNNPACK" ON)
 option(USE_ZMQ "Use ZMQ" OFF)
 option(USE_ZSTD "Use ZSTD" OFF)
@@ -554,22 +554,23 @@ endif()
 
 if(USE_VULKAN)
   string(APPEND CMAKE_CXX_FLAGS " -DUSE_VULKAN")
-endif()
-
-if(USE_VULKAN_API)
   string(APPEND CMAKE_CXX_FLAGS " -DUSE_VULKAN_API")
-endif()
 
-if(USE_VULKAN_WRAPPER)
-  string(APPEND CMAKE_CXX_FLAGS " -DUSE_VULKAN_WRAPPER")
-endif()
+  if(USE_VULKAN_FP16_INFERENCE)
+    string(APPEND CMAKE_CXX_FLAGS " -DUSE_VULKAN_FP16_INFERENCE")
+  endif()
 
-if(USE_VULKAN_SHADERC_RUNTIME)
-  string(APPEND CMAKE_CXX_FLAGS " -DUSE_VULKAN_SHADERC_RUNTIME")
-endif()
+  if(USE_VULKAN_RELAXED_PRECISION)
+    string(APPEND CMAKE_CXX_FLAGS " -DUSE_VULKAN_RELAXED_PRECISION")
+  endif()
 
-if(USE_VULKAN_RELAXED_PRECISION)
-  string(APPEND CMAKE_CXX_FLAGS " -DUSE_VULKAN_RELAXED_PRECISION")
+  if(USE_VULKAN_SHADERC_RUNTIME)
+    string(APPEND CMAKE_CXX_FLAGS " -DUSE_VULKAN_SHADERC_RUNTIME")
+  endif()
+
+  if(USE_VULKAN_WRAPPER)
+    string(APPEND CMAKE_CXX_FLAGS " -DUSE_VULKAN_WRAPPER")
+  endif()
 endif()
 
 if(USE_PYTORCH_METAL)
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index f7b27e5947da..fd3c95f2573b 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -65,8 +65,7 @@ file(GLOB native_cpp "native/*.cpp")
 file(GLOB native_mkl_cpp "native/mkl/*.cpp")
 file(GLOB native_mkldnn_cpp "native/mkldnn/*.cpp")
 file(GLOB vulkan_cpp "vulkan/*.cpp")
-file(GLOB native_vulkan_cpp "native/vulkan/*.cpp")
-file(GLOB native_vulkan_api_cpp "native/vulkan/api/*.cpp" "native/vulkan/ops/*.cpp")
+file(GLOB native_vulkan_cpp "native/vulkan/*.cpp" "native/vulkan/api/*.cpp" "native/vulkan/ops/*.cpp")
 
 # Metal
 file(GLOB metal_h "metal/*.h")
@@ -126,9 +125,6 @@ if(AT_MKLDNN_ENABLED)
 endif()
 if(USE_VULKAN)
   set(all_cpu_cpp ${all_cpu_cpp} ${vulkan_cpp} ${native_vulkan_cpp} ${vulkan_generated_cpp})
-  if(USE_VULKAN_API)
-    set(all_cpu_cpp ${all_cpu_cpp} ${native_vulkan_api_cpp})
-  endif()
 else()
   set(all_cpu_cpp ${all_cpu_cpp} ${vulkan_cpp})
 endif()
diff --git a/aten/src/ATen/native/vulkan/Vulkan.cpp b/aten/src/ATen/native/vulkan/Vulkan.cpp
index 90920a8e5a82..3646ae7e9496 100644
--- a/aten/src/ATen/native/vulkan/Vulkan.cpp
+++ b/aten/src/ATen/native/vulkan/Vulkan.cpp
@@ -779,17 +779,17 @@ void ComputeUnit::createComputePipeline(
   {
     uint32_t offset = 0;
     size_t size = sizeof(WorkGroupSize::x);
-    spMapEntries[0].constantID = 1;
+    spMapEntries[0].constantID = 0;
     spMapEntries[0].offset = offset;
     spMapEntries[0].size = size;
     offset += size;
     size = sizeof(WorkGroupSize::y);
-    spMapEntries[1].constantID = 2;
+    spMapEntries[1].constantID = 1;
     spMapEntries[1].offset = offset;
     spMapEntries[1].size = size;
     offset += size;
     size = sizeof(WorkGroupSize::z);
-    spMapEntries[2].constantID = 3;
+    spMapEntries[2].constantID = 2;
     spMapEntries[2].offset = offset;
     spMapEntries[2].size = size;
   }
diff --git a/aten/src/ATen/native/vulkan/api/Adapter.h b/aten/src/ATen/native/vulkan/api/Adapter.h
index 4ba02a5e9926..b4203530f635 100644
--- a/aten/src/ATen/native/vulkan/api/Adapter.h
+++ b/aten/src/ATen/native/vulkan/api/Adapter.h
@@ -33,7 +33,7 @@ struct Adapter final {
   }
 
   inline Shader::WorkGroup local_work_group_size() const {
-    return { 8u, 8u, 1u, };
+    return { 4u, 4u, 4u, };
   }
 };
 
diff --git a/aten/src/ATen/native/vulkan/api/Allocator.h b/aten/src/ATen/native/vulkan/api/Allocator.h
index 9a26fa8f48a7..b720608d844b 100644
--- a/aten/src/ATen/native/vulkan/api/Allocator.h
+++ b/aten/src/ATen/native/vulkan/api/Allocator.h
@@ -13,6 +13,8 @@
 
 #ifdef USE_VULKAN_WRAPPER
   #define VMA_STATIC_VULKAN_FUNCTIONS 0
+#else
+  #define VMA_DYNAMIC_VULKAN_FUNCTIONS 0
 #endif
 
 #ifdef DEBUG
diff --git a/aten/src/ATen/native/vulkan/api/Cache.h b/aten/src/ATen/native/vulkan/api/Cache.h
index 83ea3343aa83..b224adbbeeda 100644
--- a/aten/src/ATen/native/vulkan/api/Cache.h
+++ b/aten/src/ATen/native/vulkan/api/Cache.h
@@ -72,7 +72,7 @@ template<typename Factory>
 inline auto Cache<Factory>::retrieve(
     const Descriptor& descriptor) {
   auto iterator = cache_.find(descriptor);
-  if (cache_.cend() == iterator) {
+  if C10_UNLIKELY(cache_.cend() == iterator) {
     iterator = cache_.insert({descriptor, factory_(descriptor)}).first;
   }
 
diff --git a/aten/src/ATen/native/vulkan/api/Command.cpp b/aten/src/ATen/native/vulkan/api/Command.cpp
index e01691935d70..5aa3586d4683 100644
--- a/aten/src/ATen/native/vulkan/api/Command.cpp
+++ b/aten/src/ATen/native/vulkan/api/Command.cpp
@@ -36,9 +36,11 @@ VkCommandPool create_command_pool(
   return command_pool;
 }
 
-VkCommandBuffer allocate_command_buffer(
+void allocate_command_buffers(
     const VkDevice device,
-    const VkCommandPool command_pool) {
+    const VkCommandPool command_pool,
+    VkCommandBuffer* const command_buffers,
+    const uint32_t count) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       device,
       "Invalid Vulkan device!");
@@ -47,37 +49,28 @@ VkCommandBuffer allocate_command_buffer(
       command_pool,
       "Invalid Vulkan command pool!");
 
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      command_buffers && (count > 0u),
+      "Invalid usage!");
+
   const VkCommandBufferAllocateInfo command_buffer_allocate_info{
     VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
     nullptr,
     command_pool,
     VK_COMMAND_BUFFER_LEVEL_PRIMARY,
-    1u,
+    count,
   };
 
-  VkCommandBuffer command_buffer{};
   VK_CHECK(vkAllocateCommandBuffers(
       device,
       &command_buffer_allocate_info,
-      &command_buffer));
-
-  TORCH_CHECK(
-      command_buffer,
-      "Invalid Vulkan command buffer!");
-
-  return command_buffer;
+      command_buffers));
 }
 
 } // namespace
 
-Command::Buffer::Buffer()
-  : command_buffer_(VK_NULL_HANDLE) {
-}
-
-Command::Buffer::Buffer(
-    const VkDevice device,
-    const VkCommandPool command_pool)
-  : command_buffer_(allocate_command_buffer(device, command_pool)) {
+Command::Buffer::Buffer(const VkCommandBuffer command_buffer)
+  : command_buffer_(command_buffer) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       command_buffer_,
       "Invalid Vulkan command buffer!");
@@ -99,6 +92,10 @@ void Command::Buffer::Buffer::begin() {
   VK_CHECK(vkBeginCommandBuffer(
       command_buffer_,
       &command_buffer_begin_info));
+
+  // Reset
+  bound_.reset();
+  barriers_.reset();
 }
 
 void Command::Buffer::Buffer::end() {
@@ -110,74 +107,90 @@ void Command::Buffer::Buffer::end() {
   VK_CHECK(vkEndCommandBuffer(command_buffer_));
 }
 
-void Command::Buffer::barrier(
-    const Pipeline::Barrier& barrier) {
+void Command::Buffer::barrier() {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       command_buffer_,
       "This command buffer is in an invalid state! "
       "Potential reason: This command buffer is moved from.");
 
-  c10::SmallVector<VkMemoryBarrier, 1u> global_memory_barriers;
-  c10::SmallVector<VkImageMemoryBarrier, 4u> image_memory_barriers;
-
-  if (!barrier.buffers.empty()) {
-    // Using global memory barriers instead of buffer memory barriers for
-    // buffers.  The consensus seems to be that there is no advantage in
-    // using the latter in favor of the former.
-
-    VkMemoryBarrier global_memory_barrier{
-      VK_STRUCTURE_TYPE_MEMORY_BARRIER,
-      nullptr,
-      0u,
-      0u,
-    };
-
-    // Coalesce all buffer memory barriers into one global memory barrier.
+  if (barriers_.stage) {
+    c10::SmallVector<VkBufferMemoryBarrier, 4u> buffer_memory_barriers;
+
+    for (const Resource::Buffer::Barrier& barrier : barriers_.buffers) {
+      buffer_memory_barriers.push_back({
+            VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
+            nullptr,
+            barrier.memory.src,
+            barrier.memory.dst,
+            VK_QUEUE_FAMILY_IGNORED,
+            VK_QUEUE_FAMILY_IGNORED,
+            barrier.object.handle,
+            barrier.object.offset,
+            barrier.object.range,
+          });
+    }
 
-    for (const Resource::Buffer::Barrier& barrier : barrier.buffers) {
-      global_memory_barrier.srcAccessMask |= barrier.memory.src;
-      global_memory_barrier.dstAccessMask |= barrier.memory.dst;
+    c10::SmallVector<VkImageMemoryBarrier, 4u> image_memory_barriers;
+
+    for (const Resource::Image::Barrier& barrier : barriers_.images) {
+      image_memory_barriers.push_back({
+            VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+            nullptr,
+            barrier.memory.src,
+            barrier.memory.dst,
+            barrier.layout.src,
+            barrier.layout.dst,
+            VK_QUEUE_FAMILY_IGNORED,
+            VK_QUEUE_FAMILY_IGNORED,
+            barrier.object.handle,
+            {
+              VK_IMAGE_ASPECT_COLOR_BIT,
+              0u,
+              VK_REMAINING_MIP_LEVELS,
+              0u,
+              VK_REMAINING_ARRAY_LAYERS,
+            },
+          });
     }
 
-    global_memory_barriers.push_back(global_memory_barrier);
+    vkCmdPipelineBarrier(
+        command_buffer_,
+        barriers_.stage.src,
+        barriers_.stage.dst,
+        0u,
+        0u,
+        nullptr,
+        buffer_memory_barriers.size(),
+        buffer_memory_barriers.data(),
+        image_memory_barriers.size(),
+        image_memory_barriers.data());
   }
 
-  for (const Resource::Image::Barrier& barrier : barrier.images) {
-    image_memory_barriers.push_back({
-          VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
-          nullptr,
-          barrier.memory.src,
-          barrier.memory.dst,
-          barrier.layout.src,
-          barrier.layout.dst,
-          VK_QUEUE_FAMILY_IGNORED,
-          VK_QUEUE_FAMILY_IGNORED,
-          barrier.object.handle,
-          VkImageSubresourceRange{
-            VK_IMAGE_ASPECT_COLOR_BIT,
-            0u,
-            VK_REMAINING_MIP_LEVELS,
-            0u,
-            VK_REMAINING_ARRAY_LAYERS,
-          },
-        });
-  }
+  // Reset
+  barriers_.reset();
+}
 
-  vkCmdPipelineBarrier(
+void Command::Buffer::barrier(const Pipeline::Barrier& barrier) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       command_buffer_,
-      barrier.stage.src,
-      barrier.stage.dst,
-      0u,
-      global_memory_barriers.size(),
-      global_memory_barriers.data(),
-      0u,
-      nullptr,
-      image_memory_barriers.size(),
-      image_memory_barriers.data());
+      "This command buffer is in an invalid state! "
+      "Potential reason: This command buffer is moved from.");
+
+  barriers_.stage.src |= barrier.stage.src;
+  barriers_.stage.dst |= barrier.stage.dst;
+
+  barriers_.buffers.insert(
+      barriers_.buffers.end(),
+      barrier.buffers.begin(),
+      barrier.buffers.end());
+
+  barriers_.images.insert(
+      barriers_.images.end(),
+      barrier.images.begin(),
+      barrier.images.end());
 }
 
-void Command::Buffer::bind(
-    const Pipeline::Object& pipeline) {
+void Command::Buffer::bind(const Pipeline::Object& pipeline) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       command_buffer_,
       "This command buffer is in an invalid state! "
@@ -197,8 +210,7 @@ void Command::Buffer::bind(
   }
 }
 
-void Command::Buffer::bind(
-    const Descriptor::Set& set) {
+void Command::Buffer::bind(const Descriptor::Set& set) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       command_buffer_,
       "This command buffer is in an invalid state! "
@@ -241,6 +253,8 @@ void Command::Buffer::copy(
       destination,
       "Invalid Vulkan destination buffer!");
 
+  barrier();
+
   const VkBufferCopy buffer_copy{
     0u,
     0u,
@@ -262,17 +276,19 @@ void Command::Buffer::dispatch(
       "This command buffer is in an invalid state! "
       "Potential reason: This command buffer is moved from.");
 
+  barrier();
+
   vkCmdDispatch(
       command_buffer_,
       utils::div_up(
-          global_work_group.width,
-          bound_.pipeline.local_work_group.width),
+          global_work_group.data[0u],
+          bound_.pipeline.local_work_group.data[0u]),
       utils::div_up(
-          global_work_group.height,
-          bound_.pipeline.local_work_group.height),
+          global_work_group.data[1u],
+          bound_.pipeline.local_work_group.data[1u]),
       utils::div_up(
-          global_work_group.depth,
-          bound_.pipeline.local_work_group.depth));
+          global_work_group.data[2u],
+          bound_.pipeline.local_work_group.data[2u]));
 }
 
 void Command::Buffer::submit(
@@ -306,7 +322,8 @@ Command::Pool::Pool(const GPU& gpu)
   : device_(gpu.device),
     command_pool_(
         create_command_pool(gpu.device, gpu.adapter->compute_queue_family_index),
-        VK_DELETER(CommandPool)(device_)) {
+        VK_DELETER(CommandPool)(device_)),
+    buffer_{} {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       device_,
       "Invalid Vulkan device!");
@@ -314,11 +331,14 @@ Command::Pool::Pool(const GPU& gpu)
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       command_pool_,
       "Invalid Vulkan command pool!");
+
+  buffer_.pool.reserve(Configuration::kReserve);
 }
 
 Command::Pool::Pool(Pool&& pool)
   : device_(std::move(pool.device_)),
-    command_pool_(std::move(pool.command_pool_)) {
+    command_pool_(std::move(pool.command_pool_)),
+    buffer_(std::move(pool.buffer_)) {
   pool.device_ = VK_NULL_HANDLE;
 }
 
@@ -326,6 +346,7 @@ Command::Pool& Command::Pool::operator=(Pool&& pool) {
   if (&pool != this) {
     device_ = std::move(pool.device_);
     command_pool_ = std::move(pool.command_pool_);
+    buffer_ = std::move(pool.buffer_);
 
     pool.device_ = VK_NULL_HANDLE;
   };
@@ -333,13 +354,42 @@ Command::Pool& Command::Pool::operator=(Pool&& pool) {
   return *this;
 }
 
+Command::Pool::~Pool() {
+  try {
+    if (device_ && command_pool_) {
+      purge();
+    }
+  }
+  catch (const std::exception& e) {
+    LOG(WARNING)
+        << "Vulkan: Command pool destructor raised an exception!  Error: "
+        << e.what();
+  }
+  catch (...) {
+    LOG(WARNING)
+        << "Vulkan: Command pool destructor raised an unknown exception!";
+  }
+}
+
 Command::Buffer Command::Pool::allocate() {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       device_ && command_pool_,
       "This command pool is in an invalid state! "
       "Potential reason: This command pool is moved from.");
 
-  return Buffer(device_, command_pool_.get());
+  if (buffer_.pool.size() == buffer_.in_use) {
+    buffer_.pool.resize(
+        buffer_.pool.size() +
+        Configuration::kQuantum);
+
+    allocate_command_buffers(
+       device_,
+       command_pool_.get(),
+       buffer_.pool.data() + buffer_.in_use,
+       Configuration::kQuantum);
+  }
+
+  return Buffer(buffer_.pool[buffer_.in_use++]);
 }
 
 void Command::Pool::purge() {
@@ -348,6 +398,7 @@ void Command::Pool::purge() {
       "This command pool is in an invalid state! "
       "Potential reason: This command pool is moved from.");
 
+  buffer_.in_use = 0u;
   VK_CHECK(vkResetCommandPool(device_, command_pool_.get(), 0u));
 }
 
diff --git a/aten/src/ATen/native/vulkan/api/Command.h b/aten/src/ATen/native/vulkan/api/Command.h
index 8e2f235cfa27..42f073674be5 100644
--- a/aten/src/ATen/native/vulkan/api/Command.h
+++ b/aten/src/ATen/native/vulkan/api/Command.h
@@ -20,8 +20,7 @@ struct Command final {
 
   class Buffer final {
    public:
-    Buffer();
-    Buffer(VkDevice device, VkCommandPool command_pool);
+    Buffer(VkCommandBuffer command_buffer = VK_NULL_HANDLE);
     Buffer(const Buffer&) = delete;
     Buffer& operator=(const Buffer&) = delete;
     Buffer(Buffer&&);
@@ -39,12 +38,32 @@ struct Command final {
     void dispatch(const Shader::WorkGroup& global_work_group);
     void submit(VkQueue queue, Resource::Fence fence = {});
 
+   private:
+    void barrier();
+
    private:
     VkCommandBuffer command_buffer_;
-    struct {
+
+    struct Bound final {
       Pipeline::Object pipeline;
       VkDescriptorSet descriptor_set;
+
+      void reset();
     } bound_;
+
+    struct Barrier final {
+      struct Stage final {
+        VkPipelineStageFlags src;
+        VkPipelineStageFlags dst;
+
+        operator bool() const;
+      } stage;
+
+      c10::SmallVector<Resource::Buffer::Barrier, 4u> buffers;
+      c10::SmallVector<Resource::Image::Barrier, 4u> images;
+
+      void reset();
+    } barriers_;
   };
 
   //
@@ -58,14 +77,24 @@ struct Command final {
     Pool& operator=(const Pool&) = delete;
     Pool(Pool&&);
     Pool& operator=(Pool&&);
-    ~Pool() = default;
+    ~Pool();
 
     Buffer allocate();
     void purge();
 
    private:
+    struct Configuration final {
+      static constexpr uint32_t kQuantum = 64u;
+      static constexpr uint32_t kReserve = 1024u;
+    };
+
     VkDevice device_;
     Handle<VkCommandPool, VK_DELETER(CommandPool)> command_pool_;
+
+    struct {
+      std::vector<VkCommandBuffer> pool;
+      size_t in_use;
+    } buffer_;
   } pool /* [thread_count] */;
 
   explicit Command(const GPU& gpu)
@@ -79,7 +108,8 @@ struct Command final {
 
 inline Command::Buffer::Buffer(Buffer&& buffer)
   : command_buffer_(std::move(buffer.command_buffer_)),
-    bound_(std::move(buffer.bound_)) {
+    bound_(std::move(buffer.bound_)),
+    barriers_(std::move(buffer.barriers_)) {
   buffer.command_buffer_ = VK_NULL_HANDLE;
 }
 
@@ -87,6 +117,7 @@ inline Command::Buffer& Command::Buffer::operator=(Buffer&& buffer) {
   if (&buffer != this) {
     command_buffer_ = std::move(buffer.command_buffer_);
     bound_ = std::move(buffer.bound_);
+    barriers_ = std::move(buffer.barriers_);
 
     buffer.command_buffer_ = VK_NULL_HANDLE;
   };
@@ -98,6 +129,22 @@ inline Command::Buffer::operator bool() const {
   return VK_NULL_HANDLE != command_buffer_;
 }
 
+inline void Command::Buffer::Bound::reset() {
+  pipeline = {};
+  descriptor_set = VK_NULL_HANDLE;
+}
+
+inline Command::Buffer::Barrier::Stage::operator bool() const {
+  return (0u != src) ||
+         (0u != dst);
+}
+
+inline void Command::Buffer::Barrier::reset() {
+  stage = {};
+  buffers.clear();
+  images.clear();
+}
+
 } // namespace api
 } // namespace vulkan
 } // namespace native
diff --git a/aten/src/ATen/native/vulkan/api/Common.h b/aten/src/ATen/native/vulkan/api/Common.h
index be89073e90ba..d606f1d859a9 100644
--- a/aten/src/ATen/native/vulkan/api/Common.h
+++ b/aten/src/ATen/native/vulkan/api/Common.h
@@ -19,16 +19,16 @@
 #endif /* USE_VULKAN_WRAPPER */
 
 #define VK_CHECK(function)                                  \
-  {                                                         \
+  do {                                                      \
     const VkResult result = (function);                     \
     TORCH_CHECK(VK_SUCCESS == result, "VkResult:", result); \
-  }
+  } while (false)
 
 #define VK_CHECK_RELAXED(function)                          \
-  {                                                         \
+  do {                                                      \
     const VkResult result = (function);                     \
     TORCH_CHECK(VK_SUCCESS <= result, "VkResult:", result); \
-  }
+  } while (false)
 
 #define VK_DELETER(Handle) \
     at::native::vulkan::api::destroy_##Handle
diff --git a/aten/src/ATen/native/vulkan/api/Context.cpp b/aten/src/ATen/native/vulkan/api/Context.cpp
index ef638f917956..09dfa8fc1d77 100644
--- a/aten/src/ATen/native/vulkan/api/Context.cpp
+++ b/aten/src/ATen/native/vulkan/api/Context.cpp
@@ -111,7 +111,7 @@ Context::~Context() {
 }
 
 void Context::flush() {
-  VK_CHECK(vkDeviceWaitIdle(device()));
+  VK_CHECK(vkQueueWaitIdle(queue()));
 
   resource().pool.purge();
   descriptor().pool.purge();
diff --git a/aten/src/ATen/native/vulkan/api/Context.h b/aten/src/ATen/native/vulkan/api/Context.h
index 687ddbbfe931..41adfc5fb272 100644
--- a/aten/src/ATen/native/vulkan/api/Context.h
+++ b/aten/src/ATen/native/vulkan/api/Context.h
@@ -128,7 +128,8 @@ inline void bind(
     const std::index_sequence<Indices...>,
     Arguments&&...arguments) {
   C10_UNUSED const int _[]{
-    (descriptor_set.bind(Indices, arguments), 0)...,
+    0,
+    (descriptor_set.bind(Indices, std::forward<Arguments>(arguments)), 0)...,
   };
 }
 
diff --git a/aten/src/ATen/native/vulkan/api/Descriptor.cpp b/aten/src/ATen/native/vulkan/api/Descriptor.cpp
index 037f793dfa2a..317536248987 100644
--- a/aten/src/ATen/native/vulkan/api/Descriptor.cpp
+++ b/aten/src/ATen/native/vulkan/api/Descriptor.cpp
@@ -6,50 +6,41 @@ namespace vulkan {
 namespace api {
 namespace {
 
-VkDescriptorPool create_descriptor_pool(
-    const VkDevice device) {
+VkDescriptorPool create_descriptor_pool(const VkDevice device) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       device,
       "Invalid Vulkan device!");
 
   const struct {
     uint32_t capacity;
-    c10::SmallVector<VkDescriptorPoolSize, 8u> sizes;
+    c10::SmallVector<VkDescriptorPoolSize, 4u> sizes;
   } descriptor {
     1024u,
     {
-      // Note: It is OK for the sum of descriptors per type, below, to exceed
-      // the max total figure above, but be concenious of memory consumption.
-      // Considering how the descriptor pool must be frequently purged anyway
-      // as a result of the impracticality of having enormous pools that
-      // persist through the execution of the program, there is diminishing
-      // return in increasing max counts.
-      {
-        /*
-          Buffers
-        */
+      /*
+        Buffers
+      */
 
-        {
-          VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
-          768u,
-        },
-        {
-          VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-          768u,
-        },
+      {
+        VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+        1024u,
+      },
+      {
+        VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+        1024u,
+      },
 
-        /*
-          Images
-        */
+      /*
+        Images
+      */
 
-        {
-          VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-          768u,
-        },
-        {
-          VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-          768u,
-        },
+      {
+        VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+        1024u,
+      },
+      {
+        VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+        1024u,
       },
     },
   };
@@ -57,7 +48,7 @@ VkDescriptorPool create_descriptor_pool(
   const VkDescriptorPoolCreateInfo descriptor_pool_create_info{
     VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
     nullptr,
-    0u, /* Do not use VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT. */
+    0u,
     descriptor.capacity,
     static_cast<uint32_t>(descriptor.sizes.size()),
     descriptor.sizes.data(),
@@ -77,10 +68,12 @@ VkDescriptorPool create_descriptor_pool(
   return descriptor_pool;
 }
 
-VkDescriptorSet allocate_descriptor_set(
+void allocate_descriptor_sets(
     const VkDevice device,
     const VkDescriptorPool descriptor_pool,
-    const VkDescriptorSetLayout descriptor_set_layout) {
+    const VkDescriptorSetLayout descriptor_set_layout,
+    VkDescriptorSet* const descriptor_sets,
+    const uint32_t count) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       device,
       "Invalid Vulkan device!");
@@ -93,41 +86,43 @@ VkDescriptorSet allocate_descriptor_set(
       descriptor_set_layout,
       "Invalid Vulkan descriptor set layout!");
 
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      descriptor_sets && (count > 0u),
+      "Invalid usage!");
+
+  const std::vector<VkDescriptorSetLayout> descriptor_set_layouts{
+    count,
+    descriptor_set_layout,
+  };
+
   const VkDescriptorSetAllocateInfo descriptor_set_allocate_info{
     VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
     nullptr,
     descriptor_pool,
-    1u,
-    &descriptor_set_layout,
+    descriptor_set_layouts.size(),
+    descriptor_set_layouts.data(),
   };
 
-  VkDescriptorSet descriptor_set{};
   VK_CHECK(vkAllocateDescriptorSets(
       device,
       &descriptor_set_allocate_info,
-      &descriptor_set));
-
-  TORCH_CHECK(
-      descriptor_set,
-      "Invalid Vulkan descriptor set!");
-
-  return descriptor_set;
+      descriptor_sets));
 }
 
 } // namespace
 
 Descriptor::Set::Set(
     const VkDevice device,
-    const VkDescriptorPool descriptor_pool,
-    const Shader::Layout::Object& shader_layout)
+    VkDescriptorSet descriptor_set,
+    const Shader::Layout::Signature& shader_layout_signature)
   : device_(device),
-    descriptor_set_(
-        allocate_descriptor_set(
-            device_,
-            descriptor_pool,
-            shader_layout.handle)),
-    shader_layout_signature_(shader_layout.signature),
+    descriptor_set_(descriptor_set),
+    shader_layout_signature_(shader_layout_signature),
     bindings_{} {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      device_,
+      "Invalid Vulkan device!");
+
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       descriptor_set_,
       "Invalid Vulkan descriptor set!");
@@ -135,7 +130,7 @@ Descriptor::Set::Set(
 
 void Descriptor::Set::update(const Item& item) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-      device_,
+      device_ && descriptor_set_,
       "This descriptor set is in an invalid state! "
       "Potential reason: This descriptor set is moved from.");
 
@@ -160,7 +155,7 @@ Descriptor::Set& Descriptor::Set::bind(
     const uint32_t binding,
     const Resource::Buffer::Object& buffer) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-      device_,
+      device_ && descriptor_set_,
       "This descriptor set is in an invalid state! "
       "Potential reason: This descriptor set is moved from.");
 
@@ -183,7 +178,7 @@ Descriptor::Set& Descriptor::Set::bind(
     const uint32_t binding,
     const Resource::Image::Object& image) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-      device_,
+      device_ && descriptor_set_,
       "This descriptor set is in an invalid state! "
       "Potential reason: This descriptor set is moved from.");
 
@@ -194,7 +189,10 @@ Descriptor::Set& Descriptor::Set::bind(
         .image = {
           image.sampler,
           image.view,
-          image.layout
+          [](const VkDescriptorType type, const VkImageLayout layout) {
+            return (VK_DESCRIPTOR_TYPE_STORAGE_IMAGE == type) ?
+                    VK_IMAGE_LAYOUT_GENERAL : layout;
+          }(shader_layout_signature_[binding], image.layout),
         },
       },
     });
@@ -204,7 +202,7 @@ Descriptor::Set& Descriptor::Set::bind(
 
 VkDescriptorSet Descriptor::Set::handle() const {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-      device_,
+      device_ && descriptor_set_,
       "This descriptor set is in an invalid state! "
       "Potential reason: This descriptor set is moved from.");
 
@@ -238,7 +236,7 @@ VkDescriptorSet Descriptor::Set::handle() const {
       }
     };
 
-    c10::SmallVector<VkWriteDescriptorSet, 8u> write_descriptor_sets;
+    c10::SmallVector<VkWriteDescriptorSet, 6u> write_descriptor_sets;
 
     for (const Item& item : bindings_.items) {
       VkWriteDescriptorSet write{
@@ -271,6 +269,7 @@ VkDescriptorSet Descriptor::Set::handle() const {
         0u,
         nullptr);
 
+    // Reset
     bindings_.dirty = false;
   }
 
@@ -281,7 +280,8 @@ Descriptor::Pool::Pool(const GPU& gpu)
   : device_(gpu.device),
     descriptor_pool_(
         create_descriptor_pool(gpu.device),
-        VK_DELETER(DescriptorPool)(device_)) {
+        VK_DELETER(DescriptorPool)(device_)),
+    set_{} {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       device_,
       "Invalid Vulkan device!");
@@ -293,7 +293,8 @@ Descriptor::Pool::Pool(const GPU& gpu)
 
 Descriptor::Pool::Pool(Pool&& pool)
   : device_(std::move(pool.device_)),
-    descriptor_pool_(std::move(pool.descriptor_pool_)) {
+    descriptor_pool_(std::move(pool.descriptor_pool_)),
+    set_(std::move(pool.set_)) {
   pool.device_ = VK_NULL_HANDLE;
 }
 
@@ -301,6 +302,7 @@ Descriptor::Pool& Descriptor::Pool::operator=(Pool&& pool) {
   if (&pool != this) {
     device_ = std::move(pool.device_);
     descriptor_pool_ = std::move(pool.descriptor_pool_);
+    set_ = std::move(pool.set_);
 
     pool.device_ = VK_NULL_HANDLE;
   };
@@ -308,9 +310,25 @@ Descriptor::Pool& Descriptor::Pool::operator=(Pool&& pool) {
   return *this;
 }
 
+Descriptor::Pool::~Pool() {
+  try {
+    if (device_ && descriptor_pool_) {
+      purge();
+    }
+  }
+  catch (const std::exception& e) {
+    LOG(WARNING)
+        << "Vulkan: Descriptor pool destructor raised an exception!  Error: "
+        << e.what();
+  }
+  catch (...) {
+    LOG(WARNING)
+        << "Vulkan: Descriptor pool destructor raised an unknown exception!";
+  }
+}
+
 Descriptor::Set Descriptor::Pool::allocate(
-    const Shader::Layout::Object& shader_layout)
-{
+    const Shader::Layout::Object& shader_layout) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       device_ && descriptor_pool_,
       "This descriptor pool is in an invalid state! "
@@ -320,10 +338,31 @@ Descriptor::Set Descriptor::Pool::allocate(
       shader_layout,
       "Invalid Vulkan shader layout!");
 
+  auto iterator = set_.layouts.find(shader_layout.handle);
+  if (set_.layouts.cend() == iterator) {
+    iterator = set_.layouts.insert({shader_layout.handle, {}}).first;
+    iterator->second.pool.reserve(Configuration::kReserve);
+  }
+
+  auto& layout = iterator->second;
+
+  if (layout.pool.size() == layout.in_use) {
+    layout.pool.resize(
+        layout.pool.size() +
+        Configuration::kQuantum);
+
+    allocate_descriptor_sets(
+        device_,
+        descriptor_pool_.get(),
+        shader_layout.handle,
+        layout.pool.data() + layout.in_use,
+        Configuration::kQuantum);
+  }
+
   return Set(
       device_,
-      descriptor_pool_.get(),
-      shader_layout);
+      layout.pool[layout.in_use++],
+      shader_layout.signature);
 }
 
 void Descriptor::Pool::purge() {
@@ -332,6 +371,7 @@ void Descriptor::Pool::purge() {
       "This descriptor pool is in an invalid state! "
       "Potential reason: This descriptor pool is moved from.");
 
+  set_.layouts.clear();
   VK_CHECK(vkResetDescriptorPool(device_, descriptor_pool_.get(), 0u));
 }
 
diff --git a/aten/src/ATen/native/vulkan/api/Descriptor.h b/aten/src/ATen/native/vulkan/api/Descriptor.h
index e268696d781b..440bb9aa4097 100644
--- a/aten/src/ATen/native/vulkan/api/Descriptor.h
+++ b/aten/src/ATen/native/vulkan/api/Descriptor.h
@@ -60,21 +60,16 @@ struct Descriptor final {
    public:
     Set(
         VkDevice device,
-        VkDescriptorPool descriptor_pool,
-        const Shader::Layout::Object& shader_layout);
+        VkDescriptorSet descriptor_set,
+        const Shader::Layout::Signature& shader_layout_signature);
     Set(const Set&) = delete;
     Set& operator=(const Set&) = delete;
     Set(Set&&);
     Set& operator=(Set&&);
     ~Set() = default;
 
-    Set& bind(
-        uint32_t binding,
-        const Resource::Buffer::Object& buffer);
-
-    Set& bind(
-        uint32_t binding,
-        const Resource::Image::Object& image);
+    Set& bind(uint32_t binding, const Resource::Buffer::Object& buffer);
+    Set& bind(uint32_t binding, const Resource::Image::Object& image);
 
     VkDescriptorSet handle() const;
 
@@ -82,6 +77,7 @@ struct Descriptor final {
     struct Item final {
       uint32_t binding;
       VkDescriptorType type;
+
       union {
         VkDescriptorBufferInfo buffer;
         VkDescriptorImageInfo image;
@@ -96,7 +92,7 @@ struct Descriptor final {
     Shader::Layout::Signature shader_layout_signature_;
 
     struct {
-      c10::SmallVector<Item, 8u> items;
+      c10::SmallVector<Item, 6u> items;
       mutable bool dirty;
     } bindings_;
   };
@@ -112,14 +108,28 @@ struct Descriptor final {
     Pool& operator=(const Pool&) = delete;
     Pool(Pool&&);
     Pool& operator=(Pool&&);
-    ~Pool() = default;
+    ~Pool();
 
     Set allocate(const Shader::Layout::Object& shader_layout);
     void purge();
 
    private:
+    struct Configuration final {
+      static constexpr uint32_t kQuantum = 16u;
+      static constexpr uint32_t kReserve = 64u;
+    };
+
     VkDevice device_;
     Handle<VkDescriptorPool, VK_DELETER(DescriptorPool)> descriptor_pool_;
+
+    struct {
+      struct Layout final {
+        std::vector<VkDescriptorSet> pool;
+        size_t in_use;
+      };
+
+      ska::flat_hash_map<VkDescriptorSetLayout, Layout> layouts;
+    } set_;
   } pool /* [thread_count] */;
 
   explicit Descriptor(const GPU& gpu)
@@ -132,9 +142,10 @@ struct Descriptor final {
 //
 
 inline Descriptor::Set::Set(Set&& set)
-  : device_(set.device_),
-    descriptor_set_(set.descriptor_set_),
-    bindings_(set.bindings_) {
+  : device_(std::move(set.device_)),
+    descriptor_set_(std::move(set.descriptor_set_)),
+    shader_layout_signature_(std::move(set.shader_layout_signature_)),
+    bindings_(std::move(set.bindings_)) {
   set.device_ = VK_NULL_HANDLE;
   set.descriptor_set_ = VK_NULL_HANDLE;
 }
@@ -143,6 +154,7 @@ inline Descriptor::Set& Descriptor::Set::operator=(Set&& set) {
   if (&set != this) {
     device_ = std::move(set.device_);
     descriptor_set_ = std::move(set.descriptor_set_);
+    shader_layout_signature_ = std::move(set.shader_layout_signature_);
     bindings_ = std::move(set.bindings_);
 
     set.device_ = VK_NULL_HANDLE;
diff --git a/aten/src/ATen/native/vulkan/api/Pipeline.cpp b/aten/src/ATen/native/vulkan/api/Pipeline.cpp
index 9facc3f49e0f..4b15203892ed 100644
--- a/aten/src/ATen/native/vulkan/api/Pipeline.cpp
+++ b/aten/src/ATen/native/vulkan/api/Pipeline.cpp
@@ -100,38 +100,31 @@ typename Pipeline::Factory::Handle Pipeline::Factory::operator()(
       descriptor.shader_module,
       "Invalid Vulkan shader module!");
 
-  constexpr uint32_t x_offset = 0u;
-  constexpr uint32_t x_size = sizeof(Shader::WorkGroup::width);
-  constexpr uint32_t y_offset = x_offset + x_size;
-  constexpr uint32_t y_size = sizeof(Shader::WorkGroup::height);
-  constexpr uint32_t z_offset = y_offset + y_size;
-  constexpr uint32_t z_size = sizeof(Shader::WorkGroup::depth);
-
   constexpr VkSpecializationMapEntry specialization_map_entires[3]{
     // X
     {
-      1u,
-      x_offset,
-      x_size,
+      0u,
+      offsetof(Shader::WorkGroup, data[0u]),
+      sizeof(Shader::WorkGroup::data[0u]),
     },
     // Y
     {
-      2u,
-      y_offset,
-      y_size,
+      1u,
+      offsetof(Shader::WorkGroup, data[1u]),
+      sizeof(Shader::WorkGroup::data[1u]),
     },
     // Z
     {
-      3u,
-      z_offset,
-      z_size,
+      2u,
+      offsetof(Shader::WorkGroup, data[2u]),
+      sizeof(Shader::WorkGroup::data[2u]),
     },
   };
 
   const VkSpecializationInfo specialization_info{
     3u,
     specialization_map_entires,
-    sizeof(Shader::WorkGroup),
+    sizeof(descriptor.local_work_group),
     &descriptor.local_work_group,
   };
 
diff --git a/aten/src/ATen/native/vulkan/api/Pipeline.h b/aten/src/ATen/native/vulkan/api/Pipeline.h
index 50893b709473..1d1966790dbf 100644
--- a/aten/src/ATen/native/vulkan/api/Pipeline.h
+++ b/aten/src/ATen/native/vulkan/api/Pipeline.h
@@ -96,6 +96,21 @@ struct Pipeline final {
     }
   } layout;
 
+  //
+  // Stage
+  //
+
+  struct Stage final {
+    typedef uint8_t Flags;
+
+    enum Type : Flags {
+      None = 0u << 0u,
+      Compute = 1u << 0u,
+      Host = 1u << 1u,
+      Transfer = 1u << 2u,
+    };
+  };
+
   /*
     Descriptor
   */
@@ -202,9 +217,9 @@ inline size_t Pipeline::Factory::Hasher::operator()(
   return c10::get_hash(
       descriptor.pipeline_layout,
       descriptor.shader_module,
-      descriptor.local_work_group.width,
-      descriptor.local_work_group.height,
-      descriptor.local_work_group.depth);
+      descriptor.local_work_group.data[0u],
+      descriptor.local_work_group.data[1u],
+      descriptor.local_work_group.data[2u]);
 }
 
 inline Pipeline::Object::operator bool() const {
diff --git a/aten/src/ATen/native/vulkan/api/Resource.cpp b/aten/src/ATen/native/vulkan/api/Resource.cpp
index e239ba2f7763..a7177e379058 100644
--- a/aten/src/ATen/native/vulkan/api/Resource.cpp
+++ b/aten/src/ATen/native/vulkan/api/Resource.cpp
@@ -32,7 +32,7 @@ VmaAllocator create_allocator(
     nullptr,
     1u,
     nullptr,
-    nullptr, // TODO (Ashkan): VULKAN_WRAPPER
+    nullptr,
     nullptr,
     instance,
     VK_API_VERSION_1_0,
@@ -48,8 +48,9 @@ VmaAllocator create_allocator(
 VmaAllocationCreateInfo create_allocation_create_info(
     const Resource::Memory::Descriptor& descriptor) {
   return VmaAllocationCreateInfo{
-    0u, /* VMA_ALLOCATION_CREATE_MAPPED_BIT - MoltenVK Issue #175 */
-        /* VMA_ALLOCATION_CREATE_STRATEGY_MIN_FRAGMENTATION_BIT */
+    VMA_ALLOCATION_CREATE_STRATEGY_MIN_MEMORY_BIT |
+        /* VMA_ALLOCATION_CREATE_MAPPED_BIT - MoltenVK Issue #175 */
+        0,
     descriptor.usage,
     descriptor.required,
     descriptor.preferred,
@@ -85,16 +86,20 @@ void release_image(const Resource::Image& image) {
 
 } // namespace
 
-void* map(const Resource::Memory& memory) {
-  // Call will be ignored by implementation if the memory type this allocation
-  // belongs to is not HOST_VISIBLE or is HOST_COHERENT, which is the behavior
-  // we want.
-  VK_CHECK(vmaInvalidateAllocation(
-      memory.allocator, memory.allocation, 0u, VK_WHOLE_SIZE));
-
+void* map(
+    const Resource::Memory& memory,
+    const Resource::Memory::Access::Flags access) {
   void* data = nullptr;
   VK_CHECK(vmaMapMemory(memory.allocator, memory.allocation, &data));
 
+  if (access & Resource::Memory::Access::Read) {
+    // Call will be ignored by implementation if the memory type this allocation
+    // belongs to is not HOST_VISIBLE or is HOST_COHERENT, which is the behavior
+    // we want.
+    VK_CHECK(vmaInvalidateAllocation(
+        memory.allocator, memory.allocation, 0u, VK_WHOLE_SIZE));
+  }
+
   return data;
 }
 
@@ -119,14 +124,14 @@ void Resource::Memory::Scope::operator()(const void* const data) const {
     return;
   }
 
-  vmaUnmapMemory(allocator_, allocation_);
-
   if (access_ & Access::Write) {
     // Call will be ignored by implementation if the memory type this allocation
     // belongs to is not HOST_VISIBLE or is HOST_COHERENT, which is the behavior
     // we want.
     VK_CHECK(vmaFlushAllocation(allocator_, allocation_, 0u, VK_WHOLE_SIZE));
   }
+
+  vmaUnmapMemory(allocator_, allocation_);
 }
 
 Resource::Image::Sampler::Factory::Factory(const GPU& gpu)
@@ -151,11 +156,11 @@ Resource::Image::Sampler::Factory::operator()(
     descriptor.address_mode,
     0.0f,
     VK_FALSE,
-    0.0f,
+    1.0f,
     VK_FALSE,
     VK_COMPARE_OP_NEVER,
     0.0f,
-    0.0f,
+    VK_LOD_CLAMP_NONE,
     descriptor.border,
     VK_FALSE,
   };
@@ -239,7 +244,9 @@ Resource::Pool::Pool(const GPU& gpu)
 
 Resource::Pool::~Pool() {
   try {
-    purge();
+    if (device_ && allocator_) {
+      purge();
+    }
   }
   catch (const std::exception& e) {
     LOG(WARNING)
@@ -387,9 +394,9 @@ Resource::Image Resource::Pool::image(
     {
       VK_IMAGE_ASPECT_COLOR_BIT,
       0u,
-      1u,
+      VK_REMAINING_MIP_LEVELS,
       0u,
-      1u,
+      VK_REMAINING_ARRAY_LAYERS,
     },
   };
 
diff --git a/aten/src/ATen/native/vulkan/api/Resource.h b/aten/src/ATen/native/vulkan/api/Resource.h
index a9428d272782..340d926206ff 100644
--- a/aten/src/ATen/native/vulkan/api/Resource.h
+++ b/aten/src/ATen/native/vulkan/api/Resource.h
@@ -12,7 +12,6 @@ namespace native {
 namespace vulkan {
 namespace api {
 
-
 struct Resource final {
   class Pool;
 
@@ -47,6 +46,7 @@ struct Resource final {
       typedef uint8_t Flags;
 
       enum Type : Flags {
+        None = 0u << 0u,
         Read = 1u << 0u,
         Write = 1u << 1u,
       };
@@ -334,17 +334,17 @@ class Resource::Memory::Scope final {
 
 template<typename, typename Pointer>
 inline Resource::Memory::Handle<Pointer> Resource::Memory::map() const & {
-  void* map(const Memory& memory);
+  void* map(const Memory& memory, Access::Flags);
 
   return Handle<Pointer>{
-    reinterpret_cast<Pointer>(map(*this)),
+    reinterpret_cast<Pointer>(map(*this, Access::Read)),
     Scope(allocator, allocation, Access::Read),
   };
 }
 
 template<typename, Resource::Memory::Access::Flags kAccess, typename Pointer>
 inline Resource::Memory::Handle<Pointer> Resource::Memory::map() & {
-  void* map(const Memory& memory);
+  void* map(const Memory& memory, Access::Flags);
 
   static_assert(
       (kAccess == Access::Read) ||
@@ -353,7 +353,7 @@ inline Resource::Memory::Handle<Pointer> Resource::Memory::map() & {
       "Invalid memory access!");
 
   return Handle<Pointer>{
-    reinterpret_cast<Pointer>(map(*this)),
+    reinterpret_cast<Pointer>(map(*this, kAccess)),
     Scope(allocator, allocation, kAccess),
   };
 }
diff --git a/aten/src/ATen/native/vulkan/api/Shader.cpp b/aten/src/ATen/native/vulkan/api/Shader.cpp
index 2c090d073bdf..43d1a62ac201 100644
--- a/aten/src/ATen/native/vulkan/api/Shader.cpp
+++ b/aten/src/ATen/native/vulkan/api/Shader.cpp
@@ -18,7 +18,7 @@ Shader::Layout::Factory::Factory(const GPU& gpu)
 
 Shader::Layout::Factory::Handle Shader::Layout::Factory::operator()(
     const Descriptor& descriptor) const {
-  c10::SmallVector<VkDescriptorSetLayoutBinding, 8u> bindings;
+  c10::SmallVector<VkDescriptorSetLayoutBinding, 6u> bindings;
 
   uint32_t binding = 0u;
   for (const VkDescriptorType type : descriptor.signature) {
diff --git a/aten/src/ATen/native/vulkan/api/Shader.h b/aten/src/ATen/native/vulkan/api/Shader.h
index b2238a95de50..718504e69bd4 100644
--- a/aten/src/ATen/native/vulkan/api/Shader.h
+++ b/aten/src/ATen/native/vulkan/api/Shader.h
@@ -4,6 +4,7 @@
 
 #include <ATen/native/vulkan/api/Common.h>
 #include <ATen/native/vulkan/api/Cache.h>
+#include <ATen/native/vulkan/api/Utils.h>
 #include <c10/util/hash.h>
 
 namespace at {
@@ -44,7 +45,7 @@ struct Shader final {
       Signature
     */
 
-    typedef c10::SmallVector<VkDescriptorType, 8u> Signature;
+    typedef c10::SmallVector<VkDescriptorType, 6u> Signature;
 
     /*
       Descriptor
@@ -112,7 +113,7 @@ struct Shader final {
   // Work Group
   //
 
-  typedef VkExtent3D WorkGroup;
+  typedef utils::uvec3 WorkGroup;
 
   /*
     Descriptor
@@ -224,9 +225,9 @@ inline void Shader::Layout::Cache::purge() {
 inline bool operator==(
     const Shader::WorkGroup& _1,
     const Shader::WorkGroup& _2) {
-  return (_1.width == _2.width) &&
-         (_1.height == _2.height) &&
-         (_1.depth == _2.depth);
+  return (_1.data[0u] == _2.data[0u]) &&
+         (_1.data[1u] == _2.data[1u]) &&
+         (_1.data[2u] == _2.data[2u]);
 }
 
 inline Shader::Descriptor::Descriptor(const char* const glsl)
diff --git a/aten/src/ATen/native/vulkan/api/Utils.h b/aten/src/ATen/native/vulkan/api/Utils.h
index c46a652da856..1d261849a5e7 100644
--- a/aten/src/ATen/native/vulkan/api/Utils.h
+++ b/aten/src/ATen/native/vulkan/api/Utils.h
@@ -8,41 +8,34 @@ namespace vulkan {
 namespace api {
 namespace utils {
 
-inline int64_t align_down(
-    const int64_t number,
-    const int64_t multiple) {
+//
+// Alignment
+//
+
+template<typename Type>
+inline constexpr Type align_down(
+    const Type number,
+    const Type multiple) {
   return (number / multiple) * multiple;
 }
 
-inline int64_t align_up(
-    const int64_t number,
-    const int64_t multiple) {
+template<typename Type>
+inline constexpr Type align_up(
+    const Type number,
+    const Type multiple) {
   return align_down(number + multiple - 1, multiple);
 }
 
-inline int64_t div_up(
-    const int64_t numerator,
-    const int64_t denominator) {
+template<typename Type>
+inline constexpr Type div_up(
+    const Type numerator,
+    const Type denominator) {
   return (numerator + denominator - 1) / denominator;
 }
 
-inline VkFormat convert(const caffe2::TypeMeta dtype) {
-  switch (c10::typeMetaToScalarType(dtype)) {
-    case kFloat:
-#ifdef VULKAN_FP16_INFERENCE
-      return VK_FORMAT_R16G16B16A16_SFLOAT;
-#else
-      return VK_FORMAT_R32G32B32A32_SFLOAT;
-#endif /* VULKAN_FP16_INFERENCE */
-
-    default:
-      TORCH_CHECK(
-        false,
-        "Vulkan tensor format not supported!");
-  }
-
-  return VK_FORMAT_UNDEFINED;
-}
+//
+// Cast
+//
 
 namespace detail {
 
@@ -79,6 +72,37 @@ inline constexpr To safe_downcast(const From v) {
   return detail::safe_downcast<To, From>(v);
 }
 
+//
+// Vector
+//
+
+namespace detail {
+
+template<typename Type, uint32_t N>
+struct vec final {
+  Type data[N];
+};
+
+} // namespace detail
+
+template<uint32_t N>
+using ivec = detail::vec<int32_t, N>;
+using ivec2 = ivec<2u>;
+using ivec3 = ivec<3u>;
+using ivec4 = ivec<4u>;
+
+template<uint32_t N>
+using uvec = detail::vec<uint32_t, N>;
+using uvec2 = uvec<2u>;
+using uvec3 = uvec<3u>;
+using uvec4 = uvec<4u>;
+
+template<uint32_t N>
+using vec = detail::vec<float, N>;
+using vec2 = vec<2u>;
+using vec3 = vec<3u>;
+using vec4 = vec<4u>;
+
 } // namespace utils
 } // namespace api
 } // namespace vulkan
diff --git a/aten/src/ATen/native/vulkan/glsl/KO4C4HW_to_image.glsl b/aten/src/ATen/native/vulkan/glsl/KO4C4HW_to_image.glsl
index 3c9a50a267f9..58394dca19da 100644
--- a/aten/src/ATen/native/vulkan/glsl/KO4C4HW_to_image.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/KO4C4HW_to_image.glsl
@@ -2,7 +2,7 @@
 #define PRECISION $precision
 layout(std430) buffer;
 layout(std430) uniform;
-layout(set = 0, rgba32f, binding = 0) writeonly PRECISION uniform image3D uOutput;
+layout(set = 0, rgba16f, binding = 0) writeonly PRECISION uniform image3D uOutput;
 layout(set = 0, binding = 1) readonly buffer kernel {
   vec4 data[];
 }
@@ -13,7 +13,7 @@ layout(set = 0, binding = 2) uniform constBlock {
 }
 uConstBlock;
 
-layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   ivec3 pos = ivec3(gl_GlobalInvocationID) * ivec3(4, 1, 1);
diff --git a/aten/src/ATen/native/vulkan/glsl/adaptive_avg_pool2d.glsl b/aten/src/ATen/native/vulkan/glsl/adaptive_avg_pool2d.glsl
index 99c6f54b919b..d5b9af843dbe 100644
--- a/aten/src/ATen/native/vulkan/glsl/adaptive_avg_pool2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/adaptive_avg_pool2d.glsl
@@ -6,25 +6,24 @@ layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
-layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D   uOutput;
 layout(set = 0, binding = 1)          uniform PRECISION                    sampler3D uInput;
+layout(set = 0, binding = 2)          uniform PRECISION restrict           Block {
+  ivec4 size;
+  vec2 stride;
+  vec2 kernel;
+} uBlock;
 
-layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  /* Dynamically Uniform */
-  const ivec3 size = imageSize(uOutput);
-  const vec3 isize = textureSize(uInput, 0);
-  const vec2 stride = isize.xy / size.xy;
-  const vec2 kernel = isize.xy - (size.xy - 1) * stride;
-
-  if (all(lessThan(pos, size))) {
-    const vec2 ipos = pos.xy * stride;
+  if (all(lessThan(pos, uBlock.size.xyz))) {
+    const vec2 ipos = pos.xy * uBlock.stride;
 
     const ivec2 start = ivec2(ipos);
-    const ivec2 end = ivec2(ceil(ipos + kernel));
+    const ivec2 end = ivec2(ceil(ipos + uBlock.kernel));
     const ivec2 range = end - start;
 
     vec4 sum = vec4(0);
diff --git a/aten/src/ATen/native/vulkan/glsl/add.glsl b/aten/src/ATen/native/vulkan/glsl/add.glsl
index 771a2a1b9349..8dcff0476edf 100644
--- a/aten/src/ATen/native/vulkan/glsl/add.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/add.glsl
@@ -6,22 +6,20 @@ layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
-layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D   uOutput;
 layout(set = 0, binding = 1)          uniform PRECISION                    sampler3D uInput0;
 layout(set = 0, binding = 2)          uniform PRECISION                    sampler3D uInput1;
 layout(set = 0, binding = 3)          uniform PRECISION restrict           Block {
+  ivec3 size;
   float alpha;
 } uBlock;
 
-layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  /* Dynamically Uniform */
-  const ivec3 size = imageSize(uOutput);
-
-  if (all(lessThan(pos, size))) {
+  if (all(lessThan(pos, uBlock.size))) {
     imageStore(
         uOutput,
         pos,
diff --git a/aten/src/ATen/native/vulkan/glsl/add_.glsl b/aten/src/ATen/native/vulkan/glsl/add_.glsl
index f8cdb8ea05e6..ed82d0cbe87b 100644
--- a/aten/src/ATen/native/vulkan/glsl/add_.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/add_.glsl
@@ -6,21 +6,19 @@ layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
-layout(set = 0, binding = 0, rgba32f) uniform PRECISION          image3D   uOutput;
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict image3D   uOutput;
 layout(set = 0, binding = 1)          uniform PRECISION          sampler3D uInput0;
 layout(set = 0, binding = 2)          uniform PRECISION restrict Block {
+  ivec3 size;
   float alpha;
 } uBlock;
 
-layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  /* Dynamically Uniform */
-  const ivec3 size = imageSize(uOutput);
-
-  if (all(lessThan(pos, size))) {
+  if (all(lessThan(pos, uBlock.size))) {
     imageStore(
         uOutput,
         pos,
diff --git a/aten/src/ATen/native/vulkan/glsl/add_scalar.glsl b/aten/src/ATen/native/vulkan/glsl/add_scalar.glsl
index 1f49c9e9d475..8882ba0d8ff2 100644
--- a/aten/src/ATen/native/vulkan/glsl/add_scalar.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/add_scalar.glsl
@@ -6,21 +6,19 @@ layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
-layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D   uOutput;
 layout(set = 0, binding = 1)          uniform PRECISION                    sampler3D uInput;
 layout(set = 0, binding = 2)          uniform PRECISION restrict           Block {
+  ivec3 size;
   float other;
 } uBlock;
 
-layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  /* Dynamically Uniform */
-  const ivec3 size = imageSize(uOutput);
-
-  if (all(lessThan(pos, size))) {
+  if (all(lessThan(pos, uBlock.size))) {
     imageStore(
         uOutput,
         pos,
diff --git a/aten/src/ATen/native/vulkan/glsl/add_scalar_.glsl b/aten/src/ATen/native/vulkan/glsl/add_scalar_.glsl
index fd6ec2953afb..bffd680669fb 100644
--- a/aten/src/ATen/native/vulkan/glsl/add_scalar_.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/add_scalar_.glsl
@@ -6,20 +6,18 @@ layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
-layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict image3D uOutput;
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict image3D uOutput;
 layout(set = 0, binding = 1)          uniform PRECISION restrict Block {
+  ivec3 size;
   float other;
 } uBlock;
 
-layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  /* Dynamically Uniform */
-  const ivec3 size = imageSize(uOutput);
-
-  if (all(lessThan(pos, size))) {
+  if (all(lessThan(pos, uBlock.size))) {
     imageStore(
         uOutput,
         pos,
diff --git a/aten/src/ATen/native/vulkan/glsl/addmm.glsl b/aten/src/ATen/native/vulkan/glsl/addmm.glsl
index 7489a74a33f5..61f76fa8cf5d 100644
--- a/aten/src/ATen/native/vulkan/glsl/addmm.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/addmm.glsl
@@ -6,28 +6,24 @@ layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
-layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D   uOutput;
 layout(set = 0, binding = 1)          uniform PRECISION                    sampler3D uM1;
 layout(set = 0, binding = 2)          uniform PRECISION                    sampler3D uM2;
 layout(set = 0, binding = 3)          uniform PRECISION                    sampler3D uT;
 layout(set = 0, binding = 4)          uniform PRECISION restrict           Block {
-  float alpha;
-  float beta;
+  ivec4 size;
+  vec2 multiplier;
 } uBlock;
 
-layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  /* Dynamically Uniform */
-  const ivec3 size = imageSize(uOutput);
-  const int dim = textureSize(uM1, 0).x;
-
-  if (all(lessThan(pos, size))) {
+  if (all(lessThan(pos, uBlock.size.xyz))) {
     vec4 sum = vec4(0);
 
-    for (int k = 0; k < dim; ++k) {
+    for (int k = 0; k < uBlock.size.w; ++k) {
       sum = fma(
           texelFetch(uM1, ivec3(k, pos.y, pos.z), 0),
           texelFetch(uM2, ivec3(pos.x, k, pos.z), 0),
@@ -37,6 +33,6 @@ void main() {
     imageStore(
         uOutput,
         pos,
-        uBlock.alpha * sum + uBlock.beta * texelFetch(uT, pos, 0));
+        uBlock.multiplier.x * sum + uBlock.multiplier.y * texelFetch(uT, pos, 0));
   }
 }
diff --git a/aten/src/ATen/native/vulkan/glsl/avg_pool2d.glsl b/aten/src/ATen/native/vulkan/glsl/avg_pool2d.glsl
index 8036be567b65..df2bbcf18014 100644
--- a/aten/src/ATen/native/vulkan/glsl/avg_pool2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/avg_pool2d.glsl
@@ -6,29 +6,26 @@ layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
-layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D   uOutput;
 layout(set = 0, binding = 1)          uniform PRECISION                    sampler3D uInput;
 layout(set = 0, binding = 2)          uniform PRECISION restrict           Block {
-  ivec2 kernel;
+  ivec4 size;
+  ivec2 isize;
   ivec2 stride;
   ivec2 padding;
+  ivec2 kernel;
 } uBlock;
 
-layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  /* Dynamically Uniform */
-  const ivec3 size = imageSize(uOutput);
-  const ivec3 isize = textureSize(uInput, 0);
-  const float range = uBlock.kernel.x * uBlock.kernel.y;
-
-  if (all(lessThan(pos, size))) {
+  if (all(lessThan(pos, uBlock.size.xyz))) {
     const ivec2 ipos = pos.xy * uBlock.stride - uBlock.padding;
 
     const ivec2 start = max(ivec2(0), ipos);
-    const ivec2 end = min(ipos + uBlock.kernel, isize.xy);
+    const ivec2 end = min(ipos + uBlock.kernel, uBlock.isize);
 
     vec4 sum = vec4(0);
 
@@ -41,6 +38,6 @@ void main() {
     imageStore(
         uOutput,
         pos,
-        sum / range);
+        sum / uBlock.size.w);
   }
 }
diff --git a/aten/src/ATen/native/vulkan/glsl/clamp.glsl b/aten/src/ATen/native/vulkan/glsl/clamp.glsl
index 8482e27f48e4..c394dfd26627 100644
--- a/aten/src/ATen/native/vulkan/glsl/clamp.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/clamp.glsl
@@ -6,25 +6,22 @@ layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
-layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D   uOutput;
 layout(set = 0, binding = 1)          uniform PRECISION                    sampler3D uInput;
 layout(set = 0, binding = 2)          uniform PRECISION restrict           Block {
-  float min;
-  float max;
+  ivec4 size;
+  vec2 clamp;
 } uBlock;
 
-layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  /* Dynamically Uniform */
-  const ivec3 size = imageSize(uOutput);
-
-  if (all(lessThan(pos, size))) {
+  if (all(lessThan(pos, uBlock.size.xyz))) {
     imageStore(
         uOutput,
         pos,
-        clamp(texelFetch(uInput, pos, 0), uBlock.min, uBlock.max));
+        clamp(texelFetch(uInput, pos, 0), uBlock.clamp.x, uBlock.clamp.y));
   }
 }
diff --git a/aten/src/ATen/native/vulkan/glsl/clamp_.glsl b/aten/src/ATen/native/vulkan/glsl/clamp_.glsl
index 9bfc77a44053..b16258685114 100644
--- a/aten/src/ATen/native/vulkan/glsl/clamp_.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/clamp_.glsl
@@ -6,24 +6,21 @@ layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
-layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict image3D uOutput;
-layout(set = 0, binding = 1)          uniform PRECISION restrict         Block {
-  float min;
-  float max;
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict image3D uOutput;
+layout(set = 0, binding = 1)          uniform PRECISION restrict Block {
+  ivec4 size;
+  vec2 clamp;
 } uBlock;
 
-layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  /* Dynamically Uniform */
-  const ivec3 size = imageSize(uOutput);
-
-  if (all(lessThan(pos, size))) {
+  if (all(lessThan(pos, uBlock.size.xyz))) {
     imageStore(
         uOutput,
         pos,
-        clamp(imageLoad(uOutput, pos), uBlock.min, uBlock.max));
+        clamp(imageLoad(uOutput, pos), uBlock.clamp.x, uBlock.clamp.y));
   }
 }
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d.glsl
index 0a58c5d0a2f6..fd54c2f38721 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d.glsl
@@ -6,7 +6,7 @@ layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
-layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D   uOutput;
 layout(set = 0, binding = 1)          uniform PRECISION                    sampler3D uInput;
 layout(set = 0, binding = 2)          uniform PRECISION                    sampler3D uKernel;
 layout(set = 0, binding = 3)          buffer  PRECISION restrict readonly  Bias {
@@ -20,7 +20,7 @@ layout(set = 0, binding = 4)          uniform PRECISION restrict           Block
   vec2 clamp;
 } uBlock;
 
-layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl
index cdff4560bfa8..fe50262f7d46 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl
@@ -6,7 +6,7 @@ layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
-layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D   uOutput;
 layout(set = 0, binding = 1)          uniform PRECISION                    sampler3D uInput;
 layout(set = 0, binding = 2)          uniform PRECISION                    sampler3D uKernel;
 layout(set = 0, binding = 3)          buffer  PRECISION restrict readonly  Bias {
@@ -20,7 +20,7 @@ layout(set = 0, binding = 4)          uniform PRECISION restrict           Block
   vec2 clamp;
 } uBlock;
 
-layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_dw_clamp.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_dw_clamp.glsl
index c9f1b43ad4d5..37a5898b9f10 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d_dw_clamp.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d_dw_clamp.glsl
@@ -2,7 +2,7 @@
 #define PRECISION $precision
 layout(std430) buffer;
 layout(std430) uniform;
-layout(set = 0, rgba32f, binding = 0) writeonly PRECISION uniform image3D uOutput;
+layout(set = 0, rgba16f, binding = 0) writeonly PRECISION uniform image3D uOutput;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
 layout(set = 0, binding = 2) uniform PRECISION sampler3D uKernel;
 layout(set = 0, binding = 3) readonly buffer bias {
@@ -23,7 +23,7 @@ uConstBlock;
 
 #define UP_DIV(x, y) (((x) + (y)-1) / (y))
 
-layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   ivec3 pos = ivec3(gl_GlobalInvocationID);
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp.glsl
index 9a48022b85f2..b73c58e0f54d 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp.glsl
@@ -2,7 +2,7 @@
 #define PRECISION $precision
 layout(std430) buffer;
 layout(std430) uniform;
-layout(set = 0, rgba32f, binding = 0) writeonly PRECISION uniform image3D uOutput;
+layout(set = 0, rgba16f, binding = 0) writeonly PRECISION uniform image3D uOutput;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
 layout(set = 0, binding = 2) uniform PRECISION sampler3D uKernel;
 layout(set = 0, binding = 3) readonly buffer bias {
@@ -23,7 +23,7 @@ uConstBlock;
 
 #define UP_DIV(x, y) (((x) + (y)-1) / (y))
 
-layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   ivec3 gpos = ivec3(gl_GlobalInvocationID);
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl
index c2962844e0bc..bbc745ca8efd 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl
@@ -6,7 +6,7 @@ layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
-layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D   uOutput;
 layout(set = 0, binding = 1)          uniform PRECISION                    sampler3D uInput;
 layout(set = 0, binding = 2)          uniform PRECISION                    sampler3D uKernel;
 layout(set = 0, binding = 3)          buffer  PRECISION restrict readonly  Bias {
@@ -20,7 +20,7 @@ layout(set = 0, binding = 4)          uniform PRECISION restrict           Block
   int W;
 } uBlock;
 
-layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
diff --git a/aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl b/aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl
index 5c08399f765c..d19c370ec9bd 100644
--- a/aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl
@@ -10,23 +10,21 @@ layout(set = 0, binding = 0) uniform PRECISION                    sampler3D uIma
 layout(set = 0, binding = 1) buffer  PRECISION restrict writeonly Buffer {
   float data[];
 } uBuffer;
+layout(set = 0, binding = 2) uniform PRECISION restrict           Block {
+  ivec4 size;
+  ivec4 offset;
+} uBlock;
 
-layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  /* Dynamically Uniform */
-  const ivec3 size = textureSize(uImage, 0);
-  const int plane = size.x * size.y;
-  const int block = 4 * plane;
-  const ivec4 offset = plane * ivec4(0, 1, 2, 3);
-
-  if (all(lessThan(pos, size))) {
+  if (all(lessThan(pos, uBlock.size.xyz))) {
     const vec4 texel = texelFetch(uImage, pos, 0);
 
-    const int base = pos.x + size.x * pos.y + block * pos.z;
-    const ivec4 index = base + offset;
+    const int base = pos.x + uBlock.size.x * pos.y + uBlock.size.w * pos.z;
+    const ivec4 index = base + uBlock.offset;
 
     uBuffer.data[index.x] = texel.r;
     uBuffer.data[index.y] = texel.g;
diff --git a/aten/src/ATen/native/vulkan/glsl/max_pool2d.glsl b/aten/src/ATen/native/vulkan/glsl/max_pool2d.glsl
index 5db8a53e1770..948b797a5207 100644
--- a/aten/src/ATen/native/vulkan/glsl/max_pool2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/max_pool2d.glsl
@@ -2,7 +2,7 @@
 #define PRECISION $precision
 layout(std430) buffer;
 layout(std430) uniform;
-layout(set = 0, rgba32f, binding = 0) writeonly PRECISION uniform image3D uOutput;
+layout(set = 0, rgba16f, binding = 0) writeonly PRECISION uniform image3D uOutput;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
 layout(set = 0, binding = 2) uniform constBlock {
   ivec4 inputSize;
@@ -17,7 +17,7 @@ uConstBlock;
 #define UP_DIV(x, y) (((x) + (y)-1) / (y))
 #define FLT_MAX 3.402823466e+38
 
-layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   ivec3 pos = ivec3(gl_GlobalInvocationID);
diff --git a/aten/src/ATen/native/vulkan/glsl/mean.glsl b/aten/src/ATen/native/vulkan/glsl/mean.glsl
index 7c5795a6e96a..130d716ca9e6 100644
--- a/aten/src/ATen/native/vulkan/glsl/mean.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/mean.glsl
@@ -6,24 +6,25 @@ layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
-layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D   uOutput;
 layout(set = 0, binding = 1)          uniform PRECISION                    sampler3D uInput;
+layout(set = 0, binding = 2)          uniform PRECISION restrict           Block {
+  ivec4 size;
+  ivec2 isize;
+} uBlock;
 
-layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+// This implementation is suboptimal and should be revisted.
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  /* Dynamically Uniform */
-  const ivec3 size = imageSize(uOutput);
-  const ivec3 isize = textureSize(uInput, 0);
-  const float range = isize.x * isize.y;
-
-  if (all(lessThan(pos, size))) {
+  if (all(lessThan(pos, uBlock.size.xyz))) {
     vec4 sum = vec4(0);
 
-    for (int y = 0; y < isize.y; ++y) {
-      for (int x = 0; x < isize.x; ++x) {
+    for (int y = 0; y < uBlock.isize.y; ++y) {
+      for (int x = 0; x < uBlock.isize.x; ++x) {
         sum += texelFetch(uInput, ivec3(x, y, pos.z), 0);
       }
     }
@@ -31,6 +32,6 @@ void main() {
     imageStore(
         uOutput,
         pos,
-        sum / range);
+        sum / uBlock.size.w);
   }
 }
diff --git a/aten/src/ATen/native/vulkan/glsl/mean2d.glsl b/aten/src/ATen/native/vulkan/glsl/mean2d.glsl
index a2b137cef42e..266226aa708b 100644
--- a/aten/src/ATen/native/vulkan/glsl/mean2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/mean2d.glsl
@@ -1,29 +1,41 @@
 #version 450 core
 #define PRECISION $precision
+
 layout(std430) buffer;
 layout(std430) uniform;
-layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D   uOutput;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D   uOutput;
 layout(set = 0, binding = 1)          uniform PRECISION                    sampler3D uInput;
-layout(set = 0, binding = 2)          uniform PRECISION restrict                     Block {
-  int W;
-  int H;
+layout(set = 0, binding = 2)          uniform PRECISION restrict           Block {
+  ivec4 size;
+  ivec2 isize;
 } uBlock;
 
-layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+// This implementation is suboptimal and should be revisted.
 
 void main() {
-  ivec3 pos = ivec3(gl_GlobalInvocationID);
-  vec4 r = vec4(1.0) / (float(uBlock.W) * float(uBlock.H));
-  vec4 acc = vec4(0);
-  int xi, yi;
-  int zi = (imageSize(uOutput).x*pos.y + pos.x)/4;
-  int zo = (imageSize(uOutput).x*pos.y + pos.x)%4;
-  for (yi = 0; yi < uBlock.H; ++yi) {
-    for (xi = 0; xi < uBlock.W; ++xi) {
-      acc += texelFetch(uInput, ivec3(xi, yi, zi), 0);
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (all(lessThan(pos, uBlock.size.xyz))) {
+    vec4 sum = vec4(0);
+
+    const int z = pos.x + uBlock.size.x * pos.y;
+    const int zi = z / 4;
+    const int zo = z % 4;
+
+    for (int y = 0; y < uBlock.isize.y; ++y) {
+      for (int x = 0; x < uBlock.isize.x; ++x) {
+        sum += texelFetch(uInput, ivec3(x, y, zi), 0);
+      }
     }
-  }
-  vec4 outValue = r * acc;
 
-  imageStore(uOutput, pos, vec4(outValue[zo], 0,0,0));
+    imageStore(
+        uOutput,
+        pos,
+        vec4(sum[zo], 0, 0, 0) / uBlock.size.w);
+  }
 }
diff --git a/aten/src/ATen/native/vulkan/glsl/mm.glsl b/aten/src/ATen/native/vulkan/glsl/mm.glsl
index dfcb155e7515..00ab5f31e6db 100644
--- a/aten/src/ATen/native/vulkan/glsl/mm.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/mm.glsl
@@ -6,23 +6,22 @@ layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
-layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D   uOutput;
 layout(set = 0, binding = 1)          uniform PRECISION                    sampler3D uM1;
 layout(set = 0, binding = 2)          uniform PRECISION                    sampler3D uM2;
+layout(set = 0, binding = 3)          uniform PRECISION restrict           Block {
+  ivec4 size;
+} uBlock;
 
-layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  /* Dynamically Uniform */
-  const ivec3 size = imageSize(uOutput);
-  const int dim = textureSize(uM1, 0).x;
-
-  if (all(lessThan(pos, size))) {
+  if (all(lessThan(pos, uBlock.size.xyz))) {
     vec4 sum = vec4(0);
 
-    for (int k = 0; k < dim; ++k) {
+    for (int k = 0; k < uBlock.size.w; ++k) {
       sum = fma(
           texelFetch(uM1, ivec3(k, pos.y, pos.z), 0),
           texelFetch(uM2, ivec3(pos.x, k, pos.z), 0),
diff --git a/aten/src/ATen/native/vulkan/glsl/mul_scalar.glsl b/aten/src/ATen/native/vulkan/glsl/mul_scalar.glsl
index b73e3180c725..d3a98ba30bea 100644
--- a/aten/src/ATen/native/vulkan/glsl/mul_scalar.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/mul_scalar.glsl
@@ -6,21 +6,19 @@ layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
-layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D   uOutput;
 layout(set = 0, binding = 1)          uniform PRECISION                    sampler3D uInput;
 layout(set = 0, binding = 2)          uniform PRECISION restrict           Block {
+  ivec3 size;
   float other;
 } uBlock;
 
-layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  /* Dynamically Uniform */
-  const ivec3 size = imageSize(uOutput);
-
-  if (all(lessThan(pos, size))) {
+  if (all(lessThan(pos, uBlock.size))) {
     imageStore(
         uOutput,
         pos,
diff --git a/aten/src/ATen/native/vulkan/glsl/mul_scalar_.glsl b/aten/src/ATen/native/vulkan/glsl/mul_scalar_.glsl
index c259e5aa5a58..b49252e128cc 100644
--- a/aten/src/ATen/native/vulkan/glsl/mul_scalar_.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/mul_scalar_.glsl
@@ -6,20 +6,18 @@ layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
-layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict image3D uOutput;
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict image3D uOutput;
 layout(set = 0, binding = 1)          uniform PRECISION restrict Block {
+  ivec3 size;
   float other;
 } uBlock;
 
-layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  /* Dynamically Uniform */
-  const ivec3 size = imageSize(uOutput);
-
-  if (all(lessThan(pos, size))) {
+  if (all(lessThan(pos, uBlock.size))) {
     imageStore(
         uOutput,
         pos,
diff --git a/aten/src/ATen/native/vulkan/glsl/nchw_to_image.glsl b/aten/src/ATen/native/vulkan/glsl/nchw_to_image.glsl
index 23407891d649..fb87b5a36918 100644
--- a/aten/src/ATen/native/vulkan/glsl/nchw_to_image.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/nchw_to_image.glsl
@@ -6,25 +6,23 @@ layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
-layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D uImage;
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D uImage;
 layout(set = 0, binding = 1)          buffer  PRECISION restrict readonly  Buffer {
   float data[];
 } uBuffer;
+layout(set = 0, binding = 2)          uniform PRECISION restrict           Block {
+  ivec4 size;
+  ivec4 offset;
+} uBlock;
 
-layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  /* Dynamically Uniform */
-  const ivec3 size = imageSize(uImage);
-  const int plane = size.x * size.y;
-  const int block = 4 * plane;
-  const ivec4 offset = plane * ivec4(0, 1, 2, 3);
-
-  if (all(lessThan(pos, size))) {
-    const int base = pos.x + size.x * pos.y + block * pos.z;
-    const ivec4 index = base + offset;
+  if (all(lessThan(pos, uBlock.size.xyz))) {
+    const int base = pos.x + uBlock.size.x * pos.y + uBlock.size.w * pos.z;
+    const ivec4 index = base + uBlock.offset;
 
     imageStore(
         uImage,
diff --git a/aten/src/ATen/native/vulkan/glsl/permute.glsl b/aten/src/ATen/native/vulkan/glsl/permute.glsl
index bd0b6637efae..af8e33588f78 100644
--- a/aten/src/ATen/native/vulkan/glsl/permute.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/permute.glsl
@@ -17,7 +17,7 @@ layout(set = 0, binding = 2) uniform constBlock {
 }
 uConst;
 
-layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   ivec3 pos = ivec3(gl_GlobalInvocationID);
diff --git a/aten/src/ATen/native/vulkan/glsl/upsample_nearest2d.glsl b/aten/src/ATen/native/vulkan/glsl/upsample_nearest2d.glsl
index a979cf275c21..efb1c5c7fc9a 100644
--- a/aten/src/ATen/native/vulkan/glsl/upsample_nearest2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/upsample_nearest2d.glsl
@@ -6,26 +6,24 @@ layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
-layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D   uOutput;
 layout(set = 0, binding = 1)          uniform PRECISION                    sampler3D uInput;
 layout(set = 0, binding = 2)          uniform PRECISION restrict           Block {
+  ivec4 size;
+  ivec2 isize;
   vec2 scale;
 } uBlock;
 
-layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  /* Dynamically Uniform */
-  const ivec3 size = imageSize(uOutput);
-  const ivec3 isize = textureSize(uInput, 0);
-
-  if (all(lessThan(pos, size))) {
+  if (all(lessThan(pos, uBlock.size.xyz))) {
     const ivec2 ipos = clamp(
         ivec2(pos.xy * uBlock.scale),
         ivec2(0),
-        isize.xy - 1);
+        uBlock.isize);
 
     imageStore(
         uOutput,
diff --git a/aten/src/ATen/native/vulkan/ops/Add.cpp b/aten/src/ATen/native/vulkan/ops/Add.cpp
index 24e9cd6dc021..270a1d5f8168 100644
--- a/aten/src/ATen/native/vulkan/ops/Add.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Add.cpp
@@ -7,6 +7,8 @@ namespace vulkan {
 namespace ops {
 namespace {
 
+using namespace api::utils;
+
 Tensor add_scalar(
     const Tensor& self_arg,
     const Scalar other,
@@ -18,8 +20,8 @@ Tensor add_scalar(
 
   vTensor v_output{
     context,
-    self.sizes(),
-    self.options(),
+    v_self.sizes(),
+    v_self.options(),
   };
 
   api::Command::Buffer command_buffer = context->command().pool.allocate();
@@ -27,8 +29,10 @@ Tensor add_scalar(
   {
     if (v_output.has_image() && v_self.has_image()) {
       const struct {
+        uvec3 extents;
         float other;
       } block {
+        v_self.extents(),
         other.to<float>() * alpha.to<float>(),
       };
 
@@ -43,10 +47,15 @@ Tensor add_scalar(
           v_output.extents(),
           // Write-only access bypasses synchronization but inserts appropriate
           // barriers if necessary.
-          v_output.image(command_buffer, vTensor::Access::Write),
+          v_output.image(
+              command_buffer,
+              vTensor::Stage::Compute,
+              vTensor::Access::Write),
           // Read-only access is implied on const tensors and triggers an async
           // synchronization if necessary.
-          v_self.image(command_buffer),
+          v_self.image(
+              command_buffer,
+              vTensor::Stage::Compute),
           // Object lifetime is managed by the resource pool.
           // It is OK not to keep track of the handle.
           context->resource().pool.uniform(block).object);
@@ -62,24 +71,26 @@ Tensor add_scalar(
 }
 
 Tensor& add_scalar_(
-    Tensor& self_arg,
+    Tensor& self,
     const Scalar other,
     const Scalar alpha) {
   api::Context* const context = api::context();
 
   TORCH_CHECK(
-      self_arg.is_vulkan(),
+      self.is_vulkan(),
       "Vulkan: In-place add is only supported on Vulkan tensors.");
 
-  vTensor& v_self = convert(self_arg);
+  vTensor& v_self = convert(self);
 
   api::Command::Buffer command_buffer = context->command().pool.allocate();
   command_buffer.begin();
   {
     if (v_self.has_image()) {
       const struct {
+        uvec3 extents;
         float other;
       } block {
+        v_self.extents(),
         other.to<float>() * alpha.to<float>(),
       };
 
@@ -93,7 +104,10 @@ Tensor& add_scalar_(
           v_self.extents(),
           // Read-Write access triggers an async synchronization if necessory
           // and inserts appropriate barriers if hazards are detected.
-          v_self.image(command_buffer, vTensor::Access::Read | vTensor::Access::Write),
+          v_self.image(
+              command_buffer,
+              vTensor::Stage::Compute,
+              vTensor::Access::Read | vTensor::Access::Write),
           // Object lifetime is managed by the resource pool.
           // It is OK not to keep track of the handle.
           context->resource().pool.uniform(block).object);
@@ -105,7 +119,7 @@ Tensor& add_scalar_(
   command_buffer.end();
   command_buffer.submit(context->gpu().queue);
 
-  return self_arg;
+  return self;
 }
 
 Tensor add_tensor(
@@ -122,8 +136,8 @@ Tensor add_tensor(
 
   vTensor v_output{
     context,
-    self.sizes(),
-    self.options(),
+    v_self.sizes(),
+    v_self.options(),
   };
 
   api::Command::Buffer command_buffer = context->command().pool.allocate();
@@ -131,8 +145,10 @@ Tensor add_tensor(
   {
     if (v_self.has_image() && v_other.has_image()) {
       const struct {
+        uvec3 extents;
         float alpha;
       } block {
+        v_output.extents(),
         alpha.to<float>(),
       };
 
@@ -148,13 +164,20 @@ Tensor add_tensor(
           v_output.extents(),
           // Write-only access bypasses synchronization but inserts appropriate
           // barriers if necessary.
-          v_output.image(command_buffer, vTensor::Access::Write),
+          v_output.image(
+              command_buffer,
+              vTensor::Stage::Compute,
+              vTensor::Access::Write),
           // Read-only access is implied on const tensors and triggers an async
           // synchronization if necessary.
-          v_self.image(command_buffer),
+          v_self.image(
+              command_buffer,
+              vTensor::Stage::Compute),
           // Read-only access is implied on const tensors and triggers an async
           // synchronization if necessary.
-          v_other.image(command_buffer),
+          v_other.image(
+              command_buffer,
+              vTensor::Stage::Compute),
           // Object lifetime is managed by the resource pool.
           // It is OK not to keep track of the handle.
           context->resource().pool.uniform(block).object);
@@ -170,16 +193,16 @@ Tensor add_tensor(
 }
 
 Tensor& add_tensor_(
-    Tensor& self_arg,
+    Tensor& self,
     const Tensor& other_arg,
     const Scalar alpha) {
   api::Context* const context = api::context();
 
   TORCH_CHECK(
-      self_arg.is_vulkan(),
+      self.is_vulkan(),
       "Vulkan: In-place add is only supported on Vulkan tensors.");
 
-  vTensor& v_self = convert(self_arg);
+  vTensor& v_self = convert(self);
 
   const Tensor other = other_arg.is_vulkan() ? other_arg : other_arg.vulkan();
   const vTensor& v_other = convert(other);
@@ -187,10 +210,12 @@ Tensor& add_tensor_(
   api::Command::Buffer command_buffer = context->command().pool.allocate();
   command_buffer.begin();
   {
-    if (v_self.has_image() && v_other.has_image()) {
+    if (v_self.has_image() && v_other.has_image() && !self.is_same(other)) {
       const struct {
+        uvec3 extents;
         float alpha;
       } block {
+        v_self.extents(),
         alpha.to<float>(),
       };
 
@@ -205,10 +230,15 @@ Tensor& add_tensor_(
           v_self.extents(),
           // Read-Write access triggers an async synchronization if necessory
           // and inserts appropriate barriers if hazards are detected.
-          v_self.image(command_buffer, vTensor::Access::Read | vTensor::Access::Write),
+          v_self.image(
+              command_buffer,
+              vTensor::Stage::Compute,
+              vTensor::Access::Read | vTensor::Access::Write),
           // Read-only access is implied on const tensors and triggers an async
           // synchronization if necessary.
-          v_other.image(command_buffer),
+          v_other.image(
+              command_buffer,
+              vTensor::Stage::Compute),
           // Object lifetime is managed by the resource pool.
           // It is OK not to keep track of the handle.
           context->resource().pool.uniform(block).object);
@@ -220,7 +250,7 @@ Tensor& add_tensor_(
   command_buffer.end();
   command_buffer.submit(context->gpu().queue);
 
-  return self_arg;
+  return self;
 }
 
 #ifdef USE_VULKAN_API
diff --git a/aten/src/ATen/native/vulkan/ops/Clamp.cpp b/aten/src/ATen/native/vulkan/ops/Clamp.cpp
index 6aec84d8b349..369a47fee93a 100644
--- a/aten/src/ATen/native/vulkan/ops/Clamp.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Clamp.cpp
@@ -7,12 +7,14 @@ namespace vulkan {
 namespace ops {
 namespace {
 
+using namespace api::utils;
+
 Tensor clamp(
     const Tensor& self_arg,
-    const c10::optional<Scalar> min_value,
-    const c10::optional<Scalar> max_value) {
+    const c10::optional<Scalar> min,
+    const c10::optional<Scalar> max) {
   TORCH_CHECK(
-      min_value || max_value,
+      min || max,
       "At least one of 'min' or 'max' must not be None");
 
   api::Context* const context = api::context();
@@ -22,8 +24,8 @@ Tensor clamp(
 
   vTensor v_output{
     context,
-    self.sizes(),
-    self.options(),
+    v_self.sizes(),
+    v_self.options(),
   };
 
   api::Command::Buffer command_buffer = context->command().pool.allocate();
@@ -31,11 +33,16 @@ Tensor clamp(
   {
     if (v_output.has_image() && v_self.has_image()) {
       const struct {
-        float min_value;
-        float max_value;
+        uvec3 extents;
+        uint32_t _;
+        vec2 clamp;
       } block {
-        min_value ? min_value->to<float>() : -std::numeric_limits<float>::infinity(),
-        max_value ? max_value->to<float>() : std::numeric_limits<float>::infinity(),
+        v_output.extents(),
+        0u,
+        {
+          min ? min->to<float>() : -std::numeric_limits<float>::infinity(),
+          max ? max->to<float>() : std::numeric_limits<float>::infinity(),
+        },
       };
 
       context->dispatch(
@@ -49,10 +56,15 @@ Tensor clamp(
           v_output.extents(),
           // Write-only access bypasses synchronization but inserts appropriate
           // barriers if necessary.
-          v_output.image(command_buffer, vTensor::Access::Write),
+          v_output.image(
+              command_buffer,
+              vTensor::Stage::Compute,
+              vTensor::Access::Write),
           // Read-only access is implied on const tensors and triggers an async
           // synchronization if necessary.
-          v_self.image(command_buffer),
+          v_self.image(
+              command_buffer,
+              vTensor::Stage::Compute),
           // Object lifetime is managed by the resource pool.
           // It is OK not to keep track of the handle.
           context->resource().pool.uniform(block).object);
@@ -68,31 +80,36 @@ Tensor clamp(
 }
 
 Tensor& clamp_(
-    Tensor& self_arg,
-    const c10::optional<Scalar> min_value,
-    const c10::optional<Scalar> max_value) {
+    Tensor& self,
+    const c10::optional<Scalar> min,
+    const c10::optional<Scalar> max) {
   api::Context* const context = api::context();
 
   TORCH_CHECK(
-      min_value || max_value,
+      min || max,
       "At least one of 'min' or 'max' must not be None");
 
   TORCH_CHECK(
-      self_arg.is_vulkan(),
+      self.is_vulkan(),
       "Vulkan: In-place clamp is only supported on Vulkan tensors.");
 
-  vTensor& v_self = convert(self_arg);
+  vTensor& v_self = convert(self);
 
   api::Command::Buffer command_buffer = context->command().pool.allocate();
   command_buffer.begin();
   {
     if (v_self.has_image()) {
       const struct {
-        float min_value;
-        float max_value;
+        uvec3 extents;
+        uint32_t _;
+        vec2 clamp;
       } block {
-        min_value ? min_value->to<float>() : -std::numeric_limits<float>::infinity(),
-        max_value ? max_value->to<float>() : std::numeric_limits<float>::infinity(),
+        v_self.extents(),
+        0u,
+        {
+          min ? min->to<float>() : -std::numeric_limits<float>::infinity(),
+          max ? max->to<float>() : std::numeric_limits<float>::infinity(),
+        },
       };
 
       context->dispatch(
@@ -105,7 +122,10 @@ Tensor& clamp_(
           v_self.extents(),
           // Read-Write access triggers an async synchronization if necessory
           // and inserts appropriate barriers if hazards are detected.
-          v_self.image(command_buffer, vTensor::Access::Read | vTensor::Access::Write),
+          v_self.image(
+              command_buffer,
+              vTensor::Stage::Compute,
+              vTensor::Access::Read | vTensor::Access::Write),
           // Object lifetime is managed by the resource pool.
           // It is OK not to keep track of the handle.
           context->resource().pool.uniform(block).object);
@@ -117,7 +137,7 @@ Tensor& clamp_(
   command_buffer.end();
   command_buffer.submit(context->gpu().queue);
 
-  return self_arg;
+  return self;
 }
 
 Tensor hardtanh(
diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.cpp b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
index 7cf3b4fe5137..c549468aaec8 100644
--- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
@@ -9,6 +9,8 @@ namespace vulkan {
 namespace ops {
 namespace {
 
+using namespace api::utils;
+
 inline bool is_depthwise(
     const IntArrayRef filter,
     const int64_t groups) {
@@ -64,14 +66,13 @@ vTensor pack_weights(
   // General
   //
 
-  using namespace api::utils;
 
   vTensor v_weight{
     api::context(),
     &pool,
     {
-      div_up(src_filter[Layout::Filter::output], 4),
-      4 * align_up(src_filter[Layout::Filter::input], 4),
+      div_up(src_filter[Layout::Filter::output], INT64_C(4)),
+      4 * align_up(src_filter[Layout::Filter::input], INT64_C(4)),
       src_filter[Layout::Filter::height],
       src_filter[Layout::Filter::width],
     },
@@ -174,8 +175,8 @@ std::array<int64_t, 4> pack_filter(
   };
 
   return {
-    api::utils::align_up(filter[Layout::Filter::output], 4),
-    api::utils::align_up(filter[Layout::Filter::input], 4),
+    align_up(filter[Layout::Filter::output], INT64_C(4)),
+    align_up(filter[Layout::Filter::input], INT64_C(4)),
     effective(
         filter[Layout::Filter::height],
         dilation[Layout::Parameter::height]),
@@ -270,8 +271,6 @@ void conv2d_depthwise(
     const IntArrayRef dilation,
     const float output_min,
     const float output_max) {
-  using namespace api::utils;
-
   if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
     const struct {
       int32_t kernel_x, kernel_y;
@@ -305,16 +304,25 @@ void conv2d_depthwise(
         v_output.extents(),
         // Write-only access bypasses synchronization but inserts appropriate
         // barriers if necessary.
-        v_output.image(command_buffer, vTensor::Access::Write),
+        v_output.image(
+            command_buffer,
+            vTensor::Stage::Compute,
+            vTensor::Access::Write),
         // Read-only access is implied on const tensors and triggers an async
         // synchronization if necessary.
-        v_input.image(command_buffer),
+        v_input.image(
+            command_buffer,
+            vTensor::Stage::Compute),
         // Read-only access is implied on const tensors and triggers an async
         // synchronization if necessary.
-        v_weight.image(command_buffer),
+        v_weight.image(
+            command_buffer,
+            vTensor::Stage::Compute),
         // Read-only access is implied on const tensors and triggers an async
         // synchronization if necessary.
-        v_bias.buffer(command_buffer),
+        v_bias.buffer(
+            command_buffer,
+            vTensor::Stage::Compute),
         // Object lifetime is managed by the resource pool.
         // It is OK not to keep track of the handle.
         context->resource().pool.uniform(block).object);
@@ -336,10 +344,8 @@ void conv2d_pointwise(
     const IntArrayRef padding,
     const float output_min,
     const float output_max) {
-  using namespace api::utils;
-
   if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
-    
+
     vTensor v_weight_reshaped{
         context,
         {1,1, v_weight.sizes()[0], v_weight.sizes()[1]},
@@ -351,8 +357,13 @@ void conv2d_pointwise(
     temp_command_buffer.begin();
 
     temp_command_buffer.copy(
-        v_weight.buffer(temp_command_buffer),
-        v_weight_reshaped.buffer(temp_command_buffer, vTensor::Access::Write)
+        v_weight.buffer(
+            temp_command_buffer,
+            vTensor::Stage::Transfer),
+        v_weight_reshaped.buffer(
+            temp_command_buffer,
+            vTensor::Stage::Transfer,
+            vTensor::Access::Write)
     );
 
     temp_command_buffer.end();
@@ -389,16 +400,26 @@ void conv2d_pointwise(
         v_output.extents(),
         // Write-only access bypasses synchronization but inserts appropriate
         // barriers if necessary.
-        v_output.image(command_buffer, vTensor::Access::Write),
+        v_output.image(
+            command_buffer,
+            vTensor::Stage::Compute,
+            vTensor::Access::Write),
         // Read-only access is implied on const tensors and triggers an async
         // synchronization if necessary.
-        v_input.image(command_buffer),
+        v_input.image(
+            command_buffer,
+            vTensor::Stage::Compute),
         // Read-only access is implied on const tensors and triggers an async
         // synchronization if necessary.
-        v_weight_reshaped.image(command_buffer, vTensor::Access::Read),
+        v_weight_reshaped.image(
+            command_buffer,
+            vTensor::Stage::Compute,
+            vTensor::Access::Read),
         // Read-only access is implied on const tensors and triggers an async
         // synchronization if necessary.
-        v_bias.buffer(command_buffer),
+        v_bias.buffer(
+            command_buffer,
+            vTensor::Stage::Compute),
         // Object lifetime is managed by the resource pool.
         // It is OK not to keep track of the handle.
         context->resource().pool.uniform(block).object);
@@ -421,8 +442,6 @@ void conv2d(
     const IntArrayRef dilation,
     const float output_min,
     const float output_max) {
-  using namespace api::utils;
-
   if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
     const struct {
       int32_t kernel_x, kernel_y, kernel_ic, kernel_oc;
@@ -458,16 +477,25 @@ void conv2d(
         v_output.extents(),
         // Write-only access bypasses synchronization but inserts appropriate
         // barriers if necessary.
-        v_output.image(command_buffer, vTensor::Access::Write),
+        v_output.image(
+            command_buffer,
+            vTensor::Stage::Compute,
+            vTensor::Access::Write),
         // Read-only access is implied on const tensors and triggers an async
         // synchronization if necessary.
-        v_input.image(command_buffer),
+        v_input.image(
+            command_buffer,
+            vTensor::Stage::Compute),
         // Read-only access is implied on const tensors and triggers an async
         // synchronization if necessary.
-        v_weight.image(command_buffer),
+        v_weight.image(
+            command_buffer,
+            vTensor::Stage::Compute),
         // Read-only access is implied on const tensors and triggers an async
         // synchronization if necessary.
-        v_bias.buffer(command_buffer),
+        v_bias.buffer(
+            command_buffer,
+            vTensor::Stage::Compute),
         // Object lifetime is managed by the resource pool.
         // It is OK not to keep track of the handle.
         context->resource().pool.uniform(block).object);
diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.h b/aten/src/ATen/native/vulkan/ops/Convolution.h
index 2bab7091d4ab..7bd27bb1942b 100644
--- a/aten/src/ATen/native/vulkan/ops/Convolution.h
+++ b/aten/src/ATen/native/vulkan/ops/Convolution.h
@@ -1,5 +1,6 @@
 #pragma once
-#ifdef USE_VULKAN
+
+#ifdef USE_VULKAN_API
 
 #include <ATen/native/vulkan/ops/Common.h>
 #include <torch/custom_class.h>
@@ -96,4 +97,4 @@ c10::intrusive_ptr<Conv2dOpContext> conv2d_clamp_prepack(
 } // namespace native
 } // namespace at
 
-#endif /* USE_VULKAN */
+#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/ops/Copy.cpp b/aten/src/ATen/native/vulkan/ops/Copy.cpp
index 2f74d1be00ab..bbd326b42ace 100644
--- a/aten/src/ATen/native/vulkan/ops/Copy.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Copy.cpp
@@ -48,13 +48,18 @@ Tensor& copy_(Tensor& self, const Tensor& src) {
           //   are automatically inserted if a RAW hazard is detected.
           // - Recording any potential pending sync operations into the same
           //   command buffer prevents an expensive queue submission.
-          convert(src).buffer(command_buffer),
+          convert(src).buffer(
+              command_buffer,
+              vTensor::Stage::Transfer),
           // - Write-only access never triggers a sync as the contents will be
           //   overwritten regardless.  Having said that, appropriate barriers
           //   are inserted automatically if WAR or WAW hazards are detected.
           // - Recording pending sync operations into the same command buffer
           //   prevents an expensive queue submission.
-          v_self.buffer(command_buffer, vTensor::Access::Write));
+          v_self.buffer(
+              command_buffer,
+              vTensor::Stage::Transfer,
+              vTensor::Access::Write));
 
       command_buffer.end();
       command_buffer.submit(api::context()->gpu().queue);
diff --git a/aten/src/ATen/native/vulkan/ops/Mean.cpp b/aten/src/ATen/native/vulkan/ops/Mean.cpp
index 05e96df722d0..f6d63c14f381 100644
--- a/aten/src/ATen/native/vulkan/ops/Mean.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Mean.cpp
@@ -1,4 +1,5 @@
 #include <ATen/native/vulkan/ops/Common.h>
+#include <ATen/native/vulkan/ops/Utils.h>
 #include <torch/library.h>
 
 namespace at {
@@ -7,38 +8,48 @@ namespace vulkan {
 namespace ops {
 namespace {
 
-int64_t normalize_dim(int64_t d, int64_t n) {
-  return (d % n + n) % n;
-}
+using namespace api::utils;
 
 Tensor mean(
     const at::Tensor& input_arg,
     const IntArrayRef dim,
     const bool keepdim,
     const optional<ScalarType> dtype) {
-  TORCH_INTERNAL_ASSERT(
-      input_arg.dim() == 4, "vulkan_mean expects 4-dimensional input");
+  TORCH_CHECK(
+      input_arg.dim() == 4,
+      "Vulkan mean expects 4-dimensional input!");
+
   static const std::unordered_set<int64_t> expected_dims_set({2, 3});
   std::unordered_set<int64_t> dims_set;
+
   for (const auto& d : dim) {
-    dims_set.insert(normalize_dim(d, 4));
+    dims_set.insert(utils::normalize(d, 4));
   }
-  TORCH_INTERNAL_ASSERT(
+
+  TORCH_CHECK(
       dims_set == expected_dims_set,
-      "vulkan_mean currently only supported for image-wide reduction");
+      "Vulkan mean currently only supports image-wide reduction!");
+
+  api::Context* const context = api::context();
+
+  const Tensor input = input_arg.is_vulkan() ? input_arg : input_arg.vulkan();
+  const vTensor& v_input = convert(input);
+  const IntArrayRef v_input_sizes = v_input.sizes();
+
+  c10::SmallVector<int64_t, 4u> output_sizes{
+    v_input_sizes[Layout::Activation4D::batch],
+    v_input_sizes[Layout::Activation4D::channels],
+  };
 
-  std::vector<int64_t> output_dims{input_arg.sizes()[0], input_arg.sizes()[1]};
   if (keepdim) {
-    output_dims.push_back(1);
-    output_dims.push_back(1);
+    output_sizes.push_back(1);
+    output_sizes.push_back(1);
   }
 
-  api::Context* const context = api::context();
-  const vTensor& v_input = convert(input_arg);
   vTensor v_output{
-      context,
-      output_dims,
-      input_arg.options(),
+    context,
+    output_sizes,
+    v_input.options(),
   };
 
   api::Command::Buffer command_buffer = context->command().pool.allocate();
@@ -46,38 +57,72 @@ Tensor mean(
   {
     if (v_input.has_image()) {
       const struct {
-        uint32_t input_width, input_height;
-      } block{
-          input_arg.sizes()[3],
-          input_arg.sizes()[2],
+        uvec3 extents;
+        int32_t range;
+        ivec2 iextents;
+      } block {
+        v_output.extents(),
+        safe_downcast<int32_t>(
+            v_input_sizes[Layout::Activation4D::width] *
+            v_input_sizes[Layout::Activation4D::height]),
+        {
+          safe_downcast<int32_t>(v_input_sizes[Layout::Activation4D::width]),
+          safe_downcast<int32_t>(v_input_sizes[Layout::Activation4D::height]),
+        },
       };
 
       if (keepdim) {
         context->dispatch(
             command_buffer,
             {
-                VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-                VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+              VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+              VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+              VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
             },
             VK_KERNEL(mean),
             v_output.extents(),
-            v_output.image(command_buffer, vTensor::Access::Write),
-            v_input.image(command_buffer));
-      } else {
+            // Write-only access bypasses synchronization but inserts appropriate
+            // barriers if necessary.
+            v_output.image(
+                command_buffer,
+                vTensor::Stage::Compute,
+                vTensor::Access::Write),
+            // Read-only access is implied on const tensors and triggers an async
+            // synchronization if necessary.
+            v_input.image(
+                command_buffer,
+                vTensor::Stage::Compute),
+            // Object lifetime is managed by the resource pool.
+            // It is OK not to keep track of the handle.
+            context->resource().pool.uniform(block).object);
+      }
+      else {
         context->dispatch(
             command_buffer,
             {
-                VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-                VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-                VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+              VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+              VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+              VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
             },
             VK_KERNEL(mean2d),
             v_output.extents(),
-            v_output.image(command_buffer, vTensor::Access::Write),
-            v_input.image(command_buffer),
+            // Write-only access bypasses synchronization but inserts appropriate
+            // barriers if necessary.
+            v_output.image(
+                command_buffer,
+                vTensor::Stage::Compute,
+                vTensor::Access::Write),
+            // Read-only access is implied on const tensors and triggers an async
+            // synchronization if necessary.
+            v_input.image(
+                command_buffer,
+                vTensor::Stage::Compute),
+            // Object lifetime is managed by the resource pool.
+            // It is OK not to keep track of the handle.
             context->resource().pool.uniform(block).object);
       }
-    } else {
+    }
+    else {
       TORCH_CHECK(false, "Not implemented!");
     }
   }
diff --git a/aten/src/ATen/native/vulkan/ops/Mm.cpp b/aten/src/ATen/native/vulkan/ops/Mm.cpp
index ca342e70a7b8..e3335b8fc760 100644
--- a/aten/src/ATen/native/vulkan/ops/Mm.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Mm.cpp
@@ -7,137 +7,129 @@ namespace vulkan {
 namespace ops {
 namespace {
 
-vTensor pack_weights(api::Resource::Pool& pool, const Tensor& weight_arg) {
-  return convert(weight_arg.vulkan());
+using namespace api::utils;
+
+vTensor pack_weights(
+    api::Resource::Pool& pool,
+    const Tensor& weight_arg) {
+  // Pending Stephen's fix.
+  const Tensor weight = weight_arg.is_vulkan() ? weight_arg : weight_arg.vulkan();
+  return convert(weight);
 }
 
 vTensor pack_biases(
     api::Resource::Pool& pool,
-    const c10::optional<Tensor>& bias_arg,
-    const Tensor& weight_arg) {
+    const Tensor& weight_arg,
+    const c10::optional<Tensor>& bias_arg) {
   if (bias_arg) {
-    return convert(bias_arg->vulkan());
-  } else {
-    vTensor v_bias{
-        api::context(),
-        &pool,
-        {weight_arg.size(Layout::Parameter::width)},
-        weight_arg.options(),
-    };
-
-    using Future = vTensor::Future<void, vTensor::Access::Write>;
-    Future v_bias_future = v_bias.host<void, vTensor::Access::Write>();
-    Future::Payload v_bias_payload = v_bias_future.wait();
-
-    memset(
-        v_bias_payload.get(),
-        // 2's complement integers and IEEE-754 floating point numbers both
-        // have identical bit representations for 0, so can use memset which
-        // only accepts uint8_t parameter.
-        0,
-        v_bias.nbytes());
-
-    return v_bias;
+    return convert(
+        bias_arg->is_vulkan() ?
+            *bias_arg :
+            bias_arg->vulkan());
   }
-}
 
-bool available(const Tensor& weight, const c10::optional<Tensor>& bias) {
-  bool valid = true;
-  if (bias && bias->ndimension() > 1) {
-    valid =
-        (bias->sizes()[Layout::Parameter::width] ==
-         weight.sizes()[Layout::Parameter::width]);
-  }
-  return api::available() && valid;
+  vTensor v_bias{
+    api::context(),
+    &pool,
+    {
+      weight_arg.size(Layout::Parameter::width),
+    },
+    weight_arg.options(),
+  };
+
+  using Future = vTensor::Future<void, vTensor::Access::Write>;
+  Future v_bias_future = v_bias.host<void, vTensor::Access::Write>();
+  Future::Payload v_bias_payload = v_bias_future.wait();
+
+  memset(
+      v_bias_payload.get(),
+      // 2's complement integers and IEEE-754 floating point numbers both
+      // have identical bit representations for 0, so can use memset which
+      // only accepts uint8_t parameter.
+      0,
+      v_bias.nbytes());
+
+  return v_bias;
 }
 
-bool usable(
-    const Tensor& input,
+bool available(
     const Tensor& weight,
     const c10::optional<Tensor>& bias) {
-  return (input.sizes()[Layout::Parameter::width] ==
-       weight.sizes()[Layout::Parameter::height]);
+  return api::available() &&
+         // Weight
+         (2 == weight.ndimension()) &&
+         (weight.size(Layout::Parameter::height) > 0) &&
+         (weight.size(Layout::Parameter::width) > 0) &&
+         ((c10::DeviceType::CPU == weight.device().type()) ||
+          (c10::DeviceType::Vulkan == weight.device().type())) &&
+         (kFloat == weight.scalar_type()) &&
+         !weight.requires_grad() &&
+         // Bias
+         ((bias && bias->defined()) ? ((bias->ndimension() > 0) &&
+                                       ((c10::DeviceType::CPU == bias->device().type()) ||
+                                        (c10::DeviceType::Vulkan == bias->device().type())) &&
+                                       (kFloat == bias->scalar_type()) &&
+                                       ((bias->ndimension() > 1) ?
+                                            (bias->size(Layout::Parameter::width) ==
+                                                weight.size(Layout::Parameter::width))
+                                            : true) &&
+                                       !bias->requires_grad())
+                                    : true) &&
+         true;
 }
 
-void addmm_impl(
-    api::Context* const context,
-    api::Command::Buffer& command_buffer,
-    vTensor& v_output,
-    const vTensor& v_self,
-    const vTensor& v_mat1,
-    const vTensor& v_mat2,
-    const float beta,
-    const float alpha) {
-  if (v_output.has_image() && v_self.has_image() && v_mat1.has_image() &&
-      v_mat2.has_image()) {
-    const struct {
-      float alpha, beta;
-    } block{
-        alpha,
-        beta,
-    };
-
-    context->dispatch(
-        command_buffer,
-        {
-            VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-            VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-            VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-            VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
-        },
-        VK_KERNEL(addmm),
-        v_output.extents(),
-        // Write-only access bypasses synchronization but inserts appropriate
-        // barriers if necessary.
-        v_output.image(command_buffer, vTensor::Access::Write),
-        // Read-only access is implied on const tensors and triggers an async
-        // synchronization if necessary.
-        v_mat1.image(command_buffer),
-        // Read-only access is implied on const tensors and triggers an async
-        // synchronization if necessary.
-        v_mat2.image(command_buffer),
-        // Read-only access is implied on const tensors and triggers an async
-        // synchronization if necessary.
-        v_self.image(command_buffer),
-        context->resource().pool.uniform(block).object);
-  } else {
-    TORCH_CHECK(false, "Not implemented!");
-  }
+bool usable(
+    const Tensor& input,
+    const Tensor& weight,
+    const c10::optional<Tensor>& /* bias */) {
+  return (2 == input.ndimension()) &&
+         (c10::DeviceType::Vulkan == input.device().type()) &&
+         (kFloat == input.scalar_type()) &&
+         (input.size(Layout::Parameter::width) ==
+              weight.size(Layout::Parameter::height)) &&
+         !input.requires_grad() &&
+         true;
 }
 
 Tensor addmm(
-    const Tensor& self,
-    const Tensor& mat1,
-    const Tensor& mat2,
+    const Tensor& bias,
+    const Tensor& input,
+    const Tensor& weight,
     const Scalar beta,
     const Scalar alpha) {
-  return LinearOpContext::create(api::context()->resource().pool, mat2, self)
-      .run(mat1, beta.to<float>(), alpha.to<float>());
+  return LinearOpContext::create(
+      api::context()->resource().pool,
+      weight,
+      bias).run(
+          input,
+          alpha.to<float>(),
+          beta.to<float>());
 }
 
-Tensor mm(const Tensor& self_arg, const Tensor& mat2_arg) {
+Tensor mm(
+    const Tensor& mat1_arg,
+    const Tensor& mat2_arg) {
   api::Context* const context = api::context();
 
-  const Tensor mat1 = self_arg.is_vulkan() ? self_arg : self_arg.vulkan();
+  const Tensor mat1 = mat1_arg.is_vulkan() ? mat1_arg : mat1_arg.vulkan();
   const vTensor& v_mat1 = convert(mat1);
 
   const Tensor mat2 = mat2_arg.is_vulkan() ? mat2_arg : mat2_arg.vulkan();
   const vTensor& v_mat2 = convert(mat2);
 
-  const auto mat1_sizes = mat1.sizes();
-  const auto mat2_sizes = mat2.sizes();
+  const auto v_mat1_sizes = v_mat1.sizes();
+  const auto v_mat2_sizes = v_mat2.sizes();
 
   TORCH_CHECK(
-      mat1_sizes[Layout::Parameter::width] ==
-          mat2_sizes[Layout::Parameter::height],
+      v_mat1_sizes[Layout::Parameter::width] ==
+          v_mat2_sizes[Layout::Parameter::height],
       "Incompatible matrix dimensions!");
 
   vTensor v_output{
       context,
       {
-          mat1_sizes[Layout::Parameter::height],
-          mat2_sizes[Layout::Parameter::width],
+          v_mat1_sizes[Layout::Parameter::height],
+          v_mat2_sizes[Layout::Parameter::width],
       },
       mat1.options(),
   };
@@ -146,24 +138,43 @@ Tensor mm(const Tensor& self_arg, const Tensor& mat2_arg) {
   command_buffer.begin();
   {
     if (v_mat1.has_image() && v_mat2.has_image()) {
+      const struct {
+        uvec3 size;
+        int32_t K;
+      } block {
+        v_output.extents(),
+        safe_downcast<int32_t>(v_mat1_sizes[Layout::Parameter::width]),
+      };
+
       context->dispatch(
           command_buffer,
           {
               VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
               VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
               VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+              VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
           },
           VK_KERNEL(mm),
           v_output.extents(),
           // Write-only access bypasses synchronization but inserts appropriate
           // barriers if necessary.
-          v_output.image(command_buffer, vTensor::Access::Write),
+          v_output.image(
+              command_buffer,
+              vTensor::Stage::Compute,
+              vTensor::Access::Write),
           // Read-only access is implied on const tensors and triggers an async
           // synchronization if necessary.
-          v_mat1.image(command_buffer),
+          v_mat1.image(
+              command_buffer,
+              vTensor::Stage::Compute),
           // Read-only access is implied on const tensors and triggers an async
           // synchronization if necessary.
-          v_mat2.image(command_buffer));
+          v_mat2.image(
+              command_buffer,
+              vTensor::Stage::Compute),
+          // Object lifetime is managed by the resource pool.
+          // It is OK not to keep track of the handle.
+          context->resource().pool.uniform(block).object);
     } else {
       TORCH_CHECK(false, "Not implemented!");
     }
@@ -191,7 +202,7 @@ LinearOpContext::LinearOpContext(
     const c10::optional<Tensor>& bias)
   : packed_{
       pack_weights(pool, weight),
-      pack_biases(pool, bias, weight),
+      pack_biases(pool, weight, bias),
     },
     unpacked_{
       weight,
@@ -203,7 +214,12 @@ LinearOpContext LinearOpContext::create(
     api::Resource::Pool& pool,
     const Tensor& weight,
     const c10::optional<Tensor>& bias) {
-  TORCH_CHECK(available(weight, bias))
+  TORCH_CHECK(
+      available(weight, bias),
+      "Vulkan Linear not available! "
+      "Reason: The provided (weight, bias) parameters are either invalid "
+      "individually or their combination is not supported by Vulkan Impl.");
+
   // Pass in the originals
   return LinearOpContext{
       pool,
@@ -212,8 +228,10 @@ LinearOpContext LinearOpContext::create(
   };
 }
 
-Tensor LinearOpContext::run(const Tensor& input_arg, float beta, float alpha)
-    const {
+Tensor LinearOpContext::run(
+    const Tensor& input_arg,
+    const float alpha,
+    const float beta) const {
   api::Context* const context = api::context();
 
   const Tensor input = input_arg.is_vulkan() ? input_arg : input_arg.vulkan();
@@ -222,12 +240,14 @@ Tensor LinearOpContext::run(const Tensor& input_arg, float beta, float alpha)
   TORCH_CHECK(
       usable(input, unpacked_.weight, unpacked_.bias),
       "Vulkan Linear not usable! "
-      "Reason: The provided input tensor is either invalid or unsupported by Vulkan impl.");
+      "Reason: The provided input tensor is either invalid on its own, or its "
+      "combination with the provided weight and bias tensors are unsupported by "
+      "Vulkan impl.");
 
   vTensor v_output{
       context,
       {
-          input_arg.sizes()[Layout::Parameter::height],
+          v_input.sizes()[Layout::Parameter::height],
           packed_.v_weight.sizes()[Layout::Parameter::width],
       },
       input.options(),
@@ -236,19 +256,61 @@ Tensor LinearOpContext::run(const Tensor& input_arg, float beta, float alpha)
   api::Command::Buffer command_buffer = context->command().pool.allocate();
   command_buffer.begin();
   {
-    if (input_arg.ndimension() == 2) {
-      addmm_impl(
-          context,
+    if (v_output.has_image() &&
+        v_input.has_image() &&
+        packed_.v_weight.has_image() &&
+        packed_.v_bias.has_image()) {
+      const struct {
+        uvec3 size;
+        int32_t K;
+        vec2 multiplier;
+      } block {
+          v_output.extents(),
+          safe_downcast<int32_t>(v_input.sizes()[Layout::Parameter::width]),
+          {
+            alpha,
+            beta,
+          },
+      };
+
+      context->dispatch(
           command_buffer,
-          v_output,
-          packed_.v_bias,
-          v_input,
-          packed_.v_weight,
-          beta,
-          alpha);
-    } else {
-      TORCH_CHECK(
-          false, "linear_run does not yet support inputs with ndim > 2!")
+          {
+              VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+              VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+              VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+              VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+              VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+          },
+          VK_KERNEL(addmm),
+          v_output.extents(),
+          // Write-only access bypasses synchronization but inserts appropriate
+          // barriers if necessary.
+          v_output.image(
+              command_buffer,
+              vTensor::Stage::Compute,
+              vTensor::Access::Write),
+          // Read-only access is implied on const tensors and triggers an async
+          // synchronization if necessary.
+          v_input.image(
+              command_buffer,
+              vTensor::Stage::Compute),
+          // Read-only access is implied on const tensors and triggers an async
+          // synchronization if necessary.
+          packed_.v_weight.image(
+              command_buffer,
+              vTensor::Stage::Compute),
+          // Read-only access is implied on const tensors and triggers an async
+          // synchronization if necessary.
+          packed_.v_bias.image(
+              command_buffer,
+              vTensor::Stage::Compute),
+          // Object lifetime is managed by the resource pool.
+          // It is OK not to keep track of the handle.
+          context->resource().pool.uniform(block).object);
+    }
+    else {
+      TORCH_CHECK(false, "Not implemented!");
     }
   }
   command_buffer.end();
@@ -264,12 +326,14 @@ LinearOpContext::State LinearOpContext::unpack() const {
   };
 }
 
-
 c10::intrusive_ptr<LinearOpContext> linear_prepack(
     Tensor&& weight,
     c10::optional<Tensor>&& bias) {
-  return c10::make_intrusive<LinearOpContext>(LinearOpContext::create(
-      persistent()->pool, std::move(weight), std::move(bias)));
+  return c10::make_intrusive<LinearOpContext>(
+      LinearOpContext::create(
+          persistent()->pool,
+          std::move(weight),
+          std::move(bias)));
 }
 
 Tensor linear_run(
diff --git a/aten/src/ATen/native/vulkan/ops/Mm.h b/aten/src/ATen/native/vulkan/ops/Mm.h
index 08c84967d00f..2c389c555a1a 100644
--- a/aten/src/ATen/native/vulkan/ops/Mm.h
+++ b/aten/src/ATen/native/vulkan/ops/Mm.h
@@ -1,5 +1,6 @@
 #pragma once
-#ifdef USE_VULKAN
+
+#ifdef USE_VULKAN_API
 
 #include <ATen/native/vulkan/ops/Common.h>
 #include <torch/library.h>
@@ -52,4 +53,4 @@ Tensor linear_run(
 } // namespace native
 } // namespace at
 
-#endif /* USE_VULKAN */
+#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/ops/Mul.cpp b/aten/src/ATen/native/vulkan/ops/Mul.cpp
index c22456808b82..84226135929a 100644
--- a/aten/src/ATen/native/vulkan/ops/Mul.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Mul.cpp
@@ -7,6 +7,8 @@ namespace vulkan {
 namespace ops {
 namespace {
 
+using namespace api::utils;
+
 Tensor mul_scalar(
     const Tensor& self_arg,
     const Scalar other) {
@@ -17,8 +19,8 @@ Tensor mul_scalar(
 
   vTensor v_output{
     context,
-    self.sizes(),
-    self.options(),
+    v_self.sizes(),
+    v_self.options(),
   };
 
   api::Command::Buffer command_buffer = context->command().pool.allocate();
@@ -26,8 +28,10 @@ Tensor mul_scalar(
   {
     if (v_output.has_image() && v_self.has_image()) {
       const struct {
+        uvec3 extents;
         float other;
       } block {
+        v_output.extents(),
         other.to<float>(),
       };
 
@@ -42,10 +46,15 @@ Tensor mul_scalar(
           v_output.extents(),
           // Write-only access bypasses synchronization but inserts appropriate
           // barriers if necessary.
-          v_output.image(command_buffer, vTensor::Access::Write),
+          v_output.image(
+              command_buffer,
+              vTensor::Stage::Compute,
+              vTensor::Access::Write),
           // Read-only access is implied on const tensors and triggers an async
           // synchronization if necessary.
-          v_self.image(command_buffer),
+          v_self.image(
+              command_buffer,
+              vTensor::Stage::Compute),
           // Object lifetime is managed by the resource pool.
           // It is OK not to keep track of the handle.
           context->resource().pool.uniform(block).object);
@@ -61,23 +70,25 @@ Tensor mul_scalar(
 }
 
 Tensor& mul_scalar_(
-    Tensor& self_arg,
+    Tensor& self,
     const Scalar other) {
   api::Context* const context = api::context();
 
   TORCH_CHECK(
-      self_arg.is_vulkan(),
+      self.is_vulkan(),
       "Vulkan: In-place mul_scalar is only supported on Vulkan tensors.");
 
-  vTensor& v_self = convert(self_arg);
+  vTensor& v_self = convert(self);
 
   api::Command::Buffer command_buffer = context->command().pool.allocate();
   command_buffer.begin();
   {
     if (v_self.has_image()) {
       const struct {
+        uvec3 extents;
         float other;
       } block {
+        v_self.extents(),
         other.to<float>(),
       };
 
@@ -91,7 +102,10 @@ Tensor& mul_scalar_(
           v_self.extents(),
           // Read-Write access triggers an async synchronization if necessory
           // and inserts appropriate barriers if hazards are detected.
-          v_self.image(command_buffer, vTensor::Access::Read | vTensor::Access::Write),
+          v_self.image(
+              command_buffer,
+              vTensor::Stage::Compute,
+              vTensor::Access::Read | vTensor::Access::Write),
           // Object lifetime is managed by the resource pool.
           // It is OK not to keep track of the handle.
           context->resource().pool.uniform(block).object);
@@ -103,7 +117,7 @@ Tensor& mul_scalar_(
   command_buffer.end();
   command_buffer.submit(context->gpu().queue);
 
-  return self_arg;
+  return self;
 }
 
 #ifdef USE_VULKAN_API
diff --git a/aten/src/ATen/native/vulkan/ops/Persistent.cpp b/aten/src/ATen/native/vulkan/ops/Persistent.cpp
index 4b92198b1da2..bea5e97e5021 100644
--- a/aten/src/ATen/native/vulkan/ops/Persistent.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Persistent.cpp
@@ -6,18 +6,19 @@ namespace vulkan {
 namespace ops {
 
 Persistent* persistent() {
-  static const std::unique_ptr<Persistent> persistent([]() -> Persistent* {
-    try {
-      return new Persistent{
-        api::Resource::Pool{
-          api::context()->gpu(),
-        },
-      };
-    }
-    catch (...) {
-      return nullptr;
-    }
-  }());
+  static const std::unique_ptr<Persistent> persistent(
+    []() -> Persistent* {
+      try {
+        return new Persistent{
+          api::Resource::Pool{
+            api::context()->gpu(),
+          },
+        };
+      }
+      catch (...) {
+        return nullptr;
+      }
+    }());
 
   TORCH_CHECK(
       persistent,
diff --git a/aten/src/ATen/native/vulkan/ops/Pool.cpp b/aten/src/ATen/native/vulkan/ops/Pool.cpp
index 8e853ee538f9..0bc97d6741bc 100644
--- a/aten/src/ATen/native/vulkan/ops/Pool.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Pool.cpp
@@ -8,12 +8,14 @@ namespace vulkan {
 namespace ops {
 namespace {
 
+using namespace api::utils;
+
 Tensor adaptive_avg_pool2d(
     const at::Tensor& self_arg,
     const IntArrayRef output_size) {
-  TORCH_INTERNAL_ASSERT(
+  TORCH_CHECK(
       self_arg.dim() == 4,
-      "vulkan_adaptive_avg_pool2d expects 4-dimensional input!");
+      "Vulkan adaptive_avg_pool2d expects 4-dimensional input!");
 
   api::Context* const context = api::context();
 
@@ -28,27 +30,59 @@ Tensor adaptive_avg_pool2d(
       output_size[Layout::Activation4D::batch],
       output_size[Layout::Activation4D::channels],
     },
-    self.options(),
+    v_self.options(),
   };
 
   api::Command::Buffer command_buffer = context->command().pool.allocate();
   command_buffer.begin();
   {
     if (v_self.has_image()) {
+      const uvec3 v_output_size = v_output.extents();
+      const uvec3 v_self_size = v_self.extents();
+
+      const vec2 stride {
+        static_cast<float>(v_self_size.data[0u]) / v_output_size.data[0u],
+        static_cast<float>(v_self_size.data[1u]) / v_output_size.data[1u],
+      };
+
+      const struct {
+        uvec3 size;
+        uint32_t _;
+        vec2 stride;
+        vec2 kernel;
+      } block {
+        v_output.extents(),
+        0u,
+        stride,
+        {
+          v_self_size.data[0u] - (v_output_size.data[0u] - 1u) * stride.data[0u],
+          v_self_size.data[1u] - (v_output_size.data[1u] - 1u) * stride.data[1u],
+        },
+      };
+
       context->dispatch(
           command_buffer,
           {
             VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
             VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
           },
           VK_KERNEL(adaptive_avg_pool2d),
           v_output.extents(),
           // Write-only access bypasses synchronization but inserts appropriate
           // barriers if necessary.
-          v_output.image(command_buffer, vTensor::Access::Write),
+          v_output.image(
+              command_buffer,
+              vTensor::Stage::Compute,
+              vTensor::Access::Write),
           // Read-only access is implied on const tensors and triggers an async
           // synchronization if necessary.
-          v_self.image(command_buffer));
+          v_self.image(
+              command_buffer,
+              vTensor::Stage::Compute),
+          // Object lifetime is managed by the resource pool.
+          // It is OK not to keep track of the handle.
+          context->resource().pool.uniform(block).object);
     }
     else {
       TORCH_CHECK(false, "Not implemented!");
@@ -134,26 +168,43 @@ Tensor avg_pool2d(
       output_height,
       output_width,
     },
-    self.options(),
+    v_self.options(),
   };
 
   api::Command::Buffer command_buffer = context->command().pool.allocate();
   command_buffer.begin();
   {
-    using namespace api::utils;
+    using namespace utils;
 
     if (v_self.has_image()) {
       const struct {
-        int32_t kernel_width, kernel_height;
-        int32_t stride_x, stride_y;
-        int32_t padding_x, padding_y;
+        uvec3 extents;
+        int32_t range;
+        ivec2 iextents;
+        ivec2 stride;
+        ivec2 padding;
+        ivec2 kernel;
       } block {
-        safe_downcast<int32_t>(kernel[Layout::Parameter::width]),
-        safe_downcast<int32_t>(kernel[Layout::Parameter::height]),
-        safe_downcast<int32_t>(stride[Layout::Parameter::width]),
-        safe_downcast<int32_t>(stride[Layout::Parameter::height]),
-        safe_downcast<int32_t>(padding[Layout::Parameter::width]),
-        safe_downcast<int32_t>(padding[Layout::Parameter::height]),
+        v_output.extents(),
+        safe_downcast<int32_t>(
+            kernel[Layout::Parameter::width] *
+            kernel[Layout::Parameter::height]),
+        {
+          safe_downcast<int32_t>(self.size(Layout::Activation4D::width)),
+          safe_downcast<int32_t>(self.size(Layout::Activation4D::height)),
+        },
+        {
+          safe_downcast<int32_t>(stride[Layout::Parameter::width]),
+          safe_downcast<int32_t>(stride[Layout::Parameter::height]),
+        },
+        {
+          safe_downcast<int32_t>(padding[Layout::Parameter::width]),
+          safe_downcast<int32_t>(padding[Layout::Parameter::height]),
+        },
+        {
+          safe_downcast<int32_t>(kernel[Layout::Parameter::width]),
+          safe_downcast<int32_t>(kernel[Layout::Parameter::height]),
+        },
       };
 
       context->dispatch(
@@ -167,10 +218,15 @@ Tensor avg_pool2d(
           v_output.extents(),
           // Write-only access bypasses synchronization but inserts appropriate
           // barriers if necessary.
-          v_output.image(command_buffer, vTensor::Access::Write),
+          v_output.image(
+              command_buffer,
+              vTensor::Stage::Compute,
+              vTensor::Access::Write),
           // Read-only access is implied on const tensors and triggers an async
           // synchronization if necessary.
-          v_self.image(command_buffer),
+          v_self.image(
+              command_buffer,
+              vTensor::Stage::Compute),
           // Object lifetime is managed by the resource pool.
           // It is OK not to keep track of the handle.
           context->resource().pool.uniform(block).object);
diff --git a/aten/src/ATen/native/vulkan/ops/RegisterOpContextClass.cpp b/aten/src/ATen/native/vulkan/ops/Register.cpp
similarity index 98%
rename from aten/src/ATen/native/vulkan/ops/RegisterOpContextClass.cpp
rename to aten/src/ATen/native/vulkan/ops/Register.cpp
index 699944b7c48e..7b226654af01 100644
--- a/aten/src/ATen/native/vulkan/ops/RegisterOpContextClass.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Register.cpp
@@ -1,10 +1,10 @@
-#ifdef USE_VULKAN
+#ifdef USE_VULKAN_API
 
-#include <torch/library.h>
-#include <torch/custom_class.h>
 #include <ATen/native/vulkan/ops/Common.h>
 #include <ATen/native/vulkan/ops/Convolution.h>
 #include <ATen/native/vulkan/ops/Mm.h>
+#include <torch/custom_class.h>
+#include <torch/library.h>
 
 namespace at {
 namespace native {
@@ -77,4 +77,4 @@ TORCH_LIBRARY_IMPL(vulkan_prepack, Vulkan, m) {
 } // namespace native
 } // namespace at
 
-#endif /* USE_VULKAN */
+#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/ops/Shape.cpp b/aten/src/ATen/native/vulkan/ops/Shape.cpp
index 10d6ac24198a..8edfda60b76f 100644
--- a/aten/src/ATen/native/vulkan/ops/Shape.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Shape.cpp
@@ -9,7 +9,7 @@ namespace {
 
 Tensor view(
     const Tensor& self_arg,
-    IntArrayRef shape) {
+    const IntArrayRef shape) {
   api::Context* const context = api::context();
 
   const Tensor self = self_arg.is_vulkan() ? self_arg : self_arg.vulkan();
@@ -27,10 +27,15 @@ Tensor view(
     command_buffer.copy(
         // Read-only access is implied on const tensors and triggers an async
         // synchronization if necessary.
-        v_self.buffer(command_buffer),
+        v_self.buffer(
+            command_buffer,
+            vTensor::Stage::Transfer),
         // Write-only access bypasses synchronization but inserts appropriate
         // barriers if necessary.
-        v_output.buffer(command_buffer, vTensor::Access::Write));
+        v_output.buffer(
+            command_buffer,
+            vTensor::Stage::Transfer,
+            vTensor::Access::Write));
   }
   command_buffer.end();
   command_buffer.submit(context->gpu().queue);
diff --git a/aten/src/ATen/native/vulkan/ops/Tensor.cpp b/aten/src/ATen/native/vulkan/ops/Tensor.cpp
index abd185c1ca07..3570834b8dd4 100644
--- a/aten/src/ATen/native/vulkan/ops/Tensor.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Tensor.cpp
@@ -6,29 +6,36 @@ namespace vulkan {
 namespace ops {
 namespace {
 
-VkDeviceSize bytes(
-    const IntArrayRef sizes,
-    const caffe2::TypeMeta dtype) {
-  VkDeviceSize size = c10::elementSize(c10::typeMetaToScalarType(dtype));
+using namespace api::utils;
 
-  // Forward declaration
-  bool requires_image(IntArrayRef);
+VkFormat vk_format(const caffe2::TypeMeta dtype) {
+  switch (c10::typeMetaToScalarType(dtype)) {
+    case kFloat:
+    #ifdef USE_VULKAN_FP16_INFERENCE
+      return VK_FORMAT_R16G16B16A16_SFLOAT;
+    #else
+      return VK_FORMAT_R32G32B32A32_SFLOAT;
+    #endif /* USE_VULKAN_FP16_INFERENCE */
 
-  if (requires_image(sizes)) {
-    // Forward declaration
-    VkExtent3D image_extents(IntArrayRef);
-
-    const VkExtent3D extents = image_extents(sizes);
-    size *= extents.width * extents.height * (4u * extents.depth);
-  }
-  else {
-    size *= prod_intlist(sizes);
+    default:
+      TORCH_CHECK(
+        false,
+        "Vulkan tensor format not supported!");
   }
 
-  return size;
+  return VK_FORMAT_UNDEFINED;
 }
 
-vTensor::Access::Flags convert(const VkAccessFlags vk_access) {
+VkExtent3D vk_extent(const uvec3& extent) {
+  return {
+    extent.data[0u],
+    extent.data[1u],
+    extent.data[2u],
+  };
+}
+
+vTensor::Access::Flags access(
+    const VkAccessFlags vk_access) {
   vTensor::Access::Flags access = 0u;
 
   constexpr VkAccessFlags kRead =
@@ -55,6 +62,115 @@ vTensor::Access::Flags convert(const VkAccessFlags vk_access) {
   return access;
 }
 
+VkAccessFlags vk_access(
+    const vTensor::Stage::Flags stage,
+    const vTensor::Access::Flags access) {
+  VkAccessFlags vk_access = 0u;
+
+  if (access & vTensor::Access::Read) {
+    if (stage & vTensor::Stage::Compute) {
+      vk_access |= VK_ACCESS_SHADER_READ_BIT;
+    }
+
+    if (stage & vTensor::Stage::Host) {
+      vk_access |= VK_ACCESS_HOST_READ_BIT;
+    }
+
+    if (stage & vTensor::Stage::Transfer) {
+      vk_access |= VK_ACCESS_TRANSFER_READ_BIT;
+    }
+  }
+
+  if (access & vTensor::Access::Write) {
+    if (stage & vTensor::Stage::Compute) {
+      vk_access |= VK_ACCESS_SHADER_WRITE_BIT;
+    }
+
+    if (stage & vTensor::Stage::Host) {
+      vk_access |= VK_ACCESS_HOST_WRITE_BIT;
+    }
+
+    if (stage & vTensor::Stage::Transfer) {
+      vk_access |= VK_ACCESS_TRANSFER_WRITE_BIT;
+    }
+  }
+
+  return vk_access;
+}
+
+VkImageLayout vk_layout(
+    const vTensor::Stage::Flags stage,
+    const vTensor::Access::Flags access) {
+  switch (stage) {
+    case vTensor::Stage::Compute:
+      switch (access) {
+        case vTensor::Access::Read:
+          return VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+
+        default:
+          return VK_IMAGE_LAYOUT_GENERAL;
+      } break;
+
+    case vTensor::Stage::Transfer:
+      switch (access) {
+        case vTensor::Access::Read:
+          return VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
+
+        case vTensor::Access::Write:
+          return VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
+
+        default:
+          TORCH_INTERNAL_ASSERT(false, "Invalid!");
+      } break;
+
+    default:
+      TORCH_INTERNAL_ASSERT(false, "Invalid!");
+  }
+
+  return VK_IMAGE_LAYOUT_UNDEFINED;
+}
+
+VkPipelineStageFlags vk_stage(
+    const vTensor::Stage::Flags stage) {
+  VkPipelineStageFlags vk_stage = 0u;
+
+  if (stage & vTensor::Stage::Compute) {
+    vk_stage |= VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+  }
+
+  if (stage & vTensor::Stage::Host) {
+    vk_stage |= VK_PIPELINE_STAGE_HOST_BIT;
+  }
+
+  if (stage & vTensor::Stage::Transfer) {
+    vk_stage |= VK_PIPELINE_STAGE_TRANSFER_BIT;
+  }
+
+  return vk_stage;
+}
+
+VkDeviceSize buffer_bytes(
+    const IntArrayRef sizes,
+    const caffe2::TypeMeta dtype) {
+  VkDeviceSize size = c10::elementSize(c10::typeMetaToScalarType(dtype));
+
+  // Forward declaration
+  bool requires_image(IntArrayRef);
+
+  if (requires_image(sizes)) {
+    // Forward declaration
+    uvec3 image_extents(IntArrayRef);
+
+    const uvec3 extents = image_extents(sizes);
+    size *= extents.data[0u] * extents.data[1u] * (4u * extents.data[2u]);
+  }
+  else {
+    size *= prod_intlist(sizes);
+  }
+
+  return size;
+}
+
 vTensor::Buffer allocate_buffer(
     const api::Adapter* const adapter,
     api::Resource::Pool* const pool,
@@ -74,35 +190,29 @@ vTensor::Buffer allocate_buffer(
   // Forward declaration
   bool requires_staging(const api::Adapter*);
 
-  const VkFlags usage = [adapter]() {
-    VkFlags usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
-
-    if (requires_staging(adapter)) {
-      usage |= VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
-               VK_BUFFER_USAGE_TRANSFER_DST_BIT;
-    }
-
-    return usage;
-  }();
+  const VkFlags usage =
+      VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+      VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
+      VK_BUFFER_USAGE_TRANSFER_DST_BIT;
 
   const auto memory = [adapter]() -> api::Resource::Memory::Descriptor {
     if (requires_staging(adapter)) {
       return {
         VMA_MEMORY_USAGE_GPU_ONLY,
         0u,
-        VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+        0u,
       };
     }
 
     return {
-      VMA_MEMORY_USAGE_UNKNOWN,
-      VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT,
-      VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+      VMA_MEMORY_USAGE_GPU_TO_CPU,
+      0u,
+      VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
     };
   }();
 
   return pool->buffer({
-      bytes(sizes, options.dtype()),
+      buffer_bytes(sizes, options.dtype()),
       // Usage
       {
         usage,
@@ -115,7 +225,7 @@ bool requires_image(const IntArrayRef sizes) {
   return (1u <= sizes.size()) && (sizes.size() <= 4u);
 }
 
-VkExtent3D image_extents(const IntArrayRef sizes) {
+uvec3 image_extents(const IntArrayRef sizes) {
   int64_t width = 1;
   int64_t height = 1;
   int64_t depth = 1;
@@ -151,7 +261,7 @@ VkExtent3D image_extents(const IntArrayRef sizes) {
   return {
     width,
     height,
-    api::utils::div_up(depth, 4),
+    div_up(depth, INT64_C(4)),
   };
 }
 
@@ -167,7 +277,7 @@ vTensor::Image allocate_image(
 
   return pool->image({
       VK_IMAGE_TYPE_3D,
-      api::utils::convert(options.dtype()),
+      vk_format(options.dtype()),
       extents,
       // Usage
       {
@@ -176,13 +286,20 @@ vTensor::Image allocate_image(
         {
           VMA_MEMORY_USAGE_GPU_ONLY,
           0u,
-          VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+          0u,
         },
       },
       // View
       {
         VK_IMAGE_VIEW_TYPE_3D,
-        api::utils::convert(options.dtype()),
+        vk_format(options.dtype()),
+      },
+      // Sampler
+      {
+        VK_FILTER_NEAREST,
+        VK_SAMPLER_MIPMAP_MODE_NEAREST,
+        VK_SAMPLER_ADDRESS_MODE_REPEAT,
+        VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK,
       },
     });
 }
@@ -212,13 +329,13 @@ vTensor::Buffer allocate_staging(
   verify(options);
 
   return pool->buffer({
-      bytes(sizes, options.dtype()),
+      buffer_bytes(sizes, options.dtype()),
       // Usage
       {
         VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
             VK_BUFFER_USAGE_TRANSFER_DST_BIT,
         {
-          VMA_MEMORY_USAGE_CPU_ONLY,
+          VMA_MEMORY_USAGE_CPU_COPY,
           0u,
           0u,
         },
@@ -247,11 +364,11 @@ Barrier categorize(
     return Barrier::None;
   }
 
-  const vTensor::Access::Flags src_access = convert(vk_src_access);
-  const vTensor::Access::Flags dst_access = convert(vk_dst_access);
+  const vTensor::Access::Flags src_access = access(vk_src_access);
+  const vTensor::Access::Flags dst_access = access(vk_dst_access);
 
-  if (vTensor::Access::Read == (src_access & vTensor::Access::Read)) {
-    if (vTensor::Access::Read == (dst_access & vTensor::Access::Read)) {
+  if ((src_access & vTensor::Access::Read) == src_access) {
+    if ((dst_access & vTensor::Access::Read) == dst_access) {
       // RAR (Read after Read)
       return Barrier::None;
     }
@@ -303,53 +420,81 @@ vTensor::vTensor(
 }
 
 const vTensor* vTensor::host() const {
-  view_->staging(Access::Read);
+  view_->staging(Stage::Host, Access::Read);
   return this;
 }
 
 vTensor* vTensor::host(const Access::Flags access) {
-  view_->staging(access);
+  view_->staging(Stage::Host, access);
   return this;
 }
 
-vTensor::Buffer::Object vTensor::buffer() const & {
-  return view_->buffer(Access::Read).object;
+vTensor::Buffer::Object vTensor::buffer(
+  const Stage::Flags stage) const & {
+  return view_->buffer(
+      stage,
+      Access::Read).object;
 }
 
 vTensor::Buffer::Object vTensor::buffer(
+    const Stage::Flags stage,
     const Access::Flags access) & {
-  return view_->buffer(access).object;
+  return view_->buffer(
+      stage,
+      access).object;
 }
 
 vTensor::Buffer::Object vTensor::buffer(
-    api::Command::Buffer& command_buffer) const & {
-  return view_->buffer(command_buffer, Access::Read).object;
+    api::Command::Buffer& command_buffer,
+    const Stage::Flags stage) const & {
+  return view_->buffer(
+      command_buffer,
+      stage,
+      Access::Read).object;
 }
 
 vTensor::Buffer::Object vTensor::buffer(
     api::Command::Buffer& command_buffer,
+    const Stage::Flags stage,
     const Access::Flags access) & {
-  return view_->buffer(command_buffer, access).object;
+  return view_->buffer(
+      command_buffer,
+      stage,
+      access).object;
 }
 
-vTensor::Image::Object vTensor::image() const & {
-  return view_->image(Access::Read).object;
+vTensor::Image::Object vTensor::image(
+    const Stage::Flags stage) const & {
+  return view_->image(
+      stage,
+      Access::Read).object;
 }
 
 vTensor::Image::Object vTensor::image(
+    const Stage::Flags stage,
     const Access::Flags access) & {
-  return view_->image(access).object;
+  return view_->image(
+      stage,
+      access).object;
 }
 
 vTensor::Image::Object vTensor::image(
-    api::Command::Buffer& command_buffer) const & {
-  return view_->image(command_buffer, Access::Read).object;
+    api::Command::Buffer& command_buffer,
+    const Stage::Flags stage) const & {
+  return view_->image(
+      command_buffer,
+      stage,
+      Access::Read).object;
 }
 
 vTensor::Image::Object vTensor::image(
     api::Command::Buffer& command_buffer,
+    const Stage::Flags stage,
     const Access::Flags access) & {
-  return view_->image(command_buffer, access).object;
+  return view_->image(
+      command_buffer,
+      stage,
+      access).object;
 }
 
 vTensor::View::View()
@@ -399,7 +544,7 @@ vTensor::View::View(
 
 class vTensor::View::CMD final {
  public:
-  CMD(const View&);
+  explicit CMD(const View&);
   CMD(const View&, api::Command::Buffer&);
   CMD(const CMD&) = delete;
   CMD& operator=(const CMD&) = delete;
@@ -446,9 +591,10 @@ class vTensor::View::CMD final {
     External,
   } type;
 
-  union {
+  union _ final {
     api::Command::Buffer internal;
     api::Command::Buffer* external;
+    ~_() {}
   } command_buffer_;
 };
 
@@ -489,7 +635,7 @@ api::Command::Buffer& vTensor::View::CMD::command_buffer() {
 }
 
 void vTensor::View::CMD::barrier(State::Transition transition) {
-  // Buffer and Staging are just an alias for the same memory location on UMA.
+  // Buffer and Staging are just an alias for the same memory region on UMA.
 
   if (view_.state_.is_uma()) {
     transition.first.buffer.stage |= transition.first.staging.stage;
@@ -615,8 +761,6 @@ void vTensor::View::CMD::barrier(State::Transition transition) {
       barrier.stage.src = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
     }
 
-    // Optimization opportunity: delay and batch.
-
     command_buffer().barrier(barrier);
   }
 }
@@ -633,13 +777,13 @@ void vTensor::View::CMD::copy_buffer_to_staging(
       state.transition({
           // Staging
           {
-            VK_PIPELINE_STAGE_TRANSFER_BIT,
-            VK_ACCESS_TRANSFER_WRITE_BIT,
+            vk_stage(Stage::Transfer),
+            vk_access(Stage::Transfer, Access::Write),
           },
           // Buffer
           {
-            VK_PIPELINE_STAGE_TRANSFER_BIT,
-            VK_ACCESS_TRANSFER_READ_BIT,
+            vk_stage(Stage::Transfer),
+            vk_access(Stage::Transfer, Access::Read),
           },
           // Image
           {},
@@ -660,13 +804,13 @@ void vTensor::View::CMD::copy_staging_to_buffer(
       state.transition({
           // Staging
           {
-            VK_PIPELINE_STAGE_TRANSFER_BIT,
-            VK_ACCESS_TRANSFER_READ_BIT,
+            vk_stage(Stage::Transfer),
+            vk_access(Stage::Transfer, Access::Read),
           },
           // Buffer
           {
-            VK_PIPELINE_STAGE_TRANSFER_BIT,
-            VK_ACCESS_TRANSFER_WRITE_BIT,
+            vk_stage(Stage::Transfer),
+            vk_access(Stage::Transfer, Access::Write),
           },
           // Image
           {},
@@ -689,27 +833,47 @@ void vTensor::View::CMD::copy_buffer_to_image(
           {},
           // Buffer
           {
-            VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
-            VK_ACCESS_SHADER_READ_BIT,
+            vk_stage(Stage::Compute),
+            vk_access(Stage::Compute, Access::Read),
           },
           // Image
           {
-            VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
-            VK_ACCESS_SHADER_WRITE_BIT,
-            VK_IMAGE_LAYOUT_GENERAL,
+            vk_stage(Stage::Compute),
+            vk_access(Stage::Compute, Access::Write),
+            vk_layout(Stage::Compute, Access::Write),
           },
         }));
 
+  const uvec3 extents = view_.extents();
+  const uint32_t plane = extents.data[0u] * extents.data[1u];
+
+  const struct {
+    uvec3 extents;
+    uint32_t block;
+    uvec4 offset;
+  } block {
+    extents,
+    4u * plane,
+    {
+      0u * plane,
+      1u * plane,
+      2u * plane,
+      3u * plane,
+    },
+  };
+
   view_.context_->dispatch(
       command_buffer(),
       {
         VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+        VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
       },
       VK_KERNEL(nchw_to_image),
-      view_.extents(),
+      extents,
       image,
-      buffer);
+      buffer,
+      view_.context_->resource().pool.uniform(block).object);
 }
 
 void vTensor::View::CMD::copy_image_to_buffer(
@@ -726,27 +890,47 @@ void vTensor::View::CMD::copy_image_to_buffer(
           {},
           // Buffer
           {
-            VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
-            VK_ACCESS_SHADER_WRITE_BIT,
+            vk_stage(Stage::Compute),
+            vk_access(Stage::Compute, Access::Write),
           },
           // Image
           {
-            VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
-            VK_ACCESS_SHADER_READ_BIT,
-            VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+            vk_stage(Stage::Compute),
+            vk_access(Stage::Compute, Access::Read),
+            vk_layout(Stage::Compute, Access::Read),
           },
         }));
 
+  const uvec3 extents = view_.extents();
+  const uint32_t plane = extents.data[0u] * extents.data[1u];
+
+  const struct {
+    uvec3 extents;
+    uint32_t block;
+    uvec4 offset;
+  } block {
+    extents,
+    4u * plane,
+    {
+      0u * plane,
+      1u * plane,
+      2u * plane,
+      3u * plane,
+    },
+  };
+
   view_.context_->dispatch(
       command_buffer(),
       {
         VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+        VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
       },
       VK_KERNEL(image_to_nchw),
       view_.extents(),
       image,
-      buffer);
+      buffer,
+      view_.context_->resource().pool.uniform(block).object);
 }
 
 void vTensor::View::CMD::submit(const api::Resource::Fence fence) {
@@ -769,9 +953,10 @@ vTensor::Buffer& vTensor::View::buffer() const {
 }
 
 vTensor::Buffer& vTensor::View::buffer(
+    const Stage::Flags stage,
     const Access::Flags access) const {
   CMD command_buffer(*this);
-  Buffer& buffer = this->buffer(command_buffer, access);
+  Buffer& buffer = this->buffer(command_buffer, stage, access);
   command_buffer.submit();
 
   return buffer;
@@ -779,25 +964,27 @@ vTensor::Buffer& vTensor::View::buffer(
 
 vTensor::Buffer& vTensor::View::buffer(
     api::Command::Buffer& command_buffer_,
+    const Stage::Flags stage,
     const Access::Flags access) const {
   CMD command_buffer(*this, command_buffer_);
-  return buffer(command_buffer, access);
+  return buffer(command_buffer, stage, access);
 }
 
 vTensor::Buffer& vTensor::View::buffer(
     CMD& command_buffer,
+    const Stage::Flags stage,
     const Access::Flags access) const {
   if ((access & Access::Read) && state_.is_dirty(Component::Buffer)) {
     if (state_.is_clean(Component::Staging)) {
       command_buffer.copy_staging_to_buffer(
           state_,
-          staging(command_buffer, Access::Read).object,
+          staging(command_buffer, Stage::Transfer, Access::Read).object,
           buffer().object);
     }
     else if (state_.is_clean(Component::Image)) {
       command_buffer.copy_image_to_buffer(
           state_,
-          image(command_buffer, Access::Read).object,
+          image(command_buffer, Stage::Compute, Access::Read).object,
           buffer().object);
     }
     else {
@@ -813,20 +1000,8 @@ vTensor::Buffer& vTensor::View::buffer(
           {},
           // Buffer
           {
-            VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
-            [access]() {
-              VkAccessFlags vk_access = 0u;
-
-              if (access & Access::Read) {
-                vk_access |= VK_ACCESS_SHADER_READ_BIT;
-              }
-
-              if (access & Access::Write) {
-                vk_access |= VK_ACCESS_SHADER_WRITE_BIT;
-              }
-
-              return vk_access;
-            }(),
+            vk_stage(stage),
+            vk_access(stage, access),
           },
           // Image
           {},
@@ -845,7 +1020,7 @@ vTensor::Image& vTensor::View::image() const {
   if (!image_ && state_.is_available(Component::Image)) {
     image_ = allocate_image(
         pool_,
-        extents(),
+        vk_extent(extents()),
         options());
   }
 
@@ -853,9 +1028,10 @@ vTensor::Image& vTensor::View::image() const {
 }
 
 vTensor::Image& vTensor::View::image(
+    const Stage::Flags stage,
     const Access::Flags access) const {
   CMD command_buffer(*this);
-  Image& image = this->image(command_buffer, access);
+  Image& image = this->image(command_buffer, stage, access);
   command_buffer.submit();
 
   return image;
@@ -863,18 +1039,20 @@ vTensor::Image& vTensor::View::image(
 
 vTensor::Image& vTensor::View::image(
     api::Command::Buffer& command_buffer_,
+    const Stage::Flags stage,
     const Access::Flags access) const {
   CMD command_buffer(*this, command_buffer_);
-  return image(command_buffer, access);
+  return image(command_buffer, stage, access);
 }
 
 vTensor::Image& vTensor::View::image(
     CMD& command_buffer,
+    const Stage::Flags stage,
     const Access::Flags access) const {
   if ((access & Access::Read) && state_.is_dirty(Component::Image)) {
     command_buffer.copy_buffer_to_image(
         state_,
-        buffer(command_buffer, Access::Read).object,
+        buffer(command_buffer, stage, Access::Read).object,
         image().object);
   }
 
@@ -886,27 +1064,9 @@ vTensor::Image& vTensor::View::image(
           {},
           // Image
           {
-            VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
-            [access]() {
-              VkAccessFlags vk_access = 0u;
-
-              if (access & Access::Read) {
-                vk_access |= VK_ACCESS_SHADER_READ_BIT;
-              }
-
-              if (access & Access::Write) {
-                vk_access |= VK_ACCESS_SHADER_WRITE_BIT;
-              }
-
-              return vk_access;
-            }(),
-            [access]() {
-              if (Access::Read == (access & Access::Read)) {
-                return VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
-              }
-
-              return VK_IMAGE_LAYOUT_GENERAL;
-            }(),
+            vk_stage(stage),
+            vk_access(stage, access),
+            vk_layout(stage, access),
           },
         }));
 
@@ -935,9 +1095,11 @@ vTensor::Buffer& vTensor::View::staging() const {
   return staging_;
 }
 
-vTensor::Buffer& vTensor::View::staging(const Access::Flags access) const {
+vTensor::Buffer& vTensor::View::staging(
+    const Stage::Flags stage,
+    const Access::Flags access) const {
   CMD command_buffer(*this);
-  Buffer& staging = this->staging(command_buffer, access);
+  Buffer& staging = this->staging(command_buffer, stage, access);
   command_buffer.submit(fence());
 
   return staging;
@@ -945,11 +1107,12 @@ vTensor::Buffer& vTensor::View::staging(const Access::Flags access) const {
 
 vTensor::Buffer& vTensor::View::staging(
     CMD& command_buffer,
+    const Stage::Flags stage,
     const Access::Flags access) const {
   if ((access & Access::Read) && state_.is_dirty(Component::Staging)) {
     command_buffer.copy_buffer_to_staging(
         state_,
-        buffer(command_buffer, Access::Read).object,
+        buffer(command_buffer, Stage::Transfer, Access::Read).object,
         staging().object);
   }
 
@@ -957,20 +1120,8 @@ vTensor::Buffer& vTensor::View::staging(
       state_.transition({
           // Staging
           {
-            VK_PIPELINE_STAGE_HOST_BIT,
-            [access]() {
-              VkAccessFlags vk_access = 0u;
-
-              if (access & Access::Read) {
-                vk_access |= VK_ACCESS_HOST_READ_BIT;
-              }
-
-              if (access & Access::Write) {
-                vk_access |= VK_ACCESS_HOST_WRITE_BIT;
-              }
-
-              return vk_access;
-            }(),
+            vk_stage(stage),
+            vk_access(stage, access),
           },
           // Buffer
           {},
diff --git a/aten/src/ATen/native/vulkan/ops/Tensor.h b/aten/src/ATen/native/vulkan/ops/Tensor.h
index ca4187d12b7d..48d4cca84dd4 100644
--- a/aten/src/ATen/native/vulkan/ops/Tensor.h
+++ b/aten/src/ATen/native/vulkan/ops/Tensor.h
@@ -85,6 +85,7 @@ class vTensor final {
     Types
   */
 
+  typedef api::Pipeline::Stage Stage;
   typedef api::Resource::Memory::Access Access;
   typedef api::Resource::Buffer Buffer;
   typedef api::Resource::Fence Fence;
@@ -132,6 +133,9 @@ class vTensor final {
     Payload wait() const &;
 
    private:
+    template<typename, Access::Flags>
+    friend class Future;
+
     // Intentionally disabed to enforce a usage pattern wherein the Future's
     // lifetime exceeds that of the Payload as we use the Future's destructor
     // to eagerly (as opposed to lazily and upon first use) upload the
@@ -139,10 +143,6 @@ class vTensor final {
 
     Payload wait() const && = delete;
 
-   private:
-    template<typename, Access::Flags>
-    friend class Future;
-
    private:
     const vTensor* tensor_;
   };
@@ -178,22 +178,22 @@ class vTensor final {
     predictability of usage and efficiency.
   */
 
-  Buffer::Object buffer() const &;
-  Buffer::Object buffer(Access::Flags access) &;
-  Buffer::Object buffer(api::Command::Buffer&) const &;
-  Buffer::Object buffer(api::Command::Buffer&, Access::Flags) &;
+  Buffer::Object buffer(Stage::Flags) const &;
+  Buffer::Object buffer(Stage::Flags, Access::Flags) &;
+  Buffer::Object buffer(api::Command::Buffer&, Stage::Flags) const &;
+  Buffer::Object buffer(api::Command::Buffer&, Stage::Flags, Access::Flags) &;
 
   bool has_image() const;
-  Image::Object image() const &;
-  Image::Object image(Access::Flags access) &;
-  Image::Object image(api::Command::Buffer&) const &;
-  Image::Object image(api::Command::Buffer&, Access::Flags) &;
+  Image::Object image(Stage::Flags) const &;
+  Image::Object image(Stage::Flags, Access::Flags) &;
+  Image::Object image(api::Command::Buffer&, Stage::Flags) const &;
+  Image::Object image(api::Command::Buffer&, Stage::Flags, Access::Flags) &;
 
   /*
     Metadata
   */
 
-  const VkExtent3D& extents() const;
+  const api::utils::uvec3& extents() const;
   const TensorOptions& options() const;
   IntArrayRef sizes() const;
   IntArrayRef strides() const;
@@ -223,15 +223,15 @@ class vTensor final {
     Device
   */
 
-  Buffer::Object buffer() const && = delete;
-  Buffer::Object buffer(Access::Flags) && = delete;
-  Buffer::Object buffer(api::Command::Buffer&) const && = delete;
-  Buffer::Object buffer(api::Command::Buffer&, Access::Flags) && = delete;
+  Buffer::Object buffer(Stage::Flags) const && = delete;
+  Buffer::Object buffer(Stage::Flags, Access::Flags) && = delete;
+  Buffer::Object buffer(api::Command::Buffer&, Stage::Flags) const && = delete;
+  Buffer::Object buffer(api::Command::Buffer&, Stage::Flags, Access::Flags) && = delete;
 
-  Image::Object image() const && = delete;
-  Image::Object image(Access::Flags) && = delete;
-  Image::Object image(api::Command::Buffer&) const && = delete;
-  Image::Object image(api::Command::Buffer&, Access::Flags) && = delete;
+  Image::Object image(Stage::Flags) const && = delete;
+  Image::Object image(Stage::Flags, Access::Flags) && = delete;
+  Image::Object image(api::Command::Buffer&, Stage::Flags) const && = delete;
+  Image::Object image(api::Command::Buffer&, Stage::Flags, Access::Flags) && = delete;
 
  private:
   class View final {
@@ -248,18 +248,30 @@ class vTensor final {
     View operator=(View&&) = delete;
     ~View() = default;
 
-    Buffer& buffer(Access::Flags) const;
-    Buffer& buffer(api::Command::Buffer&, Access::Flags) const;
+    /*
+      Device
+    */
+
+    Buffer& buffer(Stage::Flags, Access::Flags) const;
+    Buffer& buffer(api::Command::Buffer&, Stage::Flags, Access::Flags) const;
 
     bool has_image() const;
-    Image& image(Access::Flags) const;
-    Image& image(api::Command::Buffer&, Access::Flags) const;
+    Image& image(Stage::Flags, Access::Flags) const;
+    Image& image(api::Command::Buffer&, Stage::Flags, Access::Flags) const;
 
-    Buffer& staging(Access::Flags) const;
-    Buffer& staging(api::Command::Buffer&, Access::Flags) const;
+    /*
+      Host
+    */
+
+    Buffer& staging(Stage::Flags, Access::Flags) const;
+    Buffer& staging(api::Command::Buffer&, Stage::Flags, Access::Flags) const;
     vTensor::Memory& wait() const;
 
-    const VkExtent3D& extents() const;
+    /*
+      Metadata
+    */
+
+    const api::utils::uvec3& extents() const;
     const TensorOptions& options() const;
     IntArrayRef sizes() const;
     IntArrayRef strides() const;
@@ -326,11 +338,11 @@ class vTensor final {
    private:
     // Accessors / Lazy Allocation
     Buffer& buffer() const;
-    Buffer& buffer(CMD&, Access::Flags) const;
+    Buffer& buffer(CMD&, Stage::Flags, Access::Flags) const;
     Image& image() const;
-    Image& image(CMD&, Access::Flags) const;
+    Image& image(CMD&, Stage::Flags, Access::Flags) const;
     Buffer& staging() const;
-    Buffer& staging(CMD&, Access::Flags) const;
+    Buffer& staging(CMD&, Stage::Flags, Access::Flags) const;
     Fence& fence() const;
 
     // Validation
@@ -351,7 +363,7 @@ class vTensor final {
     mutable State state_;
 
     // Metadata
-    VkExtent3D extents_;
+    api::utils::uvec3 extents_;
     TensorOptions options_;
     c10::SmallVector<int64_t, 6u> sizes_;
     c10::SmallVector<int64_t, 6u> strides_;
@@ -486,7 +498,7 @@ inline bool vTensor::has_image() const {
   return view_->has_image();
 }
 
-inline const VkExtent3D& vTensor::extents() const {
+inline const api::utils::uvec3& vTensor::extents() const {
   return view_->extents();
 }
 
@@ -511,7 +523,7 @@ inline bool vTensor::View::has_image() const {
   return state_.is_available(View::Component::Image);
 }
 
-inline const VkExtent3D& vTensor::View::extents() const {
+inline const api::utils::uvec3& vTensor::View::extents() const {
   return extents_;
 }
 
diff --git a/aten/src/ATen/native/vulkan/ops/Upsample.cpp b/aten/src/ATen/native/vulkan/ops/Upsample.cpp
index e38e367c7241..32508c01eec1 100644
--- a/aten/src/ATen/native/vulkan/ops/Upsample.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Upsample.cpp
@@ -8,6 +8,8 @@ namespace vulkan {
 namespace ops {
 namespace {
 
+using namespace api::utils;
+
 Tensor upsample_nearest2d(
     const Tensor& input_arg,
     const IntArrayRef output_sizes,
@@ -17,18 +19,17 @@ Tensor upsample_nearest2d(
 
   const Tensor input = input_arg.is_vulkan() ? input_arg : input_arg.vulkan();
   const vTensor& v_input = convert(input);
-
-  const auto input_sizes = input.sizes();
+  const auto v_input_sizes = v_input.sizes();
 
   TORCH_CHECK(
-      (4 == input_sizes.size()) && (2 == output_sizes.size()),
+      (4 == v_input_sizes.size()) && (2 == output_sizes.size()),
       "Invalid input!");
 
   vTensor v_output{
     context,
     {
-      input_sizes[Layout::Activation4D::batch],
-      input_sizes[Layout::Activation4D::channels],
+      v_input_sizes[Layout::Activation4D::batch],
+      v_input_sizes[Layout::Activation4D::channels],
       output_sizes[Layout::Parameter::height],
       output_sizes[Layout::Parameter::width],
     },
@@ -40,16 +41,27 @@ Tensor upsample_nearest2d(
   {
     if (v_input.has_image()) {
       const struct {
-        float scale_x, scale_y;
+        uvec3 extents;
+        uint32_t _;
+        ivec2 iextents;
+        vec2 scale;
       } block {
-        compute_scales_value<float>(
-            scales_w,
-            input_sizes[Layout::Activation4D::width],
-            output_sizes[Layout::Parameter::width]),
-        compute_scales_value<float>(
-            scales_h,
-            input_sizes[Layout::Activation4D::height],
-            output_sizes[Layout::Parameter::height]),
+        v_output.extents(),
+        0u,
+        {
+          safe_downcast<int32_t>(input.size(Layout::Activation4D::width) - 1),
+          safe_downcast<int32_t>(input.size(Layout::Activation4D::height) - 1),
+        },
+        {
+            compute_scales_value<float>(
+                scales_w,
+                v_input_sizes[Layout::Activation4D::width],
+                output_sizes[Layout::Parameter::width]),
+            compute_scales_value<float>(
+                scales_h,
+                v_input_sizes[Layout::Activation4D::height],
+                output_sizes[Layout::Parameter::height]),
+        },
       };
 
       context->dispatch(
@@ -63,10 +75,15 @@ Tensor upsample_nearest2d(
           v_output.extents(),
           // Write-only access bypasses synchronization but inserts appropriate
           // barriers if necessary.
-          v_output.image(command_buffer, vTensor::Access::Write),
+          v_output.image(
+              command_buffer,
+              vTensor::Stage::Compute,
+              vTensor::Access::Write),
           // Read-only access is implied on const tensors and triggers an async
           // synchronization if necessary.
-          v_input.image(command_buffer),
+          v_input.image(
+              command_buffer,
+              vTensor::Stage::Compute),
           // Object lifetime is managed by the resource pool.
           // It is OK not to keep track of the handle.
           context->resource().pool.uniform(block).object);
diff --git a/aten/src/ATen/native/vulkan/ops/Utils.h b/aten/src/ATen/native/vulkan/ops/Utils.h
new file mode 100644
index 000000000000..ffdc2b6e94eb
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/ops/Utils.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#ifdef USE_VULKAN_API
+
+#include <ATen/native/vulkan/ops/Common.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace ops {
+namespace utils {
+
+int64_t normalize(
+    const int64_t dimension,
+    const int64_t n) {
+  return (dimension % n + n) % n;
+}
+
+} // namespace utils
+} // namespace ops
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
index 70618dc8df84..4bf9bf46a965 100644
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@@ -77,12 +77,8 @@ list(APPEND ATen_HIP_TEST_SRCS
 #  ${CMAKE_CURRENT_SOURCE_DIR}/hip/hip_stream_test.cpp
 
 list(APPEND ATen_VULKAN_TEST_SRCS
-  ${CMAKE_CURRENT_SOURCE_DIR}/vulkan_test.cpp)
-
-if(USE_VULKAN_API)
-  list(APPEND ATen_VULKAN_TEST_SRCS
-    ${CMAKE_CURRENT_SOURCE_DIR}/vulkan_api_test.cpp)
-endif()
+  ${CMAKE_CURRENT_SOURCE_DIR}/vulkan_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/vulkan_api_test.cpp)
 
 list(APPEND ATen_MOBILE_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test_all_types.cpp
diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index 03105eec90ea..73b221c81b9d 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -14,7 +14,13 @@ bool checkRtol(const at::Tensor& diff, const std::vector<at::Tensor>& inputs) {
     maxValue = fmax(tensor.abs().max().item<float>(), maxValue);
   }
 
-  return diff.abs().max().item<float>() < (2e-6 * maxValue);
+#ifdef USE_VULKAN_FP16_INFERENCE
+  constexpr float tolerance = 1e-2;
+#else
+  constexpr float tolerance = 1e-5;
+#endif
+
+  return diff.abs().max().item<float>() < (tolerance * maxValue);
 }
 
 bool almostEqual(const at::Tensor& a, const at::Tensor& b) {
@@ -500,11 +506,11 @@ TEST(VulkanAPITest, empty) {
 }
 
 TEST(VulkanAPITest, mean) {
-  const auto in_cpu = at::rand({5, 3, 9, 9}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-  const auto out_cpu = at::mean(in_cpu, {-1, -2}, false);
+  const auto in_cpu = at::rand({17, 3, 79, 53}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+  const auto out_cpu = at::mean(in_cpu, {-1, -2}, true);
 
   const auto in_vulkan = in_cpu.vulkan();
-  const auto out_vulkan = at::mean(in_vulkan, {-1, -2}, false);
+  const auto out_vulkan = at::mean(in_vulkan, {-1, -2}, true);
 
   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
   if (!check) {
@@ -515,12 +521,12 @@ TEST(VulkanAPITest, mean) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, mean_keep_dim) {
-  const auto in_cpu = at::rand({10, 3, 21, 21}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-  const auto out_cpu = at::mean(in_cpu, {-1, -2}, true);
+TEST(VulkanAPITest, mean2d) {
+  const auto in_cpu = at::rand({11, 7, 173, 37}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+  const auto out_cpu = at::mean(in_cpu, {-1, -2}, false);
 
   const auto in_vulkan = in_cpu.vulkan();
-  const auto out_vulkan = at::mean(in_vulkan, {-1, -2}, true);
+  const auto out_vulkan = at::mean(in_vulkan, {-1, -2}, false);
 
   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
   if (!check) {
@@ -730,7 +736,7 @@ class Conv2d final : public BaseOp {
         stride_(stride),
         padding_(padding),
         w_(at::rand(wsizes, at::device(at::kCPU).dtype(at::kFloat))),
-        b_(at::zeros(wsizes[0], at::device(at::kCPU).dtype(at::kFloat))){
+        b_(at::rand(wsizes[0], at::device(at::kCPU).dtype(at::kFloat))){
   }
 
   at::Tensor run(at::Tensor& t) const override {
@@ -850,7 +856,6 @@ class MobileNetV2 final : public OpsList {
     ops_.emplace_back(new Hardtanh_());
     ops_.emplace_back(new Conv2d({384, 1, 3, 3}, 384, 1, 1));
     ops_.emplace_back(new Hardtanh_());
-    ops_.emplace_back(new Hardtanh_());
     ops_.emplace_back(new Conv2d({64, 384, 1, 1}, 1, 1, 0));
     ops_.emplace_back(new Conv2d({384, 64, 1, 1}, 1, 1, 0));
     ops_.emplace_back(new Hardtanh_());
diff --git a/aten/src/ATen/test/vulkan_test.cpp b/aten/src/ATen/test/vulkan_test.cpp
index d5483a7327b1..7c4e96f7f1a6 100644
--- a/aten/src/ATen/test/vulkan_test.cpp
+++ b/aten/src/ATen/test/vulkan_test.cpp
@@ -1,3 +1,5 @@
+#ifndef USE_VULKAN_API
+
 #include <gtest/gtest.h>
 
 #include <ATen/ATen.h>
@@ -938,3 +940,5 @@ TEST(VulkanTest, avg_pool2d) {
   }
   ASSERT_TRUE(check);
 }
+
+#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/vulkan/Context.cpp b/aten/src/ATen/vulkan/Context.cpp
index 8d2b5281d2ae..793c690a0c14 100644
--- a/aten/src/ATen/vulkan/Context.cpp
+++ b/aten/src/ATen/vulkan/Context.cpp
@@ -3,6 +3,10 @@
 #include <ATen/Tensor.h>
 #include <ATen/vulkan/Context.h>
 
+#ifdef USE_VULKAN_API
+#include <ATen/native/vulkan/api/Context.h>
+#endif /* USE_VULKAN_API */
+
 namespace at {
 namespace vulkan {
 
@@ -23,8 +27,12 @@ at::Tensor& vulkan_copy_(at::Tensor& self, const at::Tensor& src) {
 
 namespace native {
 bool is_vulkan_available() {
+#ifdef USE_VULKAN_API
+  return native::vulkan::api::available();
+#else
   auto p = at::vulkan::g_vulkan_impl_registry.load();
   return p ? p->is_vulkan_available() : false;
+#endif
 }
 } // namespace native
 
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 7c068bb1e842..92015c269083 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -137,6 +137,12 @@ function(caffe2_print_configuration_summary)
   message(STATUS "  USE_OPENMP            : ${USE_OPENMP}")
   message(STATUS "  USE_TBB               : ${USE_TBB}")
   message(STATUS "  USE_VULKAN            : ${USE_VULKAN}")
+  if(${USE_VULKAN})
+    message(STATUS "    USE_VULKAN_FP16_INFERENCE    : ${USE_VULKAN_FP16_INFERENCE}")
+    message(STATUS "    USE_VULKAN_RELAXED_PRECISION : ${USE_VULKAN_RELAXED_PRECISION}")
+    message(STATUS "    USE_VULKAN_SHADERC_RUNTIME   : ${USE_VULKAN_SHADERC_RUNTIME}")
+    message(STATUS "    USE_VULKAN_WRAPPER           : ${USE_VULKAN_WRAPPER}")
+  endif()
   message(STATUS "  USE_PROF              : ${USE_PROF}")
   message(STATUS "  USE_QNNPACK           : ${USE_QNNPACK}")
   message(STATUS "  USE_PYTORCH_QNNPACK   : ${USE_PYTORCH_QNNPACK}")