From 1a4b217e10143d496d6861f8d7a785d79ca46101 Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Fri, 1 Aug 2025 13:19:31 -0700
Subject: [PATCH] Remove support for VkSemaphore

Summary:
## Changes

Revert some changes to the D78360038 / https://github.com/pytorch/executorch/pull/12527  stack which enabled support for submitting command buffers with an associated semaphore.

## Context

The original intent was to allow command buffers to be correctly ordered when submitting multiple command buffers for model inference. Previously it was thought that the Vulkan API would not be aware of the dependency between two separate command buffer submissions, so a semaphore would be needed to ensure correct execution order between them.

However, I noticed the following validation layer error on Mac:

```
Validation 0 vkQueueSubmit(): pSubmits[0].pSignalSemaphores[0] (VkSemaphore 0x10f000000010f) is being signaled by VkQueue 0x1181082a8, but it was previously signaled by VkQueue 0x1181082a8 and has not since been waited on.
The Vulkan spec states: Each binary semaphore element of the pSignalSemaphores member of any element of pSubmits must be unsignaled when the semaphore signal operation it defines is executed on the device (https://vulkan.lunarg.com/doc/view/1.4.313.0/mac/antora/spec/latest/chapters/cmdbuffers.html#VUID-vkQueueSubmit-pSignalSemaphores-00067)
```

The reason this happens is because we store the `VkSemaphore` together with the `VkCommandBuffer`, and use the same `VkSemaphore` in the submit info every time the command buffer is submitted. However, there's no mechanism to reset the VkSemaphore to "unsignaled" once it's been signaled.

Therefore, as-is the `VkSemaphores` do not serve any purpose after the first inference.

The correct approach if we were to use semaphores is to create a new one with every submission, and not have it be attached to a specific command buffer

However, after some deeper research I found that `VkSemaphore` is not actually needed to ensure correct execution order between command buffers; the command pipeline barriers that we already insert should be sufficient. My primary source is this [stackoverflow question](the correct approach if we were to use semaphores is to create a new one with every submission, and not have it be attached to a specific command buffer) which references the Vulkan API spec in the answer.

Therefore, remove the `VkSemaphore` machinery since it's not required.

Differential Revision: D79468286
---
 backends/vulkan/runtime/api/Context.cpp       | 16 +++--------
 backends/vulkan/runtime/api/Context.h         |  2 --
 .../vulkan/runtime/graph/ComputeGraph.cpp     | 18 ++-----------
 backends/vulkan/runtime/graph/ComputeGraph.h  |  6 +----
 backends/vulkan/runtime/vk_api/Command.cpp    | 27 +------------------
 backends/vulkan/runtime/vk_api/Command.h      | 14 +---------
 6 files changed, 8 insertions(+), 75 deletions(-)

diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp
index 44804b1c86e..68db37b866e 100644
--- a/backends/vulkan/runtime/api/Context.cpp
+++ b/backends/vulkan/runtime/api/Context.cpp
@@ -38,8 +38,7 @@ Context::Context(vkapi::Adapter* adapter, const ContextConfig& config)
       querypool_(config_.query_pool_config, nullptr),
       // Command buffer submission
       cmd_mutex_{},
-      cmd_(VK_NULL_HANDLE, VK_NULL_HANDLE, 0u),
-      prev_semaphore_(VK_NULL_HANDLE),
+      cmd_(VK_NULL_HANDLE, 0u),
       submit_count_{0u},
       // Memory Management
       buffer_clearlist_mutex_{},
@@ -196,21 +195,14 @@ void Context::register_blit(
 }
 
 void Context::submit_cmd_to_gpu(VkFence fence_handle, const bool final_use) {
-  // Wait semaphore would be previous command buffer's signal semaphore
-  VkSemaphore wait_semaphore = prev_semaphore_;
-  // Signal semaphore for the the current command buffer
-  VkSemaphore signal_semaphore = cmd_.get_signal_semaphore();
-  // Next command buffer would wait on this command buffer's signal semaphore
-  prev_semaphore_ = signal_semaphore;
-
   if (cmd_) {
     cmd_.end();
     adapter_p_->submit_cmd(
         queue_,
         cmd_.get_submit_handle(final_use),
         fence_handle,
-        wait_semaphore,
-        signal_semaphore);
+        VK_NULL_HANDLE,
+        VK_NULL_HANDLE);
 
     submit_count_ = 0u;
   }
@@ -226,8 +218,6 @@ void Context::flush() {
   if (cmd_) {
     cmd_.invalidate();
   }
-  // Reset previous command buffer semaphore
-  prev_semaphore_ = VK_NULL_HANDLE;
 
   std::lock_guard<std::mutex> bufferlist_lock(buffer_clearlist_mutex_);
   std::lock_guard<std::mutex> imagelist_lock(image_clearlist_mutex_);
diff --git a/backends/vulkan/runtime/api/Context.h b/backends/vulkan/runtime/api/Context.h
index 3efa8d0276d..9c7301b9971 100644
--- a/backends/vulkan/runtime/api/Context.h
+++ b/backends/vulkan/runtime/api/Context.h
@@ -68,8 +68,6 @@ class Context final {
   // Command buffers submission
   std::mutex cmd_mutex_;
   vkapi::CommandBuffer cmd_;
-  // Semaphore for the previously submitted command buffer, if any
-  VkSemaphore prev_semaphore_;
   uint32_t submit_count_;
   // Memory Management
   std::mutex buffer_clearlist_mutex_;
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index a1dd4a287c1..14328027362 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -776,36 +776,22 @@ void ComputeGraph::submit_current_cmd_and_wait(const bool final_use) {
   context_->fences().return_fence(fence);
 }
 
-void ComputeGraph::submit_cmd(
-    vkapi::CommandBuffer& cmd_buf,
-    VkSemaphore wait_semaphore,
-    VkSemaphore signal_semaphore,
-    VkFence fence) {
+void ComputeGraph::submit_cmd(vkapi::CommandBuffer& cmd_buf, VkFence fence) {
   if (cmd_buf) {
     cmd_buf.end();
     context_->adapter_ptr()->submit_cmd(
-        context_->queue(),
-        cmd_buf.get_submit_handle(false),
-        fence,
-        wait_semaphore,
-        signal_semaphore);
+        context_->queue(), cmd_buf.get_submit_handle(false), fence);
   }
 }
 
 void ComputeGraph::submit_deferred_cmds_and_wait() {
-  VkSemaphore prev_semaphore = VK_NULL_HANDLE;
   vkapi::VulkanFence fence = context_->fences().get_fence();
 
   for (uint32_t i = 0; i < deferred_cmd_list_.size(); i++) {
     auto& cmd = deferred_cmd_list_[i];
-    VkSemaphore wait_semaphore = prev_semaphore;
-    VkSemaphore signal_semaphore = cmd.get_signal_semaphore();
-    prev_semaphore = signal_semaphore;
 
     submit_cmd(
         cmd,
-        wait_semaphore,
-        signal_semaphore,
         i == (deferred_cmd_list_.size() - 1) ? fence.get_submit_handle()
                                              : VK_NULL_HANDLE);
   }
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 7bac9bf92db..886e2c5ccea 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -857,11 +857,7 @@ class ComputeGraph final {
   /*
    * Submit one command buffer to the GPU.
    */
-  void submit_cmd(
-      vkapi::CommandBuffer& cmd_buf,
-      VkSemaphore wait_semaphore,
-      VkSemaphore signal_semaphore,
-      VkFence fence);
+  void submit_cmd(vkapi::CommandBuffer& cmd_buf, VkFence fence);
 
   /*
    * Submits all the commands gathered in deferred_cmd_bufs_ to the GPU.
diff --git a/backends/vulkan/runtime/vk_api/Command.cpp b/backends/vulkan/runtime/vk_api/Command.cpp
index 4e0a915fe98..84e1f68dc68 100644
--- a/backends/vulkan/runtime/vk_api/Command.cpp
+++ b/backends/vulkan/runtime/vk_api/Command.cpp
@@ -20,34 +20,28 @@ namespace vkapi {
 
 CommandBuffer::CommandBuffer(
     VkCommandBuffer handle,
-    VkSemaphore semaphore,
     const VkCommandBufferUsageFlags flags)
     : handle_(handle),
-      signal_semaphore_(semaphore),
       flags_(flags),
       state_(CommandBuffer::State::NEW),
       bound_{} {}
 
 CommandBuffer::CommandBuffer(CommandBuffer&& other) noexcept
     : handle_(other.handle_),
-      signal_semaphore_(other.signal_semaphore_),
       flags_(other.flags_),
       state_(other.state_),
       bound_(other.bound_) {
   other.handle_ = VK_NULL_HANDLE;
-  other.signal_semaphore_ = VK_NULL_HANDLE;
   other.bound_.reset();
 }
 
 CommandBuffer& CommandBuffer::operator=(CommandBuffer&& other) noexcept {
   handle_ = other.handle_;
-  signal_semaphore_ = other.signal_semaphore_;
   flags_ = other.flags_;
   state_ = other.state_;
   bound_ = other.bound_;
 
   other.handle_ = VK_NULL_HANDLE;
-  other.signal_semaphore_ = VK_NULL_HANDLE;
   other.bound_.reset();
   other.state_ = CommandBuffer::State::INVALID;
 
@@ -310,12 +304,6 @@ CommandPool::~CommandPool() {
   if (pool_ == VK_NULL_HANDLE) {
     return;
   }
-  for (auto& semaphore : semaphores_) {
-    if (semaphore != VK_NULL_HANDLE) {
-      vkDestroySemaphore(device_, semaphore, nullptr);
-    }
-  }
-
   vkDestroyCommandPool(device_, pool_, nullptr);
 }
 
@@ -326,7 +314,6 @@ CommandBuffer CommandPool::get_new_cmd(bool reusable) {
   allocate_new_batch(config_.cmd_pool_batch_size);
 
   VkCommandBuffer handle = buffers_[in_use_];
-  VkSemaphore semaphore = semaphores_[in_use_];
 
   VkCommandBufferUsageFlags cmd_flags = 0u;
   if (!reusable) {
@@ -334,7 +321,7 @@ CommandBuffer CommandPool::get_new_cmd(bool reusable) {
   }
 
   in_use_++;
-  return CommandBuffer(handle, semaphore, cmd_flags);
+  return CommandBuffer(handle, cmd_flags);
 }
 
 void CommandPool::flush() {
@@ -350,7 +337,6 @@ void CommandPool::allocate_new_batch(const uint32_t count) {
   }
 
   buffers_.resize(buffers_.size() + count);
-  semaphores_.resize(buffers_.size() + count);
 
   const VkCommandBufferAllocateInfo allocate_info{
       VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, // sType
@@ -362,17 +348,6 @@ void CommandPool::allocate_new_batch(const uint32_t count) {
 
   VK_CHECK(vkAllocateCommandBuffers(
       device_, &allocate_info, buffers_.data() + in_use_));
-
-  const VkSemaphoreCreateInfo semaphoreCreateInfo = {
-      VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, nullptr, 0};
-
-  for (uint32_t i = 0; i < count; i++) {
-    VK_CHECK(vkCreateSemaphore(
-        device_,
-        &semaphoreCreateInfo,
-        nullptr,
-        semaphores_.data() + in_use_ + i));
-  }
 }
 
 } // namespace vkapi
diff --git a/backends/vulkan/runtime/vk_api/Command.h b/backends/vulkan/runtime/vk_api/Command.h
index d6d3fe05a34..ff1e5934a5c 100644
--- a/backends/vulkan/runtime/vk_api/Command.h
+++ b/backends/vulkan/runtime/vk_api/Command.h
@@ -26,10 +26,7 @@ namespace vkapi {
 
 class CommandBuffer final {
  public:
-  explicit CommandBuffer(
-      VkCommandBuffer,
-      VkSemaphore,
-      const VkCommandBufferUsageFlags);
+  explicit CommandBuffer(VkCommandBuffer, const VkCommandBufferUsageFlags);
 
   CommandBuffer(const CommandBuffer&) = delete;
   CommandBuffer& operator=(const CommandBuffer&) = delete;
@@ -73,8 +70,6 @@ class CommandBuffer final {
 
  private:
   VkCommandBuffer handle_;
-  // Semaphore to signal when the command buffer has completed execution
-  VkSemaphore signal_semaphore_;
   VkCommandBufferUsageFlags flags_;
   State state_;
   Bound bound_;
@@ -86,7 +81,6 @@ class CommandBuffer final {
 
   inline void invalidate() {
     handle_ = VK_NULL_HANDLE;
-    signal_semaphore_ = VK_NULL_HANDLE;
     bound_.reset();
   }
 
@@ -106,10 +100,6 @@ class CommandBuffer final {
 
   VkCommandBuffer get_submit_handle(const bool final_use = false);
 
-  VkSemaphore get_signal_semaphore() const {
-    return signal_semaphore_;
-  }
-
   inline operator bool() const {
     return handle_ != VK_NULL_HANDLE;
   }
@@ -140,8 +130,6 @@ class CommandPool final {
   // New Buffers
   std::mutex mutex_;
   std::vector<VkCommandBuffer> buffers_;
-  // Semaphores corresponding to the command buffers
-  std::vector<VkSemaphore> semaphores_;
   size_t in_use_;
 
  public: