From efee9908511bcf803f59d6ac407bda5b0b92da96 Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Tue, 3 Sep 2024 13:11:40 -0700
Subject: [PATCH 1/4] [ET-VK] Add test to track sizes of various objects

## Context

Add a simple test to track the sizes of various important objects in the Vulkan compute graph API over time. The test uses some loose thresholds to alert when an object has grown unexpectedly large.

Differential Revision: [D62144400](https://our.internmc.facebook.com/intern/diff/D62144400/)

[ghstack-poisoned]
---
 .../vulkan/test/vulkan_compute_api_test.cpp   | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index f3c60a21376..2f9c3d22f57 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -992,6 +992,28 @@ TEST_F(VulkanComputeAPITest, texture_virtual_resize) {
       graph.get_tensor(name.value)->staging_buffer_numel()); \
   graph.copy_from_staging(name.staging, data_##name.data(), data_##name.size());
 
+// The purpose of this test is simply to track the size of various classes over
+// time, in the interest of making sure that they doesn't grow too large.
+TEST_F(VulkanComputeAPITest, print_object_sizes) {
+#define PRINT_SIZE(name) \
+  std::cout << #name << " size: " << sizeof(name) << " B" << std::endl
+  PRINT_SIZE(vTensor);
+  PRINT_SIZE(Value);
+  PRINT_SIZE(StagingBuffer);
+  PRINT_SIZE(ComputeGraph);
+  PRINT_SIZE(ExecuteNode);
+#undef PRINT_SIZE
+
+  // The actual sizes of each object is dependent on the platform. However, we
+  // can alert ourselves to any significant changes in the sizes of these
+  // objects by checking the `sizeof()` the class against some loose thresholds.
+  EXPECT_TRUE(sizeof(vTensor) < 1800);
+  EXPECT_TRUE(sizeof(Value) < 2400);
+  EXPECT_TRUE(sizeof(StagingBuffer) < 500);
+  EXPECT_TRUE(sizeof(ComputeGraph) < 500);
+  EXPECT_TRUE(sizeof(ExecuteNode) < 500);
+}
+
 TEST(VulkanComputeGraphTest, test_values_scalars) {
   GraphConfig config;
   ComputeGraph graph(config);

From d4e400c0fe83cf6777872b434a6523eb5d8dd325 Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Tue, 3 Sep 2024 13:11:42 -0700
Subject: [PATCH 2/4] [ET-VK]  Add type for symbolic integers

## Context

Introduce the `SymInt` class which allows representation of symbolic integers in a Vulkan graph.

Please see the comments documentation of the `SymInt` class for more details regarding why the `Int` type is not sufficient for symbolic integers.

Differential Revision: [D62144399](https://our.internmc.facebook.com/intern/diff/D62144399/)

[ghstack-poisoned]
---
 .../vulkan/runtime/graph/ComputeGraph.cpp     | 24 ++++++++
 backends/vulkan/runtime/graph/ComputeGraph.h  | 18 +++++-
 .../runtime/graph/containers/SymInt.cpp       | 24 ++++++++
 .../vulkan/runtime/graph/containers/SymInt.h  | 41 +++++++++++++
 .../vulkan/runtime/graph/containers/Types.cpp |  1 +
 .../vulkan/runtime/graph/containers/Types.h   |  1 +
 .../vulkan/runtime/graph/containers/Value.h   |  9 +++
 .../vulkan/test/glsl/scalar_add_texture.glsl  | 29 ++++++++++
 .../vulkan/test/vulkan_compute_api_test.cpp   | 58 +++++++++++++++++++
 9 files changed, 204 insertions(+), 1 deletion(-)
 create mode 100644 backends/vulkan/runtime/graph/containers/SymInt.cpp
 create mode 100644 backends/vulkan/runtime/graph/containers/SymInt.h
 create mode 100644 backends/vulkan/test/glsl/scalar_add_texture.glsl

diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index 6c3ec88eaa7..a8f57f57d2a 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -43,6 +43,7 @@ VALUE_PTR_CLASS_IMPL(IntListPtr, std::vector<int64_t>, IntList)
 VALUE_PTR_CLASS_IMPL(DoubleListPtr, std::vector<double>, DoubleList)
 VALUE_PTR_CLASS_IMPL(BoolListPtr, std::vector<bool>, BoolList)
 VALUE_PTR_CLASS_IMPL(ValueListPtr, std::vector<ValueRef>, ValueList)
+VALUE_PTR_CLASS_IMPL(SymIntPtr, SymInt, SymInt)
 
 #undef VALUE_PTR_CLASS_IMPL
 
@@ -261,6 +262,13 @@ ValueRef ComputeGraph::add_string(std::string&& str) {
   return idx;
 }
 
+ValueRef ComputeGraph::add_symint(const int32_t val) {
+  ValueRef idx(static_cast<int>(values_.size()));
+  check_no_active_value_ptrs();
+  values_.emplace_back(SymInt(context(), val));
+  return idx;
+}
+
 ValueRef ComputeGraph::set_input_tensor(
     const ValueRef idx,
     const bool use_staging) {
@@ -300,6 +308,22 @@ ValueRef ComputeGraph::set_output_tensor(
   return idx;
 }
 
+vkapi::BufferBindInfo ComputeGraph::get_or_create_int_param_buffer(
+    const ValueRef idx) {
+  if (values_.at(idx).isInt()) {
+    const int32_t val = extract_scalar<int32_t>(idx);
+    create_params_buffer(val);
+  } else if (values_.at(idx).isSymInt()) {
+    SymIntPtr symint = get_symint(idx);
+    return vkapi::BufferBindInfo(symint->gpu_buffer.buffer());
+  }
+  VK_THROW("Cannot create a int param buffer for the given value");
+}
+
+void ComputeGraph::set_symint(const ValueRef idx, const int32_t val) {
+  get_symint(idx)->set(val);
+}
+
 SharedObject& ComputeGraph::get_shared_object(const int64_t idx) {
   if (idx >= shared_objects_.size()) {
     shared_objects_.resize(static_cast<size_t>(idx + 1));
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 9b04b08a70e..ac5e0d6c9d1 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -63,6 +63,7 @@ DECL_VALUE_PTR_CLASS(IntListPtr, std::vector<int64_t>)
 DECL_VALUE_PTR_CLASS(DoubleListPtr, std::vector<double>)
 DECL_VALUE_PTR_CLASS(BoolListPtr, std::vector<bool>)
 DECL_VALUE_PTR_CLASS(ValueListPtr, std::vector<ValueRef>)
+DECL_VALUE_PTR_CLASS(SymIntPtr, SymInt);
 
 #undef DECL_VALUE_PTR_CLASS
 
@@ -154,6 +155,7 @@ class ComputeGraph final {
   GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(DoubleListPtr, double_list, DoubleList)
   GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(BoolListPtr, bool_list, BoolList)
   GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(ValueListPtr, value_list, ValueList)
+  GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(SymIntPtr, symint, SymInt);
 
 #undef GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS
 
@@ -422,15 +424,28 @@ class ComputeGraph final {
 
   ValueRef add_string(std::string&& str);
 
+  ValueRef add_symint(const int32_t val);
+
   ValueRef set_input_tensor(const ValueRef idx, const bool use_staging = true);
   ValueRef set_output_tensor(const ValueRef idx, const bool use_staging = true);
 
   template <typename Block>
-  const vkapi::BufferBindInfo create_params_buffer(const Block& data) {
+  vkapi::BufferBindInfo create_params_buffer(const Block& data) {
     param_ubos_.emplace_back(api::ParamsBuffer(context_.get(), data));
     return vkapi::BufferBindInfo(param_ubos_.back().buffer());
   }
 
+  /*
+   * Given a ValueRef, do the following depending on the type of the Value:
+   * - If it is a SymInt, return the BufferBindInfo of the ParamsBuffer object
+   *   backing the SymInt.
+   * - If it is a regular Int, create a new ParamsBuffer using the integer value
+   *   and return the BufferBindInfo of the created ParamsBuffer.
+   */
+  vkapi::BufferBindInfo get_or_create_int_param_buffer(const ValueRef idx);
+
+  void set_symint(const ValueRef idx, const int32_t val);
+
   /*
    * Convenience function to add an input tensor along with its staging buffer
    */
@@ -577,6 +592,7 @@ class ComputeGraph final {
   friend class DoubleListPtr;
   friend class BoolListPtr;
   friend class ValueListPtr;
+  friend class SymIntPtr;
 };
 
 template <typename T>
diff --git a/backends/vulkan/runtime/graph/containers/SymInt.cpp b/backends/vulkan/runtime/graph/containers/SymInt.cpp
new file mode 100644
index 00000000000..c91db84b787
--- /dev/null
+++ b/backends/vulkan/runtime/graph/containers/SymInt.cpp
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/containers/SymInt.h>
+
+namespace vkcompute {
+
+SymInt::SymInt(api::Context* context_p, const int32_t val)
+    : gpu_buffer(context_p, val){};
+
+void SymInt::set(const int32_t val) {
+  gpu_buffer.update(val);
+}
+
+void SymInt::operator=(const int32_t val) {
+  gpu_buffer.update(val);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/containers/SymInt.h b/backends/vulkan/runtime/graph/containers/SymInt.h
new file mode 100644
index 00000000000..0c9fbee5fe2
--- /dev/null
+++ b/backends/vulkan/runtime/graph/containers/SymInt.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/api/Context.h>
+#include <executorch/backends/vulkan/runtime/api/containers/ParamsBuffer.h>
+
+namespace vkcompute {
+
+/*
+ * Represents a symbolic integer whose value can be variable. It is implemented
+ * as a thin wrapper around a `ParamsBuffer` object that holds the value of the
+ * integer. The `ParamsBuffer` object allows the value of the symbolic integer
+ * to be changed from the CPU and have those changes be visible to all shaders
+ * that use the symbolic integer; it also allows the value of the symbolic
+ * integer to be the result of a compute shader.
+ *
+ * Regular scalar types represented by `TypeTag::INT` cannot be used for
+ * symbolic integers because their value is assumed to be constant; therefore
+ * the `Value` instance holding the value of the scalar does not contain
+ * any reference to the GPU buffers used to pass its value into compute shaders.
+ * Therefore, updating the value of the scalar does not impact the value seen
+ * by compute shaders.
+ */
+struct SymInt final {
+  api::ParamsBuffer gpu_buffer;
+
+  explicit SymInt(api::Context* context_p, const int32_t val);
+
+  void set(const int32_t val);
+
+  void operator=(const int32_t val);
+};
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/containers/Types.cpp b/backends/vulkan/runtime/graph/containers/Types.cpp
index c5ffc65add1..e7a8951a552 100644
--- a/backends/vulkan/runtime/graph/containers/Types.cpp
+++ b/backends/vulkan/runtime/graph/containers/Types.cpp
@@ -29,6 +29,7 @@ std::ostream& operator<<(std::ostream& out, const TypeTag& tag) {
     PRINT_CASE(BOOLLIST)
     PRINT_CASE(VALUELIST)
     PRINT_CASE(STRING)
+    PRINT_CASE(SYMINT)
   }
   return out;
 }
diff --git a/backends/vulkan/runtime/graph/containers/Types.h b/backends/vulkan/runtime/graph/containers/Types.h
index 79edbd50d3a..5840d1695ee 100644
--- a/backends/vulkan/runtime/graph/containers/Types.h
+++ b/backends/vulkan/runtime/graph/containers/Types.h
@@ -36,6 +36,7 @@ enum class TypeTag : uint32_t {
   // Special Type
   VALUELIST,
   STRING,
+  SYMINT,
 };
 
 std::ostream& operator<<(std::ostream& out, const TypeTag& tag);
diff --git a/backends/vulkan/runtime/graph/containers/Value.h b/backends/vulkan/runtime/graph/containers/Value.h
index 6e03bbd4a21..50a2b5e548c 100644
--- a/backends/vulkan/runtime/graph/containers/Value.h
+++ b/backends/vulkan/runtime/graph/containers/Value.h
@@ -13,6 +13,7 @@
 #include <executorch/backends/vulkan/runtime/api/api.h>
 
 #include <executorch/backends/vulkan/runtime/graph/containers/Constant.h>
+#include <executorch/backends/vulkan/runtime/graph/containers/SymInt.h>
 #include <executorch/backends/vulkan/runtime/graph/containers/Types.h>
 
 namespace vkcompute {
@@ -67,6 +68,8 @@ struct Value final {
 
     std::string as_string;
 
+    SymInt as_symint;
+
     Payload() : u() {}
     // NOLINTNEXTLINE
     ~Payload(){};
@@ -123,6 +126,7 @@ struct Value final {
           TypeTag::VALUELIST, std::vector<ValueRef>, as_value_list, vector);
       CASE_MOVE_MOVEABLE_TYPE(
           TypeTag::STRING, std::string, as_string, basic_string);
+      CASE_MOVE_MOVEABLE_TYPE(TypeTag::SYMINT, SymInt, as_symint, SymInt);
 
       case TypeTag::NONE:
         clearToNone();
@@ -172,6 +176,9 @@ struct Value final {
       case TypeTag::STRING:
         payload.as_string.~basic_string();
         break;
+      case TypeTag::SYMINT:
+        payload.as_symint.~SymInt();
+        break;
       // Manually list out the types so that if a type here is added later and
       // not handled the compiler can catch it.
       case TypeTag::NONE:
@@ -288,6 +295,8 @@ struct Value final {
       TypeTag::STRING,
       as_string);
 
+  SUPPORT_TRIVIALLY_MOVEABLE_TYPE(SymInt, SymInt, TypeTag::SYMINT, as_symint);
+
 #undef SUPPORT_TRIVIALLY_COPYABLE_TYPE
 #undef SUPPORT_TRIVIALLY_MOVEABLE_TYPE
 
diff --git a/backends/vulkan/test/glsl/scalar_add_texture.glsl b/backends/vulkan/test/glsl/scalar_add_texture.glsl
new file mode 100644
index 00000000000..aa2b22c81f9
--- /dev/null
+++ b/backends/vulkan/test/glsl/scalar_add_texture.glsl
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+layout(std430) buffer;
+
+${layout_declare_tensor(0, "rw", "t_in", "float", "texture3d")}
+${layout_declare_ubo(1, "uvec3", "extents")}
+${layout_declare_ubo(2, "int", "scalar")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  if (any(greaterThanEqual(pos, extents))) {
+    return;
+  }
+
+  vec4 in_tex = imageLoad(t_in, pos);
+  imageStore(t_in, pos, imageLoad(t_in, pos) + float(scalar));
+}
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 2f9c3d22f57..a0bfefafa02 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -1268,6 +1268,64 @@ TEST(VulkanComputeGraphTest, test_simple_graph) {
   }
 }
 
+TEST(VulkanComputeGraphTest, test_simple_graph_with_symint) {
+  GraphConfig config;
+  config.set_storage_type_override(utils::kTexture3D);
+  ComputeGraph graph(config);
+
+  std::vector<int64_t> sizes = {8, 64, 124};
+
+  // Build graph
+
+  ValueRef scalar = graph.add_symint(1);
+  IOValueRef a = graph.add_input_tensor(sizes, vkapi::kFloat);
+
+  IOValueRef out = {};
+  out.value = a.value;
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR("scalar_add_texture"),
+      graph.create_global_wg_size(a.value),
+      graph.create_local_wg_size(a.value),
+      // Inputs and Outputs
+      {{out.value, vkapi::MemoryAccessType::WRITE}},
+      // Shader params buffers
+      {graph.texture_limits_ubo(a.value),
+       graph.get_or_create_int_param_buffer(scalar)},
+      // Specialization Constants
+      {},
+      // Resizing Logic
+      nullptr,
+      {}));
+
+  out.staging = graph.set_output_tensor(out.value);
+
+  graph.prepare();
+  graph.encode_execute();
+
+  // Run graph
+
+  for (float i = 5.0f; i < 30.0f; i += 10.0f) {
+    int scalar_val = i - 3.0f;
+    graph.set_symint(scalar, scalar_val);
+
+    float val_a = i + 2.0f;
+    float val_out = val_a + scalar_val;
+
+    fill_vtensor(graph, a, val_a);
+
+    graph.execute();
+
+    EXTRACT_TENSOR(out);
+
+    // Sanity check that the values are correct
+    for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); ++i) {
+      CHECK_VALUE(data_out, i, val_out);
+    }
+  }
+}
+
 #define CREATE_WEIGHT_TENSOR(name, sizes, dtype, val)              \
   std::vector<float> data_##name(utils::multiply_integers(sizes)); \
   std::fill(data_##name.begin(), data_##name.end(), val);          \

From 35df318391cf8b0af33f05b41ff3a7957c355b3e Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Tue, 3 Sep 2024 13:11:45 -0700
Subject: [PATCH 3/4] [ET-VK] Add `TmpTensorVRef` struct to recycle temporary
 tensor memory

## Context

Normally, tensor memory is planned during the export stage; tensors that do not overlap in lifetimes may share a memory allocation. However, memory planning requires knowledge of the lifetime of the tensors.

However, some complex operators may not be able to perform all the necessary computations in one shader, or the implementation of the operator may require that some temporary tensors be created during the execution of the op. Since these temporary tensors are not visible to the memory planning algorithm, they will not be memory planned.

This diff introduces the `TmpTensorVRef` object which facilitates memory sharing between temporary tensors. The design principle is that the lifetime of temporary tensors is restricted to the execution of the op within which they are created; thus, that knowledge can be used to implement memory planning. Please see the comments documentation of `TmpTensorVRef` for more details.

Differential Revision: [D62144398](https://our.internmc.facebook.com/intern/diff/D62144398/)

[ghstack-poisoned]
---
 .../vulkan/runtime/graph/ComputeGraph.cpp     | 66 +++++++++++++
 backends/vulkan/runtime/graph/ComputeGraph.h  | 80 +++++++++++++++
 .../vulkan/runtime/graph/containers/Value.h   |  5 +
 .../vulkan/test/vulkan_compute_api_test.cpp   | 99 +++++++++++++++++++
 4 files changed, 250 insertions(+)

diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index a8f57f57d2a..92729ffb8ab 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -15,6 +15,8 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
 
+#include <iostream>
+
 namespace vkcompute {
 
 //
@@ -47,6 +49,70 @@ VALUE_PTR_CLASS_IMPL(SymIntPtr, SymInt, SymInt)
 
 #undef VALUE_PTR_CLASS_IMPL
 
+//
+// TmpTensorVRef
+//
+
+TmpTensorVRef::TmpTensorVRef(
+    ComputeGraph* const graph_ptr,
+    const std::vector<int64_t>& sizes,
+    const vkapi::ScalarType dtype,
+    const utils::StorageType storage_type,
+    const utils::GPUMemoryLayout memory_layout)
+    : graph_p(graph_ptr), sobj_idx(-1), vref(kDummyValueRef) {
+  set_sobj_idx();
+  vref =
+      graph_p->add_tensor(sizes, dtype, storage_type, memory_layout, sobj_idx);
+}
+
+TmpTensorVRef::TmpTensorVRef(
+    ComputeGraph* const graph_ptr,
+    const std::vector<int64_t>& sizes,
+    const vkapi::ScalarType dtype,
+    const utils::StorageType storage_type)
+    : graph_p(graph_ptr), sobj_idx(-1), vref(kDummyValueRef) {
+  set_sobj_idx();
+  vref = graph_p->add_tensor(sizes, dtype, storage_type, sobj_idx);
+}
+
+TmpTensorVRef::TmpTensorVRef(
+    ComputeGraph* const graph_ptr,
+    const std::vector<int64_t>& sizes,
+    const vkapi::ScalarType dtype,
+    const utils::GPUMemoryLayout memory_layout)
+    : graph_p(graph_ptr), sobj_idx(-1), vref(kDummyValueRef) {
+  set_sobj_idx();
+  vref = graph_p->add_tensor(sizes, dtype, memory_layout, sobj_idx);
+}
+
+TmpTensorVRef::TmpTensorVRef(
+    ComputeGraph* const graph_ptr,
+    const std::vector<int64_t>& sizes,
+    const vkapi::ScalarType dtype)
+    : graph_p(graph_ptr), sobj_idx(-1), vref(kDummyValueRef) {
+  set_sobj_idx();
+  vref = graph_p->add_tensor(sizes, dtype, sobj_idx);
+}
+
+TmpTensorVRef::~TmpTensorVRef() {
+  // Lifetime of this temporary tensor is expired; return the shared object to
+  // the pool, as long as the sobj index is valid
+  if (sobj_idx >= 0) {
+    graph_p->tmp_shared_object_idxs_.emplace(sobj_idx);
+  }
+}
+
+void TmpTensorVRef::set_sobj_idx() {
+  // If no available temporary shared objects, request a new one to be created
+  if (graph_p->tmp_shared_object_idxs_.empty()) {
+    sobj_idx = graph_p->shared_objects_.size();
+  } else {
+    // Get the first available shared object idx
+    sobj_idx = graph_p->tmp_shared_object_idxs_.top();
+    graph_p->tmp_shared_object_idxs_.pop();
+  }
+}
+
 //
 // ComputeGraph
 //
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index ac5e0d6c9d1..218e66f8e42 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -11,6 +11,7 @@
 // @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
 
 #include <optional>
+#include <stack>
 
 #include <executorch/backends/vulkan/runtime/api/api.h>
 
@@ -67,6 +68,78 @@ DECL_VALUE_PTR_CLASS(SymIntPtr, SymInt);
 
 #undef DECL_VALUE_PTR_CLASS
 
+//
+// TmpTensorVRef
+//
+
+/*
+ * This struct is used to recycle the memory of temporary tensors that are
+ * created during the execution of a node. Upon construction, this struct will
+ * check the `tmp_shared_object_idxs_` of the provided `ComputeGraph` instance
+ * if any shared objects are available; if not, then a new one is created. A
+ * tensor value is then added to the `ComputeGraph` instance with the requested
+ * specifications. Upon destruction, the shared object index of the temporary
+ * tensor is returned to `tmp_shared_object_idxs_`.
+ *
+ * Note that instances of this struct can be used as if they were `ValueRef` due
+ * to implementation of a custom casting operator.
+ *
+ * This class should only be used to create tensors whose lifetimes exist only
+ * in a well defined scope (i.e. within a function).
+ */
+struct TmpTensorVRef {
+  ComputeGraph* graph_p;
+  int64_t sobj_idx;
+  ValueRef vref;
+
+  //
+  // Match all available overloads of `add_tensor` and `add_tensor_like`
+  //
+
+  TmpTensorVRef(
+      ComputeGraph* const graph_ptr,
+      const std::vector<int64_t>& sizes,
+      const vkapi::ScalarType dtype,
+      const utils::StorageType storage_type,
+      const utils::GPUMemoryLayout memory_layout);
+
+  TmpTensorVRef(
+      ComputeGraph* const graph_ptr,
+      const std::vector<int64_t>& sizes,
+      const vkapi::ScalarType dtype,
+      const utils::StorageType storage_type);
+
+  TmpTensorVRef(
+      ComputeGraph* const graph_ptr,
+      const std::vector<int64_t>& sizes,
+      const vkapi::ScalarType dtype,
+      const utils::GPUMemoryLayout memory_layout);
+
+  TmpTensorVRef(
+      ComputeGraph* const graph_ptr,
+      const std::vector<int64_t>& sizes,
+      const vkapi::ScalarType dtype);
+
+  // No copy construction or assignment
+  TmpTensorVRef(TmpTensorVRef& other) = delete;
+  TmpTensorVRef& operator=(TmpTensorVRef& other) = delete;
+
+  // No move construction or assignment
+  TmpTensorVRef(TmpTensorVRef&& other) = delete;
+  TmpTensorVRef& operator=(TmpTensorVRef&& other) = delete;
+
+  // Custom cast to ValueRef
+  operator ValueRef() const {
+    return vref;
+  };
+
+  ~TmpTensorVRef();
+
+ private:
+  // Helper function to get new shared obj index
+  void set_sobj_idx();
+};
+
 //
 // ComputeGraph
 //
@@ -94,7 +167,12 @@ class ComputeGraph final {
   vkapi::DescriptorPoolConfig execute_descriptor_counts_;
 
   std::unique_ptr<api::Context> context_;
+
   std::vector<SharedObject> shared_objects_;
+  // This stack is used by `TmpTensorVRef` instances to recycle shared objects
+  // for temporary tensors. See the comments of `TmpTensorVRef` for more details
+  std::stack<int64_t> tmp_shared_object_idxs_;
+
   std::vector<Value> values_;
   std::vector<api::ParamsBuffer> param_ubos_;
 
@@ -593,6 +671,8 @@ class ComputeGraph final {
   friend class BoolListPtr;
   friend class ValueListPtr;
   friend class SymIntPtr;
+
+  friend class TmpTensorVRef;
 };
 
 template <typename T>
diff --git a/backends/vulkan/runtime/graph/containers/Value.h b/backends/vulkan/runtime/graph/containers/Value.h
index 50a2b5e548c..8773f0c0b04 100644
--- a/backends/vulkan/runtime/graph/containers/Value.h
+++ b/backends/vulkan/runtime/graph/containers/Value.h
@@ -29,6 +29,11 @@ inline bool is_valid(ValueRef value_ref) {
 struct IOValueRef {
   ValueRef value;
   ValueRef staging;
+
+  // Custom cast to ValueRef
+  operator ValueRef() const {
+    return value;
+  };
 };
 
 /*
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index a0bfefafa02..a87e7c3d30f 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -1518,6 +1518,105 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
   }
 }
 
+TEST(VulkanComputeGraphTest, test_simple_graph_with_tmp_tensors) {
+  GraphConfig config;
+  ComputeGraph graph(config);
+
+  std::vector<int64_t> size_big = {8, 64, 124};
+  std::vector<int64_t> size_small = {8, 1, 124};
+
+  // Build graph
+
+  IOValueRef a = graph.add_input_tensor(
+      size_big, vkapi::kFloat, /*shared_object_idx = */ 0);
+  IOValueRef b = graph.add_input_tensor(
+      size_small, vkapi::kFloat, /*shared_object_idx = */ 1);
+
+  IOValueRef out = {};
+
+  out.value =
+      graph.add_tensor(size_big, vkapi::kFloat, /*shared_object_idx = */ 2);
+
+  // Perform the following compute
+  //
+  // a, b, out;
+  // {
+  //   inter;
+  //   {
+  //     tmp = a + b
+  //     tmp2 = tmp + a
+  //     inter = tmp2 + b
+  //   }
+  //   {
+  //     tmp = inter + b;
+  //     tmp2 = tmp + a
+  //     out = tmp2 + b;
+  //   }
+  // }
+  {
+    TmpTensorVRef inter(&graph, size_big, vkapi::kFloat);
+    EXPECT_TRUE(inter.sobj_idx == 3);
+    {
+      TmpTensorVRef tmp(&graph, size_big, vkapi::kFloat);
+      EXPECT_TRUE(tmp.sobj_idx == 4);
+      VK_GET_OP_FN("aten.add.Tensor")
+      (graph, {a, b, kDummyValueRef, tmp});
+
+      TmpTensorVRef tmp2(&graph, size_big, vkapi::kFloat);
+      EXPECT_TRUE(tmp2.sobj_idx == 5);
+      VK_GET_OP_FN("aten.add.Tensor")
+      (graph, {tmp, a, kDummyValueRef, tmp2});
+
+      VK_GET_OP_FN("aten.add.Tensor")
+      (graph, {tmp2, b, kDummyValueRef, inter});
+    }
+    {
+      TmpTensorVRef tmp(&graph, size_big, vkapi::kFloat);
+      EXPECT_TRUE(tmp.sobj_idx == 4);
+      VK_GET_OP_FN("aten.add.Tensor")
+      (graph, {inter, b, kDummyValueRef, tmp});
+
+      TmpTensorVRef tmp2(&graph, size_big, vkapi::kFloat);
+      EXPECT_TRUE(tmp2.sobj_idx == 5);
+      VK_GET_OP_FN("aten.add.Tensor")
+      (graph, {tmp, a, kDummyValueRef, tmp2});
+
+      VK_GET_OP_FN("aten.add.Tensor")
+      (graph, {tmp2, b, kDummyValueRef, out});
+    }
+  }
+
+  out.staging = graph.set_output_tensor(out.value);
+
+  graph.prepare();
+  graph.encode_execute();
+
+  // Run graph
+
+  for (float i = 5.0f; i < 30.0f; i += 10.0f) {
+    float val_a = i + 2.0f;
+    float val_b = i + 1.5f;
+    float val_tmp = val_a + val_b;
+    float val_tmp2 = val_tmp + val_a;
+    float val_inter = val_tmp2 + val_b;
+    float val_tmp_2 = val_inter + val_b;
+    float val_tmp2_2 = val_tmp_2 + val_a;
+    float val_out = val_tmp2_2 + val_b;
+
+    fill_vtensor(graph, a, val_a);
+    fill_vtensor(graph, b, val_b);
+
+    graph.execute();
+
+    EXTRACT_TENSOR(out);
+
+    // Sanity check that the values are correct
+    for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); ++i) {
+      CHECK_VALUE(data_out, i, val_out);
+    }
+  }
+}
+
 TEST(VulkanComputeGraphTest, test_large_graph) {
   auto build_start_time = std::chrono::system_clock::now();
   GraphConfig config;

From 1de43d2a23f1d61a0916e7bde1ef0a5b90ece8be Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Wed, 4 Sep 2024 15:04:02 -0700
Subject: [PATCH 4/4] [ET-VK][BE][ez] Enable automatic layout slot index
 incrementing

## Context

Currently, in shaders we have to declare the binding slot that layout bindings will bind to explicitly, i.e.

```
${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
${layout_declare_buffer(1, "r", "nchw_in", DTYPE)}
${layout_declare_ubo(2, "ivec4", "sizes")}
```

However, this can get a little tedious when making many layout declarations. This diff improves the situation by adding the `B` variable which will automatically increment the binding slot whenever a layout binding is declared. Now we can write

```
${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
${layout_declare_buffer(B, "r", "nchw_in", DTYPE)}
${layout_declare_ubo(B, "ivec4", "sizes")}
```

I may make a follow up diff to change all layout declarations to use `B` across all shaders in the codebase later on.

Differential Revision: [D62210119](https://our.internmc.facebook.com/intern/diff/D62210119/)

[ghstack-poisoned]
---
 backends/vulkan/runtime/gen_vulkan_spv.py | 45 ++++++++++++++++++-----
 1 file changed, 36 insertions(+), 9 deletions(-)

diff --git a/backends/vulkan/runtime/gen_vulkan_spv.py b/backends/vulkan/runtime/gen_vulkan_spv.py
index f4ba98b31fd..6ee29d45f18 100644
--- a/backends/vulkan/runtime/gen_vulkan_spv.py
+++ b/backends/vulkan/runtime/gen_vulkan_spv.py
@@ -38,6 +38,10 @@
 # Basic configuration settings for shaders
 DEFAULT_ENV: Dict[str, Any] = {
     "PRECISION": "highp",
+    # B is shorthand for "binding". This is used to automatically increment the
+    # layout binding index when declaring layout bindings. Note that a container
+    # type is used because integers are immutable in Python.
+    "B": [0],
 }
 
 # Establishes relationships between different tensor types and different GLSL types
@@ -179,8 +183,14 @@ def get_access_qualifier(access_type: Optional[str]) -> str:
     raise AssertionError(f"Invalid access type: {access_type}")
 
 
+def get_slot_val(slot: Union[int, List[int]]) -> int:
+    if isinstance(slot, list):
+        return slot[0]
+    return slot
+
+
 def layout_declare_buffer(
-    slot: int,
+    slot: Union[int, List[int]],
     access_type: str,
     var_name: str,
     dtype: str,
@@ -192,15 +202,18 @@ def layout_declare_buffer(
         array_type = buffer_scalar_type(dtype)
 
     out_str = f"""
-layout(set = 0, binding = {slot}) buffer {precision} restrict {get_access_qualifier(access_type)} {var_name}Buffer {{
+layout(set = 0, binding = {get_slot_val(slot)}) buffer {precision} restrict {get_access_qualifier(access_type)} {var_name}Buffer {{
     {array_type} {var_name}[];
 }};
 """
+
+    if isinstance(slot, list):
+        slot[0] = slot[0] + 1
     return out_str
 
 
 def layout_declare_image(
-    slot: int,
+    slot: Union[int, List[int]],
     access_type: str,
     var_name: str,
     dtype: str,
@@ -209,11 +222,16 @@ def layout_declare_image(
 ) -> str:
     image_format = TYPE_MAPPINGS["IMAGE_FORMAT"][dtype]
     image_type = TYPE_MAPPINGS["IMAGE_T"][image_ndim][dtype]
-    return f"layout(set = 0, binding = {slot}, {image_format}) uniform {precision} restrict {get_access_qualifier(access_type)} {image_type} {var_name};"
+
+    ret_str = f"layout(set = 0, binding = {get_slot_val(slot)}, {image_format}) uniform {precision} restrict {get_access_qualifier(access_type)} {image_type} {var_name};"
+
+    if isinstance(slot, list):
+        slot[0] = slot[0] + 1
+    return ret_str
 
 
 def layout_declare_sampler(
-    slot: int,
+    slot: Union[int, List[int]],
     access_type: str,
     var_name: str,
     dtype: str,
@@ -222,11 +240,16 @@ def layout_declare_sampler(
     image_ndim: int = 3,
 ) -> str:
     sampler_type = TYPE_MAPPINGS["SAMPLER_T"][image_ndim][dtype]
-    return f"layout(set = 0, binding = {slot}) uniform {precision} {sampler_type} {var_name};"
+
+    ret_str = f"layout(set = 0, binding = {get_slot_val(slot)}) uniform {precision} {sampler_type} {var_name};"
+
+    if isinstance(slot, list):
+        slot[0] = slot[0] + 1
+    return ret_str
 
 
 def layout_declare_tensor(
-    slot: int,
+    slot: Union[int, List[int]],
     access_type: str,
     var_name: str,
     dtype: str,
@@ -262,7 +285,9 @@ def layout_declare_tensor(
         )
 
 
-def layout_declare_ubo(slot: int, *args, precision: str = "PRECISION") -> str:
+def layout_declare_ubo(
+    slot: Union[int, List[int]], *args, precision: str = "PRECISION"
+) -> str:
     assert len(args) % 2 == 0
 
     var_list = list(zip(args[::2], args[1::2]))
@@ -272,12 +297,14 @@ def layout_declare_ubo(slot: int, *args, precision: str = "PRECISION") -> str:
         ubo_name += var_name + "_"
 
     out_str = f"""
-layout(set = 0, binding = {slot}) uniform {precision} restrict readonly {ubo_name}UBO {{
+layout(set = 0, binding = {get_slot_val(slot)}) uniform {precision} restrict readonly {ubo_name}UBO {{
 """
     for type_name, var_name in var_list:
         out_str += f"{type_name} {var_name};\n"
     out_str += "};"
 
+    if isinstance(slot, list):
+        slot[0] = slot[0] + 1
     return out_str