From 0a6cc2a35fb37e745d9875d52ba88841913f50bd Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Thu, 22 Aug 2024 11:50:52 -0700
Subject: [PATCH 1/8] [ET-VK][ez] Introduce `check_close` function in
 `compute_api_test` to account for small numerical differences

## Context

Introduce a `check_close` function in `vulkan_compute_api_test` to account for small numerical differences when executing on the GPU. Otherwise some tests will be reported as failures on some platforms (i.e. Mac).

Differential Revision: [D61666459](https://our.internmc.facebook.com/intern/diff/D61666459/)

[ghstack-poisoned]
---
 backends/vulkan/test/utils/test_utils.cpp        | 6 ++++++
 backends/vulkan/test/utils/test_utils.h          | 6 ++++++
 backends/vulkan/test/vulkan_compute_api_test.cpp | 2 +-
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
index ad496873695..6c056cc9d90 100644
--- a/backends/vulkan/test/utils/test_utils.cpp
+++ b/backends/vulkan/test/utils/test_utils.cpp
@@ -482,3 +482,9 @@ void execute_graph_and_check_output(
     }
   }
 }
+
+bool check_close(float a, float b, float atol, float rtol) {
+  float max = std::max(std::abs(a), std::abs(b));
+  float diff = std::abs(a - b);
+  return diff <= (atol + rtol * max);
+}
diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h
index f9969eddbf4..bf549446170 100644
--- a/backends/vulkan/test/utils/test_utils.h
+++ b/backends/vulkan/test/utils/test_utils.h
@@ -242,3 +242,9 @@ void print_vector(
   }
   std::cout << std::endl;
 }
+
+//
+// Misc. Utilities
+//
+
+bool check_close(float a, float b, float atol = 1e-4, float rtol = 1e-5);
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 307593d8fdb..ee2d119b6be 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -601,7 +601,7 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) {
   EXPECT_TRUE(data_out.size() == ref_out.size());
 
   for (size_t i = 0; i < data_out.size(); ++i) {
-    EXPECT_TRUE(data_out[i] == ref_out[i]);
+    EXPECT_TRUE(check_close(data_out[i], ref_out[i]));
   }
 }
 

From f13eedeba46994faf74bce06d52143389a57a976 Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Thu, 22 Aug 2024 11:50:55 -0700
Subject: [PATCH 2/8] [ET-VK][ez] Empty initialize ShaderInfo and add `bool()`
 operator

## Context

This diff lays the foundation for the implementation of "view operators". These operators will not dispatch any shaders; they will be solely responsible for updating the sizes and strides metadata of the output tensor, which will use the same storage resource as the input tensor. These ops will be implemented by adding a "no-op" `ExecuteNode` instance to the `ComputeGraph`, which does not contain a shader but does contain a resize function to update sizes and strides upon a resize.

This diff allows `ShaderInfo` to empty initialize, and add a bool operator to check if the ShaderInfo actually points to valid shader code. This will be used to construct "no-op" `ExecuteNode` instances which can be used to implement view operators.

Differential Revision: [D61666460](https://our.internmc.facebook.com/intern/diff/D61666460/)

[ghstack-poisoned]
---
 backends/vulkan/runtime/vk_api/Shader.h          | 8 ++++++--
 backends/vulkan/test/vulkan_compute_api_test.cpp | 7 +++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/backends/vulkan/runtime/vk_api/Shader.h b/backends/vulkan/runtime/vk_api/Shader.h
index 34c2d95c932..1e3b2a799f2 100644
--- a/backends/vulkan/runtime/vk_api/Shader.h
+++ b/backends/vulkan/runtime/vk_api/Shader.h
@@ -53,8 +53,8 @@ class ShaderLayout final {
 
 struct ShaderInfo final {
   struct {
-    const uint32_t* bin;
-    uint32_t size;
+    const uint32_t* bin = nullptr;
+    uint32_t size = 0u;
   } src_code;
 
   std::string kernel_name{""};
@@ -71,6 +71,10 @@ struct ShaderInfo final {
       const uint32_t,
       std::vector<VkDescriptorType>,
       const utils::uvec3 tile_size);
+
+  operator bool() const {
+    return src_code.bin != nullptr;
+  };
 };
 
 bool operator==(const ShaderInfo& _1, const ShaderInfo& _2);
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index ee2d119b6be..cbd409112ff 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -168,6 +168,13 @@ std::vector<int64_t> get_reference_strides(
   return {};
 }
 
+TEST_F(VulkanComputeAPITest, empty_init_shader_info_test) {
+  vkapi::ShaderInfo empty_shader_info;
+  EXPECT_FALSE(empty_shader_info);
+  EXPECT_TRUE(empty_shader_info.src_code.bin == nullptr);
+  EXPECT_TRUE(empty_shader_info.src_code.size == 0u);
+}
+
 TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) {
   for (const auto& sizes : standard_sizes_to_test) {
     if (sizes.size() < 3) {

From 1e930def96a08f27a577ef4910d5085ab0db30ea Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Thu, 22 Aug 2024 11:50:58 -0700
Subject: [PATCH 3/8] [ET-VK][ez] Enable no-op ExecuteNodes for view ops

## Context

This diff lays the foundation for the implementation of "view operators". These operators will not dispatch any shaders; they will be solely responsible for updating the sizes and strides metadata of the output tensor, which will use the same storage resource as the input tensor. These ops will be implemented by adding a "no-op" ExecuteNode instance to the ComputeGraph, which does not contain a shader but does contain a resize function to update sizes and strides upon a resize.

This diff allows `ExecuteNode` to empty initialize. If the `ShaderInfo` of the `ExecuteNode` instance is not valid, then the `execute()` function of the `ExecuteNode` instance will be a no-op.

Differential Revision: [D61666465](https://our.internmc.facebook.com/intern/diff/D61666465/)

[ghstack-poisoned]
---
 .../vulkan/runtime/graph/ops/ExecuteNode.cpp     | 15 +++++++++++++++
 backends/vulkan/runtime/graph/ops/ExecuteNode.h  | 16 +++++++++++++++-
 backends/vulkan/test/vulkan_compute_api_test.cpp | 13 +++++++++++++
 3 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp
index 3b2a826f87f..2cb00ba65af 100644
--- a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp
@@ -35,7 +35,22 @@ ExecuteNode::ExecuteNode(
   graph.update_descriptor_counts(shader, /*execute = */ true);
 }
 
+ExecuteNode::ExecuteNode(
+    const ResizeFunction& resize_fn,
+    const std::vector<ValueRef>& resize_args)
+    : shader_(),
+      global_workgroup_size_({0u, 0u, 0u}),
+      local_workgroup_size_({0u, 0u, 0u}),
+      args_(),
+      params_(),
+      spec_vars_(),
+      resize_fn_(resize_fn),
+      resize_args_(resize_args) {}
+
 void ExecuteNode::encode(ComputeGraph* graph) {
+  if (!shader_) {
+    return;
+  }
   api::Context* const context = graph->context();
   vkapi::PipelineBarrier pipeline_barrier{};
 
diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.h b/backends/vulkan/runtime/graph/ops/ExecuteNode.h
index 1fff14e020e..dece9ddb50d 100644
--- a/backends/vulkan/runtime/graph/ops/ExecuteNode.h
+++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.h
@@ -48,7 +48,7 @@ class ExecuteNode final {
       const std::vector<ArgGroup>&,
       const std::vector<ValueRef>&)>;
 
-  ExecuteNode(
+  explicit ExecuteNode(
       ComputeGraph& graph,
       const vkapi::ShaderInfo& shader,
       const utils::uvec3& global_workgroup_size,
@@ -59,6 +59,15 @@ class ExecuteNode final {
       const ResizeFunction& resize_fn = nullptr,
       const std::vector<ValueRef>& resize_args = {});
 
+  /*
+   * This overload of the ExecuteNode constructor is used to register ops which
+   * update a tensor view. No shader is dispatched, but the node still needs to
+   * update the view's sizes and strides after a resize.
+   */
+  explicit ExecuteNode(
+      const ResizeFunction& resize_fn = nullptr,
+      const std::vector<ValueRef>& resize_args = {});
+
   ~ExecuteNode() = default;
 
   void encode(ComputeGraph* graph);
@@ -83,6 +92,11 @@ class ExecuteNode final {
   const vkapi::SpecVarList spec_vars_;
   const ResizeFunction resize_fn_;
   const std::vector<ValueRef> resize_args_;
+
+ public:
+  operator bool() const {
+    return shader_;
+  }
 };
 
 } // namespace vkcompute
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index cbd409112ff..af92728cb0c 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -982,6 +982,19 @@ TEST(VulkanComputeGraphTest, test_values_string) {
   EXPECT_TRUE(stored == "hello, world");
 }
 
+TEST(VulkanComputeGraphTest, empty_init_executenode_test) {
+  ExecuteNode node(nullptr, {});
+  EXPECT_FALSE(node);
+
+  GraphConfig config;
+  ComputeGraph graph(config);
+
+  // Encode an empty ExecuteNode and check that command buffer encoding does not
+  // crash.
+  graph.execute_nodes().emplace_back(new ExecuteNode(nullptr, {}));
+  EXPECT_NO_FATAL_FAILURE(graph.encode_execute());
+}
+
 TEST(VulkanComputeGraphTest, test_zero_dim_tensor) {
   GraphConfig config;
   ComputeGraph graph(config);

From 846cd776190fecfad13b351b3936112f6b1318bb Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Thu, 22 Aug 2024 11:51:01 -0700
Subject: [PATCH 4/8] [ET-VK] Use dim order as the source of truth for tensor
 strides

## Context

This diff introduces dim order as a core metadata member of `vTensor`. The dim order is a better source of truth for the data layout of a buffer backed tensor because it is less ambiguous than the strides; when multiple dims have a stride of 1, it is ambiguous which dim comes "before" the other. This amibiguity makes it impossible to update the strides correctly when the tensor is resized.

## Changes

* Introduce `dim_order_` as a core metadata member of `vTensor` class
* Tensor view construction now accepts `new_dim_order` as an argument instead of `new_strides`
* Introduce `virtual_reconfigure` member function of `vTensor` which is an extension of `virtual_resize` that allows updating of sizes as well as strides (via updating the dim order)

Differential Revision: [D61666464](https://our.internmc.facebook.com/intern/diff/D61666464/)

[ghstack-poisoned]
---
 .../vulkan/runtime/api/containers/Tensor.cpp  | 231 +++++++++++++-----
 .../vulkan/runtime/api/containers/Tensor.h    | 107 ++++++--
 .../vulkan/runtime/graph/ComputeGraph.cpp     |   7 +
 backends/vulkan/runtime/graph/ComputeGraph.h  |   9 +-
 .../vulkan/test/vulkan_compute_api_test.cpp   | 107 +++++++-
 5 files changed, 372 insertions(+), 89 deletions(-)

diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index 78aa4796aa5..1a250650511 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -13,36 +13,15 @@
 namespace vkcompute {
 namespace api {
 
-/*
- * Given the strides of a buffer-backed tensor, find the index of the "fastest
- * moving" dimension in WHCN dimension order. If multiple dims have the lowest
- * stride, then the "earlier" dim is assumed to be the fastest moving (width is
- * "earlier" than height).
- */
-int32_t find_fastest_whcn_dim(const std::vector<int64_t>& strides) {
-  if (strides.size() == 0) {
-    return 0;
-  }
-  int32_t fastest_dim = 0;
-  int64_t min_stride = strides.at(0);
-  for (int d = strides.size() - 1; d >= 0; --d) {
-    if (strides.at(d) < min_stride) {
-      fastest_dim = d;
-      min_stride = strides.at(d);
-    }
-  }
-  return (strides.size() - 1 - fastest_dim);
-}
-
 /*
  * Given the strides of a buffer-backed tensor, estimate the equivalent memory
  * layout enum value by identifying the fastest moving dimension.
  */
 utils::GPUMemoryLayout estimate_memory_layout(
-    const std::vector<int64_t>& strides) {
-  int32_t fastest_dim = find_fastest_whcn_dim(strides);
-  if (fastest_dim <= 3) {
-    return utils::GPUMemoryLayout(fastest_dim);
+    const std::vector<int64_t>& dim_order) {
+  int64_t fastest_dim_whcn = dim_order.size() - 1 - dim_order.back();
+  if (fastest_dim_whcn >= 0 && fastest_dim_whcn <= 3) {
+    return utils::GPUMemoryLayout(fastest_dim_whcn);
   }
 
   // TODO(ssjia) find a way to gracefully recover from this case by i.e. adding
@@ -51,41 +30,108 @@ utils::GPUMemoryLayout estimate_memory_layout(
   VK_THROW("No compatible GPUMemoryLayout value");
 }
 
+std::vector<int64_t> calculate_dim_order(
+    const size_t ndim,
+    const utils::GPUMemoryLayout memory_layout) {
+  // Special case for zero dim tensors
+  if (ndim == 0) {
+    return {0};
+  }
+  std::vector<int64_t> dim_order(ndim);
+  int64_t last_dim =
+      ndim - utils::to_packed_dim_nchw_offset<int64_t>(memory_layout);
+
+  int64_t cur_dim = 0;
+  for (int d = 0; d < ndim; ++d) {
+    if (d == last_dim) {
+      cur_dim += 1;
+    }
+    dim_order[d] = cur_dim;
+    cur_dim += 1;
+  }
+  if (last_dim >= 0) {
+    dim_order[ndim - 1] = last_dim;
+  }
+
+  return dim_order;
+}
+
+namespace {
+
+struct StrideDimIndexPair {
+  int64_t stride;
+  int64_t dim_i;
+
+  StrideDimIndexPair() : stride(0), dim_i(0) {}
+
+  explicit StrideDimIndexPair(int64_t stride, int64_t dim_i)
+      : stride(stride), dim_i(dim_i) {}
+
+  bool operator>(const StrideDimIndexPair& other) const {
+    // Descending order
+    return stride < other.stride;
+  }
+
+  bool operator<(const StrideDimIndexPair& other) const {
+    // Descending order
+    return stride > other.stride;
+  }
+};
+
+} // namespace
+
+std::vector<int64_t> strides_to_dim_order(const std::vector<int64_t>& strides) {
+  std::vector<StrideDimIndexPair> stride_dim_pairs(strides.size());
+  for (size_t i = 0; i < strides.size(); ++i) {
+    stride_dim_pairs[i] = StrideDimIndexPair(strides[i], i);
+  }
+  std::stable_sort(stride_dim_pairs.begin(), stride_dim_pairs.end());
+
+  std::vector<int64_t> dim_order(strides.size());
+  for (int i = 0; i < strides.size(); ++i) {
+    dim_order.at(i) = stride_dim_pairs.at(i).dim_i;
+  }
+  return dim_order;
+}
+
 std::vector<int64_t> calculate_strides(
     const std::vector<int64_t>& sizes,
-    const utils::GPUMemoryLayout memory_layout) {
+    const std::vector<int64_t>& dim_order) {
   // For zero dim tensors
   if (sizes.size() == 0) {
     return {1};
   }
 
-  const int64_t dim_offset =
-      utils::to_packed_dim_nchw_offset<int64_t>(memory_layout);
-  int64_t last_dim = sizes.size() - dim_offset;
-  if (last_dim < 0) {
-    last_dim = sizes.size() - 1;
-  }
-
   size_t ndim = sizes.size();
   std::vector<int64_t> strides(ndim);
 
-  const int64_t last_dim_size = sizes.at(last_dim);
-
-  for (int stride_d = ndim - 1; stride_d >= 0; stride_d--) {
-    strides.at(stride_d) = 1;
-    if (stride_d == last_dim) {
-      continue;
-    }
-    strides.at(stride_d) = last_dim_size;
-    for (int size_d = ndim - 1; size_d > stride_d; size_d--) {
-      if (size_d != last_dim) {
-        strides.at(stride_d) *= sizes.at(size_d);
-      }
+  strides[dim_order[ndim - 1]] = 1;
+  for (int32_t i = ndim - 2; i >= 0; --i) {
+    if (sizes[dim_order[i + 1]] == 0) {
+      strides[dim_order[i]] = strides[dim_order[i + 1]];
+    } else {
+      strides[dim_order[i]] =
+          strides[dim_order[i + 1]] * sizes[dim_order[i + 1]];
     }
   }
+
   return strides;
 }
 
+bool dim_order_is_valid(const std::vector<int64_t>& dim_order) {
+  int64_t sum = 0;
+  for (size_t i = 0; i < dim_order.size(); ++i) {
+    if (dim_order[i] < 0 || dim_order[i] >= dim_order.size()) {
+      return false;
+    }
+    sum += dim_order[i];
+  }
+  int64_t n = static_cast<int64_t>(dim_order.size() - 1);
+  // Sanity check that the sum of the indices in the vector is equal to the sum
+  // of 0 + 1 + 2 + ... + (ndim - 1)
+  return sum == n * (n + 1) / 2;
+}
+
 std::vector<int64_t> unsqueeze_strides(
     const std::vector<int64_t>& strides,
     const int64_t numel) {
@@ -170,7 +216,8 @@ vTensor::vTensor(
       memory_layout_(memory_layout),
       // Calculate tensor size metadata
       sizes_(sizes.begin(), sizes.end()),
-      strides_(calculate_strides(sizes, memory_layout_)),
+      dim_order_(calculate_dim_order(sizes_.size(), memory_layout_)),
+      strides_(calculate_strides(sizes, dim_order_)),
       numel_(utils::multiply_integers(sizes_)),
       padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
       unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
@@ -189,6 +236,9 @@ vTensor::vTensor(
           padded_sizes_,
           dtype_,
           allocate_memory) {
+  VK_CHECK_COND(
+      dim_order_is_valid(dim_order_), "computed dim order is invalid");
+
   if (storage_type != utils::kBuffer) {
     texture_limits_.limits = utils::ivec3{
         utils::safe_downcast<int32_t>(storage_.image_extents_[0]),
@@ -204,16 +254,39 @@ vTensor::vTensor(
   }
 }
 
+vTensor::vTensor(const vTensor& other)
+    : dtype_(other.dtype_),
+      memory_layout_(other.memory_layout_),
+      // Copy tensor size metadata
+      sizes_(other.sizes_.begin(), other.sizes_.end()),
+      dim_order_(other.dim_order_.begin(), other.dim_order_.end()),
+      strides_(other.strides_.begin(), other.strides_.end()),
+      numel_(other.numel_),
+      padded_sizes_{other.padded_sizes_.begin(), other.padded_sizes_.end()},
+      unsqueezed_strides_{
+          other.unsqueezed_strides_.begin(),
+          other.unsqueezed_strides_.end()},
+      padded_numel_(other.padded_numel_),
+      texture_limits_{other.texture_limits_},
+      // Empty initialize Utility Uniform Buffers
+      sizes_uniform_(),
+      strides_uniform_(),
+      numel_uniform_(),
+      texture_limits_uniform_(),
+      // Copy Tensor storage
+      storage_(other.storage_) {}
+
 vTensor::vTensor(
     const vTensor& other,
     const std::vector<int64_t>& sizes,
-    const std::vector<int64_t>& strides,
-    const size_t offset_numel)
+    const std::vector<int64_t>& dim_order,
+    const int64_t offset_numel)
     : dtype_(other.dtype_),
-      memory_layout_(estimate_memory_layout(strides)),
+      memory_layout_(estimate_memory_layout(dim_order)),
       // Copy tensor size metadata
       sizes_(sizes.begin(), sizes.end()),
-      strides_(strides.begin(), strides.end()),
+      dim_order_(dim_order.begin(), dim_order.end()),
+      strides_(calculate_strides(sizes_, dim_order_)),
       numel_(utils::multiply_integers(sizes_)),
       padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
       unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
@@ -226,6 +299,8 @@ vTensor::vTensor(
       texture_limits_uniform_(),
       // Copy Tensor storage
       storage_(other.storage_, vkapi::element_size(dtype_) * offset_numel) {
+  VK_CHECK_COND(
+      dim_order_is_valid(dim_order_), "new dim order provided is invalid");
   VK_CHECK_COND(
       offset_numel + numel_ <= other.numel(),
       "Tensor alias cannot access more elements than available in the original"
@@ -339,9 +414,17 @@ void vTensor::bind_allocation(const vkapi::Allocation& allocation) {
   }
 }
 
-void vTensor::update_size_metadata(const std::vector<int64_t>& new_sizes) {
+void vTensor::update_metadata(
+    const std::vector<int64_t>& new_sizes,
+    const std::vector<int64_t>& new_dim_order) {
   sizes_ = new_sizes;
-  strides_ = calculate_strides(new_sizes, memory_layout_);
+  dim_order_ = new_dim_order;
+  strides_ = calculate_strides(sizes_, dim_order_);
+  // Only update the memory layout for buffer-backed tensors. Strides are
+  // meaningless for texture-backed tensors and do not impact the memory layout.
+  if (storage_type() == utils::kBuffer) {
+    memory_layout_ = estimate_memory_layout(dim_order_);
+  }
   numel_ = utils::multiply_integers(sizes_);
 
   padded_sizes_ = calculate_padded_sizes(sizes_, memory_layout_);
@@ -373,15 +456,12 @@ void vTensor::update_size_metadata(const std::vector<int64_t>& new_sizes) {
   }
 }
 
-void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
-  update_size_metadata(new_sizes);
-  storage_.discard_and_reallocate(
-      calculate_padded_sizes(new_sizes, memory_layout_),
-      memory_layout_,
-      dtype_);
+void vTensor::update_size_metadata(const std::vector<int64_t>& new_sizes) {
+  // Dim order does not change on resize
+  update_metadata(new_sizes, dim_order_);
 }
 
-void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
+void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
   if (storage_type() != utils::kBuffer) {
     // For texture storage check that the current texture is large enough for
     // the new sizes of the tensor.
@@ -394,10 +474,37 @@ void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
 
     VK_CHECK_COND(
         valid_resize,
-        "Cannot use virtual resize if new sizes requires a larger texture.");
+        "tensor sizes requires a larger texture than the current one.");
+  } else {
+    int64_t numel = utils::multiply_integers(sizes);
+    bool valid_resize =
+        numel + storage_.buffer_offset_ <= storage_.buffer_length_;
+    VK_CHECK_COND(
+        valid_resize,
+        "tensor sizes requires a larger buffer than the current one.");
   }
+}
+
+void vTensor::virtual_reconfigure(
+    const std::vector<int64_t>& new_sizes,
+    const std::vector<int64_t>& new_dim_order) {
+  VK_CHECK_COND(
+      dim_order_is_valid(new_dim_order), "new dim order provided is invalid");
+  check_sizes(new_sizes);
+  update_metadata(new_sizes, new_dim_order);
+}
+
+void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
+  check_sizes(new_sizes);
+  update_size_metadata(new_sizes);
+}
 
+void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
   update_size_metadata(new_sizes);
+  storage_.discard_and_reallocate(
+      calculate_padded_sizes(new_sizes, memory_layout_),
+      memory_layout_,
+      dtype_);
 }
 
 //
@@ -480,6 +587,7 @@ vTensorStorage::vTensorStorage(
       storage_type_{storage_type},
       image_extents_(calculate_image_extents(padded_sizes, gpu_memory_layout)),
       buffer_length_{utils::multiply_integers(padded_sizes)},
+      buffer_offset_{0},
       image_(allocate_image(
           context_,
           image_extents_,
@@ -496,11 +604,12 @@ vTensorStorage::vTensorStorage(
 
 vTensorStorage::vTensorStorage(
     const vTensorStorage& other,
-    const size_t buffer_offset)
+    const int64_t buffer_offset)
     : context_(other.context_),
       storage_type_{other.storage_type_},
       image_extents_(other.image_extents_),
       buffer_length_{other.buffer_length_},
+      buffer_offset_{buffer_offset},
       image_(),
       buffer_(other.buffer_, buffer_offset),
       last_access_{other.last_access_} {
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
index 5a4598291c0..48dcdaf2f7a 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.h
+++ b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -20,14 +20,30 @@ namespace vkcompute {
 namespace api {
 
 /*
- * Given the sizes of a tensor and the GPU memory layout, calculate the strides
- * of the tensor in NCHW dimension order. The GPU memory layout will be used to
- * determine which dimension is packed along a texel; that dimension will be
- * used as the "fasted moving" dimension with a stride of 1.
+ * Given the strides of a tensor in NCHW dimension order, calculate the dim
+ * order of the tensor by computing an index sort of the strides. Note that
+ * there is some ambiguity when multiple dimensions have the same stride;
+ * stable_sort is used to preserve the ordering of "outer" dimensions with
+ * respect to "inner" dimensions.
+ */
+std::vector<int64_t> strides_to_dim_order(const std::vector<int64_t>& strides);
+
+/*
+ * Given a GPUMemoryLayout value, produce a dim order vector that matches the
+ * given memory layout. The produced dim order vector will be in the NCHW
+ * dimension order
+ */
+std::vector<int64_t> calculate_dim_order(
+    const size_t ndim,
+    const utils::GPUMemoryLayout memory_layout);
+
+/*
+ * Given the sizes of a tensor and the dim order of the tensor (both in NCHW)
+ * dimension order, calculate the strides of the tensor.
  */
 std::vector<int64_t> calculate_strides(
     const std::vector<int64_t>& sizes,
-    const utils::GPUMemoryLayout memory_layout);
+    const std::vector<int64_t>& dim_order);
 
 std::vector<int64_t> unsqueeze_strides(
     const std::vector<int64_t>& strides,
@@ -96,7 +112,7 @@ class vTensorStorage final {
    * because this behaviour is unsafe, since the original tensor may be
    * destroyed before the copy is destroyed.
    */
-  vTensorStorage(const vTensorStorage& other, const size_t buffer_offset = 0);
+  vTensorStorage(const vTensorStorage& other, const int64_t buffer_offset = 0);
 
  public:
   // To discourage creating copies, the assignment operator is still deleted.
@@ -118,6 +134,7 @@ class vTensorStorage final {
   // Resource sizings
   utils::uvec3 image_extents_{};
   int64_t buffer_length_{};
+  int64_t buffer_offset_{};
 
   // GPU Storage
   mutable vkapi::VulkanImage image_;
@@ -167,8 +184,16 @@ class vTensor final {
       const utils::GPUMemoryLayout memory_layout = utils::kChannelsPacked,
       const bool allocate_memory = true);
 
-  vTensor(const vTensor& other) = delete;
-  vTensor& operator=(const vTensor& other) = delete;
+  /*
+   * This constructor allows for the creation of a vTensor that references the
+   * same buffer resource of another vTensor, with the same sizes and strides
+   * metadata. The created vTensor will not own the underlying resource. This is
+   * only applicable for buffer backed tensors at the moment.
+   *
+   * Once created, the sizes and strides of the aliased vTensor can be changed
+   * using the `virtual_reconfigure` member function.
+   */
+  vTensor(const vTensor& other);
 
   /*
    * This constructor allows for the creation of a vTensor that references the
@@ -176,6 +201,10 @@ class vTensor final {
    * strides metatdata. The created vTensor will not own the underlying
    * resource. This is only applicable for buffer backed tensors at the moment.
    *
+   * Note that dim order is used as the source of truth regarding the strides,
+   * and the new strides are computed from the new sizes and new dim order.
+   * Thus only the dim order is provided as an argument to this function.
+   *
    * The offset_numel argument allows the aliased tensor's memory region to
    * begin at an offset of N elements from the start of the original tensor's
    * buffer.
@@ -183,8 +212,11 @@ class vTensor final {
   vTensor(
       const vTensor& other,
       const std::vector<int64_t>& sizes,
-      const std::vector<int64_t>& strides,
-      const size_t offset_numel = 0);
+      const std::vector<int64_t>& dim_order,
+      const int64_t offset_numel = 0);
+
+  // To discourage making copies, the copy assignment operator is still deleted
+  vTensor& operator=(const vTensor& other) = delete;
 
   vTensor(vTensor&& other) = default;
   vTensor& operator=(vTensor&& other) = default;
@@ -195,6 +227,8 @@ class vTensor final {
 
   // sizes of the tensor in NCHW dimension order
   std::vector<int64_t> sizes_;
+  // dim order of the tensor in NCHW dimension order
+  std::vector<int64_t> dim_order_;
   // strides of the tensor in NCHW dimension order
   std::vector<int64_t> strides_;
   // Contains the number of elements in the tensor according to the canonical
@@ -305,6 +339,10 @@ class vTensor final {
     return sizes_.size();
   }
 
+  inline const std::vector<int64_t>& dim_order() const {
+    return dim_order_;
+  }
+
   inline const std::vector<int64_t>& strides() const {
     return strides_;
   }
@@ -386,24 +424,61 @@ class vTensor final {
 
  private:
   /*
-   * Update the size metadata of the vTensor to be new sizes. Should not be used
-   * directly, reallocate() or virtual_resize() should be used instead.
+   * Update the sizes, dim order, and strides metadata of the vTensor.
+   *
+   * The dim order is used as the "source of truth" for the strides and the
+   * strides are calculated from the dim order, therefore only the dim order is
+   * accepted as an argument to this function. Within the function, the new
+   * strides are computed from the new sizes and new dim order.
+   *
+   * Should not be used directly, reallocate() or virtual_resize() should be
+   * used instead.
+   */
+  void update_metadata(
+      const std::vector<int64_t>& new_sizes,
+      const std::vector<int64_t>& new_dim_order);
+
+  /*
+   * Convenience overload of update_metadata. Given the new sizes, the new
+   * strides will be re-calculated based on the current memory layout of the
+   * tensor. Update_metadata will be called with the new sizes and strides.
    */
   void update_size_metadata(const std::vector<int64_t>& new_sizes);
 
+  /*
+   * Check that tensor sizes are valid given the current storage resource's
+   * limits.
+   */
+  void check_sizes(const std::vector<int64_t>& sizes) const;
+
  public:
   /*
-   * Discard the underlying VkImage or VkBuffer and re-allocate based on new
-   * tensor sizes
+   * Virtually resize and "re-stride" the tensor by modifying the size and
+   * stride metadata that gets used in compute shaders. This allows the shader
+   * to interpret the underlying resource with the updated metadata.
+   *
+   * Note that the dim order is used as the source of truth for the strides; the
+   * strides are computed using the new sizes and new dim order, thus only the
+   * dim order is accepted as an argument to this function.
    */
-  void reallocate(const std::vector<int64_t>& new_sizes);
+  void virtual_reconfigure(
+      const std::vector<int64_t>& new_sizes,
+      const std::vector<int64_t>& new_dim_order);
 
   /*
    * Perform a virtual resize of the vTensor by modifying the size metadata that
    * gets used in compute shaders. This allows the shader to treat the
-   * underlying resource as if it were a different size.
+   * underlying resource as if it were a different size. This function is a
+   * convenience overload of virtual_reconfigure; new strides will be computed
+   * based on the new sizes that preserves the memory layout of the tensor.
    */
   void virtual_resize(const std::vector<int64_t>& new_sizes);
+
+  /*
+   * Discard the underlying VkImage or VkBuffer and re-allocate based on new
+   * tensor sizes
+   */
+  void reallocate(const std::vector<int64_t>& new_sizes);
 };
 
 } // namespace api
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index 50d927a913f..48e1ebf0a83 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -203,6 +203,13 @@ ValueRef ComputeGraph::add_tensor(
       sizes, dtype, suggested_memory_layout(sizes), shared_object_idx);
 }
 
+ValueRef ComputeGraph::add_tensor_view(const ValueRef vref) {
+  const vTensorPtr t = get_tensor(vref);
+  ValueRef idx(static_cast<int>(values_.size()));
+  values_.emplace_back(api::vTensor(*t));
+  return idx;
+}
+
 ValueRef ComputeGraph::add_tensor_view(
     const ValueRef vref,
     const std::vector<int64_t>& sizes,
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index b432be83881..faa2f4107ec 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -356,10 +356,17 @@ class ComputeGraph final {
    * `vTensor` value at `vref`. See the copy constructor of `api::vTensor` for
    * more details.
    */
+  ValueRef add_tensor_view(const ValueRef vref);
+
+  /*
+   * Use the copy constructor of `api::vTensor` to create a "view" of the
+   * `vTensor` value at `vref` with different sizes and dim order. See the copy
+   * constructor of `api::vTensor` for more details.
+   */
   ValueRef add_tensor_view(
       const ValueRef vref,
       const std::vector<int64_t>& sizes,
-      const std::vector<int64_t>& strides,
+      const std::vector<int64_t>& dim_order,
       const size_t offset_numel = 0);
 
   /*
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index af92728cb0c..a12a7974cbf 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -175,7 +175,67 @@ TEST_F(VulkanComputeAPITest, empty_init_shader_info_test) {
   EXPECT_TRUE(empty_shader_info.src_code.size == 0u);
 }
 
+TEST_F(VulkanComputeAPITest, calculate_dim_order_test) {
+  // ndim, GPUMemoryLayout, expected dim order pairs
+  std::vector<std::tuple<size_t, utils::GPUMemoryLayout, std::vector<int64_t>>>
+      test_cases = {
+          {1, utils::kWidthPacked, {0}},
+          {1, utils::kHeightPacked, {0}},
+          {1, utils::kChannelsPacked, {0}},
+          {2, utils::kWidthPacked, {0, 1}},
+          {2, utils::kHeightPacked, {1, 0}},
+          {2, utils::kChannelsPacked, {0, 1}},
+          {3, utils::kWidthPacked, {0, 1, 2}},
+          {3, utils::kHeightPacked, {0, 2, 1}},
+          {3, utils::kChannelsPacked, {1, 2, 0}},
+          {4, utils::kWidthPacked, {0, 1, 2, 3}},
+          {4, utils::kHeightPacked, {0, 1, 3, 2}},
+          {4, utils::kChannelsPacked, {0, 2, 3, 1}},
+      };
+
+  for (const auto& test_case : test_cases) {
+    const size_t& ndim = std::get<0>(test_case);
+    const utils::GPUMemoryLayout& layout = std::get<1>(test_case);
+    const auto& expected_dim_order = std::get<2>(test_case);
+    std::vector<int64_t> dim_order = calculate_dim_order(ndim, layout);
+
+    ASSERT_TRUE(dim_order == expected_dim_order);
+  }
+}
+
+TEST_F(VulkanComputeAPITest, calculate_tensor_dim_order_test) {
+  // Stride, expected dim order pairs. Note that strides don't have to "make
+  // sense" because only they are sorted; the actual stride values don't matter.
+  std::vector<std::tuple<std::vector<int64_t>, std::vector<int64_t>>>
+      test_cases = {
+          {{8, 1}, {0, 1}},
+          {{2, 10}, {1, 0}},
+          {{66, 12, 1}, {0, 1, 2}},
+          {{32, 128, 4}, {1, 0, 2}},
+          {{3, 8, 11, 212}, {3, 2, 1, 0}},
+          {{100, 12, 9, 1}, {0, 1, 2, 3}},
+          {{10, 12, 101, 6}, {2, 1, 0, 3}},
+      };
+
+  for (const auto& test_case : test_cases) {
+    const auto& strides = std::get<0>(test_case);
+    const auto& expected_dim_order = std::get<1>(test_case);
+    std::vector<int64_t> dim_order = strides_to_dim_order(strides);
+
+    ASSERT_TRUE(dim_order == expected_dim_order);
+  }
+}
+
 TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) {
+  // vtensor to be resized
+  vTensor v_tensor_to_resize(
+      context(),
+      {25, 25, 25, 25},
+      vkapi::kFloat,
+      utils::kBuffer,
+      utils::kWidthPacked,
+      /*allocate_memory = */ false);
+
   for (const auto& sizes : standard_sizes_to_test) {
     if (sizes.size() < 3) {
       continue;
@@ -183,7 +243,9 @@ TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) {
     for (const auto& layout :
          {utils::kWidthPacked, utils::kHeightPacked, utils::kChannelsPacked}) {
       {
-        std::vector<int64_t> strides = calculate_strides(sizes, layout);
+        std::vector<int64_t> dim_order =
+            calculate_dim_order(sizes.size(), layout);
+        std::vector<int64_t> strides = calculate_strides(sizes, dim_order);
         std::vector<int64_t> ref_strides = get_reference_strides(sizes, layout);
         ASSERT_TRUE(strides == ref_strides);
 
@@ -194,6 +256,25 @@ TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) {
             get_reference_strides(sizes, layout, true);
 
         ASSERT_TRUE(unsqueezed_strides == ref_unsqueezed_strides);
+
+        // Create new vTensor and check that the strides are correct
+        vTensor new_v_tensor(
+            context(),
+            sizes,
+            vkapi::kFloat,
+            utils::kBuffer,
+            layout,
+            /*allocate_memory = */ false);
+
+        ASSERT_TRUE(new_v_tensor.strides() == ref_strides);
+        ASSERT_TRUE(
+            new_v_tensor.unsqueezed_strides() == ref_unsqueezed_strides);
+
+        // Resize vtensor and check that updated metadata is correct
+        v_tensor_to_resize.virtual_reconfigure(sizes, dim_order);
+        ASSERT_TRUE(v_tensor_to_resize.strides() == ref_strides);
+        ASSERT_TRUE(
+            v_tensor_to_resize.unsqueezed_strides() == ref_unsqueezed_strides);
       }
     }
   }
@@ -549,9 +630,10 @@ TEST_F(VulkanComputeAPITest, tensor_copy_test) {
   std::vector<int64_t> sizes = {9, 9};
   std::vector<int64_t> strides =
       get_reference_strides(sizes, utils::kWidthPacked);
+  std::vector<int64_t> dim_order = {0, 1};
 
   vTensor original = CREATE_FLOAT_BUFFER(sizes, /*allocate_memory=*/true);
-  vTensor copy = vTensor(original, sizes, strides);
+  vTensor copy = vTensor(original, sizes, dim_order);
   EXPECT_TRUE(get_vma_allocation_count() == 1);
 
   // Fill original tensor with some data
@@ -564,7 +646,6 @@ TEST_F(VulkanComputeAPITest, tensor_copy_test) {
   for (size_t i = 0; i < data_out.size(); ++i) {
     CHECK_VALUE(data_out, i, 2.5f + i);
   }
-  std::cout << std::endl;
 }
 
 TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) {
@@ -576,7 +657,7 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) {
   std::vector<int64_t> mat2_t_sizes = {K, N};
   std::vector<int64_t> out_sizes = {M, N};
 
-  std::vector<int64_t> transposed_strides = {1, K};
+  std::vector<int64_t> transposed_dim_order = {1, 0};
 
   vTensor mat1 = CREATE_FLOAT_BUFFER(mat1_sizes, /*allocate_memory=*/true);
   vTensor mat2 = CREATE_FLOAT_BUFFER(mat2_sizes, /*allocate_memory=*/true);
@@ -588,8 +669,8 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) {
   std::vector<float> mat2_data =
       create_random_float_buffer(mat2.staging_buffer_numel());
 
-  vTensor mat2_t = vTensor(mat2, mat2_t_sizes, transposed_strides);
-  EXPECT_TRUE(mat2_t.gpu_memory_layout() == utils::kHeightPacked);
+  // Create direct view and modify sizes and strides later
+  vTensor mat2_t = vTensor(mat2);
 
   std::vector<float> mat2_t_data = transpose_matrix(mat2_data, N, K);
   std::vector<float> ref_out =
@@ -601,6 +682,10 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) {
 
   record_reference_matmul(api::context(), out, mat1, mat2_t);
 
+  // Update sizes and strides of mat2_t to be that of a transposed tensor
+  mat2_t.virtual_reconfigure(mat2_t_sizes, transposed_dim_order);
+  EXPECT_TRUE(mat2_t.gpu_memory_layout() == utils::kHeightPacked);
+
   std::vector<float> data_out(out.staging_buffer_numel());
   // Extract the copy tensor; should contain the data of the original tensor
   extract_vtensor(out, data_out);
@@ -622,7 +707,7 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_slice_test) {
   constexpr int L_S2 = 7;
   constexpr int O_S2 = 3;
 
-  std::vector<int64_t> strides = {1};
+  std::vector<int64_t> dim_order = {0};
 
   std::vector<int64_t> t_sizes = {L};
   std::vector<int64_t> s1_sizes = {L_S1};
@@ -632,8 +717,8 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_slice_test) {
 
   fill_vtensor(orig, 0);
 
-  vTensor s1 = vTensor(orig, s1_sizes, strides, O_S1);
-  vTensor s2 = vTensor(s1, s2_sizes, strides, O_S2);
+  vTensor s1 = vTensor(orig, s1_sizes, dim_order, O_S1);
+  vTensor s2 = vTensor(s1, s2_sizes, dim_order, O_S2);
 
   record_scalar_add_buffer(api::context(), s1, 4.5f);
   record_scalar_add_buffer(api::context(), s2, 7.5f);
@@ -1093,7 +1178,7 @@ TEST(VulkanComputeGraphTest, test_simple_graph_with_view) {
   config.set_storage_type_override(utils::kBuffer);
   ComputeGraph graph(config);
 
-  std::vector<int64_t> strides = {W, 1};
+  std::vector<int64_t> dim_order = {0, 1};
 
   std::vector<int64_t> orig_sizes = {H, W};
   std::vector<int64_t> slice_sizes = {S_H, W};
@@ -1103,7 +1188,7 @@ TEST(VulkanComputeGraphTest, test_simple_graph_with_view) {
 
   IOValueRef orig = graph.add_input_tensor(orig_sizes, vkapi::kFloat);
   ValueRef slice =
-      graph.add_tensor_view(orig.value, slice_sizes, strides, offset);
+      graph.add_tensor_view(orig.value, slice_sizes, dim_order, offset);
 
   IOValueRef out = {};
 

From bb9d0b4d2944055acbb46ed819a6d05487c08967 Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Thu, 22 Aug 2024 11:51:04 -0700
Subject: [PATCH 5/8] [ET-VK] Add buffer implementation for matrix
 multiplication

## Context

TSIA. Add an implementation for matrix multiplication for buffer-backed tensors. This will be used forn the SDPA + KV Cache update custom op.

Differential Revision: [D61666461](https://our.internmc.facebook.com/intern/diff/D61666461/)

[ghstack-poisoned]
---
 .../vulkan/runtime/graph/ComputeGraph.cpp     |   2 +-
 backends/vulkan/runtime/graph/ComputeGraph.h  |  15 +++
 .../graph/ops/glsl/matmul_naive_buffer.glsl   |  66 +++++++++++
 .../graph/ops/glsl/matmul_naive_buffer.yaml   |  16 +++
 ...naive.glsl => matmul_naive_texture3d.glsl} |  16 +--
 ...naive.yaml => matmul_naive_texture3d.yaml} |  10 +-
 .../vulkan/runtime/graph/ops/impl/MatMul.cpp  |  52 ++++++++-
 backends/vulkan/test/op_tests/cases.py        |   1 +
 .../vulkan/test/vulkan_compute_api_test.cpp   | 107 ++++++++++--------
 9 files changed, 216 insertions(+), 69 deletions(-)
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.yaml
 rename backends/vulkan/runtime/graph/ops/glsl/{matmul_naive.glsl => matmul_naive_texture3d.glsl} (72%)
 rename backends/vulkan/runtime/graph/ops/glsl/{matmul_naive.yaml => matmul_naive_texture3d.yaml} (71%)

diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index 48e1ebf0a83..9fa0091b298 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -368,7 +368,7 @@ utils::uvec3 ComputeGraph::create_local_wg_size(
 }
 
 utils::uvec3 ComputeGraph::create_local_wg_size(const ValueRef idx) {
-  return create_local_wg_size(image_extents_of(idx));
+  return create_local_wg_size(create_global_wg_size(idx));
 }
 
 void ComputeGraph::copy_into_staging(
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index faa2f4107ec..58a97c9e255 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -186,6 +186,21 @@ class ComputeGraph final {
 
   std::vector<int64_t> sizes_of(const ValueRef idx) const;
 
+  /*
+   * Returns the size of the tensor at `idx` along the specified dimension.
+   * Negative indexing is allowed.
+   */
+  template <typename T>
+  T size_at(const int64_t dim, const ValueRef idx) const {
+    const Value& val = values_.at(idx);
+    if (val.isTensor()) {
+      return static_cast<T>(utils::val_at(dim, val.toConstTensor().sizes()));
+    } else if (val.isTensorRef()) {
+      return static_cast<T>(utils::val_at(dim, val.toConstTensorRef().sizes));
+    }
+    VK_THROW("Could not get sizes of value with type ", val.type());
+  }
+
   vkapi::ScalarType dtype_of(const ValueRef idx) const;
 
   inline utils::uvec3 image_extents_of(const ValueRef idx) const {
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.glsl
new file mode 100644
index 00000000000..81f0a815cb9
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.glsl
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define T ${buffer_scalar_type(DTYPE)}
+
+${define_required_extensions(DTYPE)}
+
+layout(std430) buffer;
+
+${layout_declare_tensor(0, "w", "t_out", DTYPE, "buffer")}
+${layout_declare_tensor(1, "r", "t_mat1", DTYPE, "buffer")}
+${layout_declare_tensor(2, "r", "t_mat2", DTYPE, "buffer")}
+${layout_declare_ubo(3, "ivec4", "out_sizes")}
+${layout_declare_ubo(4, "ivec4", "out_strides")}
+${layout_declare_ubo(5, "ivec4", "mat1_sizes")}
+${layout_declare_ubo(6, "ivec4", "mat1_strides")}
+${layout_declare_ubo(7, "ivec4", "mat2_sizes")}
+${layout_declare_ubo(8, "ivec4", "mat2_strides")}
+${layout_declare_ubo(9, "int", "out_numel")}
+
+#include "indexing_utils.h"
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec4 out_idx = ivec4(
+      gl_GlobalInvocationID.x,
+      gl_GlobalInvocationID.y,
+      gl_GlobalInvocationID.z % out_sizes.z,
+      gl_GlobalInvocationID.z / out_sizes.z);
+
+  if (any(greaterThanEqual(out_idx, out_sizes))) {
+    return;
+  }
+
+  int mat1_id = to_buffer_id(
+      ivec4(0, out_idx.y, out_idx.z, out_idx.w), mat1_strides);
+  int mat2_id = to_buffer_id(
+      ivec4(out_idx.x, 0, out_idx.z, out_idx.w), mat2_strides);
+
+  int orig_mat1_id = to_buffer_id(
+      ivec4(0, out_idx.y, out_idx.z, out_idx.w), mat1_strides);
+
+  int orig_mat2_id = to_buffer_id(
+      ivec4(out_idx.x, 0, 0, 0), mat2_strides);
+
+  T sum = T(0.0);
+  for (int i = 0; i < mat1_sizes.x; ++i) {
+    sum += t_mat1[mat1_id] * t_mat2[mat2_id];
+
+    mat1_id += mat1_strides.x;
+    mat2_id += mat2_strides.y;
+  }
+
+  const int out_id = to_buffer_id(out_idx, out_strides);
+  t_out[out_id] = T(sum);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.yaml
new file mode 100644
index 00000000000..54eb444f73d
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.yaml
@@ -0,0 +1,16 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+matmul_naive_buffer:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: buffer
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: float
+      - VALUE: half
+  shader_variants:
+    - NAME: matmul_naive_buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.glsl
similarity index 72%
rename from backends/vulkan/runtime/graph/ops/glsl/matmul_naive.glsl
rename to backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.glsl
index 37a9b60f3c5..7225f2c64a0 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.glsl
@@ -16,17 +16,11 @@ $if MAT2_IS_TRANSPOSED:
 #include "indexing_utils.h"
 #include "matmul.h"
 
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out;
-layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1;
-layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2;
-
-layout(set = 0, binding = 3) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 4) uniform PRECISION restrict InSizes {
-  ivec4 in_sizes;
-};
+${layout_declare_tensor(0, "w", "im_out", DTYPE, "texture3d")}
+${layout_declare_tensor(1, "r", "im_mat1", DTYPE, "texture3d")}
+${layout_declare_tensor(2, "r", "im_mat2", DTYPE, "texture3d")}
+${layout_declare_ubo(3, "ivec3", "out_limits")}
+${layout_declare_ubo(4, "ivec4", "in_sizes")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.yaml
similarity index 71%
rename from backends/vulkan/runtime/graph/ops/glsl/matmul_naive.yaml
rename to backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.yaml
index 1c4db3f0ce9..bb1eed494a5 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.yaml
@@ -4,10 +4,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-matmul_naive:
+matmul_naive_texture3d:
   parameter_names_with_default_values:
     DTYPE: float
-    NDIM: 3
+    STORAGE: texture3d
     MAT1_PACKING: W_packed
     MAT2_PACKING: H_packed
     MAT2_IS_TRANSPOSED: false
@@ -16,9 +16,9 @@ matmul_naive:
       - VALUE: float
       - VALUE: half
   shader_variants:
-    - NAME: matmul_naive_W_packed_H_packed
-    - NAME: matmul_naive_W_packed_W_packed
+    - NAME: matmul_naive_texture3d_W_packed_H_packed
+    - NAME: matmul_naive_texture3d_W_packed_W_packed
       MAT2_PACKING: W_packed
-    - NAME: matmul_transposed_naive_W_packed_W_packed
+    - NAME: matmul_transposed_naive_texture3d_W_packed_W_packed
       MAT2_PACKING: W_packed
       MAT2_IS_TRANSPOSED: true
diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
index d1d3ad47d76..2d9346e1340 100644
--- a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
@@ -62,7 +62,48 @@ void resize_matmul_node(
   out->virtual_resize(new_out_sizes);
 }
 
-void add_matmul_naive_node(
+void add_matmul_naive_buffer_node(
+    ComputeGraph& graph,
+    const ValueRef mat1,
+    const ValueRef mat2_data,
+    const ValueRef out,
+    const ValueRef mat2_is_transposed) {
+  ValueRef mat2 = prepack_if_tensor_ref(graph, mat2_data, utils::kHeightPacked);
+
+  std::string kernel_name = "matmul_naive_buffer";
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  utils::uvec3 global_size = {
+      graph.size_at<uint32_t>(-1, out),
+      graph.size_at<uint32_t>(-2, out),
+      graph.size_at<uint32_t>(-3, out) * graph.size_at<uint32_t>(-4, out)};
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_size,
+      graph.create_local_wg_size(global_size),
+      // Inputs and Outputs
+      {{out, vkapi::MemoryAccessType::WRITE},
+       {{mat1, mat2}, vkapi::MemoryAccessType::READ}},
+      // Shader params buffers
+      {
+          graph.sizes_ubo(out),
+          graph.strides_ubo(out),
+          graph.sizes_ubo(mat1),
+          graph.strides_ubo(mat1),
+          graph.sizes_ubo(mat2),
+          graph.strides_ubo(mat2),
+          graph.numel_ubo(out),
+      },
+      // Specialization Constants
+      {},
+      // Resizing Logic
+      resize_matmul_node,
+      {mat2_is_transposed}));
+}
+
+void add_matmul_naive_texture3d_node(
     ComputeGraph& graph,
     const ValueRef mat1,
     const ValueRef mat2_data,
@@ -74,6 +115,7 @@ void add_matmul_naive_node(
       ? "matmul_transposed_naive"
       : "matmul_naive";
   kernel_name.reserve(kShaderNameReserve);
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
   add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat1));
   add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat2));
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
@@ -174,10 +216,14 @@ void add_matmul_node(
     const ValueRef mat2_data,
     const ValueRef out,
     const ValueRef mat2_is_transposed) {
-  if (graph.memory_layout_of(mat1) == utils::kChannelsPacked) {
+  if (graph.is_buffer_storage(out)) {
+    add_matmul_naive_buffer_node(
+        graph, mat1, mat2_data, out, mat2_is_transposed);
+  } else if (graph.memory_layout_of(mat1) == utils::kChannelsPacked) {
     add_matmul_optimized_node(graph, mat1, mat2_data, out, mat2_is_transposed);
   } else if (graph.memory_layout_of(mat1) == utils::kWidthPacked) {
-    add_matmul_naive_node(graph, mat1, mat2_data, out, mat2_is_transposed);
+    add_matmul_naive_texture3d_node(
+        graph, mat1, mat2_data, out, mat2_is_transposed);
   } else {
     VK_THROW("Input should be channel packed or width packed.");
   }
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index ff5c7a60e0f..7f9f1842adf 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -70,6 +70,7 @@ def get_mm_inputs():
     test_suite.prepacked_args = ["mat2"]
     # ATen matmul doesn't support half
     test_suite.dtypes = ["at::kFloat"]
+    test_suite.storage_types = ["utils::kTexture3D", "utils::kBuffer"]
     test_suite.layouts = [
         "utils::kWidthPacked",
         "utils::kChannelsPacked",
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index a12a7974cbf..c8d50a406f8 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -2306,24 +2306,28 @@ void test_binary_op(
   }
 }
 
-#define CALL_TEST_FN_FORALL_CONDITIONS(_)                                 \
-  _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, false)    \
-  _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_HEIGHT_PACKED, false)   \
-  _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, false) \
-  _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, true)     \
-  _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_HEIGHT_PACKED, true)    \
-  _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, true)
-
-#define CALL_TEST_FN_FOR_W_PACKED(_)                                   \
-  _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, false) \
-  _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, true)
-
-#define CALL_TEST_FN_FOR_C_PACKED(_)                                      \
-  _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, false) \
-  _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, true)
+#define CALL_TEST_FN_FORALL_CONDITIONS(_)                            \
+  _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked, false)    \
+  _(vkapi::kFloat, utils::kTexture3D, utils::kHeightPacked, false)   \
+  _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked, false) \
+  _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked, true)     \
+  _(vkapi::kFloat, utils::kTexture3D, utils::kHeightPacked, true)    \
+  _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked, true)
+
+#define CALL_TEST_FN_FOR_W_PACKED(_)                              \
+  _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked, false) \
+  _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked, true)  \
+  _(vkapi::kFloat, utils::kBuffer, utils::kWidthPacked, false)    \
+  _(vkapi::kFloat, utils::kBuffer, utils::kWidthPacked, true)
+
+#define CALL_TEST_FN_FOR_C_PACKED(_)                                 \
+  _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked, false) \
+  _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked, true)  \
+  _(vkapi::kFloat, utils::kBuffer, utils::kChannelsPacked, false)    \
+  _(vkapi::kFloat, utils::kBuffer, utils::kChannelsPacked, true)
 
 TEST(VulkanComputeGraphOpsTest, add_smoke_test) {
-#define RUN_TESTS(dtype, layout, prepack)                                  \
+#define RUN_TESTS(dtype, storage, layout, prepack)                         \
   test_binary_op("add", {17, 21}, {17, 21}, dtype, layout, prepack);       \
   test_binary_op("add", {17, 21}, {1, 1}, dtype, layout, prepack);         \
   test_binary_op("sub", {11, 22}, {11, 22}, dtype, layout, prepack);       \
@@ -2344,9 +2348,11 @@ void test_mm(
     int K,
     int N,
     vkapi::ScalarType dtype,
+    utils::StorageType storage_type,
     utils::GPUMemoryLayout memory_layout,
     bool prepack = true) {
   GraphConfig config;
+  config.set_storage_type_override(storage_type);
   ComputeGraph graph(config);
 
   std::vector<int64_t> mat1_size = {M, K};
@@ -2403,42 +2409,45 @@ void test_mm(
 }
 
 TEST(VulkanComputeGraphOpsTest, mm_smoke_test) {
-#define RUN_TESTS(dtype, layout, prepack) \
-  test_mm(                                \
-      /*B = */ 1,                         \
-      /*M = */ 31,                        \
-      /*K = */ 127,                       \
-      /*N = */ 23,                        \
-      dtype,                              \
-      layout,                             \
-      prepack);                           \
-  test_mm(                                \
-      /*B = */ 5,                         \
-      /*M = */ 31,                        \
-      /*K = */ 127,                       \
-      /*N = */ 23,                        \
-      dtype,                              \
-      layout,                             \
-      prepack);                           \
-  test_mm(                                \
-      /*B = */ 7,                         \
-      /*M = */ 13,                        \
-      /*K = */ 89,                        \
-      /*N = */ 17,                        \
-      dtype,                              \
-      layout,                             \
-      prepack);                           \
-  test_mm(                                \
-      /*B = */ 1,                         \
-      /*M = */ 13,                        \
-      /*K = */ 89,                        \
-      /*N = */ 17,                        \
-      dtype,                              \
-      layout,                             \
+#define RUN_TESTS(dtype, storage_type, layout, prepack) \
+  test_mm(                                              \
+      /*B = */ 1,                                       \
+      /*M = */ 31,                                      \
+      /*K = */ 127,                                     \
+      /*N = */ 23,                                      \
+      dtype,                                            \
+      storage_type,                                     \
+      layout,                                           \
+      prepack);                                         \
+  test_mm(                                              \
+      /*B = */ 5,                                       \
+      /*M = */ 31,                                      \
+      /*K = */ 127,                                     \
+      /*N = */ 23,                                      \
+      dtype,                                            \
+      storage_type,                                     \
+      layout,                                           \
+      prepack);                                         \
+  test_mm(                                              \
+      /*B = */ 7,                                       \
+      /*M = */ 13,                                      \
+      /*K = */ 89,                                      \
+      /*N = */ 17,                                      \
+      dtype,                                            \
+      storage_type,                                     \
+      layout,                                           \
+      prepack);                                         \
+  test_mm(                                              \
+      /*B = */ 1,                                       \
+      /*M = */ 13,                                      \
+      /*K = */ 89,                                      \
+      /*N = */ 17,                                      \
+      dtype,                                            \
+      storage_type,                                     \
+      layout,                                           \
       prepack);
 
   CALL_TEST_FN_FOR_W_PACKED(RUN_TESTS);
-  CALL_TEST_FN_FOR_C_PACKED(RUN_TESTS);
 
 #undef RUN_TESTS
 }

From 880db18fa8dbbea3390b81bf45cd33e7ba50a4ad Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Thu, 22 Aug 2024 11:51:07 -0700
Subject: [PATCH 6/8] [ET-VK][Ez] Add utilities to check if one vTensor is a
 view of another

## Context

In the implementation of view operators, one of the pre-conditions is that the output tensor is a view of the input. This diff adds some utilities to check that one tensor is a view of another.

Differential Revision: [D61666458](https://our.internmc.facebook.com/intern/diff/D61666458/)

[ghstack-poisoned]
---
 backends/vulkan/runtime/api/containers/Tensor.cpp | 10 ++++++++++
 backends/vulkan/runtime/api/containers/Tensor.h   | 12 ++++++++++++
 backends/vulkan/runtime/graph/ComputeGraph.h      |  7 +++++++
 backends/vulkan/runtime/vk_api/memory/Buffer.h    |  4 ++++
 backends/vulkan/test/vulkan_compute_api_test.cpp  |  3 +++
 5 files changed, 36 insertions(+)

diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index 1a250650511..5e67b689735 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -685,6 +685,16 @@ void vTensorStorage::transition(
   last_access_.access = cur_access;
 }
 
+bool vTensorStorage::is_copy_of(const vTensorStorage& other) const {
+  if (storage_type_ != other.storage_type_) {
+    return false;
+  }
+  if (storage_type_ == utils::kBuffer) {
+    return buffer_.is_copy_of(other.buffer_);
+  }
+  return false;
+}
+
 void vTensorStorage::discard_and_reallocate(
     const std::vector<int64_t>& padded_sizes,
     const utils::GPUMemoryLayout gpu_memory_layout,
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
index 48dcdaf2f7a..11747c262d8 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.h
+++ b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -161,6 +161,11 @@ class vTensorStorage final {
     return image_.format();
   }
 
+  /*
+   * Used for checking if this vTensorStorage is a copy of another instance
+   */
+  bool is_copy_of(const vTensorStorage& other) const;
+
   void discard_and_reallocate(
       const std::vector<int64_t>& padded_sizes,
       const utils::GPUMemoryLayout gpu_memory_layout,
@@ -479,6 +484,13 @@ class vTensor final {
    * tensor sizes
    */
   void reallocate(const std::vector<int64_t>& new_sizes);
+
+  /*
+   * Check if this vTensor instance is a view of another vTensor instance
+   */
+  inline bool is_view_of(const vTensor& other) const {
+    return storage_.is_copy_of(other.storage_);
+  }
 };
 
 } // namespace api
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 58a97c9e255..5740d24a448 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -219,6 +219,13 @@ class ComputeGraph final {
     return values_.at(idx).toConstTensor().has_buffer_storage();
   }
 
+  inline bool val_is_view_of(const ValueRef maybe_view, const ValueRef base)
+      const {
+    return values_.at(maybe_view)
+        .toConstTensor()
+        .is_view_of(values_.at(base).toConstTensor());
+  }
+
   inline utils::GPUMemoryLayout memory_layout_of(const ValueRef idx) const {
     return values_.at(idx).toConstTensor().gpu_memory_layout();
   }
diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.h b/backends/vulkan/runtime/vk_api/memory/Buffer.h
index 3f69d1f2237..9302048f861 100644
--- a/backends/vulkan/runtime/vk_api/memory/Buffer.h
+++ b/backends/vulkan/runtime/vk_api/memory/Buffer.h
@@ -150,6 +150,10 @@ class VulkanBuffer final {
     return (handle_ != VK_NULL_HANDLE);
   }
 
+  inline bool is_copy_of(const VulkanBuffer& other) const {
+    return (handle_ == other.handle_) && is_copy_;
+  }
+
   inline void bind_allocation(const Allocation& memory) {
     VK_CHECK_COND(!memory_, "Cannot bind an already bound allocation!");
     VK_CHECK(vmaBindBufferMemory(allocator_, memory.allocation, handle_));
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index c8d50a406f8..7f1e70b6b02 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -635,6 +635,7 @@ TEST_F(VulkanComputeAPITest, tensor_copy_test) {
   vTensor original = CREATE_FLOAT_BUFFER(sizes, /*allocate_memory=*/true);
   vTensor copy = vTensor(original, sizes, dim_order);
   EXPECT_TRUE(get_vma_allocation_count() == 1);
+  EXPECT_TRUE(copy.is_view_of(original));
 
   // Fill original tensor with some data
   fill_vtensor(original, 2.5f, true);
@@ -1190,6 +1191,8 @@ TEST(VulkanComputeGraphTest, test_simple_graph_with_view) {
   ValueRef slice =
       graph.add_tensor_view(orig.value, slice_sizes, dim_order, offset);
 
+  EXPECT_TRUE(graph.val_is_view_of(slice, orig.value));
+
   IOValueRef out = {};
 
   out.value = graph.add_tensor(slice_sizes, vkapi::kFloat);

From da0d0fafd1ab539b7f07f6ead6fdc51b73af36ce Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Thu, 22 Aug 2024 11:51:10 -0700
Subject: [PATCH 7/8] [ET-VK] Add transpose op as view operator

## Context

As title. Implement `aten.transpose.int` as a view operator, which creates an alias of the input tensor with different sizes and strides.

To effectively test the op, the codegen script is also updated to support view ops.

Differential Revision: [D61666463](https://our.internmc.facebook.com/intern/diff/D61666463/)

[ghstack-poisoned]
---
 .../vulkan/runtime/graph/ComputeGraph.cpp     |  26 ++++
 backends/vulkan/runtime/graph/ComputeGraph.h  |   6 +
 .../runtime/graph/ops/impl/Transpose.cpp      | 124 ++++++++++++++++++
 backends/vulkan/test/op_tests/cases.py        |  26 ++++
 .../vulkan/test/op_tests/utils/codegen.py     |  12 +-
 .../test/op_tests/utils/codegen_base.py       |   2 +
 .../vulkan/test/vulkan_compute_api_test.cpp   |  87 ++++++++++++
 7 files changed, 282 insertions(+), 1 deletion(-)
 create mode 100644 backends/vulkan/runtime/graph/ops/impl/Transpose.cpp

diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index 9fa0091b298..e014c52a3a4 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -131,6 +131,32 @@ std::vector<int64_t> ComputeGraph::sizes_of(const ValueRef idx) const {
   VK_THROW("Could not get sizes of value with type ", val.type());
 }
 
+int64_t ComputeGraph::dim_of(const ValueRef idx) const {
+  const Value& val = values_.at(idx);
+  if (val.isTensor()) {
+    return val.toConstTensor().dim();
+  } else if (val.isTensorRef()) {
+    return val.toConstTensorRef().sizes.size();
+  }
+  VK_THROW("Could not get dim of value with type ", val.type());
+}
+
+std::vector<int64_t> ComputeGraph::dim_order_of(const ValueRef idx) const {
+  const Value& val = values_.at(idx);
+  if (val.isTensor()) {
+    return val.toConstTensor().dim_order();
+  }
+  VK_THROW("Could not get strides of value with type ", val.type());
+}
+
+std::vector<int64_t> ComputeGraph::strides_of(const ValueRef idx) const {
+  const Value& val = values_.at(idx);
+  if (val.isTensor()) {
+    return val.toConstTensor().strides();
+  }
+  VK_THROW("Could not get strides of value with type ", val.type());
+}
+
 vkapi::ScalarType ComputeGraph::dtype_of(const ValueRef idx) const {
   const Value& val = values_.at(idx);
   if (val.isTensor()) {
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 5740d24a448..b73b552067c 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -201,6 +201,12 @@ class ComputeGraph final {
     VK_THROW("Could not get sizes of value with type ", val.type());
   }
 
+  int64_t dim_of(const ValueRef idx) const;
+
+  std::vector<int64_t> dim_order_of(const ValueRef idx) const;
+
+  std::vector<int64_t> strides_of(const ValueRef idx) const;
+
   vkapi::ScalarType dtype_of(const ValueRef idx) const;
 
   inline utils::uvec3 image_extents_of(const ValueRef idx) const {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Transpose.cpp b/backends/vulkan/runtime/graph/ops/impl/Transpose.cpp
new file mode 100644
index 00000000000..faa99ec1a18
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Transpose.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/Logging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+#include <algorithm>
+
+#include <iostream>
+
+namespace vkcompute {
+
+/*
+ * Transposing for sizes and strides is as simple as swapping the values at
+ * dim0 and dim1 in the sizes/strides vector.
+ */
+void swap_vector_inplace(
+    std::vector<int64_t>& vec,
+    const int64_t dim0,
+    const int64_t dim1) {
+  std::iter_swap(vec.begin() + dim0, vec.begin() + dim1);
+}
+
+/*
+ * Transposing the dim order is a bit more unintuitive. dim0 and dim1 have
+ * swapped their "identities", so we need to swap the values of dim0 and dim1
+ * wherever they appear in the dim order vector. Compare this to just swapping
+ * the elements at dim0 and dim1 in the strides or sizes vectors.
+ */
+void transpose_dim_order_inplace(
+    std::vector<int64_t>& dim_order,
+    const int64_t dim0,
+    const int64_t dim1) {
+  for (int i = 0; i < dim_order.size(); ++i) {
+    if (dim_order[i] == dim0) {
+      dim_order[i] = dim1;
+    } else if (dim_order[i] == dim1) {
+      dim_order[i] = dim0;
+    }
+  }
+}
+
+void resize_transpose_view_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  (void)args;
+  vTensorPtr out = graph->get_tensor(extra_args[0]);
+  vTensorPtr in = graph->get_tensor(extra_args[1]);
+
+  const int64_t dim0 = graph->extract_scalar<int64_t>(extra_args[2]);
+  const int64_t dim1 = graph->extract_scalar<int64_t>(extra_args[3]);
+
+  std::vector<int64_t> new_sizes = in->sizes();
+  std::vector<int64_t> new_dim_order = in->dim_order();
+
+  swap_vector_inplace(new_sizes, dim0, dim1);
+  transpose_dim_order_inplace(new_dim_order, dim0, dim1);
+
+  out->virtual_reconfigure(new_sizes, new_dim_order);
+}
+
+void check_transpose_view_args(
+    ComputeGraph& graph,
+    ValueRef in_ref,
+    const int64_t dim0,
+    const int64_t dim1,
+    ValueRef out_ref) {
+  VK_CHECK_COND(
+      graph.val_is_view_of(out_ref, in_ref),
+      "output tensor must be a view of the input tensor");
+
+  const int64_t in_ndim = graph.dim_of(in_ref);
+  VK_CHECK_COND(
+      dim0 >= 0 && dim0 < in_ndim, "dim0 is not in the range of [0, in_ndim)");
+  VK_CHECK_COND(
+      dim1 >= 0 && dim1 < in_ndim, "dim1 is not in the range of [0, in_ndim)");
+}
+
+void add_transpose_view_node(
+    ComputeGraph& graph,
+    ValueRef input_ref,
+    ValueRef dim0_ref,
+    ValueRef dim1_ref,
+    ValueRef out_ref) {
+  const int64_t dim0 = graph.extract_scalar<int64_t>(dim0_ref);
+  const int64_t dim1 = graph.extract_scalar<int64_t>(dim1_ref);
+
+  std::vector<int64_t> new_sizes = graph.sizes_of(input_ref);
+  std::vector<int64_t> new_dim_order = graph.dim_order_of(input_ref);
+
+  swap_vector_inplace(new_sizes, dim0, dim1);
+  transpose_dim_order_inplace(new_dim_order, dim0, dim1);
+
+  graph.get_tensor(out_ref)->virtual_reconfigure(new_sizes, new_dim_order);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      resize_transpose_view_node, {out_ref, input_ref, dim0_ref, dim1_ref}));
+}
+
+void transpose(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  const ValueRef out = args[3];
+  return add_transpose_view_node(
+      graph,
+      args[0], // input
+      args[1], // dim0
+      args[2], // dim1
+      out);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.transpose.int, transpose);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index 7f9f1842adf..c5088ffdb32 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -548,6 +548,32 @@ def get_slice_inputs():
     return test_suite
 
 
+@register_test_suite(["aten.transpose.int"])
+def get_transpose_inputs():
+    Test = namedtuple("VkTransposeViewTest", ["self", "dim0", "dim1"])
+    Test.__new__.__defaults__ = (None, 0, 1)
+
+    test_cases = [
+        Test(self=[M1, M2], dim0=0, dim1=1),
+        Test(self=[M1, S2, M], dim0=0, dim1=1),
+        Test(self=[M1, S2, M], dim0=0, dim1=2),
+        Test(self=[M1, S2, M], dim0=2, dim1=1),
+        Test(self=[S, M, S2, M2], dim0=0, dim1=2),
+        Test(self=[S, M, S2, M2], dim0=3, dim1=2),
+        Test(self=[S, M, S2, M2], dim0=1, dim1=2),
+        Test(self=[S, M, S2, M2], dim0=3, dim1=1),
+    ]
+
+    test_suite = VkTestSuite([tuple(tc) for tc in test_cases])
+
+    test_suite.dtypes = ["at::kFloat"]
+    test_suite.storage_types = ["utils::kBuffer"]
+    test_suite.layouts = ["utils::kWidthPacked", "utils::kChannelsPacked"]
+    test_suite.data_gen = "make_seq_tensor"
+    test_suite.is_view_op = True
+    return test_suite
+
+
 @register_test_suite("aten.index_select.default")
 def get_index_select_inputs():
     Test = namedtuple("VkIndexSelectTest", ["self", "dim", "index"])
diff --git a/backends/vulkan/test/op_tests/utils/codegen.py b/backends/vulkan/test/op_tests/utils/codegen.py
index 0bccf64458c..b39801e7660 100644
--- a/backends/vulkan/test/op_tests/utils/codegen.py
+++ b/backends/vulkan/test/op_tests/utils/codegen.py
@@ -266,6 +266,7 @@ def create_value_for(self, ref: ValueRefList) -> str:  # noqa: C901
             return ret_str
 
         prepack = self.prepack_ref(ref)
+        ref_is_view = self.suite_def.is_view_op and ref.is_out
 
         cpp_type = "IOValueRef" if (ref.is_in and not prepack) else "ValueRef"
 
@@ -339,7 +340,16 @@ def create_value_for(self, ref: ValueRefList) -> str:  # noqa: C901
             return ret_str
 
         ret_str = f"{cpp_type} {ref.name} = {self.graph}{self.dot}"
-        if ref.src_cpp_type == AT_TENSOR and not prepack:
+        if ref.src_cpp_type == AT_TENSOR and ref_is_view:
+            input_name = None
+            for _name, ref in self.refs.items():
+                if ref.is_in and ref.src_cpp_type == AT_TENSOR:
+                    input_name = ref.name
+
+            assert input_name is not None
+            ret_str += "add_tensor_view(" + input_name + ".value);"
+            pass
+        elif ref.src_cpp_type == AT_TENSOR and not prepack:
             ret_str += "add_input_tensor(" if ref.is_in else "add_tensor("
             ret_str += f"{ref.src_cpp_name}.sizes().vec(), "
             ret_str += f"from_at_scalartype({ref.src_cpp_name}.scalar_type())); \n"
diff --git a/backends/vulkan/test/op_tests/utils/codegen_base.py b/backends/vulkan/test/op_tests/utils/codegen_base.py
index 5b3ca0908cf..1ebebe699a0 100644
--- a/backends/vulkan/test/op_tests/utils/codegen_base.py
+++ b/backends/vulkan/test/op_tests/utils/codegen_base.py
@@ -57,6 +57,8 @@ def __init__(self, input_cases: List[Any]):
         self.atol: str = "1e-5"
         self.rtol: str = "1e-5"
 
+        self.is_view_op: bool = False
+
     def supports_prepack(self):
         return len(self.prepacked_args) > 0
 
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 7f1e70b6b02..157f995ab4c 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -2682,3 +2682,90 @@ TEST(VulkanComputeGraphOpsTest, grid_priors_test) {
       /*offset = */ 0.5,
       /*data_out_expected = */ {4, 4, 12, 4, 20, 4, 4, 12, 12, 12, 20, 12});
 }
+
+void test_transpose_view_mm(
+    const int B,
+    const int M,
+    const int K,
+    const int N) {
+  GraphConfig config;
+  config.set_storage_type_override(utils::kBuffer);
+  ComputeGraph graph(config);
+
+  std::vector<int64_t> mat1_size = {M, K};
+  std::vector<int64_t> mat2_t_size = {N, K};
+  std::vector<int64_t> out_size = {M, N};
+
+  std::vector<int64_t> mat1_small_size = {M - 4, K - 3};
+  std::vector<int64_t> mat2_t_small_size = {N - 1, K - 3};
+
+  if (B > 1) {
+    mat1_size.resize(3);
+    mat1_size = {B, M, K};
+    mat2_t_size.resize(3);
+    mat2_t_size = {B, N, K};
+    out_size.resize(3);
+    out_size = {B, M, N};
+
+    mat1_small_size.resize(3);
+    mat1_small_size = {B, M - 4, K - 3};
+    mat2_t_small_size.resize(3);
+    mat2_t_small_size = {B, N - 1, K - 3};
+  }
+
+  // Build graph
+
+  IOValueRef mat1 =
+      graph.add_input_tensor(mat1_size, vkapi::kFloat, utils::kWidthPacked);
+  IOValueRef mat2_t =
+      graph.add_input_tensor(mat2_t_size, vkapi::kFloat, utils::kWidthPacked);
+
+  ValueRef mat2 = graph.add_tensor_view(mat2_t.value);
+
+  ValueRef dim0;
+  ValueRef dim1;
+
+  if (B > 1) {
+    dim0 = graph.add_scalar<int64_t>(1);
+    dim1 = graph.add_scalar<int64_t>(2);
+  } else {
+    dim0 = graph.add_scalar<int64_t>(0);
+    dim1 = graph.add_scalar<int64_t>(1);
+  }
+
+  IOValueRef out;
+  out.value = graph.add_tensor(out_size, vkapi::kFloat, utils::kWidthPacked);
+
+  VK_GET_OP_FN("aten.transpose.int")(graph, {mat2_t.value, dim0, dim1, mat2});
+  VK_GET_OP_FN("aten.mm.default")(graph, {mat1.value, mat2, out.value});
+
+  out.staging = graph.set_output_tensor(out.value);
+
+  graph.prepare();
+  graph.encode_prepack();
+  graph.prepack();
+  graph.encode_execute();
+
+  for (int i = 1; i < 4; i++) {
+    float val_mat1 = i;
+    float val_mat2 = i + 1;
+    float val_out = K * (val_mat1 * val_mat2);
+
+    // Try at full size
+    graph.resize_input(0, mat1_size);
+    graph.resize_input(1, mat2_t_size);
+    graph.propagate_resize();
+    execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out});
+
+    // Try at reduced sizes
+    val_out = (K - 3) * (val_mat1 * val_mat2);
+    graph.resize_input(0, mat1_small_size);
+    graph.resize_input(1, mat2_t_small_size);
+    graph.propagate_resize();
+    execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out});
+  }
+}
+
+TEST(VulkanComputeGraphOpsTest, test_transpose_with_mm) {
+  test_transpose_view_mm(2, 7, 17, 5);
+}

From a330c4e2c42db2ad7700777cbcc646cd23b60611 Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Tue, 24 Sep 2024 09:30:33 -0700
Subject: [PATCH 8/8] Update base for Update on "[ET-VK] Add transpose op as
 view operator"

## Context

As title. Implement `aten.transpose.int` as a view operator, which creates an alias of the input tensor with different sizes and strides.

To effectively test the op, the codegen script is also updated to support view ops.

Differential Revision: [D61666463](https://our.internmc.facebook.com/intern/diff/D61666463/)

[ghstack-poisoned]