From 0a6cc2a35fb37e745d9875d52ba88841913f50bd Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Thu, 22 Aug 2024 11:50:52 -0700
Subject: [PATCH 1/4] [ET-VK][ez] Introduce `check_close` function in
 `compute_api_test` to account for small numerical differences

## Context

Introduce a `check_close` function in `vulkan_compute_api_test` to account for small numerical differences when executing on the GPU. Otherwise some tests will be reported as failures on some platforms (i.e. Mac).

Differential Revision: [D61666459](https://our.internmc.facebook.com/intern/diff/D61666459/)

[ghstack-poisoned]
---
 backends/vulkan/test/utils/test_utils.cpp        | 6 ++++++
 backends/vulkan/test/utils/test_utils.h          | 6 ++++++
 backends/vulkan/test/vulkan_compute_api_test.cpp | 2 +-
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
index ad496873695..6c056cc9d90 100644
--- a/backends/vulkan/test/utils/test_utils.cpp
+++ b/backends/vulkan/test/utils/test_utils.cpp
@@ -482,3 +482,9 @@ void execute_graph_and_check_output(
     }
   }
 }
+
+bool check_close(float a, float b, float atol, float rtol) {
+  float max = std::max(std::abs(a), std::abs(b));
+  float diff = std::abs(a - b);
+  return diff <= (atol + rtol * max);
+}
diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h
index f9969eddbf4..bf549446170 100644
--- a/backends/vulkan/test/utils/test_utils.h
+++ b/backends/vulkan/test/utils/test_utils.h
@@ -242,3 +242,9 @@ void print_vector(
   }
   std::cout << std::endl;
 }
+
+//
+// Misc. Utilities
+//
+
+bool check_close(float a, float b, float atol = 1e-4, float rtol = 1e-5);
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 307593d8fdb..ee2d119b6be 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -601,7 +601,7 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) {
   EXPECT_TRUE(data_out.size() == ref_out.size());
 
   for (size_t i = 0; i < data_out.size(); ++i) {
-    EXPECT_TRUE(data_out[i] == ref_out[i]);
+    EXPECT_TRUE(check_close(data_out[i], ref_out[i]));
   }
 }
 

From f13eedeba46994faf74bce06d52143389a57a976 Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Thu, 22 Aug 2024 11:50:55 -0700
Subject: [PATCH 2/4] [ET-VK][ez] Empty initialize ShaderInfo and add `bool()`
 operator

## Context

This diff lays the foundation for the implementation of "view operators". These operators will not dispatch any shaders; they will be solely responsible for updating the sizes and strides metadata of the output tensor, which will use the same storage resource as the input tensor. These ops will be implemented by adding a "no-op" `ExecuteNode` instance to the `ComputeGraph`, which does not contain a shader but does contain a resize function to update sizes and strides upon a resize.

This diff allows `ShaderInfo` to empty initialize, and add a bool operator to check if the ShaderInfo actually points to valid shader code. This will be used to construct "no-op" `ExecuteNode` instances which can be used to implement view operators.

Differential Revision: [D61666460](https://our.internmc.facebook.com/intern/diff/D61666460/)

[ghstack-poisoned]
---
 backends/vulkan/runtime/vk_api/Shader.h          | 8 ++++++--
 backends/vulkan/test/vulkan_compute_api_test.cpp | 7 +++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/backends/vulkan/runtime/vk_api/Shader.h b/backends/vulkan/runtime/vk_api/Shader.h
index 34c2d95c932..1e3b2a799f2 100644
--- a/backends/vulkan/runtime/vk_api/Shader.h
+++ b/backends/vulkan/runtime/vk_api/Shader.h
@@ -53,8 +53,8 @@ class ShaderLayout final {
 
 struct ShaderInfo final {
   struct {
-    const uint32_t* bin;
-    uint32_t size;
+    const uint32_t* bin = nullptr;
+    uint32_t size = 0u;
   } src_code;
 
   std::string kernel_name{""};
@@ -71,6 +71,10 @@ struct ShaderInfo final {
       const uint32_t,
       std::vector<VkDescriptorType>,
       const utils::uvec3 tile_size);
+
+  operator bool() const {
+    return src_code.bin != nullptr;
+  };
 };
 
 bool operator==(const ShaderInfo& _1, const ShaderInfo& _2);
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index ee2d119b6be..cbd409112ff 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -168,6 +168,13 @@ std::vector<int64_t> get_reference_strides(
   return {};
 }
 
+TEST_F(VulkanComputeAPITest, empty_init_shader_info_test) {
+  vkapi::ShaderInfo empty_shader_info;
+  EXPECT_FALSE(empty_shader_info);
+  EXPECT_TRUE(empty_shader_info.src_code.bin == nullptr);
+  EXPECT_TRUE(empty_shader_info.src_code.size == 0u);
+}
+
 TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) {
   for (const auto& sizes : standard_sizes_to_test) {
     if (sizes.size() < 3) {

From 1e930def96a08f27a577ef4910d5085ab0db30ea Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Thu, 22 Aug 2024 11:50:58 -0700
Subject: [PATCH 3/4] [ET-VK][ez] Enable no-op ExecuteNodes for view ops

## Context

This diff lays the foundation for the implementation of "view operators". These operators will not dispatch any shaders; they will be solely responsible for updating the sizes and strides metadata of the output tensor, which will use the same storage resource as the input tensor. These ops will be implemented by adding a "no-op" ExecuteNode instance to the ComputeGraph, which does not contain a shader but does contain a resize function to update sizes and strides upon a resize.

This diff allows `ExecuteNode` to empty initialize. If the `ShaderInfo` of the `ExecuteNode` instance is not valid, then the `execute()` function of the `ExecuteNode` instance will be a no-op.

Differential Revision: [D61666465](https://our.internmc.facebook.com/intern/diff/D61666465/)

[ghstack-poisoned]
---
 .../vulkan/runtime/graph/ops/ExecuteNode.cpp     | 15 +++++++++++++++
 backends/vulkan/runtime/graph/ops/ExecuteNode.h  | 16 +++++++++++++++-
 backends/vulkan/test/vulkan_compute_api_test.cpp | 13 +++++++++++++
 3 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp
index 3b2a826f87f..2cb00ba65af 100644
--- a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp
@@ -35,7 +35,22 @@ ExecuteNode::ExecuteNode(
   graph.update_descriptor_counts(shader, /*execute = */ true);
 }
 
+ExecuteNode::ExecuteNode(
+    const ResizeFunction& resize_fn,
+    const std::vector<ValueRef>& resize_args)
+    : shader_(),
+      global_workgroup_size_({0u, 0u, 0u}),
+      local_workgroup_size_({0u, 0u, 0u}),
+      args_(),
+      params_(),
+      spec_vars_(),
+      resize_fn_(resize_fn),
+      resize_args_(resize_args) {}
+
 void ExecuteNode::encode(ComputeGraph* graph) {
+  if (!shader_) {
+    return;
+  }
   api::Context* const context = graph->context();
   vkapi::PipelineBarrier pipeline_barrier{};
 
diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.h b/backends/vulkan/runtime/graph/ops/ExecuteNode.h
index 1fff14e020e..dece9ddb50d 100644
--- a/backends/vulkan/runtime/graph/ops/ExecuteNode.h
+++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.h
@@ -48,7 +48,7 @@ class ExecuteNode final {
       const std::vector<ArgGroup>&,
       const std::vector<ValueRef>&)>;
 
-  ExecuteNode(
+  explicit ExecuteNode(
       ComputeGraph& graph,
       const vkapi::ShaderInfo& shader,
       const utils::uvec3& global_workgroup_size,
@@ -59,6 +59,15 @@ class ExecuteNode final {
       const ResizeFunction& resize_fn = nullptr,
       const std::vector<ValueRef>& resize_args = {});
 
+  /*
+   * This overload of the ExecuteNode constructor is used to register ops which
+   * update a tensor view. No shader is dispatched, but the node still needs to
+   * update the view's sizes and strides after a resize.
+   */
+  explicit ExecuteNode(
+      const ResizeFunction& resize_fn = nullptr,
+      const std::vector<ValueRef>& resize_args = {});
+
   ~ExecuteNode() = default;
 
   void encode(ComputeGraph* graph);
@@ -83,6 +92,11 @@ class ExecuteNode final {
   const vkapi::SpecVarList spec_vars_;
   const ResizeFunction resize_fn_;
   const std::vector<ValueRef> resize_args_;
+
+ public:
+  operator bool() const {
+    return shader_;
+  }
 };
 
 } // namespace vkcompute
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index cbd409112ff..af92728cb0c 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -982,6 +982,19 @@ TEST(VulkanComputeGraphTest, test_values_string) {
   EXPECT_TRUE(stored == "hello, world");
 }
 
+TEST(VulkanComputeGraphTest, empty_init_executenode_test) {
+  ExecuteNode node(nullptr, {});
+  EXPECT_FALSE(node);
+
+  GraphConfig config;
+  ComputeGraph graph(config);
+
+  // Encode an empty ExecuteNode and check that command buffer encoding does not
+  // crash.
+  graph.execute_nodes().emplace_back(new ExecuteNode(nullptr, {}));
+  EXPECT_NO_FATAL_FAILURE(graph.encode_execute());
+}
+
 TEST(VulkanComputeGraphTest, test_zero_dim_tensor) {
   GraphConfig config;
   ComputeGraph graph(config);

From 846cd776190fecfad13b351b3936112f6b1318bb Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Thu, 22 Aug 2024 11:51:01 -0700
Subject: [PATCH 4/4] [ET-VK] Use dim order as the source of truth for tensor
 strides

## Context

This diff introduces dim order as a core metadata member of `vTensor`. The dim order is a better source of truth for the data layout of a buffer backed tensor because it is less ambiguous than the strides; when multiple dims have a stride of 1, it is ambiguous which dim comes "before" the other. This amibiguity makes it impossible to update the strides correctly when the tensor is resized.

## Changes

* Introduce `dim_order_` as a core metadata member of `vTensor` class
* Tensor view construction now accepts `new_dim_order` as an argument instead of `new_strides`
* Introduce `virtual_reconfigure` member function of `vTensor` which is an extension of `virtual_resize` that allows updating of sizes as well as strides (via updating the dim order)

Differential Revision: [D61666464](https://our.internmc.facebook.com/intern/diff/D61666464/)

[ghstack-poisoned]
---
 .../vulkan/runtime/api/containers/Tensor.cpp  | 231 +++++++++++++-----
 .../vulkan/runtime/api/containers/Tensor.h    | 107 ++++++--
 .../vulkan/runtime/graph/ComputeGraph.cpp     |   7 +
 backends/vulkan/runtime/graph/ComputeGraph.h  |   9 +-
 .../vulkan/test/vulkan_compute_api_test.cpp   | 107 +++++++-
 5 files changed, 372 insertions(+), 89 deletions(-)

diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index 78aa4796aa5..1a250650511 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -13,36 +13,15 @@
 namespace vkcompute {
 namespace api {
 
-/*
- * Given the strides of a buffer-backed tensor, find the index of the "fastest
- * moving" dimension in WHCN dimension order. If multiple dims have the lowest
- * stride, then the "earlier" dim is assumed to be the fastest moving (width is
- * "earlier" than height).
- */
-int32_t find_fastest_whcn_dim(const std::vector<int64_t>& strides) {
-  if (strides.size() == 0) {
-    return 0;
-  }
-  int32_t fastest_dim = 0;
-  int64_t min_stride = strides.at(0);
-  for (int d = strides.size() - 1; d >= 0; --d) {
-    if (strides.at(d) < min_stride) {
-      fastest_dim = d;
-      min_stride = strides.at(d);
-    }
-  }
-  return (strides.size() - 1 - fastest_dim);
-}
-
 /*
  * Given the strides of a buffer-backed tensor, estimate the equivalent memory
  * layout enum value by identifying the fastest moving dimension.
  */
 utils::GPUMemoryLayout estimate_memory_layout(
-    const std::vector<int64_t>& strides) {
-  int32_t fastest_dim = find_fastest_whcn_dim(strides);
-  if (fastest_dim <= 3) {
-    return utils::GPUMemoryLayout(fastest_dim);
+    const std::vector<int64_t>& dim_order) {
+  int64_t fastest_dim_whcn = dim_order.size() - 1 - dim_order.back();
+  if (fastest_dim_whcn >= 0 && fastest_dim_whcn <= 3) {
+    return utils::GPUMemoryLayout(fastest_dim_whcn);
   }
 
   // TODO(ssjia) find a way to gracefully recover from this case by i.e. adding
@@ -51,41 +30,108 @@ utils::GPUMemoryLayout estimate_memory_layout(
   VK_THROW("No compatible GPUMemoryLayout value");
 }
 
+std::vector<int64_t> calculate_dim_order(
+    const size_t ndim,
+    const utils::GPUMemoryLayout memory_layout) {
+  // Special case for zero dim tensors
+  if (ndim == 0) {
+    return {0};
+  }
+  std::vector<int64_t> dim_order(ndim);
+  int64_t last_dim =
+      ndim - utils::to_packed_dim_nchw_offset<int64_t>(memory_layout);
+
+  int64_t cur_dim = 0;
+  for (int d = 0; d < ndim; ++d) {
+    if (d == last_dim) {
+      cur_dim += 1;
+    }
+    dim_order[d] = cur_dim;
+    cur_dim += 1;
+  }
+  if (last_dim >= 0) {
+    dim_order[ndim - 1] = last_dim;
+  }
+
+  return dim_order;
+}
+
+namespace {
+
+struct StrideDimIndexPair {
+  int64_t stride;
+  int64_t dim_i;
+
+  StrideDimIndexPair() : stride(0), dim_i(0) {}
+
+  explicit StrideDimIndexPair(int64_t stride, int64_t dim_i)
+      : stride(stride), dim_i(dim_i) {}
+
+  bool operator>(const StrideDimIndexPair& other) const {
+    // Descending order
+    return stride < other.stride;
+  }
+
+  bool operator<(const StrideDimIndexPair& other) const {
+    // Descending order
+    return stride > other.stride;
+  }
+};
+
+} // namespace
+
+std::vector<int64_t> strides_to_dim_order(const std::vector<int64_t>& strides) {
+  std::vector<StrideDimIndexPair> stride_dim_pairs(strides.size());
+  for (size_t i = 0; i < strides.size(); ++i) {
+    stride_dim_pairs[i] = StrideDimIndexPair(strides[i], i);
+  }
+  std::stable_sort(stride_dim_pairs.begin(), stride_dim_pairs.end());
+
+  std::vector<int64_t> dim_order(strides.size());
+  for (int i = 0; i < strides.size(); ++i) {
+    dim_order.at(i) = stride_dim_pairs.at(i).dim_i;
+  }
+  return dim_order;
+}
+
 std::vector<int64_t> calculate_strides(
     const std::vector<int64_t>& sizes,
-    const utils::GPUMemoryLayout memory_layout) {
+    const std::vector<int64_t>& dim_order) {
   // For zero dim tensors
   if (sizes.size() == 0) {
     return {1};
   }
 
-  const int64_t dim_offset =
-      utils::to_packed_dim_nchw_offset<int64_t>(memory_layout);
-  int64_t last_dim = sizes.size() - dim_offset;
-  if (last_dim < 0) {
-    last_dim = sizes.size() - 1;
-  }
-
   size_t ndim = sizes.size();
   std::vector<int64_t> strides(ndim);
 
-  const int64_t last_dim_size = sizes.at(last_dim);
-
-  for (int stride_d = ndim - 1; stride_d >= 0; stride_d--) {
-    strides.at(stride_d) = 1;
-    if (stride_d == last_dim) {
-      continue;
-    }
-    strides.at(stride_d) = last_dim_size;
-    for (int size_d = ndim - 1; size_d > stride_d; size_d--) {
-      if (size_d != last_dim) {
-        strides.at(stride_d) *= sizes.at(size_d);
-      }
+  strides[dim_order[ndim - 1]] = 1;
+  for (int32_t i = ndim - 2; i >= 0; --i) {
+    if (sizes[dim_order[i + 1]] == 0) {
+      strides[dim_order[i]] = strides[dim_order[i + 1]];
+    } else {
+      strides[dim_order[i]] =
+          strides[dim_order[i + 1]] * sizes[dim_order[i + 1]];
     }
   }
+
   return strides;
 }
 
+bool dim_order_is_valid(const std::vector<int64_t>& dim_order) {
+  int64_t sum = 0;
+  for (size_t i = 0; i < dim_order.size(); ++i) {
+    if (dim_order[i] < 0 || dim_order[i] >= dim_order.size()) {
+      return false;
+    }
+    sum += dim_order[i];
+  }
+  int64_t n = static_cast<int64_t>(dim_order.size() - 1);
+  // Sanity check that the sum of the indices in the vector is equal to the sum
+  // of 0 + 1 + 2 + ... + (ndim - 1)
+  return sum == n * (n + 1) / 2;
+}
+
 std::vector<int64_t> unsqueeze_strides(
     const std::vector<int64_t>& strides,
     const int64_t numel) {
@@ -170,7 +216,8 @@ vTensor::vTensor(
       memory_layout_(memory_layout),
       // Calculate tensor size metadata
       sizes_(sizes.begin(), sizes.end()),
-      strides_(calculate_strides(sizes, memory_layout_)),
+      dim_order_(calculate_dim_order(sizes_.size(), memory_layout_)),
+      strides_(calculate_strides(sizes, dim_order_)),
       numel_(utils::multiply_integers(sizes_)),
       padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
       unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
@@ -189,6 +236,9 @@ vTensor::vTensor(
           padded_sizes_,
           dtype_,
           allocate_memory) {
+  VK_CHECK_COND(
+      dim_order_is_valid(dim_order_), "computed dim order is invalid");
+
   if (storage_type != utils::kBuffer) {
     texture_limits_.limits = utils::ivec3{
         utils::safe_downcast<int32_t>(storage_.image_extents_[0]),
@@ -204,16 +254,39 @@ vTensor::vTensor(
   }
 }
 
+vTensor::vTensor(const vTensor& other)
+    : dtype_(other.dtype_),
+      memory_layout_(other.memory_layout_),
+      // Copy tensor size metadata
+      sizes_(other.sizes_.begin(), other.sizes_.end()),
+      dim_order_(other.dim_order_.begin(), other.dim_order_.end()),
+      strides_(other.strides_.begin(), other.strides_.end()),
+      numel_(other.numel_),
+      padded_sizes_{other.padded_sizes_.begin(), other.padded_sizes_.end()},
+      unsqueezed_strides_{
+          other.unsqueezed_strides_.begin(),
+          other.unsqueezed_strides_.end()},
+      padded_numel_(other.padded_numel_),
+      texture_limits_{other.texture_limits_},
+      // Empty initialize Utility Uniform Buffers
+      sizes_uniform_(),
+      strides_uniform_(),
+      numel_uniform_(),
+      texture_limits_uniform_(),
+      // Copy Tensor storage
+      storage_(other.storage_) {}
+
 vTensor::vTensor(
     const vTensor& other,
     const std::vector<int64_t>& sizes,
-    const std::vector<int64_t>& strides,
-    const size_t offset_numel)
+    const std::vector<int64_t>& dim_order,
+    const int64_t offset_numel)
     : dtype_(other.dtype_),
-      memory_layout_(estimate_memory_layout(strides)),
+      memory_layout_(estimate_memory_layout(dim_order)),
       // Copy tensor size metadata
       sizes_(sizes.begin(), sizes.end()),
-      strides_(strides.begin(), strides.end()),
+      dim_order_(dim_order.begin(), dim_order.end()),
+      strides_(calculate_strides(sizes_, dim_order_)),
       numel_(utils::multiply_integers(sizes_)),
       padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
       unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
@@ -226,6 +299,8 @@ vTensor::vTensor(
       texture_limits_uniform_(),
       // Copy Tensor storage
       storage_(other.storage_, vkapi::element_size(dtype_) * offset_numel) {
+  VK_CHECK_COND(
+      dim_order_is_valid(dim_order_), "new dim order provided is invalid");
   VK_CHECK_COND(
       offset_numel + numel_ <= other.numel(),
       "Tensor alias cannot access more elements than available in the original"
@@ -339,9 +414,17 @@ void vTensor::bind_allocation(const vkapi::Allocation& allocation) {
   }
 }
 
-void vTensor::update_size_metadata(const std::vector<int64_t>& new_sizes) {
+void vTensor::update_metadata(
+    const std::vector<int64_t>& new_sizes,
+    const std::vector<int64_t>& new_dim_order) {
   sizes_ = new_sizes;
-  strides_ = calculate_strides(new_sizes, memory_layout_);
+  dim_order_ = new_dim_order;
+  strides_ = calculate_strides(sizes_, dim_order_);
+  // Only update the memory layout for buffer-backed tensors. Strides are
+  // meaningless for texture-backed tensors and do not impact the memory layout.
+  if (storage_type() == utils::kBuffer) {
+    memory_layout_ = estimate_memory_layout(dim_order_);
+  }
   numel_ = utils::multiply_integers(sizes_);
 
   padded_sizes_ = calculate_padded_sizes(sizes_, memory_layout_);
@@ -373,15 +456,12 @@ void vTensor::update_size_metadata(const std::vector<int64_t>& new_sizes) {
   }
 }
 
-void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
-  update_size_metadata(new_sizes);
-  storage_.discard_and_reallocate(
-      calculate_padded_sizes(new_sizes, memory_layout_),
-      memory_layout_,
-      dtype_);
+void vTensor::update_size_metadata(const std::vector<int64_t>& new_sizes) {
+  // Dim order does not change on resize
+  update_metadata(new_sizes, dim_order_);
 }
 
-void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
+void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
   if (storage_type() != utils::kBuffer) {
     // For texture storage check that the current texture is large enough for
     // the new sizes of the tensor.
@@ -394,10 +474,37 @@ void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
 
     VK_CHECK_COND(
         valid_resize,
-        "Cannot use virtual resize if new sizes requires a larger texture.");
+        "tensor sizes requires a larger texture than the current one.");
+  } else {
+    int64_t numel = utils::multiply_integers(sizes);
+    bool valid_resize =
+        numel + storage_.buffer_offset_ <= storage_.buffer_length_;
+    VK_CHECK_COND(
+        valid_resize,
+        "tensor sizes requires a larger buffer than the current one.");
   }
+}
+
+void vTensor::virtual_reconfigure(
+    const std::vector<int64_t>& new_sizes,
+    const std::vector<int64_t>& new_dim_order) {
+  VK_CHECK_COND(
+      dim_order_is_valid(new_dim_order), "new dim order provided is invalid");
+  check_sizes(new_sizes);
+  update_metadata(new_sizes, new_dim_order);
+}
+
+void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
+  check_sizes(new_sizes);
+  update_size_metadata(new_sizes);
+}
 
+void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
   update_size_metadata(new_sizes);
+  storage_.discard_and_reallocate(
+      calculate_padded_sizes(new_sizes, memory_layout_),
+      memory_layout_,
+      dtype_);
 }
 
 //
@@ -480,6 +587,7 @@ vTensorStorage::vTensorStorage(
       storage_type_{storage_type},
       image_extents_(calculate_image_extents(padded_sizes, gpu_memory_layout)),
       buffer_length_{utils::multiply_integers(padded_sizes)},
+      buffer_offset_{0},
       image_(allocate_image(
           context_,
           image_extents_,
@@ -496,11 +604,12 @@ vTensorStorage::vTensorStorage(
 
 vTensorStorage::vTensorStorage(
     const vTensorStorage& other,
-    const size_t buffer_offset)
+    const int64_t buffer_offset)
     : context_(other.context_),
       storage_type_{other.storage_type_},
       image_extents_(other.image_extents_),
       buffer_length_{other.buffer_length_},
+      buffer_offset_{buffer_offset},
       image_(),
       buffer_(other.buffer_, buffer_offset),
       last_access_{other.last_access_} {
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
index 5a4598291c0..48dcdaf2f7a 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.h
+++ b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -20,14 +20,30 @@ namespace vkcompute {
 namespace api {
 
 /*
- * Given the sizes of a tensor and the GPU memory layout, calculate the strides
- * of the tensor in NCHW dimension order. The GPU memory layout will be used to
- * determine which dimension is packed along a texel; that dimension will be
- * used as the "fasted moving" dimension with a stride of 1.
+ * Given the strides of a tensor in NCHW dimension order, calculate the dim
+ * order of the tensor by computing an index sort of the strides. Note that
+ * there is some ambiguity when multiple dimensions have the same stride;
+ * stable_sort is used to preserve the ordering of "outer" dimensions with
+ * respect to "inner" dimensions.
+ */
+std::vector<int64_t> strides_to_dim_order(const std::vector<int64_t>& strides);
+
+/*
+ * Given a GPUMemoryLayout value, produce a dim order vector that matches the
+ * given memory layout. The produced dim order vector will be in the NCHW
+ * dimension order
+ */
+std::vector<int64_t> calculate_dim_order(
+    const size_t ndim,
+    const utils::GPUMemoryLayout memory_layout);
+
+/*
+ * Given the sizes of a tensor and the dim order of the tensor (both in NCHW)
+ * dimension order, calculate the strides of the tensor.
  */
 std::vector<int64_t> calculate_strides(
     const std::vector<int64_t>& sizes,
-    const utils::GPUMemoryLayout memory_layout);
+    const std::vector<int64_t>& dim_order);
 
 std::vector<int64_t> unsqueeze_strides(
     const std::vector<int64_t>& strides,
@@ -96,7 +112,7 @@ class vTensorStorage final {
    * because this behaviour is unsafe, since the original tensor may be
    * destroyed before the copy is destroyed.
    */
-  vTensorStorage(const vTensorStorage& other, const size_t buffer_offset = 0);
+  vTensorStorage(const vTensorStorage& other, const int64_t buffer_offset = 0);
 
  public:
   // To discourage creating copies, the assignment operator is still deleted.
@@ -118,6 +134,7 @@ class vTensorStorage final {
   // Resource sizings
   utils::uvec3 image_extents_{};
   int64_t buffer_length_{};
+  int64_t buffer_offset_{};
 
   // GPU Storage
   mutable vkapi::VulkanImage image_;
@@ -167,8 +184,16 @@ class vTensor final {
       const utils::GPUMemoryLayout memory_layout = utils::kChannelsPacked,
       const bool allocate_memory = true);
 
-  vTensor(const vTensor& other) = delete;
-  vTensor& operator=(const vTensor& other) = delete;
+  /*
+   * This constructor allows for the creation of a vTensor that references the
+   * same buffer resource of another vTensor, with the same sizes and strides
+   * metadata. The created vTensor will not own the underlying resource. This is
+   * only applicable for buffer backed tensors at the moment.
+   *
+   * Once created, the sizes and strides of the aliased vTensor can be changed
+   * using the `virtual_reconfigure` member function.
+   */
+  vTensor(const vTensor& other);
 
   /*
    * This constructor allows for the creation of a vTensor that references the
@@ -176,6 +201,10 @@ class vTensor final {
    * strides metatdata. The created vTensor will not own the underlying
    * resource. This is only applicable for buffer backed tensors at the moment.
    *
+   * Note that dim order is used as the source of truth regarding the strides,
+   * and the new strides are computed from the new sizes and new dim order.
+   * Thus only the dim order is provided as an argument to this function.
+   *
    * The offset_numel argument allows the aliased tensor's memory region to
    * begin at an offset of N elements from the start of the original tensor's
    * buffer.
@@ -183,8 +212,11 @@ class vTensor final {
   vTensor(
       const vTensor& other,
       const std::vector<int64_t>& sizes,
-      const std::vector<int64_t>& strides,
-      const size_t offset_numel = 0);
+      const std::vector<int64_t>& dim_order,
+      const int64_t offset_numel = 0);
+
+  // To discourage making copies, the copy assignment operator is still deleted
+  vTensor& operator=(const vTensor& other) = delete;
 
   vTensor(vTensor&& other) = default;
   vTensor& operator=(vTensor&& other) = default;
@@ -195,6 +227,8 @@ class vTensor final {
 
   // sizes of the tensor in NCHW dimension order
   std::vector<int64_t> sizes_;
+  // dim order of the tensor in NCHW dimension order
+  std::vector<int64_t> dim_order_;
   // strides of the tensor in NCHW dimension order
   std::vector<int64_t> strides_;
   // Contains the number of elements in the tensor according to the canonical
@@ -305,6 +339,10 @@ class vTensor final {
     return sizes_.size();
   }
 
+  inline const std::vector<int64_t>& dim_order() const {
+    return dim_order_;
+  }
+
   inline const std::vector<int64_t>& strides() const {
     return strides_;
   }
@@ -386,24 +424,61 @@ class vTensor final {
 
  private:
   /*
-   * Update the size metadata of the vTensor to be new sizes. Should not be used
-   * directly, reallocate() or virtual_resize() should be used instead.
+   * Update the sizes, dim order, and strides metadata of the vTensor.
+   *
+   * The dim order is used as the "source of truth" for the strides and the
+   * strides are calculated from the dim order, therefore only the dim order is
+   * accepted as an argument to this function. Within the function, the new
+   * strides are computed from the new sizes and new dim order.
+   *
+   * Should not be used directly, reallocate() or virtual_resize() should be
+   * used instead.
+   */
+  void update_metadata(
+      const std::vector<int64_t>& new_sizes,
+      const std::vector<int64_t>& new_dim_order);
+
+  /*
+   * Convenience overload of update_metadata. Given the new sizes, the new
+   * strides will be re-calculated based on the current memory layout of the
+   * tensor. Update_metadata will be called with the new sizes and strides.
    */
   void update_size_metadata(const std::vector<int64_t>& new_sizes);
 
+  /*
+   * Check that tensor sizes are valid given the current storage resource's
+   * limits.
+   */
+  void check_sizes(const std::vector<int64_t>& sizes) const;
+
  public:
   /*
-   * Discard the underlying VkImage or VkBuffer and re-allocate based on new
-   * tensor sizes
+   * Virtually resize and "re-stride" the tensor by modifying the size and
+   * stride metadata that gets used in compute shaders. This allows the shader
+   * to interpret the underlying resource with the updated metadata.
+   *
+   * Note that the dim order is used as the source of truth for the strides; the
+   * strides are computed using the new sizes and new dim order, thus only the
+   * dim order is accepted as an argument to this function.
    */
-  void reallocate(const std::vector<int64_t>& new_sizes);
+  void virtual_reconfigure(
+      const std::vector<int64_t>& new_sizes,
+      const std::vector<int64_t>& new_dim_order);
 
   /*
    * Perform a virtual resize of the vTensor by modifying the size metadata that
    * gets used in compute shaders. This allows the shader to treat the
-   * underlying resource as if it were a different size.
+   * underlying resource as if it were a different size. This function is a
+   * convenience overload of virtual_reconfigure; new strides will be computed
+   * based on the new sizes that preserves the memory layout of the tensor.
    */
   void virtual_resize(const std::vector<int64_t>& new_sizes);
+
+  /*
+   * Discard the underlying VkImage or VkBuffer and re-allocate based on new
+   * tensor sizes
+   */
+  void reallocate(const std::vector<int64_t>& new_sizes);
 };
 
 } // namespace api
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index 50d927a913f..48e1ebf0a83 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -203,6 +203,13 @@ ValueRef ComputeGraph::add_tensor(
       sizes, dtype, suggested_memory_layout(sizes), shared_object_idx);
 }
 
+ValueRef ComputeGraph::add_tensor_view(const ValueRef vref) {
+  const vTensorPtr t = get_tensor(vref);
+  ValueRef idx(static_cast<int>(values_.size()));
+  values_.emplace_back(api::vTensor(*t));
+  return idx;
+}
+
 ValueRef ComputeGraph::add_tensor_view(
     const ValueRef vref,
     const std::vector<int64_t>& sizes,
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index b432be83881..faa2f4107ec 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -356,10 +356,17 @@ class ComputeGraph final {
    * `vTensor` value at `vref`. See the copy constructor of `api::vTensor` for
    * more details.
    */
+  ValueRef add_tensor_view(const ValueRef vref);
+
+  /*
+   * Use the copy constructor of `api::vTensor` to create a "view" of the
+   * `vTensor` value at `vref` with different sizes and dim order. See the copy
+   * constructor of `api::vTensor` for more details.
+   */
   ValueRef add_tensor_view(
       const ValueRef vref,
       const std::vector<int64_t>& sizes,
-      const std::vector<int64_t>& strides,
+      const std::vector<int64_t>& dim_order,
       const size_t offset_numel = 0);
 
   /*
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index af92728cb0c..a12a7974cbf 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -175,7 +175,67 @@ TEST_F(VulkanComputeAPITest, empty_init_shader_info_test) {
   EXPECT_TRUE(empty_shader_info.src_code.size == 0u);
 }
 
+TEST_F(VulkanComputeAPITest, calculate_dim_order_test) {
+  // ndim, GPUMemoryLayout, expected dim order pairs
+  std::vector<std::tuple<size_t, utils::GPUMemoryLayout, std::vector<int64_t>>>
+      test_cases = {
+          {1, utils::kWidthPacked, {0}},
+          {1, utils::kHeightPacked, {0}},
+          {1, utils::kChannelsPacked, {0}},
+          {2, utils::kWidthPacked, {0, 1}},
+          {2, utils::kHeightPacked, {1, 0}},
+          {2, utils::kChannelsPacked, {0, 1}},
+          {3, utils::kWidthPacked, {0, 1, 2}},
+          {3, utils::kHeightPacked, {0, 2, 1}},
+          {3, utils::kChannelsPacked, {1, 2, 0}},
+          {4, utils::kWidthPacked, {0, 1, 2, 3}},
+          {4, utils::kHeightPacked, {0, 1, 3, 2}},
+          {4, utils::kChannelsPacked, {0, 2, 3, 1}},
+      };
+
+  for (const auto& test_case : test_cases) {
+    const size_t& ndim = std::get<0>(test_case);
+    const utils::GPUMemoryLayout& layout = std::get<1>(test_case);
+    const auto& expected_dim_order = std::get<2>(test_case);
+    std::vector<int64_t> dim_order = calculate_dim_order(ndim, layout);
+
+    ASSERT_TRUE(dim_order == expected_dim_order);
+  }
+}
+
+TEST_F(VulkanComputeAPITest, calculate_tensor_dim_order_test) {
+  // Stride, expected dim order pairs. Note that strides don't have to "make
+  // sense" because only they are sorted; the actual stride values don't matter.
+  std::vector<std::tuple<std::vector<int64_t>, std::vector<int64_t>>>
+      test_cases = {
+          {{8, 1}, {0, 1}},
+          {{2, 10}, {1, 0}},
+          {{66, 12, 1}, {0, 1, 2}},
+          {{32, 128, 4}, {1, 0, 2}},
+          {{3, 8, 11, 212}, {3, 2, 1, 0}},
+          {{100, 12, 9, 1}, {0, 1, 2, 3}},
+          {{10, 12, 101, 6}, {2, 1, 0, 3}},
+      };
+
+  for (const auto& test_case : test_cases) {
+    const auto& strides = std::get<0>(test_case);
+    const auto& expected_dim_order = std::get<1>(test_case);
+    std::vector<int64_t> dim_order = strides_to_dim_order(strides);
+
+    ASSERT_TRUE(dim_order == expected_dim_order);
+  }
+}
+
 TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) {
+  // vtensor to be resized
+  vTensor v_tensor_to_resize(
+      context(),
+      {25, 25, 25, 25},
+      vkapi::kFloat,
+      utils::kBuffer,
+      utils::kWidthPacked,
+      /*allocate_memory = */ false);
+
   for (const auto& sizes : standard_sizes_to_test) {
     if (sizes.size() < 3) {
       continue;
@@ -183,7 +243,9 @@ TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) {
     for (const auto& layout :
          {utils::kWidthPacked, utils::kHeightPacked, utils::kChannelsPacked}) {
       {
-        std::vector<int64_t> strides = calculate_strides(sizes, layout);
+        std::vector<int64_t> dim_order =
+            calculate_dim_order(sizes.size(), layout);
+        std::vector<int64_t> strides = calculate_strides(sizes, dim_order);
         std::vector<int64_t> ref_strides = get_reference_strides(sizes, layout);
         ASSERT_TRUE(strides == ref_strides);
 
@@ -194,6 +256,25 @@ TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) {
             get_reference_strides(sizes, layout, true);
 
         ASSERT_TRUE(unsqueezed_strides == ref_unsqueezed_strides);
+
+        // Create new vTensor and check that the strides are correct
+        vTensor new_v_tensor(
+            context(),
+            sizes,
+            vkapi::kFloat,
+            utils::kBuffer,
+            layout,
+            /*allocate_memory = */ false);
+
+        ASSERT_TRUE(new_v_tensor.strides() == ref_strides);
+        ASSERT_TRUE(
+            new_v_tensor.unsqueezed_strides() == ref_unsqueezed_strides);
+
+        // Resize vtensor and check that updated metadata is correct
+        v_tensor_to_resize.virtual_reconfigure(sizes, dim_order);
+        ASSERT_TRUE(v_tensor_to_resize.strides() == ref_strides);
+        ASSERT_TRUE(
+            v_tensor_to_resize.unsqueezed_strides() == ref_unsqueezed_strides);
       }
     }
   }
@@ -549,9 +630,10 @@ TEST_F(VulkanComputeAPITest, tensor_copy_test) {
   std::vector<int64_t> sizes = {9, 9};
   std::vector<int64_t> strides =
       get_reference_strides(sizes, utils::kWidthPacked);
+  std::vector<int64_t> dim_order = {0, 1};
 
   vTensor original = CREATE_FLOAT_BUFFER(sizes, /*allocate_memory=*/true);
-  vTensor copy = vTensor(original, sizes, strides);
+  vTensor copy = vTensor(original, sizes, dim_order);
   EXPECT_TRUE(get_vma_allocation_count() == 1);
 
   // Fill original tensor with some data
@@ -564,7 +646,6 @@ TEST_F(VulkanComputeAPITest, tensor_copy_test) {
   for (size_t i = 0; i < data_out.size(); ++i) {
     CHECK_VALUE(data_out, i, 2.5f + i);
   }
-  std::cout << std::endl;
 }
 
 TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) {
@@ -576,7 +657,7 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) {
   std::vector<int64_t> mat2_t_sizes = {K, N};
   std::vector<int64_t> out_sizes = {M, N};
 
-  std::vector<int64_t> transposed_strides = {1, K};
+  std::vector<int64_t> transposed_dim_order = {1, 0};
 
   vTensor mat1 = CREATE_FLOAT_BUFFER(mat1_sizes, /*allocate_memory=*/true);
   vTensor mat2 = CREATE_FLOAT_BUFFER(mat2_sizes, /*allocate_memory=*/true);
@@ -588,8 +669,8 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) {
   std::vector<float> mat2_data =
       create_random_float_buffer(mat2.staging_buffer_numel());
 
-  vTensor mat2_t = vTensor(mat2, mat2_t_sizes, transposed_strides);
-  EXPECT_TRUE(mat2_t.gpu_memory_layout() == utils::kHeightPacked);
+  // Create direct view and modify sizes and strides later
+  vTensor mat2_t = vTensor(mat2);
 
   std::vector<float> mat2_t_data = transpose_matrix(mat2_data, N, K);
   std::vector<float> ref_out =
@@ -601,6 +682,10 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) {
 
   record_reference_matmul(api::context(), out, mat1, mat2_t);
 
+  // Update sizes and strides of mat2_t to be that of a transposed tensor
+  mat2_t.virtual_reconfigure(mat2_t_sizes, transposed_dim_order);
+  EXPECT_TRUE(mat2_t.gpu_memory_layout() == utils::kHeightPacked);
+
   std::vector<float> data_out(out.staging_buffer_numel());
   // Extract the copy tensor; should contain the data of the original tensor
   extract_vtensor(out, data_out);
@@ -622,7 +707,7 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_slice_test) {
   constexpr int L_S2 = 7;
   constexpr int O_S2 = 3;
 
-  std::vector<int64_t> strides = {1};
+  std::vector<int64_t> dim_order = {0};
 
   std::vector<int64_t> t_sizes = {L};
   std::vector<int64_t> s1_sizes = {L_S1};
@@ -632,8 +717,8 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_slice_test) {
 
   fill_vtensor(orig, 0);
 
-  vTensor s1 = vTensor(orig, s1_sizes, strides, O_S1);
-  vTensor s2 = vTensor(s1, s2_sizes, strides, O_S2);
+  vTensor s1 = vTensor(orig, s1_sizes, dim_order, O_S1);
+  vTensor s2 = vTensor(s1, s2_sizes, dim_order, O_S2);
 
   record_scalar_add_buffer(api::context(), s1, 4.5f);
   record_scalar_add_buffer(api::context(), s2, 7.5f);
@@ -1093,7 +1178,7 @@ TEST(VulkanComputeGraphTest, test_simple_graph_with_view) {
   config.set_storage_type_override(utils::kBuffer);
   ComputeGraph graph(config);
 
-  std::vector<int64_t> strides = {W, 1};
+  std::vector<int64_t> dim_order = {0, 1};
 
   std::vector<int64_t> orig_sizes = {H, W};
   std::vector<int64_t> slice_sizes = {S_H, W};
@@ -1103,7 +1188,7 @@ TEST(VulkanComputeGraphTest, test_simple_graph_with_view) {
 
   IOValueRef orig = graph.add_input_tensor(orig_sizes, vkapi::kFloat);
   ValueRef slice =
-      graph.add_tensor_view(orig.value, slice_sizes, strides, offset);
+      graph.add_tensor_view(orig.value, slice_sizes, dim_order, offset);
 
   IOValueRef out = {};