From d1e23b7ba75176c21c2540fd46f0b321cfc6a4ff Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Thu, 31 Oct 2024 14:47:20 -0700
Subject: [PATCH] [ET-VK] Allow clone op to transfer between memory layouts and
 storage types

Pull Request resolved: https://github.com/pytorch/executorch/pull/6596

## Changes

As title. Extend the functionality of the `aten.clone` operator to allow transitioning the storage type and memory layout between the input to the output tensor.

## Context

This functionality will be used to transition input tensors to the optimal storage type and memory layout before entering the execution of an op. The transition nodes will be added by a memory metadata tagging pass that will be introduced in a subsequent diff.
ghstack-source-id: 251229412
@exported-using-ghexport

Differential Revision: [D65277710](https://our.internmc.facebook.com/intern/diff/D65277710/)
---
 backends/vulkan/runtime/graph/ComputeGraph.h  | 16 ++++
 .../bitw8_image_to_nchw_nobitw8buffer.yaml    |  6 +-
 .../runtime/graph/ops/glsl/image_to_nchw.glsl | 30 +++---
 .../runtime/graph/ops/glsl/image_to_nchw.yaml | 10 +-
 .../runtime/graph/ops/glsl/indexing_utils.h   | 15 +++
 .../nchw_to_bitw8_image_nobitw8buffer.yaml    |  6 +-
 .../runtime/graph/ops/glsl/nchw_to_image.glsl | 10 +-
 .../runtime/graph/ops/glsl/nchw_to_image.yaml | 10 +-
 .../vulkan/runtime/graph/ops/impl/Clone.cpp   | 96 ++++++++++++++++++-
 .../vulkan/runtime/graph/ops/impl/View.cpp    |  2 +
 backends/vulkan/runtime/graph/ops/impl/View.h | 21 ++++
 .../runtime/graph/ops/utils/StagingUtils.cpp  |  8 +-
 backends/vulkan/test/utils/test_utils.cpp     |  2 +-
 .../vulkan/test/vulkan_compute_api_test.cpp   | 55 +++++++++++
 14 files changed, 245 insertions(+), 42 deletions(-)
 create mode 100644 backends/vulkan/runtime/graph/ops/impl/View.h

diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index cabf4e7a882..cb958cefea3 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -612,6 +612,22 @@ class ComputeGraph final {
     return {t, staging};
   }
 
+  /*
+   * Add an input tensor with the specified properties along with its staging
+   * buffer.
+   */
+  inline IOValueRef add_input_tensor(
+      const std::vector<int64_t>& sizes,
+      const vkapi::ScalarType dtype,
+      const utils::StorageType storage_type,
+      const utils::GPUMemoryLayout memory_layout,
+      const int64_t shared_object_idx = -1) {
+    ValueRef t = add_tensor(
+        sizes, dtype, storage_type, memory_layout, shared_object_idx);
+    ValueRef staging = set_input_tensor(t);
+    return {t, staging};
+  }
+
   SharedObject& get_shared_object(const int64_t idx);
 
   //
diff --git a/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.yaml
index e15e27addad..e1574d7fc0f 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.yaml
@@ -9,11 +9,11 @@ bitw8_image_to_nchw_nobitw8buffer:
     STORAGE: texture3d
     DTYPE: int8
   generate_variant_forall:
-    DTYPE:
-      - VALUE: int8
-      - VALUE: uint8
     STORAGE:
       - VALUE: texture2d
       - VALUE: texture3d
+    DTYPE:
+      - VALUE: int8
+      - VALUE: uint8
   shader_variants:
     - NAME: bitw8_image_to_nchw_nobitw8buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
index f7d2770faf0..afdc35a8861 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
@@ -19,9 +19,11 @@ ${define_required_extensions(DTYPE)}
 
 layout(std430) buffer;
 
-${layout_declare_buffer(B, "w", "nchw_out", DTYPE)}
+${layout_declare_buffer(B, "w", "buf_out", DTYPE)}
 ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
 ${layout_declare_ubo(B, "ivec4", "sizes")}
+$if not TO_STAGING:
+  ${layout_declare_ubo(B, "ivec4", "buf_strides")}
 
 #include "indexing_utils.h"
 
@@ -31,23 +33,23 @@ ${layout_declare_spec_const(C, "int", "t_layout", "DEFAULT_LAYOUT")}
 const lowp ivec4 axis_map = unhash_axis_map(t_layout);
 const lowp int packed_dim = unhash_packed_dim(t_layout);
 
-void write_out_texel(VEC4_T texel, ivec4 tensor_idx) {
-  const ivec4 buf_indices = tidx_to_nchwi(
-      tensor_idx,
-      sizes,
-      packed_dim);
+void write_out_texel(VEC4_T texel, ivec4 tidx) {
+  $if TO_STAGING:
+    const ivec4 buf_indices = tidx_to_nchwi(tidx, sizes, packed_dim);
+  $else:
+    const ivec4 buf_indices = tidx_to_4bufi(tidx, buf_strides, packed_dim);
 
-  if (tensor_idx[packed_dim] < sizes[packed_dim]) {
-    nchw_out[buf_indices.x] = BUF_T(texel.x);
+  if (tidx[packed_dim] < sizes[packed_dim]) {
+    buf_out[buf_indices.x] = BUF_T(texel.x);
   }
-  if (tensor_idx[packed_dim] + 1 < sizes[packed_dim]) {
-    nchw_out[buf_indices.y] = BUF_T(texel.y);
+  if (tidx[packed_dim] + 1 < sizes[packed_dim]) {
+    buf_out[buf_indices.y] = BUF_T(texel.y);
   }
-  if (tensor_idx[packed_dim] + 2 < sizes[packed_dim]) {
-    nchw_out[buf_indices.z] = BUF_T(texel.z);
+  if (tidx[packed_dim] + 2 < sizes[packed_dim]) {
+    buf_out[buf_indices.z] = BUF_T(texel.z);
   }
-  if (tensor_idx[packed_dim] + 3 < sizes[packed_dim]) {
-    nchw_out[buf_indices.w] = BUF_T(texel.w);
+  if (tidx[packed_dim] + 3 < sizes[packed_dim]) {
+    buf_out[buf_indices.w] = BUF_T(texel.w);
   }
 }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml
index 0898e75110d..8fc9340d9d0 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml
@@ -8,14 +8,16 @@ image_to_nchw:
   parameter_names_with_default_values:
     DTYPE: float
     STORAGE: texture3d
+    TO_STAGING: True
   generate_variant_forall:
     DTYPE:
       - VALUE: half
       - VALUE: float
       - VALUE: int
       - VALUE: int8
-    STORAGE:
-      - VALUE: texture3d
-      - VALUE: texture2d
   shader_variants:
-    - NAME: image_to_nchw
+    - NAME: image_to_nchw_texture3d
+    - NAME: image_to_nchw_texture2d
+      STORAGE: texture2d
+    - NAME: clone_image_to_buffer
+      TO_STAGING: False
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
index 09f53fe779a..0b372ab70a4 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
+++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
@@ -88,6 +88,21 @@ ivec4 tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes, const int packed_dim) {
   return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim];
 }
 
+/*
+ * Get the buffer indices that contain the data of the texel that corresponds to
+ * to the provided tensor index. Since the texel have 4 elements, 4 buffer
+ * indices will be retrieved.
+ */
+ivec4 tidx_to_4bufi(
+    const ivec4 tidx,
+    const ivec4 strides,
+    const int packed_dim) {
+  int base_i = tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z +
+      tidx.w * strides.w;
+
+  return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim];
+}
+
 ivec4 nchwi_to_tidx(const int nchwi, const ivec4 sizes) {
   return ivec4(
       nchwi % sizes.x,
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml
index 7fe3849fd5c..506a66c0d27 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml
@@ -9,11 +9,11 @@ nchw_to_bitw8_image_nobitw8buffer:
     STORAGE: texture3d
     DTYPE: int8
   generate_variant_forall:
-    DTYPE:
-      - VALUE: int8
-      - VALUE: uint8
     STORAGE:
       - VALUE: texture2d
       - VALUE: texture3d
+    DTYPE:
+      - VALUE: int8
+      - VALUE: uint8
   shader_variants:
     - NAME: nchw_to_bitw8_image_nobitw8buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
index bde846289ef..3d2a102dac7 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
@@ -22,6 +22,8 @@ layout(std430) buffer;
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_buffer(B, "r", "buf_in", DTYPE)}
 ${layout_declare_ubo(B, "ivec4", "sizes")}
+$if not FROM_STAGING:
+  ${layout_declare_ubo(B, "ivec4", "buf_strides")}
 
 #include "indexing_utils.h"
 
@@ -32,10 +34,10 @@ const lowp ivec4 axis_map = unhash_axis_map(t_layout);
 const lowp int packed_dim = unhash_packed_dim(t_layout);
 
 VEC4_T read_texel(ivec4 tidx) {
-  const ivec4 buf_indices = tidx_to_nchwi(
-      tidx,
-      sizes,
-      packed_dim);
+  $if FROM_STAGING:
+    const ivec4 buf_indices = tidx_to_nchwi(tidx, sizes, packed_dim);
+  $else:
+    const ivec4 buf_indices = tidx_to_4bufi(tidx, buf_strides, packed_dim);
 
   VEC4_T texel = VEC4_T(0);
   if (tidx[packed_dim] < sizes[packed_dim]) {
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml
index 2bf85a74920..f44e1f74bfe 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml
@@ -8,14 +8,16 @@ nchw_to_image:
   parameter_names_with_default_values:
     STORAGE: texture3d
     DTYPE: float
+    FROM_STAGING: True
   generate_variant_forall:
     DTYPE:
       - VALUE: half
       - VALUE: float
       - VALUE: int
       - VALUE: int8
-    STORAGE:
-      - VALUE: texture3d
-      - VALUE: texture2d
   shader_variants:
-    - NAME: nchw_to_image
+    - NAME: nchw_to_image_texture3d
+    - NAME: nchw_to_image_texture2d
+      STORAGE: texture2d
+    - NAME: clone_buffer_to_image
+      FROM_STAGING: False
diff --git a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
index 751413a5ff5..c763588043f 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
@@ -10,12 +10,28 @@
 
 #include <executorch/backends/vulkan/runtime/graph/Logging.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/View.h>
+
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
 
+void resize_clone_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  (void)extra_args;
+  vTensorPtr out = graph->get_tensor(args[0].refs[0]);
+  vTensorPtr in = graph->get_tensor(args[1].refs[0]);
+  // TODO: support for when dimensionality doesn't match, i.e. clone is used to
+  // implement squeeze.
+  if (out->dim() == in->dim()) {
+    out->virtual_resize(in->sizes());
+  }
+}
+
 void add_clone_node(
     ComputeGraph& graph,
     const ValueRef in,
@@ -30,14 +46,84 @@ void add_clone_node(
       VK_KERNEL_FROM_STR(kernel_name),
       graph.create_global_wg_size(out),
       graph.create_local_wg_size(out),
-      {{out, vkapi::MemoryAccessType::WRITE},
-       {in, vkapi::MemoryAccessType::READ}},
-      {t_out->logical_limits_ubo()}));
+      // Inputs and Outputs
+      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
+      // Parameter Buffers
+      {t_out->logical_limits_ubo()},
+      // Specialization Constants
+      {},
+      // Resizing Logic
+      resize_clone_node));
+}
+
+void add_image_to_buffer_node(
+    ComputeGraph& graph,
+    const ValueRef image,
+    const ValueRef buffer) {
+  std::string kernel_name = "clone_image_to_buffer";
+  add_dtype_suffix(kernel_name, graph.dtype_of(image));
+  vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
+
+  utils::uvec3 global_wg_size = graph.create_global_wg_size(image);
+  graph.execute_nodes().emplace_back(new DispatchNode(
+      graph,
+      shader,
+      global_wg_size,
+      graph.create_local_wg_size(global_wg_size),
+      // Input and Outputs
+      {{buffer, vkapi::kWrite}, {image, vkapi::kRead}},
+      // Parameter Buffers
+      {graph.sizes_ubo(image), graph.strides_ubo(buffer)},
+      // Specialization Constants
+      {graph.hashed_layout_of(image)},
+      // Resizing Logic
+      resize_clone_node));
+}
+
+void add_buffer_to_image_node(
+    ComputeGraph& graph,
+    const ValueRef buffer,
+    const ValueRef image) {
+  std::string kernel_name = "clone_buffer_to_image";
+  add_dtype_suffix(kernel_name, graph.dtype_of(image));
+  vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
+
+  utils::uvec3 global_wg_size = graph.create_global_wg_size(image);
+  graph.execute_nodes().emplace_back(new DispatchNode(
+      graph,
+      shader,
+      global_wg_size,
+      graph.create_local_wg_size(global_wg_size),
+      // Input and Outputs
+      {{image, vkapi::kWrite}, {buffer, vkapi::kRead}},
+      // Parameter Buffers
+      {graph.sizes_ubo(image), graph.strides_ubo(buffer)},
+      // Specialization Constants
+      {graph.hashed_layout_of(image)},
+      // Resizing Logic
+      resize_clone_node));
 }
 
 void clone(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  // The vulkan delegate does not support changing memory format.
-  return add_clone_node(graph, args[0], args[2]);
+  const ValueRef src = args[0];
+  const ValueRef dst = args[2];
+
+  const utils::StorageType src_storage = graph.storage_type_of(src);
+  const utils::StorageType dst_storage = graph.storage_type_of(dst);
+  if (src_storage == utils::kTexture3D && dst_storage == utils::kTexture3D) {
+    if (graph.hashed_layout_of(src) == graph.hashed_layout_of(dst)) {
+      return add_clone_node(graph, src, dst);
+    } else {
+      return add_view_node(graph, src, kDummyValueRef, dst);
+    }
+  }
+  if (src_storage == utils::kTexture3D && dst_storage == utils::kBuffer) {
+    return add_image_to_buffer_node(graph, src, dst);
+  }
+  if (src_storage == utils::kBuffer && dst_storage == utils::kTexture3D) {
+    return add_buffer_to_image_node(graph, src, dst);
+  }
+  VK_THROW("Buffer to buffer memory layout transition not supported yet!");
 }
 
 // Clone node is not the most efficient implementation for the aten.clone
diff --git a/backends/vulkan/runtime/graph/ops/impl/View.cpp b/backends/vulkan/runtime/graph/ops/impl/View.cpp
index 46d986e03ce..060696a4fa6 100644
--- a/backends/vulkan/runtime/graph/ops/impl/View.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/View.cpp
@@ -8,6 +8,8 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/View.h>
+
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
diff --git a/backends/vulkan/runtime/graph/ops/impl/View.h b/backends/vulkan/runtime/graph/ops/impl/View.h
new file mode 100644
index 00000000000..a2038d184c3
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/View.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+namespace vkcompute {
+
+void add_view_node(
+    ComputeGraph& graph,
+    ValueRef in,
+    ValueRef sizes,
+    ValueRef out);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
index 934fd03ab7f..fd7e6b78c22 100644
--- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
@@ -29,8 +29,8 @@ vkapi::ShaderInfo get_nchw_to_tensor_shader(
   if (is_bitw8(v_dst.dtype()) && v_dst.storage_type() != utils::kBuffer &&
       !int8_buffer_enabled) {
     kernel_name = "nchw_to_bitw8_image_nobitw8buffer";
-    add_dtype_suffix(kernel_name, v_dst);
     add_storage_type_suffix(kernel_name, v_dst);
+    add_dtype_suffix(kernel_name, v_dst);
     return VK_KERNEL_FROM_STR(kernel_name);
   }
 
@@ -41,8 +41,8 @@ vkapi::ShaderInfo get_nchw_to_tensor_shader(
   }
 
   kernel_name = "nchw_to_image";
-  add_dtype_suffix(kernel_name, v_dst);
   add_storage_type_suffix(kernel_name, v_dst);
+  add_dtype_suffix(kernel_name, v_dst);
 
   return VK_KERNEL_FROM_STR(kernel_name);
 }
@@ -56,8 +56,8 @@ vkapi::ShaderInfo get_tensor_to_nchw_shader(
   if (is_bitw8(v_src.dtype()) && v_src.storage_type() != utils::kBuffer &&
       !int8_buffer_enabled) {
     kernel_name = "bitw8_image_to_nchw_nobitw8buffer";
-    add_dtype_suffix(kernel_name, v_src);
     add_storage_type_suffix(kernel_name, v_src);
+    add_dtype_suffix(kernel_name, v_src);
     return VK_KERNEL_FROM_STR(kernel_name);
   }
 
@@ -68,8 +68,8 @@ vkapi::ShaderInfo get_tensor_to_nchw_shader(
   }
 
   kernel_name = "image_to_nchw";
-  add_dtype_suffix(kernel_name, v_src);
   add_storage_type_suffix(kernel_name, v_src);
+  add_dtype_suffix(kernel_name, v_src);
 
   return VK_KERNEL_FROM_STR(kernel_name);
 }
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
index 6124f0b71e0..3b6195a5c26 100644
--- a/backends/vulkan/test/utils/test_utils.cpp
+++ b/backends/vulkan/test/utils/test_utils.cpp
@@ -118,8 +118,8 @@ void record_bitw8_image_to_nchw_nobitw8buffer_op(
   utils::uvec3 global_wg_size = {buffer_len, 1, 1};
 
   std::string kernel_name = "bitw8_image_to_nchw_nobitw8buffer";
-  add_dtype_suffix(kernel_name, v_src);
   add_storage_type_suffix(kernel_name, v_src);
+  add_dtype_suffix(kernel_name, v_src);
 
   context->submit_compute_job(
       VK_KERNEL_FROM_STR(kernel_name),
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 1d40fe1bb59..261b10359d2 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -1901,6 +1901,61 @@ TEST(VulkanComputeGraphTest, test_large_graph) {
   std::cout << ss.str();
 }
 
+void test_clone(
+    std::vector<int64_t> sizes,
+    utils::StorageType src_storage,
+    utils::GPUMemoryLayout src_layout,
+    utils::StorageType dst_storage,
+    utils::GPUMemoryLayout dst_layout) {
+  GraphConfig config;
+  ComputeGraph graph(config);
+
+  IOValueRef a =
+      graph.add_input_tensor(sizes, vkapi::kFloat, src_storage, src_layout);
+
+  IOValueRef out = {};
+  out.value = graph.add_tensor(sizes, vkapi::kFloat, dst_storage, dst_layout);
+
+  auto copyFn = VK_GET_OP_FN("aten.clone.default");
+  copyFn(graph, {a.value, kDummyValueRef, out.value});
+
+  out.staging = graph.set_output_tensor(out.value);
+
+  graph.prepare();
+  graph.encode_execute();
+
+  fill_vtensor(graph, a, 0.0f, /*iota = */ true);
+
+  graph.propagate_resize();
+  graph.execute();
+
+  EXTRACT_TENSOR(out);
+  EXTRACT_TENSOR(a);
+
+  for (int i = 0; i < graph.numel_of(a.value); ++i) {
+    EXPECT_TRUE(data_out[i] == data_a[i]);
+  }
+}
+
+TEST(VulkanComputeGraphTest, test_clone) {
+  std::vector<std::pair<utils::GPUMemoryLayout, utils::GPUMemoryLayout>> cases{
+      {utils::kWidthPacked, utils::kWidthPacked},
+      {utils::kWidthPacked, utils::kChannelsPacked},
+      {utils::kChannelsPacked, utils::kChannelsPacked},
+  };
+
+  for (std::vector<int64_t> sizes : standard_sizes_to_test) {
+    for (auto& [src_layout, dst_layout] : cases) {
+      test_clone(
+          sizes, utils::kTexture3D, src_layout, utils::kBuffer, dst_layout);
+      test_clone(
+          sizes, utils::kBuffer, src_layout, utils::kTexture3D, dst_layout);
+      test_clone(
+          sizes, utils::kTexture3D, src_layout, utils::kTexture3D, dst_layout);
+    }
+  }
+}
+
 TEST(VulkanComputeGraphTest, test_etvk_copy_offset_node) {
   GraphConfig config;
   ComputeGraph graph(config);