pytorch · SS-JIA · Nov 1, 2024 · Oct 31, 2024 · Oct 31, 2024
@@ -612,6 +612,22 @@ class ComputeGraph final {
     return {t, staging};
   }
 
+  /*
+   * Add an input tensor with the specified properties along with its staging
+   * buffer.
+   */
+  inline IOValueRef add_input_tensor(
+      const std::vector<int64_t>& sizes,
+      const vkapi::ScalarType dtype,
+      const utils::StorageType storage_type,
+      const utils::GPUMemoryLayout memory_layout,
+      const int64_t shared_object_idx = -1) {
+    ValueRef t = add_tensor(
+        sizes, dtype, storage_type, memory_layout, shared_object_idx);
+    ValueRef staging = set_input_tensor(t);
+    return {t, staging};
+  }
+
   SharedObject& get_shared_object(const int64_t idx);
 
   //

@@ -9,11 +9,11 @@ bitw8_image_to_nchw_nobitw8buffer:
     STORAGE: texture3d
     DTYPE: int8
   generate_variant_forall:
-    DTYPE:
-      - VALUE: int8
-      - VALUE: uint8
     STORAGE:
       - VALUE: texture2d
       - VALUE: texture3d
+    DTYPE:
+      - VALUE: int8
+      - VALUE: uint8
   shader_variants:
     - NAME: bitw8_image_to_nchw_nobitw8buffer
@@ -19,9 +19,11 @@ ${define_required_extensions(DTYPE)}
 
 layout(std430) buffer;
 
-${layout_declare_buffer(B, "w", "nchw_out", DTYPE)}
+${layout_declare_buffer(B, "w", "buf_out", DTYPE)}
 ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
 ${layout_declare_ubo(B, "ivec4", "sizes")}
+$if not TO_STAGING:
+  ${layout_declare_ubo(B, "ivec4", "buf_strides")}
 
 #include "indexing_utils.h"
 
@@ -31,23 +33,23 @@ ${layout_declare_spec_const(C, "int", "t_layout", "DEFAULT_LAYOUT")}
 const lowp ivec4 axis_map = unhash_axis_map(t_layout);
 const lowp int packed_dim = unhash_packed_dim(t_layout);
 
-void write_out_texel(VEC4_T texel, ivec4 tensor_idx) {
-  const ivec4 buf_indices = tidx_to_nchwi(
-      tensor_idx,
-      sizes,
-      packed_dim);
+void write_out_texel(VEC4_T texel, ivec4 tidx) {
+  $if TO_STAGING:
+    const ivec4 buf_indices = tidx_to_nchwi(tidx, sizes, packed_dim);
+  $else:
+    const ivec4 buf_indices = tidx_to_4bufi(tidx, buf_strides, packed_dim);
 
-  if (tensor_idx[packed_dim] < sizes[packed_dim]) {
-    nchw_out[buf_indices.x] = BUF_T(texel.x);
+  if (tidx[packed_dim] < sizes[packed_dim]) {
+    buf_out[buf_indices.x] = BUF_T(texel.x);
   }
-  if (tensor_idx[packed_dim] + 1 < sizes[packed_dim]) {
-    nchw_out[buf_indices.y] = BUF_T(texel.y);
+  if (tidx[packed_dim] + 1 < sizes[packed_dim]) {
+    buf_out[buf_indices.y] = BUF_T(texel.y);
   }
-  if (tensor_idx[packed_dim] + 2 < sizes[packed_dim]) {
-    nchw_out[buf_indices.z] = BUF_T(texel.z);
+  if (tidx[packed_dim] + 2 < sizes[packed_dim]) {
+    buf_out[buf_indices.z] = BUF_T(texel.z);
   }
-  if (tensor_idx[packed_dim] + 3 < sizes[packed_dim]) {
-    nchw_out[buf_indices.w] = BUF_T(texel.w);
+  if (tidx[packed_dim] + 3 < sizes[packed_dim]) {
+    buf_out[buf_indices.w] = BUF_T(texel.w);
   }
 }
 

@@ -8,14 +8,16 @@ image_to_nchw:
   parameter_names_with_default_values:
     DTYPE: float
     STORAGE: texture3d
+    TO_STAGING: True
   generate_variant_forall:
     DTYPE:
       - VALUE: half
       - VALUE: float
       - VALUE: int
       - VALUE: int8
-    STORAGE:
-      - VALUE: texture3d
-      - VALUE: texture2d
   shader_variants:
-    - NAME: image_to_nchw
+    - NAME: image_to_nchw_texture3d
+    - NAME: image_to_nchw_texture2d
+      STORAGE: texture2d
+    - NAME: clone_image_to_buffer
+      TO_STAGING: False
@@ -88,6 +88,21 @@ ivec4 tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes, const int packed_dim) {
   return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim];
 }
 
+/*
+ * Get the buffer indices that contain the data of the texel that corresponds to
+ * to the provided tensor index. Since the texel have 4 elements, 4 buffer
+ * indices will be retrieved.
+ */
+ivec4 tidx_to_4bufi(
+    const ivec4 tidx,
+    const ivec4 strides,
+    const int packed_dim) {
+  int base_i = tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z +
+      tidx.w * strides.w;
+
+  return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim];
+}
+
 ivec4 nchwi_to_tidx(const int nchwi, const ivec4 sizes) {
   return ivec4(
       nchwi % sizes.x,

@@ -16,21 +16,23 @@ ${define_required_extensions(DTYPE)}
 
 layout(std430) buffer;
 
-${layout_declare_tensor(0, "w", "t_out", DTYPE, "buffer")}
-${layout_declare_tensor(1, "r", "t_mat1", DTYPE, "buffer")}
-${layout_declare_tensor(2, "r", "t_mat2", DTYPE, "buffer")}
-${layout_declare_ubo(3, "ivec4", "out_sizes")}
-${layout_declare_ubo(4, "ivec4", "out_strides")}
-${layout_declare_ubo(5, "ivec4", "mat1_sizes")}
-${layout_declare_ubo(6, "ivec4", "mat1_strides")}
-${layout_declare_ubo(7, "ivec4", "mat2_sizes")}
-${layout_declare_ubo(8, "ivec4", "mat2_strides")}
-${layout_declare_ubo(9, "int", "out_numel")}
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
+${layout_declare_tensor(B, "r", "t_mat1", DTYPE, "buffer")}
+${layout_declare_tensor(B, "r", "t_mat2", DTYPE, "buffer")}
+${layout_declare_ubo(B, "ivec4", "out_sizes")}
+${layout_declare_ubo(B, "ivec4", "out_strides")}
+${layout_declare_ubo(B, "ivec4", "mat1_sizes")}
+${layout_declare_ubo(B, "ivec4", "mat1_strides")}
+${layout_declare_ubo(B, "ivec4", "mat2_sizes")}
+${layout_declare_ubo(B, "ivec4", "mat2_strides")}
+${layout_declare_ubo(B, "int", "out_numel")}
 
 #include "indexing_utils.h"
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
+${layout_declare_spec_const(C, "int", "mat2_is_transposed", "0")}
+
 void main() {
   const ivec4 out_bufix = ivec4(
       gl_GlobalInvocationID.x,
@@ -44,15 +46,28 @@ void main() {
 
   int mat1_bufi = tidx_to_bufi(
       ivec4(0, out_bufix.y, out_bufix.z, out_bufix.w), mat1_strides);
-  int mat2_bufi = tidx_to_bufi(
-      ivec4(out_bufix.x, 0, out_bufix.z, out_bufix.w), mat2_strides);
+  int mat2_bufi;
+  if (mat2_is_transposed > 0) {
+    mat2_bufi = tidx_to_bufi(
+        ivec4(0, out_bufix.x, 0, 0), mat2_strides);
+  } else {
+    mat2_bufi = tidx_to_bufi(
+        ivec4(out_bufix.x, 0, out_bufix.z, out_bufix.w), mat2_strides);
+  }
+
+  int mat2_stride;
+  if (mat2_is_transposed > 0) {
+    mat2_stride = mat2_strides.x;
+  } else {
+    mat2_stride = mat2_strides.y;
+  }
 
   T sum = T(0.0);
   for (int i = 0; i < mat1_sizes.x; ++i) {
     sum += t_mat1[mat1_bufi] * t_mat2[mat2_bufi];
 
     mat1_bufi += mat1_strides.x;
-    mat2_bufi += mat2_strides.y;
+    mat2_bufi += mat2_stride;
   }
 
   const int out_bufi = tidx_to_bufi(out_bufix, out_strides);

@@ -9,11 +9,11 @@ nchw_to_bitw8_image_nobitw8buffer:
     STORAGE: texture3d
     DTYPE: int8
   generate_variant_forall:
-    DTYPE:
-      - VALUE: int8
-      - VALUE: uint8
     STORAGE:
       - VALUE: texture2d
       - VALUE: texture3d
+    DTYPE:
+      - VALUE: int8
+      - VALUE: uint8
   shader_variants:
     - NAME: nchw_to_bitw8_image_nobitw8buffer
@@ -22,6 +22,8 @@ layout(std430) buffer;
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_buffer(B, "r", "buf_in", DTYPE)}
 ${layout_declare_ubo(B, "ivec4", "sizes")}
+$if not FROM_STAGING:
+  ${layout_declare_ubo(B, "ivec4", "buf_strides")}
 
 #include "indexing_utils.h"
 
@@ -32,10 +34,10 @@ const lowp ivec4 axis_map = unhash_axis_map(t_layout);
 const lowp int packed_dim = unhash_packed_dim(t_layout);
 
 VEC4_T read_texel(ivec4 tidx) {
-  const ivec4 buf_indices = tidx_to_nchwi(
-      tidx,
-      sizes,
-      packed_dim);
+  $if FROM_STAGING:
+    const ivec4 buf_indices = tidx_to_nchwi(tidx, sizes, packed_dim);
+  $else:
+    const ivec4 buf_indices = tidx_to_4bufi(tidx, buf_strides, packed_dim);
 
   VEC4_T texel = VEC4_T(0);
   if (tidx[packed_dim] < sizes[packed_dim]) {

@@ -8,14 +8,16 @@ nchw_to_image:
   parameter_names_with_default_values:
     STORAGE: texture3d
     DTYPE: float
+    FROM_STAGING: True
   generate_variant_forall:
     DTYPE:
       - VALUE: half
       - VALUE: float
       - VALUE: int
       - VALUE: int8
-    STORAGE:
-      - VALUE: texture3d
-      - VALUE: texture2d
   shader_variants:
-    - NAME: nchw_to_image
+    - NAME: nchw_to_image_texture3d
+    - NAME: nchw_to_image_texture2d
+      STORAGE: texture2d
+    - NAME: clone_buffer_to_image
+      FROM_STAGING: False
@@ -10,12 +10,28 @@
 
 #include <executorch/backends/vulkan/runtime/graph/Logging.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/View.h>
+
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
 
+void resize_clone_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  (void)extra_args;
+  vTensorPtr out = graph->get_tensor(args[0].refs[0]);
+  vTensorPtr in = graph->get_tensor(args[1].refs[0]);
+  // TODO: support for when dimensionality doesn't match, i.e. clone is used to
+  // implement squeeze.
+  if (out->dim() == in->dim()) {
+    out->virtual_resize(in->sizes());
+  }
+}
+
 void add_clone_node(
     ComputeGraph& graph,
     const ValueRef in,
@@ -30,14 +46,84 @@ void add_clone_node(
       VK_KERNEL_FROM_STR(kernel_name),
       graph.create_global_wg_size(out),
       graph.create_local_wg_size(out),
-      {{out, vkapi::MemoryAccessType::WRITE},
-       {in, vkapi::MemoryAccessType::READ}},
-      {t_out->logical_limits_ubo()}));
+      // Inputs and Outputs
+      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
+      // Parameter Buffers
+      {t_out->logical_limits_ubo()},
+      // Specialization Constants
+      {},
+      // Resizing Logic
+      resize_clone_node));
+}
+
+void add_image_to_buffer_node(
+    ComputeGraph& graph,
+    const ValueRef image,
+    const ValueRef buffer) {
+  std::string kernel_name = "clone_image_to_buffer";
+  add_dtype_suffix(kernel_name, graph.dtype_of(image));
+  vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
+
+  utils::uvec3 global_wg_size = graph.create_global_wg_size(image);
+  graph.execute_nodes().emplace_back(new DispatchNode(
+      graph,
+      shader,
+      global_wg_size,
+      graph.create_local_wg_size(global_wg_size),
+      // Input and Outputs
+      {{buffer, vkapi::kWrite}, {image, vkapi::kRead}},
+      // Parameter Buffers
+      {graph.sizes_ubo(image), graph.strides_ubo(buffer)},
+      // Specialization Constants
+      {graph.hashed_layout_of(image)},
+      // Resizing Logic
+      resize_clone_node));
+}
+
+void add_buffer_to_image_node(
+    ComputeGraph& graph,
+    const ValueRef buffer,
+    const ValueRef image) {
+  std::string kernel_name = "clone_buffer_to_image";
+  add_dtype_suffix(kernel_name, graph.dtype_of(image));
+  vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
+
+  utils::uvec3 global_wg_size = graph.create_global_wg_size(image);
+  graph.execute_nodes().emplace_back(new DispatchNode(
+      graph,
+      shader,
+      global_wg_size,
+      graph.create_local_wg_size(global_wg_size),
+      // Input and Outputs
+      {{image, vkapi::kWrite}, {buffer, vkapi::kRead}},
+      // Parameter Buffers
+      {graph.sizes_ubo(image), graph.strides_ubo(buffer)},
+      // Specialization Constants
+      {graph.hashed_layout_of(image)},
+      // Resizing Logic
+      resize_clone_node));
 }
 
 void clone(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  // The vulkan delegate does not support changing memory format.
-  return add_clone_node(graph, args[0], args[2]);
+  const ValueRef src = args[0];
+  const ValueRef dst = args[2];
+
+  const utils::StorageType src_storage = graph.storage_type_of(src);
+  const utils::StorageType dst_storage = graph.storage_type_of(dst);
+  if (src_storage == utils::kTexture3D && dst_storage == utils::kTexture3D) {
+    if (graph.hashed_layout_of(src) == graph.hashed_layout_of(dst)) {
+      return add_clone_node(graph, src, dst);
+    } else {
+      return add_view_node(graph, src, kDummyValueRef, dst);
+    }
+  }
+  if (src_storage == utils::kTexture3D && dst_storage == utils::kBuffer) {
+    return add_image_to_buffer_node(graph, src, dst);
+  }
+  if (src_storage == utils::kBuffer && dst_storage == utils::kTexture3D) {
+    return add_buffer_to_image_node(graph, src, dst);
+  }
+  VK_THROW("Buffer to buffer memory layout transition not supported yet!");
 }
 
 // Clone node is not the most efficient implementation for the aten.clone

@@ -279,9 +279,12 @@ void linear(ComputeGraph& graph, const std::vector<ValueRef>& args) {
   ValueRef weight = prepack_standard(
       graph, weight_data, graph.storage_type_of(out), utils::kWidthPacked);
   ValueRef mat2_is_transposed = graph.add_scalar(true);
+
   if (graph.val_is_none(bias)) {
     return add_matmul_node(graph, input, weight, out, mat2_is_transposed);
   } else {
+    // Buffer implementation does not yet support biases
+    VK_CHECK_COND(!graph.is_buffer_storage(out));
     return add_addmm_node(
         graph,
         bias,