From d1e23b7ba75176c21c2540fd46f0b321cfc6a4ff Mon Sep 17 00:00:00 2001 From: Stephen Jia Date: Thu, 31 Oct 2024 14:47:20 -0700 Subject: [PATCH] [ET-VK] Allow clone op to transfer between memory layouts and storage types Pull Request resolved: https://github.com/pytorch/executorch/pull/6596 ## Changes As title. Extend the functionality of the `aten.clone` operator to allow transitioning the storage type and memory layout between the input to the output tensor. ## Context This functionality will be used to transition input tensors to the optimal storage type and memory layout before entering the execution of an op. The transition nodes will be added by a memory metadata tagging pass that will be introduced in a subsequent diff. ghstack-source-id: 251229412 @exported-using-ghexport Differential Revision: [D65277710](https://our.internmc.facebook.com/intern/diff/D65277710/) --- backends/vulkan/runtime/graph/ComputeGraph.h | 16 ++++ .../bitw8_image_to_nchw_nobitw8buffer.yaml | 6 +- .../runtime/graph/ops/glsl/image_to_nchw.glsl | 30 +++--- .../runtime/graph/ops/glsl/image_to_nchw.yaml | 10 +- .../runtime/graph/ops/glsl/indexing_utils.h | 15 +++ .../nchw_to_bitw8_image_nobitw8buffer.yaml | 6 +- .../runtime/graph/ops/glsl/nchw_to_image.glsl | 10 +- .../runtime/graph/ops/glsl/nchw_to_image.yaml | 10 +- .../vulkan/runtime/graph/ops/impl/Clone.cpp | 96 ++++++++++++++++++- .../vulkan/runtime/graph/ops/impl/View.cpp | 2 + backends/vulkan/runtime/graph/ops/impl/View.h | 21 ++++ .../runtime/graph/ops/utils/StagingUtils.cpp | 8 +- backends/vulkan/test/utils/test_utils.cpp | 2 +- .../vulkan/test/vulkan_compute_api_test.cpp | 55 +++++++++++ 14 files changed, 245 insertions(+), 42 deletions(-) create mode 100644 backends/vulkan/runtime/graph/ops/impl/View.h diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index cabf4e7a882..cb958cefea3 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -612,6 +612,22 @@ class ComputeGraph final { return {t, staging}; } + /* + * Add an input tensor with the specified properties along with its staging + * buffer. + */ + inline IOValueRef add_input_tensor( + const std::vector& sizes, + const vkapi::ScalarType dtype, + const utils::StorageType storage_type, + const utils::GPUMemoryLayout memory_layout, + const int64_t shared_object_idx = -1) { + ValueRef t = add_tensor( + sizes, dtype, storage_type, memory_layout, shared_object_idx); + ValueRef staging = set_input_tensor(t); + return {t, staging}; + } + SharedObject& get_shared_object(const int64_t idx); // diff --git a/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.yaml index e15e27addad..e1574d7fc0f 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.yaml @@ -9,11 +9,11 @@ bitw8_image_to_nchw_nobitw8buffer: STORAGE: texture3d DTYPE: int8 generate_variant_forall: - DTYPE: - - VALUE: int8 - - VALUE: uint8 STORAGE: - VALUE: texture2d - VALUE: texture3d + DTYPE: + - VALUE: int8 + - VALUE: uint8 shader_variants: - NAME: bitw8_image_to_nchw_nobitw8buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl index f7d2770faf0..afdc35a8861 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl @@ -19,9 +19,11 @@ ${define_required_extensions(DTYPE)} layout(std430) buffer; -${layout_declare_buffer(B, "w", "nchw_out", DTYPE)} +${layout_declare_buffer(B, "w", "buf_out", DTYPE)} ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} ${layout_declare_ubo(B, "ivec4", "sizes")} +$if not TO_STAGING: + ${layout_declare_ubo(B, "ivec4", "buf_strides")} #include "indexing_utils.h" @@ -31,23 +33,23 @@ ${layout_declare_spec_const(C, "int", "t_layout", "DEFAULT_LAYOUT")} const lowp ivec4 axis_map = unhash_axis_map(t_layout); const lowp int packed_dim = unhash_packed_dim(t_layout); -void write_out_texel(VEC4_T texel, ivec4 tensor_idx) { - const ivec4 buf_indices = tidx_to_nchwi( - tensor_idx, - sizes, - packed_dim); +void write_out_texel(VEC4_T texel, ivec4 tidx) { + $if TO_STAGING: + const ivec4 buf_indices = tidx_to_nchwi(tidx, sizes, packed_dim); + $else: + const ivec4 buf_indices = tidx_to_4bufi(tidx, buf_strides, packed_dim); - if (tensor_idx[packed_dim] < sizes[packed_dim]) { - nchw_out[buf_indices.x] = BUF_T(texel.x); + if (tidx[packed_dim] < sizes[packed_dim]) { + buf_out[buf_indices.x] = BUF_T(texel.x); } - if (tensor_idx[packed_dim] + 1 < sizes[packed_dim]) { - nchw_out[buf_indices.y] = BUF_T(texel.y); + if (tidx[packed_dim] + 1 < sizes[packed_dim]) { + buf_out[buf_indices.y] = BUF_T(texel.y); } - if (tensor_idx[packed_dim] + 2 < sizes[packed_dim]) { - nchw_out[buf_indices.z] = BUF_T(texel.z); + if (tidx[packed_dim] + 2 < sizes[packed_dim]) { + buf_out[buf_indices.z] = BUF_T(texel.z); } - if (tensor_idx[packed_dim] + 3 < sizes[packed_dim]) { - nchw_out[buf_indices.w] = BUF_T(texel.w); + if (tidx[packed_dim] + 3 < sizes[packed_dim]) { + buf_out[buf_indices.w] = BUF_T(texel.w); } } diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml index 0898e75110d..8fc9340d9d0 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml @@ -8,14 +8,16 @@ image_to_nchw: parameter_names_with_default_values: DTYPE: float STORAGE: texture3d + TO_STAGING: True generate_variant_forall: DTYPE: - VALUE: half - VALUE: float - VALUE: int - VALUE: int8 - STORAGE: - - VALUE: texture3d - - VALUE: texture2d shader_variants: - - NAME: image_to_nchw + - NAME: image_to_nchw_texture3d + - NAME: image_to_nchw_texture2d + STORAGE: texture2d + - NAME: clone_image_to_buffer + TO_STAGING: False diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h index 09f53fe779a..0b372ab70a4 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h +++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h @@ -88,6 +88,21 @@ ivec4 tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes, const int packed_dim) { return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim]; } +/* + * Get the buffer indices that contain the data of the texel that corresponds to + * to the provided tensor index. Since the texel have 4 elements, 4 buffer + * indices will be retrieved. + */ +ivec4 tidx_to_4bufi( + const ivec4 tidx, + const ivec4 strides, + const int packed_dim) { + int base_i = tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z + + tidx.w * strides.w; + + return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim]; +} + ivec4 nchwi_to_tidx(const int nchwi, const ivec4 sizes) { return ivec4( nchwi % sizes.x, diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml index 7fe3849fd5c..506a66c0d27 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml @@ -9,11 +9,11 @@ nchw_to_bitw8_image_nobitw8buffer: STORAGE: texture3d DTYPE: int8 generate_variant_forall: - DTYPE: - - VALUE: int8 - - VALUE: uint8 STORAGE: - VALUE: texture2d - VALUE: texture3d + DTYPE: + - VALUE: int8 + - VALUE: uint8 shader_variants: - NAME: nchw_to_bitw8_image_nobitw8buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl index bde846289ef..3d2a102dac7 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl @@ -22,6 +22,8 @@ layout(std430) buffer; ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} ${layout_declare_buffer(B, "r", "buf_in", DTYPE)} ${layout_declare_ubo(B, "ivec4", "sizes")} +$if not FROM_STAGING: + ${layout_declare_ubo(B, "ivec4", "buf_strides")} #include "indexing_utils.h" @@ -32,10 +34,10 @@ const lowp ivec4 axis_map = unhash_axis_map(t_layout); const lowp int packed_dim = unhash_packed_dim(t_layout); VEC4_T read_texel(ivec4 tidx) { - const ivec4 buf_indices = tidx_to_nchwi( - tidx, - sizes, - packed_dim); + $if FROM_STAGING: + const ivec4 buf_indices = tidx_to_nchwi(tidx, sizes, packed_dim); + $else: + const ivec4 buf_indices = tidx_to_4bufi(tidx, buf_strides, packed_dim); VEC4_T texel = VEC4_T(0); if (tidx[packed_dim] < sizes[packed_dim]) { diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml index 2bf85a74920..f44e1f74bfe 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml @@ -8,14 +8,16 @@ nchw_to_image: parameter_names_with_default_values: STORAGE: texture3d DTYPE: float + FROM_STAGING: True generate_variant_forall: DTYPE: - VALUE: half - VALUE: float - VALUE: int - VALUE: int8 - STORAGE: - - VALUE: texture3d - - VALUE: texture2d shader_variants: - - NAME: nchw_to_image + - NAME: nchw_to_image_texture3d + - NAME: nchw_to_image_texture2d + STORAGE: texture2d + - NAME: clone_buffer_to_image + FROM_STAGING: False diff --git a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp index 751413a5ff5..c763588043f 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp @@ -10,12 +10,28 @@ #include +#include + #include #include #include namespace vkcompute { +void resize_clone_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& extra_args) { + (void)extra_args; + vTensorPtr out = graph->get_tensor(args[0].refs[0]); + vTensorPtr in = graph->get_tensor(args[1].refs[0]); + // TODO: support for when dimensionality doesn't match, i.e. clone is used to + // implement squeeze. + if (out->dim() == in->dim()) { + out->virtual_resize(in->sizes()); + } +} + void add_clone_node( ComputeGraph& graph, const ValueRef in, @@ -30,14 +46,84 @@ void add_clone_node( VK_KERNEL_FROM_STR(kernel_name), graph.create_global_wg_size(out), graph.create_local_wg_size(out), - {{out, vkapi::MemoryAccessType::WRITE}, - {in, vkapi::MemoryAccessType::READ}}, - {t_out->logical_limits_ubo()})); + // Inputs and Outputs + {{out, vkapi::kWrite}, {in, vkapi::kRead}}, + // Parameter Buffers + {t_out->logical_limits_ubo()}, + // Specialization Constants + {}, + // Resizing Logic + resize_clone_node)); +} + +void add_image_to_buffer_node( + ComputeGraph& graph, + const ValueRef image, + const ValueRef buffer) { + std::string kernel_name = "clone_image_to_buffer"; + add_dtype_suffix(kernel_name, graph.dtype_of(image)); + vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name); + + utils::uvec3 global_wg_size = graph.create_global_wg_size(image); + graph.execute_nodes().emplace_back(new DispatchNode( + graph, + shader, + global_wg_size, + graph.create_local_wg_size(global_wg_size), + // Input and Outputs + {{buffer, vkapi::kWrite}, {image, vkapi::kRead}}, + // Parameter Buffers + {graph.sizes_ubo(image), graph.strides_ubo(buffer)}, + // Specialization Constants + {graph.hashed_layout_of(image)}, + // Resizing Logic + resize_clone_node)); +} + +void add_buffer_to_image_node( + ComputeGraph& graph, + const ValueRef buffer, + const ValueRef image) { + std::string kernel_name = "clone_buffer_to_image"; + add_dtype_suffix(kernel_name, graph.dtype_of(image)); + vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name); + + utils::uvec3 global_wg_size = graph.create_global_wg_size(image); + graph.execute_nodes().emplace_back(new DispatchNode( + graph, + shader, + global_wg_size, + graph.create_local_wg_size(global_wg_size), + // Input and Outputs + {{image, vkapi::kWrite}, {buffer, vkapi::kRead}}, + // Parameter Buffers + {graph.sizes_ubo(image), graph.strides_ubo(buffer)}, + // Specialization Constants + {graph.hashed_layout_of(image)}, + // Resizing Logic + resize_clone_node)); } void clone(ComputeGraph& graph, const std::vector& args) { - // The vulkan delegate does not support changing memory format. - return add_clone_node(graph, args[0], args[2]); + const ValueRef src = args[0]; + const ValueRef dst = args[2]; + + const utils::StorageType src_storage = graph.storage_type_of(src); + const utils::StorageType dst_storage = graph.storage_type_of(dst); + if (src_storage == utils::kTexture3D && dst_storage == utils::kTexture3D) { + if (graph.hashed_layout_of(src) == graph.hashed_layout_of(dst)) { + return add_clone_node(graph, src, dst); + } else { + return add_view_node(graph, src, kDummyValueRef, dst); + } + } + if (src_storage == utils::kTexture3D && dst_storage == utils::kBuffer) { + return add_image_to_buffer_node(graph, src, dst); + } + if (src_storage == utils::kBuffer && dst_storage == utils::kTexture3D) { + return add_buffer_to_image_node(graph, src, dst); + } + VK_THROW("Buffer to buffer memory layout transition not supported yet!"); } // Clone node is not the most efficient implementation for the aten.clone diff --git a/backends/vulkan/runtime/graph/ops/impl/View.cpp b/backends/vulkan/runtime/graph/ops/impl/View.cpp index 46d986e03ce..060696a4fa6 100644 --- a/backends/vulkan/runtime/graph/ops/impl/View.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/View.cpp @@ -8,6 +8,8 @@ #include +#include + #include #include #include diff --git a/backends/vulkan/runtime/graph/ops/impl/View.h b/backends/vulkan/runtime/graph/ops/impl/View.h new file mode 100644 index 00000000000..a2038d184c3 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/View.h @@ -0,0 +1,21 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace vkcompute { + +void add_view_node( + ComputeGraph& graph, + ValueRef in, + ValueRef sizes, + ValueRef out); + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp index 934fd03ab7f..fd7e6b78c22 100644 --- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp +++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp @@ -29,8 +29,8 @@ vkapi::ShaderInfo get_nchw_to_tensor_shader( if (is_bitw8(v_dst.dtype()) && v_dst.storage_type() != utils::kBuffer && !int8_buffer_enabled) { kernel_name = "nchw_to_bitw8_image_nobitw8buffer"; - add_dtype_suffix(kernel_name, v_dst); add_storage_type_suffix(kernel_name, v_dst); + add_dtype_suffix(kernel_name, v_dst); return VK_KERNEL_FROM_STR(kernel_name); } @@ -41,8 +41,8 @@ vkapi::ShaderInfo get_nchw_to_tensor_shader( } kernel_name = "nchw_to_image"; - add_dtype_suffix(kernel_name, v_dst); add_storage_type_suffix(kernel_name, v_dst); + add_dtype_suffix(kernel_name, v_dst); return VK_KERNEL_FROM_STR(kernel_name); } @@ -56,8 +56,8 @@ vkapi::ShaderInfo get_tensor_to_nchw_shader( if (is_bitw8(v_src.dtype()) && v_src.storage_type() != utils::kBuffer && !int8_buffer_enabled) { kernel_name = "bitw8_image_to_nchw_nobitw8buffer"; - add_dtype_suffix(kernel_name, v_src); add_storage_type_suffix(kernel_name, v_src); + add_dtype_suffix(kernel_name, v_src); return VK_KERNEL_FROM_STR(kernel_name); } @@ -68,8 +68,8 @@ vkapi::ShaderInfo get_tensor_to_nchw_shader( } kernel_name = "image_to_nchw"; - add_dtype_suffix(kernel_name, v_src); add_storage_type_suffix(kernel_name, v_src); + add_dtype_suffix(kernel_name, v_src); return VK_KERNEL_FROM_STR(kernel_name); } diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp index 6124f0b71e0..3b6195a5c26 100644 --- a/backends/vulkan/test/utils/test_utils.cpp +++ b/backends/vulkan/test/utils/test_utils.cpp @@ -118,8 +118,8 @@ void record_bitw8_image_to_nchw_nobitw8buffer_op( utils::uvec3 global_wg_size = {buffer_len, 1, 1}; std::string kernel_name = "bitw8_image_to_nchw_nobitw8buffer"; - add_dtype_suffix(kernel_name, v_src); add_storage_type_suffix(kernel_name, v_src); + add_dtype_suffix(kernel_name, v_src); context->submit_compute_job( VK_KERNEL_FROM_STR(kernel_name), diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 1d40fe1bb59..261b10359d2 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -1901,6 +1901,61 @@ TEST(VulkanComputeGraphTest, test_large_graph) { std::cout << ss.str(); } +void test_clone( + std::vector sizes, + utils::StorageType src_storage, + utils::GPUMemoryLayout src_layout, + utils::StorageType dst_storage, + utils::GPUMemoryLayout dst_layout) { + GraphConfig config; + ComputeGraph graph(config); + + IOValueRef a = + graph.add_input_tensor(sizes, vkapi::kFloat, src_storage, src_layout); + + IOValueRef out = {}; + out.value = graph.add_tensor(sizes, vkapi::kFloat, dst_storage, dst_layout); + + auto copyFn = VK_GET_OP_FN("aten.clone.default"); + copyFn(graph, {a.value, kDummyValueRef, out.value}); + + out.staging = graph.set_output_tensor(out.value); + + graph.prepare(); + graph.encode_execute(); + + fill_vtensor(graph, a, 0.0f, /*iota = */ true); + + graph.propagate_resize(); + graph.execute(); + + EXTRACT_TENSOR(out); + EXTRACT_TENSOR(a); + + for (int i = 0; i < graph.numel_of(a.value); ++i) { + EXPECT_TRUE(data_out[i] == data_a[i]); + } +} + +TEST(VulkanComputeGraphTest, test_clone) { + std::vector> cases{ + {utils::kWidthPacked, utils::kWidthPacked}, + {utils::kWidthPacked, utils::kChannelsPacked}, + {utils::kChannelsPacked, utils::kChannelsPacked}, + }; + + for (std::vector sizes : standard_sizes_to_test) { + for (auto& [src_layout, dst_layout] : cases) { + test_clone( + sizes, utils::kTexture3D, src_layout, utils::kBuffer, dst_layout); + test_clone( + sizes, utils::kBuffer, src_layout, utils::kTexture3D, dst_layout); + test_clone( + sizes, utils::kTexture3D, src_layout, utils::kTexture3D, dst_layout); + } + } +} + TEST(VulkanComputeGraphTest, test_etvk_copy_offset_node) { GraphConfig config; ComputeGraph graph(config);