diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index cabf4e7a882..cb958cefea3 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -612,6 +612,22 @@ class ComputeGraph final { return {t, staging}; } + /* + * Add an input tensor with the specified properties along with its staging + * buffer. + */ + inline IOValueRef add_input_tensor( + const std::vector& sizes, + const vkapi::ScalarType dtype, + const utils::StorageType storage_type, + const utils::GPUMemoryLayout memory_layout, + const int64_t shared_object_idx = -1) { + ValueRef t = add_tensor( + sizes, dtype, storage_type, memory_layout, shared_object_idx); + ValueRef staging = set_input_tensor(t); + return {t, staging}; + } + SharedObject& get_shared_object(const int64_t idx); // diff --git a/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.yaml index e15e27addad..e1574d7fc0f 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.yaml @@ -9,11 +9,11 @@ bitw8_image_to_nchw_nobitw8buffer: STORAGE: texture3d DTYPE: int8 generate_variant_forall: - DTYPE: - - VALUE: int8 - - VALUE: uint8 STORAGE: - VALUE: texture2d - VALUE: texture3d + DTYPE: + - VALUE: int8 + - VALUE: uint8 shader_variants: - NAME: bitw8_image_to_nchw_nobitw8buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl index f7d2770faf0..afdc35a8861 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl @@ -19,9 +19,11 @@ ${define_required_extensions(DTYPE)} layout(std430) buffer; -${layout_declare_buffer(B, "w", "nchw_out", DTYPE)} +${layout_declare_buffer(B, "w", "buf_out", DTYPE)} ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} ${layout_declare_ubo(B, "ivec4", "sizes")} +$if not TO_STAGING: + ${layout_declare_ubo(B, "ivec4", "buf_strides")} #include "indexing_utils.h" @@ -31,23 +33,23 @@ ${layout_declare_spec_const(C, "int", "t_layout", "DEFAULT_LAYOUT")} const lowp ivec4 axis_map = unhash_axis_map(t_layout); const lowp int packed_dim = unhash_packed_dim(t_layout); -void write_out_texel(VEC4_T texel, ivec4 tensor_idx) { - const ivec4 buf_indices = tidx_to_nchwi( - tensor_idx, - sizes, - packed_dim); +void write_out_texel(VEC4_T texel, ivec4 tidx) { + $if TO_STAGING: + const ivec4 buf_indices = tidx_to_nchwi(tidx, sizes, packed_dim); + $else: + const ivec4 buf_indices = tidx_to_4bufi(tidx, buf_strides, packed_dim); - if (tensor_idx[packed_dim] < sizes[packed_dim]) { - nchw_out[buf_indices.x] = BUF_T(texel.x); + if (tidx[packed_dim] < sizes[packed_dim]) { + buf_out[buf_indices.x] = BUF_T(texel.x); } - if (tensor_idx[packed_dim] + 1 < sizes[packed_dim]) { - nchw_out[buf_indices.y] = BUF_T(texel.y); + if (tidx[packed_dim] + 1 < sizes[packed_dim]) { + buf_out[buf_indices.y] = BUF_T(texel.y); } - if (tensor_idx[packed_dim] + 2 < sizes[packed_dim]) { - nchw_out[buf_indices.z] = BUF_T(texel.z); + if (tidx[packed_dim] + 2 < sizes[packed_dim]) { + buf_out[buf_indices.z] = BUF_T(texel.z); } - if (tensor_idx[packed_dim] + 3 < sizes[packed_dim]) { - nchw_out[buf_indices.w] = BUF_T(texel.w); + if (tidx[packed_dim] + 3 < sizes[packed_dim]) { + buf_out[buf_indices.w] = BUF_T(texel.w); } } diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml index 0898e75110d..8fc9340d9d0 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml @@ -8,14 +8,16 @@ image_to_nchw: parameter_names_with_default_values: DTYPE: float STORAGE: texture3d + TO_STAGING: True generate_variant_forall: DTYPE: - VALUE: half - VALUE: float - VALUE: int - VALUE: int8 - STORAGE: - - VALUE: texture3d - - VALUE: texture2d shader_variants: - - NAME: image_to_nchw + - NAME: image_to_nchw_texture3d + - NAME: image_to_nchw_texture2d + STORAGE: texture2d + - NAME: clone_image_to_buffer + TO_STAGING: False diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h index 09f53fe779a..0b372ab70a4 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h +++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h @@ -88,6 +88,21 @@ ivec4 tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes, const int packed_dim) { return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim]; } +/* + * Get the buffer indices that contain the data of the texel that corresponds to + * to the provided tensor index. Since the texel have 4 elements, 4 buffer + * indices will be retrieved. + */ +ivec4 tidx_to_4bufi( + const ivec4 tidx, + const ivec4 strides, + const int packed_dim) { + int base_i = tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z + + tidx.w * strides.w; + + return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim]; +} + ivec4 nchwi_to_tidx(const int nchwi, const ivec4 sizes) { return ivec4( nchwi % sizes.x, diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml index 7fe3849fd5c..506a66c0d27 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml @@ -9,11 +9,11 @@ nchw_to_bitw8_image_nobitw8buffer: STORAGE: texture3d DTYPE: int8 generate_variant_forall: - DTYPE: - - VALUE: int8 - - VALUE: uint8 STORAGE: - VALUE: texture2d - VALUE: texture3d + DTYPE: + - VALUE: int8 + - VALUE: uint8 shader_variants: - NAME: nchw_to_bitw8_image_nobitw8buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl index bde846289ef..3d2a102dac7 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl @@ -22,6 +22,8 @@ layout(std430) buffer; ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} ${layout_declare_buffer(B, "r", "buf_in", DTYPE)} ${layout_declare_ubo(B, "ivec4", "sizes")} +$if not FROM_STAGING: + ${layout_declare_ubo(B, "ivec4", "buf_strides")} #include "indexing_utils.h" @@ -32,10 +34,10 @@ const lowp ivec4 axis_map = unhash_axis_map(t_layout); const lowp int packed_dim = unhash_packed_dim(t_layout); VEC4_T read_texel(ivec4 tidx) { - const ivec4 buf_indices = tidx_to_nchwi( - tidx, - sizes, - packed_dim); + $if FROM_STAGING: + const ivec4 buf_indices = tidx_to_nchwi(tidx, sizes, packed_dim); + $else: + const ivec4 buf_indices = tidx_to_4bufi(tidx, buf_strides, packed_dim); VEC4_T texel = VEC4_T(0); if (tidx[packed_dim] < sizes[packed_dim]) { diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml index 2bf85a74920..f44e1f74bfe 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml @@ -8,14 +8,16 @@ nchw_to_image: parameter_names_with_default_values: STORAGE: texture3d DTYPE: float + FROM_STAGING: True generate_variant_forall: DTYPE: - VALUE: half - VALUE: float - VALUE: int - VALUE: int8 - STORAGE: - - VALUE: texture3d - - VALUE: texture2d shader_variants: - - NAME: nchw_to_image + - NAME: nchw_to_image_texture3d + - NAME: nchw_to_image_texture2d + STORAGE: texture2d + - NAME: clone_buffer_to_image + FROM_STAGING: False diff --git a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp index 751413a5ff5..c763588043f 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp @@ -10,12 +10,28 @@ #include +#include + #include #include #include namespace vkcompute { +void resize_clone_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& extra_args) { + (void)extra_args; + vTensorPtr out = graph->get_tensor(args[0].refs[0]); + vTensorPtr in = graph->get_tensor(args[1].refs[0]); + // TODO: support for when dimensionality doesn't match, i.e. clone is used to + // implement squeeze. + if (out->dim() == in->dim()) { + out->virtual_resize(in->sizes()); + } +} + void add_clone_node( ComputeGraph& graph, const ValueRef in, @@ -30,14 +46,84 @@ void add_clone_node( VK_KERNEL_FROM_STR(kernel_name), graph.create_global_wg_size(out), graph.create_local_wg_size(out), - {{out, vkapi::MemoryAccessType::WRITE}, - {in, vkapi::MemoryAccessType::READ}}, - {t_out->logical_limits_ubo()})); + // Inputs and Outputs + {{out, vkapi::kWrite}, {in, vkapi::kRead}}, + // Parameter Buffers + {t_out->logical_limits_ubo()}, + // Specialization Constants + {}, + // Resizing Logic + resize_clone_node)); +} + +void add_image_to_buffer_node( + ComputeGraph& graph, + const ValueRef image, + const ValueRef buffer) { + std::string kernel_name = "clone_image_to_buffer"; + add_dtype_suffix(kernel_name, graph.dtype_of(image)); + vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name); + + utils::uvec3 global_wg_size = graph.create_global_wg_size(image); + graph.execute_nodes().emplace_back(new DispatchNode( + graph, + shader, + global_wg_size, + graph.create_local_wg_size(global_wg_size), + // Input and Outputs + {{buffer, vkapi::kWrite}, {image, vkapi::kRead}}, + // Parameter Buffers + {graph.sizes_ubo(image), graph.strides_ubo(buffer)}, + // Specialization Constants + {graph.hashed_layout_of(image)}, + // Resizing Logic + resize_clone_node)); +} + +void add_buffer_to_image_node( + ComputeGraph& graph, + const ValueRef buffer, + const ValueRef image) { + std::string kernel_name = "clone_buffer_to_image"; + add_dtype_suffix(kernel_name, graph.dtype_of(image)); + vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name); + + utils::uvec3 global_wg_size = graph.create_global_wg_size(image); + graph.execute_nodes().emplace_back(new DispatchNode( + graph, + shader, + global_wg_size, + graph.create_local_wg_size(global_wg_size), + // Input and Outputs + {{image, vkapi::kWrite}, {buffer, vkapi::kRead}}, + // Parameter Buffers + {graph.sizes_ubo(image), graph.strides_ubo(buffer)}, + // Specialization Constants + {graph.hashed_layout_of(image)}, + // Resizing Logic + resize_clone_node)); } void clone(ComputeGraph& graph, const std::vector& args) { - // The vulkan delegate does not support changing memory format. - return add_clone_node(graph, args[0], args[2]); + const ValueRef src = args[0]; + const ValueRef dst = args[2]; + + const utils::StorageType src_storage = graph.storage_type_of(src); + const utils::StorageType dst_storage = graph.storage_type_of(dst); + if (src_storage == utils::kTexture3D && dst_storage == utils::kTexture3D) { + if (graph.hashed_layout_of(src) == graph.hashed_layout_of(dst)) { + return add_clone_node(graph, src, dst); + } else { + return add_view_node(graph, src, kDummyValueRef, dst); + } + } + if (src_storage == utils::kTexture3D && dst_storage == utils::kBuffer) { + return add_image_to_buffer_node(graph, src, dst); + } + if (src_storage == utils::kBuffer && dst_storage == utils::kTexture3D) { + return add_buffer_to_image_node(graph, src, dst); + } + VK_THROW("Buffer to buffer memory layout transition not supported yet!"); } // Clone node is not the most efficient implementation for the aten.clone diff --git a/backends/vulkan/runtime/graph/ops/impl/View.cpp b/backends/vulkan/runtime/graph/ops/impl/View.cpp index 46d986e03ce..060696a4fa6 100644 --- a/backends/vulkan/runtime/graph/ops/impl/View.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/View.cpp @@ -8,6 +8,8 @@ #include +#include + #include #include #include diff --git a/backends/vulkan/runtime/graph/ops/impl/View.h b/backends/vulkan/runtime/graph/ops/impl/View.h new file mode 100644 index 00000000000..a2038d184c3 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/View.h @@ -0,0 +1,21 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace vkcompute { + +void add_view_node( + ComputeGraph& graph, + ValueRef in, + ValueRef sizes, + ValueRef out); + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp index 934fd03ab7f..fd7e6b78c22 100644 --- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp +++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp @@ -29,8 +29,8 @@ vkapi::ShaderInfo get_nchw_to_tensor_shader( if (is_bitw8(v_dst.dtype()) && v_dst.storage_type() != utils::kBuffer && !int8_buffer_enabled) { kernel_name = "nchw_to_bitw8_image_nobitw8buffer"; - add_dtype_suffix(kernel_name, v_dst); add_storage_type_suffix(kernel_name, v_dst); + add_dtype_suffix(kernel_name, v_dst); return VK_KERNEL_FROM_STR(kernel_name); } @@ -41,8 +41,8 @@ vkapi::ShaderInfo get_nchw_to_tensor_shader( } kernel_name = "nchw_to_image"; - add_dtype_suffix(kernel_name, v_dst); add_storage_type_suffix(kernel_name, v_dst); + add_dtype_suffix(kernel_name, v_dst); return VK_KERNEL_FROM_STR(kernel_name); } @@ -56,8 +56,8 @@ vkapi::ShaderInfo get_tensor_to_nchw_shader( if (is_bitw8(v_src.dtype()) && v_src.storage_type() != utils::kBuffer && !int8_buffer_enabled) { kernel_name = "bitw8_image_to_nchw_nobitw8buffer"; - add_dtype_suffix(kernel_name, v_src); add_storage_type_suffix(kernel_name, v_src); + add_dtype_suffix(kernel_name, v_src); return VK_KERNEL_FROM_STR(kernel_name); } @@ -68,8 +68,8 @@ vkapi::ShaderInfo get_tensor_to_nchw_shader( } kernel_name = "image_to_nchw"; - add_dtype_suffix(kernel_name, v_src); add_storage_type_suffix(kernel_name, v_src); + add_dtype_suffix(kernel_name, v_src); return VK_KERNEL_FROM_STR(kernel_name); } diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp index 6124f0b71e0..3b6195a5c26 100644 --- a/backends/vulkan/test/utils/test_utils.cpp +++ b/backends/vulkan/test/utils/test_utils.cpp @@ -118,8 +118,8 @@ void record_bitw8_image_to_nchw_nobitw8buffer_op( utils::uvec3 global_wg_size = {buffer_len, 1, 1}; std::string kernel_name = "bitw8_image_to_nchw_nobitw8buffer"; - add_dtype_suffix(kernel_name, v_src); add_storage_type_suffix(kernel_name, v_src); + add_dtype_suffix(kernel_name, v_src); context->submit_compute_job( VK_KERNEL_FROM_STR(kernel_name), diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 1d40fe1bb59..261b10359d2 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -1901,6 +1901,61 @@ TEST(VulkanComputeGraphTest, test_large_graph) { std::cout << ss.str(); } +void test_clone( + std::vector sizes, + utils::StorageType src_storage, + utils::GPUMemoryLayout src_layout, + utils::StorageType dst_storage, + utils::GPUMemoryLayout dst_layout) { + GraphConfig config; + ComputeGraph graph(config); + + IOValueRef a = + graph.add_input_tensor(sizes, vkapi::kFloat, src_storage, src_layout); + + IOValueRef out = {}; + out.value = graph.add_tensor(sizes, vkapi::kFloat, dst_storage, dst_layout); + + auto copyFn = VK_GET_OP_FN("aten.clone.default"); + copyFn(graph, {a.value, kDummyValueRef, out.value}); + + out.staging = graph.set_output_tensor(out.value); + + graph.prepare(); + graph.encode_execute(); + + fill_vtensor(graph, a, 0.0f, /*iota = */ true); + + graph.propagate_resize(); + graph.execute(); + + EXTRACT_TENSOR(out); + EXTRACT_TENSOR(a); + + for (int i = 0; i < graph.numel_of(a.value); ++i) { + EXPECT_TRUE(data_out[i] == data_a[i]); + } +} + +TEST(VulkanComputeGraphTest, test_clone) { + std::vector> cases{ + {utils::kWidthPacked, utils::kWidthPacked}, + {utils::kWidthPacked, utils::kChannelsPacked}, + {utils::kChannelsPacked, utils::kChannelsPacked}, + }; + + for (std::vector sizes : standard_sizes_to_test) { + for (auto& [src_layout, dst_layout] : cases) { + test_clone( + sizes, utils::kTexture3D, src_layout, utils::kBuffer, dst_layout); + test_clone( + sizes, utils::kBuffer, src_layout, utils::kTexture3D, dst_layout); + test_clone( + sizes, utils::kTexture3D, src_layout, utils::kTexture3D, dst_layout); + } + } +} + TEST(VulkanComputeGraphTest, test_etvk_copy_offset_node) { GraphConfig config; ComputeGraph graph(config);