diff --git a/backends/vulkan/_passes/tag_memory_meta_pass.py b/backends/vulkan/_passes/tag_memory_meta_pass.py index 43796c043c8..00b6c62d5d2 100644 --- a/backends/vulkan/_passes/tag_memory_meta_pass.py +++ b/backends/vulkan/_passes/tag_memory_meta_pass.py @@ -226,9 +226,10 @@ def get_arg_tensor_source_repset( """ arg_node = op_node.args[arg_i] - # For non-tensor arguments, return ANY_STORAGE + # For non-tensor arguments, return ALL_STORAGES_REPSET so that the respset does + # not appear to be empty. if not utils.is_tensor_arg_node(arg_node): - return utils.ANY_STORAGE + return utils.ALL_STORAGES_REPSET # Special case for cat - use the first tensor in the list as representative if isinstance(arg_node, list): diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py index 461278500a6..feba4f6f072 100644 --- a/backends/vulkan/op_registry.py +++ b/backends/vulkan/op_registry.py @@ -740,6 +740,7 @@ def register_cat_op(): [ exir_ops.edge.aten.select_copy.int, exir_ops.edge.aten.slice_copy.Tensor, + exir_ops.edge.aten.split_with_sizes_copy.default, ] ) def register_transfer_ops(): @@ -782,10 +783,7 @@ def register_ported_op(): # Ops ported from PyTorch Vulkan backend. These ops are in a separate registry because they support all packed dimensions @update_features( [ - # Tensor combination exir_ops.edge.aten.repeat.default, - exir_ops.edge.aten.split_with_sizes_copy.default, - exir_ops.edge.aten.split.Tensor, ] ) def register_ported_op_all_packed_dims(): diff --git a/backends/vulkan/runtime/graph/ops/glsl/common.glslh b/backends/vulkan/runtime/graph/ops/glsl/common.glslh index 8340a8b9b2f..9ade64910f2 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/common.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/common.glslh @@ -86,4 +86,15 @@ int quantize_and_pack(const vec4 vals, const float inv_scale, const int zp) { return pack_into_int32(quantized); } +#ifdef DEBUG_MODE + +#define printf debugPrintfEXT + +void printVec4(vec4 texel) { + debugPrintfEXT( + "texel: %f, %f, %f, %f\\n", texel.x, texel.y, texel.z, texel.w); +} + +#endif // DEBUG_MODE + #endif // COMMON_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl deleted file mode 100644 index 39aa9b11a0d..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "existing_out", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} - -layout(push_constant) uniform restrict Block { - ivec4 out_sizes; - ivec4 in_sizes; - // Operates on (x, y, z) logical extents. - // channel_range is stored in range.w - ivec4 range; - // Analogus to range variable in copy. It defines the # of channel being - // copied. - // dst channel offset is stored in dst_offset.w - ivec4 dst_offset; - int src_channel_offset; -}; - -#include "indexing_utils.h" - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); -const lowp int packed_dim = unhash_packed_dim(out_layout); - -${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); - -void main() { - // Note: Unlike other shaders, the range is often not equal to the destination - // texture extent. - const ivec3 lpos = ivec3(gl_GlobalInvocationID); - if (any(greaterThanEqual(lpos, range.xyz))) { - return; - } - - const ivec3 out_lpos = lpos + dst_offset.xyz; - - const ivec4 out_tidx = lpos_to_tidx(out_lpos, out_sizes, out_axis_map.w, packed_dim); - - // First read the existing values to make sure the boundary values stay. - VEC4_T v = load_texel_lpos(existing_out, out_lpos, out_axis_map); - - ivec4 in_tidx = out_tidx; - for (int i=0; i<4; i++) { - - in_tidx[packed_dim] = out_tidx[packed_dim] - dst_offset.w + i; - - // Handle the partial update for begining of channel in an existing tensor. - // If the source channel index is below zero or exceeds the range, we skip - // updating the element to avoid overwriting existing data. - if ((in_tidx[packed_dim] < 0) || (in_tidx[packed_dim] >= range.w)) { - continue; - } - - // Readjust for the source offset. - in_tidx[packed_dim] += src_channel_offset; - - ivec4 in_posi = tidx_to_posi(in_tidx, in_sizes, in_axis_map, packed_dim); - v[i] = load_texel(t_in, in_posi.xyz)[in_posi.w]; - } - - write_texel_lpos(t_out, out_lpos, v, out_axis_map); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml deleted file mode 100644 index 984d9a09d43..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml +++ /dev/null @@ -1,12 +0,0 @@ -copy_channel_offset: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - shader_variants: - - NAME: copy_channel_offset diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl deleted file mode 100644 index 178814a90c3..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -${define_active_storage_type(STORAGE)} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} - -layout(push_constant) uniform restrict Block { - ivec3 range; - // xyz is source offset w is channel size - ivec4 src_offset; - // xyz is destination offset w is channel size - ivec4 dst_offset; -}; - -#include "indexing_utils.h" - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); - -${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); - -${layout_declare_spec_const(C, "int", "batch_index_function", "0")} - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, range))) { - return; - } - - ivec3 in_pos = pos + src_offset.xyz; - ivec3 out_pos = pos + dst_offset.xyz; - if (src_offset.w > 0) { - if (batch_index_function == 1) { - // batch index is calculated using source channel size - const int channel_index = pos.z % src_offset.w; - const int batch_index = pos.z / src_offset.w; - out_pos.z = channel_index + dst_offset.z + batch_index * dst_offset.w; - } else if (batch_index_function == 2) { - // batch index is calculated using destination channel size - const int channel_index = pos.z % dst_offset.w; - const int batch_index = pos.z / dst_offset.w; - in_pos.z = channel_index + src_offset.z + batch_index * src_offset.w; - } - } - - write_texel_lpos( - t_out, - out_pos, - load_texel_lpos(t_in, in_pos, in_axis_map), - out_axis_map); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.yaml b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.yaml deleted file mode 100644 index 09f5ca36ea4..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.yaml +++ /dev/null @@ -1,17 +0,0 @@ -copy_offset: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - - VALUE: int8 - - VALUE: uint8 - STORAGE: - - VALUE: texture3d - - VALUE: texture2d - shader_variants: - - NAME: copy_offset diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl deleted file mode 100644 index 3100565d08a..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "existing_out", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} - -layout(push_constant) uniform restrict Block { - ivec4 range; - - // xyz is source offset w is channel size - ivec4 src_offset; - - // xyz is destination offset w is channel size - ivec4 dst_offset; -}; - -#include "indexing_utils.h" - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); -const lowp int packed_dim = unhash_packed_dim(out_layout); - -${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, range.xyz))) { - return; - } - - // Position in input tensor - ivec3 in_pos = pos + src_offset.xyz; - in_pos[packed_dim] = pos[packed_dim] + (src_offset[packed_dim] >> 2); - - // Read input value mapping to this output texel - VEC4_T in_value = load_texel_lpos(t_in, in_pos, in_axis_map); - - // Starting offset to read from a texel - const int src_lane_offset = src_offset[packed_dim] & 0x3; - const bool has_src_lane_offset = src_lane_offset != 0; - - // If input lane offset is non zero i.e packed texel is composed from multiple sources - if (has_src_lane_offset) { - // Boundary values will come from next input texel in the packed dim. - ivec3 next_in_pos = in_pos; - next_in_pos[packed_dim] = in_pos[packed_dim] + 1; - VEC4_T next_value = load_texel_lpos(t_in, next_in_pos, in_axis_map); - - // Keep input values from the end of current input pixel based on src_lane_offset - // offset 1 means the first lane of current input texel is not a part of the output texel - // offset 2 means first 2 lanes are not and so on - // Copy next texel's values towards the end of input texel, based on lane offset - // offset 1 means the first lane from next texel is part of the input texel - // offset 2 means first 2 lanes from next texel is part of the input texel and so on - if (src_lane_offset == 1) { - in_value = ivec4(in_value.yzw, next_value.x); - } else if (src_lane_offset == 2) { - in_value = ivec4(in_value.zw, next_value.xy); - } else { - in_value = ivec4(in_value.w, next_value.xyz); - } - } - - // Starting offset to write at within a texel - const int out_lane_offset = dst_offset[packed_dim] & 0x3; - const bool has_dst_lane_offset = out_lane_offset != 0; - - ivec3 out_pos = pos + dst_offset.xyz; - out_pos[packed_dim] = pos[packed_dim] + (dst_offset[packed_dim] >> 2); - - VEC4_T out_value; - - // If lane offset is non zero i.e packed texel is composed from multiple sources - if (has_dst_lane_offset) { - // When position in packed dim is > 0 - if (pos[packed_dim] > 0) { - // Boundary values will come from previous input texel in the packed dim. - ivec3 prev_in_pos = in_pos; - prev_in_pos[packed_dim] = in_pos[packed_dim] - 1; - VEC4_T prev_value = load_texel_lpos(t_in, prev_in_pos, in_axis_map); - - // Shift values toward the beginning based on out_lane_offset - // offset 1 means the last lane from the previous texel is a part of the output texel - // offset 2 means last 2 lanes and so on - if (out_lane_offset == 1) { - out_value.x = prev_value.w; - } else if (out_lane_offset == 2) { - out_value.xy = prev_value.zw; - } else { - out_value.xyz = prev_value.yzw; - } - } else { - // When position in packed dim is == 0 - // Boundary values will be the previous texel values. - out_value = load_texel_lpos(existing_out, out_pos, out_axis_map); - } - - // Copy input values towards the end of output array, based on lane offset - // offset 1 means the first lane from previous texel is part of the output texel starting at offset - // offset 2 means first 2 lanes from the previous texel is part of the output texel and so on - if (out_lane_offset == 1) { - out_value.yzw = in_value.xyz; - } else if (out_lane_offset == 2) { - out_value.zw = in_value.xy; - } else { - out_value.w = in_value.x; - } - } else { - out_value = in_value; - } - - write_texel_lpos( - t_out, - out_pos, - out_value, - out_axis_map); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.yaml b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.yaml deleted file mode 100644 index 6e55876cb28..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.yaml +++ /dev/null @@ -1,12 +0,0 @@ -copy_packed_dim_offset: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - shader_variants: - - NAME: copy_packed_dim_offset diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh index 38016547d19..b9ac0e5dace 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh @@ -259,17 +259,28 @@ void clamp_tensor_idx(const BufferMetadata meta, inout TensorIndex tidx) { tidx.data[1] = min(tidx.data[1], meta.sizes[1] - 1); } -// Does not account for axis mapping or batches +// Does not account for axis mapping TensorIndex4D texture_pos_to_tensor4d_idx_simple( const TextureMetadata meta, const ivec3 pos) { TensorIndex4D tidx; tidx.data.xyz = pos; tidx.data.w = 0; tidx.data[meta.packed_dim] *= 4; + + // Compute batch idx accounting for batch concatenation, assuming channels as + // the concatenation dim. + if (meta.sizes.w > 1) { + int channels = meta.sizes.z; + if (meta.packed_dim == 2) { + channels = align_up_4(channels); + } + tidx.data.w = tidx.data.z / channels; + tidx.data.z = tidx.data.z % channels; + } return tidx; } -// Does not account for axis mapping or batches +// Does not account for axis mapping ivec3 tensor4d_idx_to_texel_pos_simple( const TextureMetadata meta, const TensorIndex4D tidx) { ivec3 texel_pos; @@ -278,10 +289,20 @@ ivec3 tensor4d_idx_to_texel_pos_simple( texel_pos = tidx.data.xyz; texel_pos[meta.packed_dim] = div_4(packed_dim_idx); + + // Account for batch concatenation, assuming channels as the concatenation dim + if (meta.sizes.w > 1) { + int channels_ntexels = meta.sizes.z; + if (meta.packed_dim == 2) { + channels_ntexels = div_up_4(channels_ntexels); + } + texel_pos.z += tidx.data.w * channels_ntexels; + } + return texel_pos; } -// Does not account for axis mapping or batches +// Does not account for axis mapping TextureElementIndex tensor4d_idx_to_texture_element_idx_simple( const TextureMetadata meta, const TensorIndex4D tidx) { const int packed_dim_idx = tidx.data[meta.packed_dim]; @@ -289,6 +310,16 @@ TextureElementIndex tensor4d_idx_to_texture_element_idx_simple( tex_idx.pos = tidx.data.xyz; tex_idx.pos[meta.packed_dim] = div_4(packed_dim_idx); tex_idx.comp = mod_4(packed_dim_idx); + + // Account for batch concatenation, assuming channels as the concatenation dim + if (meta.sizes.w > 1) { + int channels_ntexels = meta.sizes.z; + if (meta.packed_dim == 2) { + channels_ntexels = div_up_4(channels_ntexels); + } + tex_idx.pos.z += tidx.data.w * channels_ntexels; + } + return tex_idx; } @@ -316,13 +347,21 @@ void printTensorIndex(const TensorIndex tidx) { ); } -void printTensorIndex4D(const TensorIndex tidx) { +void printTensorIndex4D(const TensorIndex4D tidx) { debugPrintfEXT( "TensorIndex4D: [%u, %u, %u, %u]\\n", - tidx.data[0][0], tidx.data[0][1], tidx.data[0][2], tidx.data[0][3] + tidx.data[0], tidx.data[1], tidx.data[2], tidx.data[3] + ); +} + +void printTextureElementIndex(const TextureElementIndex tex_idx) { + debugPrintfEXT( + "TextureElementIndex: pos=[%d %d %d] comp=%d\\n", + tex_idx.pos.x, tex_idx.pos.y, tex_idx.pos.z, tex_idx.comp ); } + void printBufferMetadata(const BufferMetadata meta) { debugPrintfEXT( "BufferMetadata: ndim=%u numel=%u\\n sizes=[%u %u %u %u %u %u %u %u]\\n dim_order=[%u %u %u %u %u %u %u %u]\\n strides=[%u %u %u %u %u %u %u %u]\\n", diff --git a/backends/vulkan/runtime/graph/ops/glsl/select.glslh b/backends/vulkan/runtime/graph/ops/glsl/select.glslh index 6509015b4b6..5390e2a4bb2 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/select.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/select.glslh @@ -9,70 +9,87 @@ #ifndef SELECT_GLSLH #define SELECT_GLSLH -#ifndef USING_BUFFER +#ifdef USING_BUFFER /* - * Enable the fast path if a texel loaded from the input texture can be used as - * is to store to the output texture. The following conditions must be met: + * Converts output tensor indices to input tensor indices for the select operation + * on buffer storage. * - * 1. The input and output textures have the same packed dimension. - * 2. The selected_dim must not be the packed dimension of the input. - * 3. The packed dimension of the input must "map" to the packed dimension of - * the output. This occurs if selected_dim is greater than the packed dimension - * of the input. + * This is done by "inserting" the select index at the selected_dim in the input + * tensor index. + * + * Parameters assumed to be defined: + * - inp: BufferMetadata + * - selected_dim + * - index */ -bool can_use_fast_path() { - if (out_packed_dim != in_packed_dim) { - return false; +TensorIndex out_tidx_to_in_tidx(const TensorIndex out_tidx) { + TensorIndex in_tidx; + initialize(in_tidx); + + int in_size = int(size_at(inp, selected_dim)); + int adjusted_index = index; + if (index < 0) { + adjusted_index = index + in_size; } - if (selected_dim <= in_packed_dim) { - return false; + + // Copy indices before selected_dim + for (int d = 0; d < selected_dim; d++) { + in_tidx.data[div_4(d)][mod_4(d)] = idx_at(out_tidx, d); } - return true; + + // Insert the selected index + in_tidx.data[div_4(selected_dim)][mod_4(selected_dim)] = adjusted_index; + + // Copy indices after selected_dim (shifted by 1) + for (int d = selected_dim; d < int_ndim(inp) - 1; d++) { + in_tidx.data[div_4(d + 1)][mod_4(d + 1)] = idx_at(out_tidx, d); + } + + return in_tidx; } -#endif // USING_BUFFER +#else // texture storage /* - * Given an output tensor index, return the corresponding input tensor index for - * the select operator. This is done by "inserting" the select index at the - * selected_dim in the input tensor index. + * Converts output tensor indices to input tensor indices for the select operation + * on texture storage. * - * A simple example is (note all tensor index are in WHCN order): - * out_tidx = [7, 5, 9] - * selected_dim = 2 - * index = 3 - * in_tidx = [7, 3, 5, 9] + * This is done by "inserting" the select index at the selected_dim in the input + * tensor index. * - * This function assumes that the following variables are defined in the layout: - * - in_sizes + * Parameters assumed to be defined: + * - inp: TextureMetadata * - selected_dim * - index */ -ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) { - ivec4 in_tidx = ivec4(0); +TensorIndex4D out_tidx_to_in_tidx(const TensorIndex4D out_tidx) { + TensorIndex4D in_tidx; + in_tidx.data = ivec4(0); int adjusted_index = index; if (index < 0) { - adjusted_index = index + in_sizes[selected_dim]; + adjusted_index = index + inp.sizes[selected_dim]; } // Handle different dimensions for selection if (selected_dim == 0) { // Select from width dimension - in_tidx = ivec4(adjusted_index, out_tidx.x, out_tidx.y, out_tidx.z); + in_tidx.data = ivec4(adjusted_index, out_tidx.data.x, out_tidx.data.y, out_tidx.data.z); } else if (selected_dim == 1) { // Select from height dimension - in_tidx = ivec4(out_tidx.x, adjusted_index, out_tidx.y, out_tidx.z); + in_tidx.data = ivec4(out_tidx.data.x, adjusted_index, out_tidx.data.y, out_tidx.data.z); } else if (selected_dim == 2) { // Select from channel dimension - in_tidx = ivec4(out_tidx.x, out_tidx.y, adjusted_index, out_tidx.z); + in_tidx.data = ivec4(out_tidx.data.x, out_tidx.data.y, adjusted_index, out_tidx.data.z); } else if (selected_dim == 3) { // Select from batch dimension - in_tidx = ivec4(out_tidx.x, out_tidx.y, out_tidx.z, adjusted_index); + in_tidx.data = ivec4(out_tidx.data.x, out_tidx.data.y, out_tidx.data.z, adjusted_index); } return in_tidx; } +#endif // USING_BUFFER + #endif // SELECT_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh index 87325754f4d..0a815c85d66 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh @@ -9,49 +9,61 @@ #ifndef SLICE_GLSLH #define SLICE_GLSLH -#ifndef USING_BUFFER +#include "indexing.glslh" -/** - * Enable the fast path if a texel loaded from the input texture can be used as - * is to store to the output texture. The following conditions must be met: +#ifdef USING_BUFFER + +/* + * Converts output tensor indices to input tensor indices for the slice operation + * on buffer storage. * - * 1. The input and output textures have the same packed dimension. - * 2. The select_dim must not be the packed dimension of the input. + * Parameters assumed to be defined: + * - inp: BufferMetadata + * - selected_dim + * - start + * - step */ -bool can_use_fast_path() { - if (out_packed_dim != in_packed_dim) { - return false; - } - if (in_packed_dim == selected_dim) { - return false; +TensorIndex out_tidx_to_in_tidx(const TensorIndex out_tidx) { + TensorIndex in_tidx = out_tidx; + + int in_size = int(size_at(inp, selected_dim)); + int adjusted_start = start; + if (start < 0) { + adjusted_start = start + in_size; } - return true; + + uint out_idx = idx_at(out_tidx, selected_dim); + in_tidx.data[div_4(selected_dim)][mod_4(selected_dim)] = + adjusted_start + int(out_idx) * step; + + return in_tidx; } -#endif // USING_BUFFER +#else // texture storage /* - * Converts output tensor indices to input tensor indices for the slice operation. - * This function maps the output indices to the corresponding input indices based on - * the slice parameters (start, step, selected_dim). + * Converts output tensor indices to input tensor indices for the slice operation + * on texture storage. * - * Parameters assumed to be defined in the layout specifier: - * - in_sizes + * Parameters assumed to be defined: + * - inp: TextureMetadata * - selected_dim * - start * - step */ -ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) { - ivec4 in_tidx = out_tidx; +TensorIndex4D out_tidx_to_in_tidx(const TensorIndex4D out_tidx) { + TensorIndex4D in_tidx = out_tidx; int adjusted_start = start; if (start < 0) { - adjusted_start = start + in_sizes[selected_dim]; + adjusted_start = start + inp.sizes[selected_dim]; } - in_tidx[selected_dim] = adjusted_start + out_tidx[selected_dim] * step; + in_tidx.data[selected_dim] = adjusted_start + out_tidx.data[selected_dim] * step; return in_tidx; } +#endif // USING_BUFFER + #endif // SLICE_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/split_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/split_buffer.glsl new file mode 100644 index 00000000000..0505c9e7bcd --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/split_buffer.glsl @@ -0,0 +1,50 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#define T ${buffer_scalar_type(DTYPE)} + +${define_active_storage_type("buffer")} +${define_required_extensions(DTYPE)} + +#extension GL_EXT_control_flow_attributes : require + +layout(std430) buffer; + +#include "indexing.glslh" + +${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")} +${layout_declare_tensor(B, "r", "t_input", DTYPE, "buffer")} + +${layout_declare_ubo(B, "BufferMetadata", "outp")} +${layout_declare_ubo(B, "BufferMetadata", "inp")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +layout(constant_id = 3) const int split_dim = 0; +layout(constant_id = 4) const int split_idx = 0; +layout(constant_id = 5) const int split_offset = 0; + +void main() { + const uint out_bufi = gl_GlobalInvocationID.x; + if (out_of_bounds(out_bufi, outp)) { + return; + } + + TensorIndex out_tidx = linear_idx_to_tensor_idx(outp, out_bufi); + + TensorIndex input_tidx = out_tidx; + input_tidx.data[div_4(split_dim)][mod_4(split_dim)] += split_offset; + + const uint input_bufi = tensor_idx_to_linear_idx(inp, input_tidx); + + t_out[out_bufi] = t_input[input_bufi]; +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/split_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/split_buffer.yaml new file mode 100644 index 00000000000..fd52c0ac721 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/split_buffer.yaml @@ -0,0 +1,16 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +split_buffer: + parameter_names_with_default_values: + DTYPE: float + STORAGE: buffer + generate_variant_forall: + DTYPE: + - VALUE: half + - VALUE: float + shader_variants: + - NAME: split_buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/split_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/split_texture.glsl new file mode 100644 index 00000000000..92d7ce548e2 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/split_texture.glsl @@ -0,0 +1,66 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#define VEC4_T ${texel_load_type(DTYPE, "texture3d")} +#define T ${texel_load_component_type(DTYPE, "texture3d")} + +${define_active_storage_type("texture3d")} +${define_required_extensions(DTYPE)} + +#extension GL_EXT_control_flow_attributes : require + +layout(std430) buffer; + +#include "common.glslh" +#include "indexing.glslh" + +${layout_declare_tensor(B, "w", "t_output", DTYPE, "texture3d")} +${layout_declare_tensor(B, "r", "t_input", DTYPE, "texture3d")} + +${layout_declare_ubo(B, "TextureMetadata", "outp")} +${layout_declare_ubo(B, "TextureMetadata", "inp")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +layout(constant_id = 3) const int split_dim = 0; +layout(constant_id = 4) const int split_idx = 0; +layout(constant_id = 5) const int split_offset = 0; + +void main() { + const ivec3 out_pos = ivec3(gl_GlobalInvocationID); + + if (out_of_bounds(out_pos, outp)) { + return; + } + + TensorIndex4D out_tidx = texture_pos_to_tensor4d_idx_simple(outp, out_pos); + + VEC4_T out_texel = VEC4_T(0); + + int limit = min( + 4, outp.sizes[outp.packed_dim] - out_tidx.data[outp.packed_dim]); + + TensorIndex4D input_tidx = out_tidx; + input_tidx.data[split_dim] += split_offset; + + for (int comp = 0; comp < limit; comp++) { + TextureElementIndex input_elem_pos = tensor4d_idx_to_texture_element_idx_simple( + inp, input_tidx); + + VEC4_T input_texel = texelFetch(t_input, input_elem_pos.pos, 0); + out_texel[comp] = input_texel[input_elem_pos.comp]; + + input_tidx.data[outp.packed_dim]++; + } + + imageStore(t_output, out_pos, out_texel); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/split_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/split_texture.yaml new file mode 100644 index 00000000000..89446df831b --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/split_texture.yaml @@ -0,0 +1,15 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +split_texture: + parameter_names_with_default_values: + DTYPE: float + generate_variant_forall: + DTYPE: + - VALUE: half + - VALUE: float + shader_variants: + - NAME: split_texture3d diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl index 7605c59c72f..73b753ccc0b 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl @@ -11,18 +11,23 @@ #define PRECISION ${PRECISION} #define UBO_PARAMS ${UBO_PARAMS} -#define VEC4_T ${texel_type(DTYPE)} #define T ${buffer_scalar_type(DTYPE)} ${define_active_storage_type("buffer")} ${define_required_extensions(DTYPE)} +#extension GL_EXT_control_flow_attributes : require + layout(std430) buffer; -#include "indexing_utils.h" +#include "indexing.glslh" + ${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")} ${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")} +${layout_declare_ubo(B, "BufferMetadata", "outp")} +${layout_declare_ubo(B, "BufferMetadata", "inp")} + $if UBO_PARAMS: $if OP_NAME == "slice": ${layout_declare_ubo(B, "int", "start")} @@ -32,10 +37,6 @@ $if UBO_PARAMS: ${layout_declare_ubo(B, "int", "index")} layout(push_constant) uniform restrict Block { - ivec4 in_sizes; - ivec4 out_strides; - ivec4 in_strides; - int out_numel; int selected_dim; $if not UBO_PARAMS: $if OP_NAME == "slice": @@ -46,24 +47,19 @@ layout(push_constant) uniform restrict Block { int index; }; -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} -${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} - -const lowp ivec4 out_dim_order = unhash_dim_order(out_layout); - layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; #include "${OP_NAME}.glslh" void main() { - const int out_bufi = ivec3(gl_GlobalInvocationID).x; - if (out_bufi >= out_numel) { + const uint out_bufi = gl_GlobalInvocationID.x; + if (out_of_bounds(out_bufi, outp)) { return; } - const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order); - ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx); + TensorIndex out_tidx = linear_idx_to_tensor_idx(outp, out_bufi); + TensorIndex in_tidx = out_tidx_to_in_tidx(out_tidx); - const int in_bufi = tidx_to_bufi(in_tidx, in_strides); + const uint in_bufi = tensor_idx_to_linear_idx(inp, in_tidx); t_out[out_bufi] = t_in[in_bufi]; } diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl index 0f34713cb43..d2c9c025242 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl @@ -11,19 +11,25 @@ #define PRECISION ${PRECISION} #define UBO_PARAMS ${UBO_PARAMS} -#define VEC4_T ${texel_type(DTYPE)} -#define T ${buffer_scalar_type(DTYPE)} +#define VEC4_T ${texel_load_type(DTYPE, "texture3d")} +#define T ${texel_load_component_type(DTYPE, "texture3d")} ${define_active_storage_type("texture3d")} ${define_required_extensions(DTYPE)} +#extension GL_EXT_control_flow_attributes : require + layout(std430) buffer; -#include "indexing_utils.h" +#include "common.glslh" +#include "indexing.glslh" ${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")} ${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")} +${layout_declare_ubo(B, "TextureMetadata", "outp")} +${layout_declare_ubo(B, "TextureMetadata", "inp")} + $if UBO_PARAMS: $if OP_NAME == "slice": ${layout_declare_ubo(B, "int", "start")} @@ -33,8 +39,6 @@ $if UBO_PARAMS: ${layout_declare_ubo(B, "int", "index")} layout(push_constant) uniform restrict Block { - ivec4 out_sizes; - ivec4 in_sizes; int selected_dim; $if not UBO_PARAMS: $if OP_NAME == "slice": @@ -45,48 +49,33 @@ layout(push_constant) uniform restrict Block { int index; }; -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); -const lowp int out_packed_dim = unhash_packed_dim(out_layout); - -${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); -const lowp int in_packed_dim = unhash_packed_dim(in_layout); - layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; #include "${OP_NAME}.glslh" void main() { - const ivec3 lpos = ivec3(gl_GlobalInvocationID); - ivec4 out_tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, out_packed_dim); + const ivec3 out_pos = ivec3(gl_GlobalInvocationID); - if (any(greaterThanEqual(out_tidx, out_sizes))) { + if (out_of_bounds(out_pos, outp)) { return; } - if (can_use_fast_path()) { - ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx); - ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim); - VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos)); + TensorIndex4D out_tidx = texture_pos_to_tensor4d_idx_simple(outp, out_pos); + VEC4_T out_texel = VEC4_T(0); - write_texel_lpos(t_out, lpos, in_texel, out_axis_map); - } - else { - VEC4_T out_texel = VEC4_T(0); - for (int texel_i = 0; texel_i < 4; ++texel_i) { - ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx); - ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim); - int element_idx = in_tidx[in_packed_dim] % 4; - - VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos)); - T selected_value = T(in_texel[element_idx]); + int limit = min( + 4, outp.sizes[outp.packed_dim] - out_tidx.data[outp.packed_dim]); + for (int comp = 0; comp < limit; comp++) { + TensorIndex4D in_tidx = out_tidx_to_in_tidx(out_tidx); - out_texel[texel_i] = selected_value; + TextureElementIndex in_elem_pos = tensor4d_idx_to_texture_element_idx_simple( + inp, in_tidx); - out_tidx[out_packed_dim]++; - } + VEC4_T in_texel = texelFetch(t_in, in_elem_pos.pos, 0); + out_texel[comp] = in_texel[in_elem_pos.comp]; - write_texel_lpos(t_out, lpos, out_texel, out_axis_map); + out_tidx.data[outp.packed_dim]++; } + + imageStore(t_out, out_pos, out_texel); } diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp deleted file mode 100644 index bd648dbae2d..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp +++ /dev/null @@ -1,317 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include -#include -#include -#include - -namespace vkcompute { - -using utils::ivec3; -using utils::ivec4; -using utils::uvec3; - -void add_copy_offset_node( - ComputeGraph& graph, - const ValueRef in, - const ivec3& range, - const ivec4& src_offset, - const ivec4& dst_offset, - const ValueRef out, - bool calc_out_pos_using_src_chnl, - bool calc_in_pos_using_dst_chnl) { - std::string kernel_name = "copy_offset"; - kernel_name.reserve(kShaderNameReserve); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - - auto shader = VK_KERNEL_FROM_STR(kernel_name); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - { - {out, vkapi::kWrite}, - {in, vkapi::kRead}, - }, - // Parameter buffers - {}, - // Push Constants - { - PushConstantDataInfo(&range, sizeof(range), sizeof(ivec4)), - PushConstantDataInfo(&src_offset, sizeof(src_offset), sizeof(ivec4)), - PushConstantDataInfo(&dst_offset, sizeof(dst_offset), sizeof(ivec4)), - }, - // Specialization Constants - {graph.hashed_layout_of(out), - graph.hashed_layout_of(in), - (calc_out_pos_using_src_chnl ? 1 - : calc_in_pos_using_dst_chnl ? 2 - : 0)}, - // Resize Args - {}, - // Resizing Logic - nullptr)); -} - -void add_copy_packed_dim_offset_node( - ComputeGraph& graph, - const ValueRef in, - const ivec3& range, - const ivec4& src_offset, - const ivec4& dst_offset, - const ValueRef out) { - // Check the packed dimension is same for both tensors, also check if the - // packed dimension is Width or Height. Since the function does not support - // channel packing. - VK_CHECK_COND( - graph.packed_dim_of(in) == graph.packed_dim_of(out) && - (graph.packed_dim_of(in) == WHCN::kWidthDim || - graph.packed_dim_of(in) == WHCN::kHeightDim)); - - std::string kernel_name = "copy_packed_dim_offset"; - kernel_name.reserve(kShaderNameReserve); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - const std::vector in_sizes = graph.sizes_of(in); - const std::vector out_sizes = graph.sizes_of(out); - - // A copy of range with the last element set to batch size of the input tensor - ivec4 final_range = { - range[0], range[1], range[2], dim_at(in_sizes, kBatch4D)}; - ivec3 global_wg_size = graph.logical_limits_of(out); - - const auto packed_dim = graph.packed_dim_of(in); - // The starting offset in a texel where this tensor will start copying from - const auto src_lane_offset = src_offset[packed_dim] & 0x3; - // The starting offset in a texel where this tensor will start copying to - const auto dst_lane_offset = dst_offset[packed_dim] & 0x3; - - // The total packed texels this tensor will be copied from - // The first texel of tensor data in packed dimension will be copied from - // remaining lanes from current source Hence (4 - src_lane_offset) is added - // to tensor size in packed dimension - const auto src_packed_size = utils::div_up_4( - (4 - src_lane_offset) + utils::val_at(-packed_dim, out_sizes)); - - // The total packed texels this tensor will be copied to - // The first texel of tensor data in packed dimension will be copied to - // remaining lanes from previous write Hence (4 - dst_lane_offset) is added - // to tensor size in packed dimension - const auto dst_packed_size = utils::div_up_4( - (4 - dst_lane_offset) + utils::val_at(-packed_dim, in_sizes)); - - // If the starting src offset is not 0, and the total packed texels is - // greater than the source texel range - const bool has_additional_src_work = - src_lane_offset != 0 && src_packed_size > final_range[packed_dim]; - // If the starting dst offset is not 0, and the total packed texels is - // greater than the source texel range - const bool has_additional_dst_work = - dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim]; - - if (has_additional_src_work || has_additional_dst_work) { - global_wg_size[packed_dim]++; // Increase the global work group size in - // packed dimension - final_range[packed_dim]++; // Increase the range in packed dimension - } - - auto shader = VK_KERNEL_FROM_STR(kernel_name); - - graph.execute_nodes().emplace_back(new DispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - global_wg_size, - graph.create_local_wg_size(global_wg_size), - // Inputs and Outputs - { - {out, vkapi::kWrite}, - {out, vkapi::kRead}, - {in, vkapi::kRead}, - }, - // Parameter buffers - {}, - // Push Constants - { - PushConstantDataInfo( - &final_range, sizeof(final_range), sizeof(ivec4)), - PushConstantDataInfo(&src_offset, sizeof(src_offset), sizeof(ivec4)), - PushConstantDataInfo(&dst_offset, sizeof(dst_offset), sizeof(ivec4)), - }, - // Specialization Constants - {graph.hashed_layout_of(out), graph.hashed_layout_of(in)}, - // Resize Args - {}, - // Resizing Logic - nullptr)); -} - -void add_copy_channel_offset_node( - ComputeGraph& graph, - const ValueRef in, - int32_t channel_range, - int32_t src_channel_offset, - int32_t dst_channel_offset, - const ValueRef out) { - // Likely need to prepad these numbers. - const std::vector in_sizes = graph.sizes_of(in); - const std::vector out_sizes = graph.sizes_of(out); - - VK_CHECK_COND(graph.packed_dim_of(in) == WHCN::kChannelsDim); - VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kChannelsDim); - - // NOTE: This function should be able to support 1d and 2d tensors when - // range=1, src_offset=dst_offset=1. - VK_CHECK_COND(graph.dim_of(in) >= 3, "Src dim should be at least 3"); - VK_CHECK_COND(graph.dim_of(out) >= 3, "Dst dim should be at least 3"); - - VK_CHECK_COND( - dim_at(in_sizes) >= src_channel_offset + channel_range, - "Src channel (", - src_channel_offset, - ") and range (", - channel_range, - ") should be less than or equal to input tensor's channel size (", - dim_at(in_sizes), - ")"); - - VK_CHECK_COND( - dim_at(out_sizes) >= dst_channel_offset + channel_range, - "Dst channel (", - dst_channel_offset, - ") and range (", - channel_range, - ") should be less than or equal to input tensor's channel size (", - dim_at(out_sizes), - ")"); - - VK_CHECK_COND(channel_range >= 0, "Channel range must be non-negative"); - VK_CHECK_COND( - src_channel_offset >= 0, "Src channel offset must be non-negative"); - VK_CHECK_COND( - dst_channel_offset >= 0, "Dst channel offset must be non-negative"); - - std::string kernel_name = "copy_channel_offset"; - kernel_name.reserve(kShaderNameReserve); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - int32_t out_channels = dim_at(out_sizes); - - // Copy one batch at a time. - for (int batch_idx = 0; batch_idx < dim_at(in_sizes); batch_idx++) { - // Mapping the tensor NCHW coordinates into texture XYZ coordinates - int32_t dst_first_z = dst_channel_offset / 4; - int32_t dst_last_z = (dst_channel_offset + channel_range - 1) / 4; - - // We copy the entire width and height dimension. For the channel dimension, - // we use the z-dimension of the global_size to specify the texture range. - // The shader combines the global invocation id and the dst_offset to get - // the actual coordinate. - - const ivec3 dst_offset{ - 0, 0, dst_first_z + batch_idx * utils::div_up_4(out_channels)}; - - const uvec3 global_size{ - utils::safe_downcast(dim_at(in_sizes)), - utils::safe_downcast(dim_at(in_sizes)), - utils::safe_downcast(dst_last_z - dst_first_z + 1)}; - const uvec3 local_size = graph.create_local_wg_size(global_size); - - const utils::ivec4 range_params = { - static_cast(global_size[0]), - static_cast(global_size[1]), - static_cast(global_size[2]), - channel_range}; - - const ivec4 offset_params = { - dst_offset[0], dst_offset[1], dst_offset[2], dst_channel_offset}; - - auto shader = VK_KERNEL_FROM_STR(kernel_name); - - graph.execute_nodes().emplace_back(new DispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - global_size, - local_size, - // Inputs and Outputs - { - {out, vkapi::kWrite}, - {out, vkapi::kRead}, - {in, vkapi::kRead}, - }, - // Parameter buffers - {}, - // Push Constants - {graph.sizes_pc_of(out), - graph.sizes_pc_of(in), - PushConstantDataInfo(&range_params, sizeof(range_params)), - PushConstantDataInfo(&offset_params, sizeof(offset_params)), - PushConstantDataInfo(&src_channel_offset, sizeof(src_channel_offset))}, - // Specialization Constants - {graph.hashed_layout_of(out), graph.hashed_layout_of(in)}, - // Resize Args - {}, - // Resizing Logic - nullptr)); - } -} - -void add_copy_offset_node( - ComputeGraph& graph, - ValueRef in, - ValueRef range_ref, - ValueRef src_offset_ref, - ValueRef dst_offset_ref, - ValueRef out) { - ivec3 range = utils::make_ivec3(*graph.get_int_list(range_ref)); - ivec3 src = utils::make_ivec3(*graph.get_int_list(src_offset_ref)); - ivec3 dst = utils::make_ivec3(*graph.get_int_list(dst_offset_ref)); - - ivec4 src_offset = {src[0], src[1], src[2], 0}; - ivec4 dst_offset = {dst[0], dst[1], dst[2], 0}; - - add_copy_offset_node( - graph, in, range, src_offset, dst_offset, out, false, false); -} - -void copy_offset(ComputeGraph& graph, const std::vector& args) { - add_copy_offset_node(graph, args[0], args[1], args[2], args[3], args[4]); -} - -void copy_channel_offset( - ComputeGraph& graph, - const std::vector& args) { - ValueRef in = args[0]; - ValueRef channel_range_ref = args[1]; - ValueRef src_channel_offset_ref = args[2]; - ValueRef dst_channel_offset_ref = args[3]; - ValueRef out = args[4]; - - auto channel_range = graph.extract_scalar(channel_range_ref); - auto src_channel_offset = - graph.extract_scalar(src_channel_offset_ref); - auto dst_channel_offset = - graph.extract_scalar(dst_channel_offset_ref); - - add_copy_channel_offset_node( - graph, in, channel_range, src_channel_offset, dst_channel_offset, out); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(etvk.copy_offset, copy_offset); - VK_REGISTER_OP(etvk.copy_channel_offset, copy_channel_offset); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.h b/backends/vulkan/runtime/graph/ops/impl/Copy.h deleted file mode 100644 index 41956d482d9..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Copy.h +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include - -namespace vkcompute { - -// add_copy_offset_node resumes the vkCmdCopyImage command. It copies the -// texture extents specified by the range, src_offset, and dst_offset (all are -// in texture coordinate (x, y, z) from the input image to the output image. -// src_offset.w and dst_offset.w may contain channel size information. -// -// It is possible to have input and output to point to the same image -// object. But when the source range and destination range overlap, the behavior -// is undefined. -// -// boolean flags calc_out_pos_using_src_chnl and calc_in_pos_using_dst_chnl -// can be used to specify an indexing function in the shader -// If calc_out_pos_using_src_chnl is set to true channel and batch index will be -// calculated based on source channel size and will be used to determine -// destination texel position. -// -// If calc_in_pos_using_dst_chnl is set to truechannel and batch index will be -// calculated based on destination channel size and will be used to determine -// source texel position. -// -// If both are true calc_out_pos_using_src_chnl is picked. If both are false no -// index calculation happens. -void add_copy_offset_node( - ComputeGraph& graph, - const ValueRef in, - const utils::ivec3& range, - const utils::ivec4& src_offset, - const utils::ivec4& dst_offset, - const ValueRef out, - bool calc_out_pos_using_src_chnl, - bool calc_in_pos_using_dst_chnl); - -// add_copy_packed_dim_offset_node behaves similar to add_copy_node, except that -// its used when copying packed dimension, if tensor is width or height packed. -// src_offset.w and dst_offset.w may contain channel size information. -// -// It copies the texture extents specified by the range, src_offset, and -// dst_offset (all are in texture coordinate (x, y, z) from the input image to -// the output image. -void add_copy_packed_dim_offset_node( - ComputeGraph& graph, - const ValueRef in, - const utils::ivec3& range, - const utils::ivec4& src_offset, - const utils::ivec4& dst_offset, - const ValueRef out); - -// add_copy_channel_offset_node behaves similar to add_copy_node, except that it -// works on the channel dimensions of the tensor (up to 4 dimensions in NCHW). -// The range and offset arguments are in the tensor coordinate. It assumes the -// underlying texture is channel-packed. -// -// This function is specialized implementation for copying -// channel packed values. The complication comes from when reading / writing the -// channel dimension on indices that are not aligned to packing, we will need -// be careful about the boundaries. -// -// It achieves the following: -// out[:, dst_channel_offset:dst_channel_offset + channel_range, :, :] = -// in [:, src_channel_offset:src_channel_offset + channel_range, :, :] -void add_copy_channel_offset_node( - ComputeGraph& graph, - const ValueRef in, - int32_t channel_range, - int32_t src_channel_offset, - int32_t dst_channel_offset, - const ValueRef out); - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp index 72c1637a2c9..2b42c0bd150 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp @@ -14,8 +14,6 @@ #include #include -#include - namespace vkcompute { namespace { diff --git a/backends/vulkan/runtime/graph/ops/impl/Split.cpp b/backends/vulkan/runtime/graph/ops/impl/Split.cpp index f87af08ee69..4e62ae8806d 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Split.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Split.cpp @@ -8,134 +8,131 @@ #include -#include +#include +#include #include -#include #include + #include -namespace vkcompute { +#include -void add_split_with_sizes_default_node( - ComputeGraph& graph, - ValueRef in, - const std::vector& split_sizes, - int64_t dim, - ValueRef out_list_ref) { - const ValueListPtr out_list = graph.get_value_list(out_list_ref); +namespace vkcompute { - const int64_t input_ndim = graph.dim_of(in); +using utils::GPUMemoryLayout; +using utils::StorageType; + +void resize_split_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& resize_args) { + (void)resize_args; + const ValueRef input = args.at(0).refs.at(0); + const ValueRef split_sizes_ref = args.at(1).refs.at(0); + const ValueRef dim_ref = args.at(2).refs.at(0); + const ValueRef out_list_ref = args.at(3).refs.at(0); + + const ValueListPtr out_list = graph->get_value_list(out_list_ref); + const std::vector split_sizes = + *(graph->get_int_list(split_sizes_ref)); + const int64_t dim = graph->extract_scalar(dim_ref); + + const int64_t input_ndim = graph->dim_of(input); const DimIndex dim_index = dim < 0 ? static_cast(dim) : static_cast(dim - input_ndim); - VK_CHECK_COND(out_list->size() == split_sizes.size()); + std::vector input_sizes = graph->sizes_of(input); for (int split_idx = 0; split_idx < split_sizes.size(); split_idx++) { const int64_t split_size = split_sizes.at(split_idx); const ValueRef out_ref = out_list->at(split_idx); - VK_CHECK_COND(dim_at(graph.sizes_of(out_ref), dim_index) == split_size); - } - - const auto packed_dim = graph.packed_dim_of(in); - const auto packed_dim_index = static_cast(kWidth4D - packed_dim); + std::vector out_sizes = input_sizes; + out_sizes.at(dim_index) = split_size; - // Index of dimension to be concatenated in (w, h, c * b) coordinate system - const auto dim_xyz_index = std::min(2, -dim_index - 1); - - utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false); - utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false); - - const bool is_splitting_channel = (dim_index == kChannel4D); - - // if splitting channels - if (is_splitting_channel) { - // set source offset w as channel size of the input tensor - src_offset[3] = dim_at(graph.sizes_of(in), kChannel4D); + graph->virtual_resize(out_ref, out_sizes); } +} - for (ValueRef out_ref : *out_list) { - // Doesn't need to use split_size since we have already verified that the - // output tensor's size matches with the split_size. - const auto out_channel_size = dim_at(graph.sizes_of(out_ref), kChannel4D); - const utils::ivec3 range = graph.logical_limits_of(out_ref); - - if (dim_index == packed_dim_index) { - // if splitting channels, use add_copy_channel_offset_node function as - // add_copy_packed_dim_offset_node does not support channel packing - if (is_splitting_channel) { - add_copy_channel_offset_node( - graph, in, out_channel_size, src_offset[2], dst_offset[2], out_ref); - src_offset[dim_xyz_index] += out_channel_size; - } else { - // dst_offset[3] is not used now but will be used in the future when - // add_copy_packed_dim_offset_node will support channel packing - // - // set destination offset w as channel size of the output tensor if - // splitting channel - dst_offset[3] = is_splitting_channel ? out_channel_size : 0; - add_copy_packed_dim_offset_node( - graph, in, range, src_offset, dst_offset, out_ref); - src_offset[dim_xyz_index] += - dim_at(graph.sizes_of(out_ref), packed_dim_index); - } - } else { - // set destination offset w as channel size of the output tensor if - // splitting channels - dst_offset[3] = is_splitting_channel ? out_channel_size : 0; - add_copy_offset_node( - graph, in, range, src_offset, dst_offset, out_ref, false, true); - src_offset[dim_xyz_index] += - is_splitting_channel ? out_channel_size : range[dim_xyz_index]; - } +void add_split_node( + ComputeGraph& graph, + const ValueRef input, + const std::vector& split_sizes, + const int64_t dim, + const ValueRef out, + const int split_idx) { + std::string kernel_name = "split"; + kernel_name.reserve(kShaderNameReserve); + add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); + add_dtype_suffix(kernel_name, graph.dtype_of(out)); + + vkapi::ParamsBindList param_ubos = { + graph.meta_ubo(out), graph.meta_ubo(input)}; + + int64_t dim_whcn = nchw_dim_to_whcn_dim(dim, graph.dim_of(input)); + + // Calculate the offset for this split by summing previous split sizes + int64_t split_offset = 0; + for (int i = 0; i < split_idx; i++) { + split_offset += split_sizes[i]; } + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + default_pick_global_wg_size, + default_pick_local_wg_size, + // Inputs and Outputs + {{out, vkapi::kWrite}, {input, vkapi::kRead}}, + // Shader params buffers + param_ubos, + // Push Constants + {}, + // Specialization Constants + {utils::safe_downcast(dim_whcn), + static_cast(split_idx), + static_cast(split_offset)}, + // Resize Args + {}, + // Resizing Logic + nullptr)); } -void add_split_with_sizes_default_node( +void add_split_with_sizes_node( ComputeGraph& graph, - ValueRef in, - ValueRef split_sizes_ref, - ValueRef dim_ref, - ValueRef out) { - int64_t dim = graph.extract_scalar(dim_ref); - std::vector split_sizes = *(graph.get_int_list(split_sizes_ref)); + const ValueRef input, + const std::vector& split_sizes, + const int64_t dim, + const ValueRef out_list_ref) { + const ValueListPtr out_list = graph.get_value_list(out_list_ref); + + VK_CHECK_COND(out_list->size() == split_sizes.size()); - add_split_with_sizes_default_node(graph, in, split_sizes, dim, out); + // Dispatch a shader for each output tensor + for (int split_idx = 0; split_idx < split_sizes.size(); split_idx++) { + const ValueRef out_ref = out_list->at(split_idx); + add_split_node(graph, input, split_sizes, dim, out_ref, split_idx); + } } void split_with_sizes_copy_default( ComputeGraph& graph, const std::vector& args) { - add_split_with_sizes_default_node(graph, args[0], args[1], args[2], args[3]); -} - -void add_split_tensor_node( - ComputeGraph& graph, - ValueRef in, - ValueRef split_size_ref, - ValueRef dim_ref, - ValueRef out) { - const int64_t split_size = graph.extract_scalar(split_size_ref); - const int64_t dim = graph.extract_scalar(dim_ref); - - const int64_t input_ndim = graph.dim_of(in); - const DimIndex dim_index = dim < 0 ? static_cast(dim) - : static_cast(dim - input_ndim); - const int64_t size = dim_at(graph.sizes_of(in), dim_index); - const std::vector split_sizes(size / split_size, split_size); + ValueRef input = args[0]; + ValueRef split_sizes_ref = args[1]; + ValueRef dim_ref = args[2]; + ValueRef out_list_ref = args[3]; - add_split_with_sizes_default_node(graph, in, split_sizes, dim, out); -} + int64_t dim = graph.extract_scalar(dim_ref); + std::vector split_sizes = *(graph.get_int_list(split_sizes_ref)); -void split_tensor(ComputeGraph& graph, const std::vector& args) { - add_split_tensor_node(graph, args[0], args[1], args[2], args[3]); + add_split_with_sizes_node(graph, input, split_sizes, dim, out_list_ref); } REGISTER_OPERATORS { VK_REGISTER_OP( aten.split_with_sizes_copy.default, split_with_sizes_copy_default); - VK_REGISTER_OP(aten.split.Tensor, split_tensor); } } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp b/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp index 60127ecf9bd..1823271824a 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp @@ -50,15 +50,16 @@ void add_transfer_copy_node( (transfer_type == TransferType::SELECT || graph.is_scalar_or_none(step_ref)); - vkapi::ParamsBindList param_buffers; + vkapi::ParamsBindList param_ubos = {graph.meta_ubo(out), graph.meta_ubo(in)}; + if (!param_is_scalar) { if (transfer_type == TransferType::SELECT) { - param_buffers = { - graph.get_or_create_int_param_buffer(index_or_start_ref, 0)}; + param_ubos.append( + graph.get_or_create_int_param_buffer(index_or_start_ref, 0)); } else { // TransferType::SLICE - param_buffers = { - graph.get_or_create_int_param_buffer(index_or_start_ref, 0), - graph.get_or_create_int_param_buffer(step_ref, 1)}; + param_ubos.append( + graph.get_or_create_int_param_buffer(index_or_start_ref, 0)); + param_ubos.append(graph.get_or_create_int_param_buffer(step_ref, 1)); } } else { transfer_params.index_or_start_ref = @@ -69,18 +70,6 @@ void add_transfer_copy_node( } std::vector push_constants; - push_constants.reserve(graph.is_buffer_storage(out) ? 5 : 3); - - if (graph.is_buffer_storage(out)) { - push_constants.emplace_back(graph.sizes_pc_of(in)); - push_constants.emplace_back(graph.strides_pc_of(out)); - push_constants.emplace_back(graph.strides_pc_of(in)); - push_constants.emplace_back(graph.numel_pc_of(out)); - } else { - push_constants.emplace_back(graph.sizes_pc_of(out)); - push_constants.emplace_back(graph.sizes_pc_of(in)); - } - if (param_is_scalar) { push_constants.emplace_back(&transfer_params, sizeof(transfer_params)); } else { @@ -88,11 +77,6 @@ void add_transfer_copy_node( &transfer_params.dim, sizeof(transfer_params.dim)); } - vkapi::SpecVarList spec_vars = { - graph.hashed_layout_of(out), - graph.hashed_layout_of(in), - }; - // Determine the shader directly std::string kernel_name; if (transfer_type == TransferType::SELECT) { @@ -115,11 +99,11 @@ void add_transfer_copy_node( // Inputs and Outputs {{out, vkapi::kWrite}, {in, vkapi::kRead}}, // Parameter buffers - param_buffers, + param_ubos, // Push Constants push_constants, // Specialization Constants - spec_vars, + {}, // Resize Args resize_args, // Resizing Logic diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h index b62bf661995..05234c7790f 100644 --- a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h +++ b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h @@ -69,7 +69,7 @@ template < std::is_integral::value && std::is_signed::value, int>::type = 0> T nchw_dim_to_whcn_dim(const T& nchw_dim, const int64_t ndim) { - return ndim - 1 - nchw_dim; + return ndim - 1 - normalize(nchw_dim, ndim); } } // namespace vkcompute diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py index f59c3e30aeb..b21a8458a89 100644 --- a/backends/vulkan/test/op_tests/cases.py +++ b/backends/vulkan/test/op_tests/cases.py @@ -6,7 +6,6 @@ import itertools - from collections import namedtuple from typing import Callable @@ -1519,64 +1518,11 @@ def get_split_with_sizes_inputs(): test_suite.layouts = [ "utils::kWidthPacked", - "utils::kHeightPacked", - "utils::kChannelsPacked", - ] - test_suite.data_gen = "make_seq_tensor" - test_suite.dtypes = ["at::kFloat"] - return test_suite - - -@register_test_suite("aten.split.Tensor") -def get_split_tensor_inputs(): - test_suite = VkTestSuite( - [ - # Split on Width - ((S1, 7, 10, 12), 12, 3), - ((S1, 7, 10, 12), 3, 3), - ((S1, 7, 10, 12), 1, 3), - ((7, 10, 12), 12, 2), - ((7, 10, 12), 3, 2), - ((7, 10, 12), 1, 2), - ((10, 12), 12, 1), - ((10, 12), 3, 1), - ((10, 12), 1, 1), - ((12,), 12, 0), - ((12,), 3, 0), - ((12,), 1, 0), - # Split on Height - ((S1, 7, 12, 8), 12, 2), - ((S1, 7, 12, 8), 3, 2), - ((S1, 7, 12, 8), 1, 2), - ((7, 12, 8), 12, 1), - ((7, 12, 8), 3, 1), - ((7, 12, 8), 1, 1), - ((12, 8), 12, 0), - ((12, 8), 3, 0), - ((12, 8), 1, 0), - # Split on Batch - ((12, 7, 10, 10), 12, 0), - ((12, 7, 10, 10), 3, 0), - ((12, 7, 10, 10), 1, 0), - # Split on Channel - ((7, 15, 10, 10), 15, 1), - ((7, 15, 10, 10), 5, 1), - ((7, 15, 10, 10), 3, 1), - ((7, 15, 10, 10), 1, 1), - ((15, 10, 10), 15, 0), - ((15, 10, 10), 5, 0), - ((15, 10, 10), 3, 0), - ((15, 10, 10), 1, 0), - ] - ) - - test_suite.layouts = [ - "utils::kWidthPacked", - "utils::kHeightPacked", "utils::kChannelsPacked", ] test_suite.data_gen = "make_seq_tensor" test_suite.dtypes = ["at::kFloat"] + test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"] return test_suite diff --git a/backends/vulkan/test/utils.py b/backends/vulkan/test/utils.py index 90edc094ec7..6d3fff452f8 100644 --- a/backends/vulkan/test/utils.py +++ b/backends/vulkan/test/utils.py @@ -8,18 +8,14 @@ import logging from collections import OrderedDict from copy import deepcopy - from enum import auto, Enum from typing import Any, List, Optional, Tuple import executorch.backends.vulkan.utils as utils - import torch - from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner from executorch.backends.vulkan.vulkan_preprocess import VulkanBackend from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner - from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import ( get_symmetric_quantization_config, XNNPACKQuantizer, @@ -36,7 +32,6 @@ ) from executorch.extension.pytree import tree_flatten from torch.export import export - from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e @@ -275,16 +270,25 @@ def check_outputs_equal( ) return result else: + result = True for i in range(len(ref_output)): - if not torch.allclose( - model_output[i], ref_output[i], atol=atol, rtol=rtol - ): - print(f"\n=== Output {i} comparison failed ===") - print_tensor_comparison_errors( - model_output[i], ref_output[i], atol, rtol - ) - return False - return True + if isinstance(ref_output[i], torch.Tensor): + if not torch.allclose( + model_output[i], ref_output[i], atol=atol, rtol=rtol + ): + print(f"\n=== Output {i} comparison failed ===") + print_tensor_comparison_errors( + model_output[i], ref_output[i], atol, rtol + ) + result = False + elif isinstance(ref_output[i], int): + if not model_output[i] == ref_output[i]: + print(f"\n=== Output {i} comparison failed ===") + print(f"{model_output[i]} vs {ref_output[[i]]}") + result = False + else: + print(f"WARNING: Output {i} has type {type(ref_output[i])}") + return result else: # If one output, eager returns tensor while executor tuple of size 1 result = torch.allclose(model_output[0], ref_output, atol=atol, rtol=rtol) @@ -326,7 +330,7 @@ def run_and_check_output( model_output = executorch_module.run_method("forward", tuple(inputs_flattened)) # Generate reference outputs using the reference model - ref_output = reference_model(*sample_inputs) + ref_output, _ = tree_flatten(reference_model(*sample_inputs)) # Check if outputs are equal return check_outputs_equal( @@ -805,3 +809,26 @@ def find_bad_operators( "all_operators": all_operators, "test_count": test_count, } + + +def make_indent(indent_level): + indent_str = "" + for _ in range(indent_level): + indent_str += " " + return indent_str + + +def print_output(outputs, n: int = 0, indent_level: int = 0): + if isinstance(outputs, (list, tuple)): + print(f"{make_indent(indent_level)}output_{n} = {type(outputs)}") + new_indent_level = indent_level + 2 + for n, test_out in enumerate(outputs): + print_output(test_out, n, new_indent_level) + elif isinstance(outputs, torch.Tensor): + print( + f"{make_indent(indent_level)}output_{n} = test_utils.random_uniform_tensor({outputs.shape}, low={outputs.min().item()}, high={outputs.max().item()}, dtype={outputs.dtype})" + ) + elif isinstance(outputs, int): + print(f"{make_indent(indent_level)}output_{n} = {outputs}") + else: + print(f"{make_indent(indent_level)}output_{n} = {type(outputs)}") diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 7e3d957afdb..7dd3bb84588 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -1911,413 +1911,6 @@ TEST(VulkanComputeGraphTest, test_clone) { } } -TEST(VulkanComputeGraphTest, test_etvk_copy_offset_node) { - GraphConfig config; - ComputeGraph graph(config); - - int64_t n = 6; - int64_t c = 12; - int64_t h = 4; - int64_t w = 8; - utils::GPUMemoryLayout memory_layout = - utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED; - - std::vector size = {n, c, h, w}; - - IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout); - - IOValueRef out = {}; - out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout); - - // Notice that copy_node operates on in texture's x, y, z dimension. In the - // comment, we provide the cooresponding coordinate in nchw. - - // src_offset is (n=0, c=4, h=1, w=1) - ValueRef src_offset_ref = graph.add_scalar_list({1, 1, 1}); - - // dst_offset is (n=1, c=8, h=2, w=0) in nchw coordinate - // Argument is {x, y, z}. - // x = 0 since w = 0 - // y = 2 since h = 2 - // z = c / 4 + 2 since - // 1. there c/4 planes per batch, n=1 means we are on the first batch; - // 2. +2 because c = 8, with channel packing it means two texels. - ValueRef dst_offset_ref = graph.add_scalar_list({0, 2, c / 4 + 2}); - - // range is (n=1, c=8, h=2, w=4) - // Argument is {x, y, z}. - // x = 4 since w = 4 - // y = 2 since h = 2 - // z = 2 since we are only copying 8 channels, hence 2 texel. n = 1 can be a - // bit misleading here, since it gives the impression that we are copying the - // entire channel. However, remember when we copy, we are trying to - // dst[dst_offset:dst_offset + range] = src[src_offset:src_offset + range], - // range must be non zero. - ValueRef range_ref = graph.add_scalar_list({4, 2, 2}); - - auto copyFn = VK_GET_OP_FN("etvk.copy_offset"); - copyFn( - graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value}); - - out.staging = graph.set_output_tensor(out.value); - - graph.prepare(); - graph.prepack(); - - fill_vtensor(graph, a, 0.0f, /*iota = */ true); - - graph.execute(); - - EXTRACT_TENSOR(out); - EXTRACT_TENSOR(a); - - // We will examine the results in the dst_range - // The value in the cooresponding coordinate should match between the source - // and destination tensor. We loop thru the range, calculate both the src and - // dst index using the offsets, and compare the values in the extracted - // vector. They should match. - int n_idx = 0; - // at each nested loop, index range from dst_offset to dst_offset + range - - for (int c_idx = 0; c_idx < 8; c_idx++) { - for (int h_idx = 0; h_idx < 2; h_idx++) { - for (int w_idx = 0; w_idx < 4; w_idx++) { - auto dst_idx = - get_buf_idx(graph, out, {n_idx + 1, c_idx + 8, h_idx + 2, w_idx}); - auto src_idx = - get_buf_idx(graph, a, {n_idx, c_idx + 4, h_idx + 1, w_idx + 1}); - - EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]); - } - } - } -} - -TEST(VulkanComputeGraphTest, DISABLED_test_etvk_copy_channel_offset_node) { - GraphConfig config; - ComputeGraph graph(config); - - int64_t n = 2; - int64_t c = 12; - int64_t h = 4; - int64_t w = 8; - utils::GPUMemoryLayout memory_layout = - utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED; - - std::vector size = {n, c, h, w}; - - IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout); - - IOValueRef out = {}; - out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout); - - int64_t src_offset = 2; - int64_t dst_offset = 3; - int64_t range = 7; - - ValueRef src_offset_ref = graph.add_scalar(src_offset); - ValueRef dst_offset_ref = graph.add_scalar(dst_offset); - ValueRef range_ref = graph.add_scalar(range); - - auto copyFn = VK_GET_OP_FN("etvk.copy_channel_offset"); - copyFn( - graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value}); - - out.staging = graph.set_output_tensor(out.value); - - graph.prepare(); - graph.prepack(); - - fill_vtensor(graph, a, 0.0f, true); - - graph.execute(); - - EXTRACT_TENSOR(out); - EXTRACT_TENSOR(a); - - for (int n_idx = 0; n_idx < n; n_idx++) { - for (int c_idx = 0; c_idx < range; c_idx++) { - for (int h_idx = 0; h_idx < h; h_idx++) { - for (int w_idx = 0; w_idx < w; w_idx++) { - auto src_idx = - get_buf_idx(graph, a, {n_idx, c_idx + src_offset, h_idx, w_idx}); - auto dst_idx = get_buf_idx( - graph, out, {n_idx, c_idx + dst_offset, h_idx, w_idx}); - EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]); - } - } - } - } -} - -TEST( - VulkanComputeGraphTest, - DISABLED_test_etvk_copy_channel_offset_node_clean_boundary) { - // Tricky part for channel copy is handling the boundary across multiple copy. - // For example, when we concat two [3, 1, 1] nchw-tensors along the channel - // dimension, due to channel packing, elements from different source texel - // will be packed into same destination texel at the boundaries. - GraphConfig config; - ComputeGraph graph(config); - - int64_t n = 2; - int64_t c = 12; - int64_t h = 4; - int64_t w = 8; - utils::GPUMemoryLayout memory_layout = - utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED; - - std::vector size = {n, c, h, w}; - - IOValueRef zero = graph.add_input_tensor(size, vkapi::kFloat, memory_layout); - IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout); - IOValueRef b = graph.add_input_tensor(size, vkapi::kFloat, memory_layout); - - IOValueRef out = {}; - out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout); - - auto copyFn = VK_GET_OP_FN("etvk.copy_channel_offset"); - - // Make sure entire out tensor is zeroed. The zero tensor will be filled with - // zero later. - copyFn( - graph, - {zero.value, - graph.add_scalar(c), - graph.add_scalar(0), - graph.add_scalar(0), - out.value}); - - int64_t a_src_offset = 0; - int64_t a_dst_offset = 2; - int64_t a_range = 5; - // a will write to channge [2, 7) - copyFn( - graph, - {a.value, - graph.add_scalar(a_range), - graph.add_scalar(a_src_offset), - graph.add_scalar(a_dst_offset), - out.value}); - - // b will write to channel [6, 11) - // Intentional for b to override channel=6 - int64_t b_src_offset = 0; - int64_t b_dst_offset = 6; - int64_t b_range = 5; - - copyFn( - graph, - {b.value, - graph.add_scalar(b_range), - graph.add_scalar(b_src_offset), - graph.add_scalar(b_dst_offset), - out.value}); - - out.staging = graph.set_output_tensor(out.value); - - graph.prepare(); - graph.prepack(); - - float a_value = 1.0f; - float b_value = 2.0f; - float zero_value = 0.0f; - fill_vtensor(graph, a, a_value); - fill_vtensor(graph, b, b_value); - fill_vtensor(graph, zero, zero_value); - - graph.execute(); - - EXTRACT_TENSOR(out); - - for (int n_idx = 0; n_idx < n; n_idx++) { - // c_idx only up to a_range-1 because the expected overwrite by b - for (int c_idx = a_dst_offset; c_idx < a_dst_offset + a_range - 1; - c_idx++) { - for (int h_idx = 0; h_idx < h; h_idx++) { - for (int w_idx = 0; w_idx < w; w_idx++) { - auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx}); - EXPECT_TRUE(data_out[dst_idx] == a_value); - } - } - } - } - - for (int n_idx = 0; n_idx < n; n_idx++) { - for (int c_idx = b_dst_offset; c_idx < b_dst_offset + b_range; c_idx++) { - for (int h_idx = 0; h_idx < h; h_idx++) { - for (int w_idx = 0; w_idx < w; w_idx++) { - auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx}); - EXPECT_TRUE(data_out[dst_idx] == b_value); - } - } - } - } - - // Also verify that data before a_dst_offset and after b_dst_offset + b_range - // are untouched. - for (int n_idx = 0; n_idx < n; n_idx++) { - for (int c_idx = 0; c_idx < a_dst_offset; c_idx++) { - for (int h_idx = 0; h_idx < h; h_idx++) { - for (int w_idx = 0; w_idx < w; w_idx++) { - auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx}); - EXPECT_TRUE(data_out[dst_idx] == zero_value); - } - } - } - } - - for (int n_idx = 0; n_idx < n; n_idx++) { - for (int c_idx = b_dst_offset + b_range; c_idx < c; c_idx++) { - for (int h_idx = 0; h_idx < h; h_idx++) { - for (int w_idx = 0; w_idx < w; w_idx++) { - auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx}); - EXPECT_TRUE(data_out[dst_idx] == zero_value); - } - } - } - } -} - -TEST(VulkanComputeGraphTest, test_etvk_copy_offset_int_node) { - GraphConfig config; - ComputeGraph graph(config); - - int64_t n = 6; - int64_t c = 12; - int64_t h = 4; - int64_t w = 8; - utils::GPUMemoryLayout memory_layout = - utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED; - - std::vector size = {n, c, h, w}; - - IOValueRef a = graph.add_input_tensor(size, vkapi::kInt, memory_layout); - - IOValueRef out = {}; - out.value = graph.add_tensor(size, vkapi::kInt, memory_layout); - - // Notice that copy_node operates on in texture's x, y, z dimension. In the - // comment, we provide the cooresponding coordinate in nchw. - - // src_offset is (n=0, c=4, h=1, w=1) - ValueRef src_offset_ref = graph.add_scalar_list({1, 1, 1}); - - // dst_offset is (n=1, c=8, h=2, w=0) in nchw coordinate - // Argument is {x, y, z}. - // x = 0 since w = 0 - // y = 2 since h = 2 - // z = c / 4 + 2 since - // 1. there c/4 planes per batch, n=1 means we are on the first batch; - // 2. +2 because c = 8, with channel packing it means two texels. - ValueRef dst_offset_ref = graph.add_scalar_list({0, 2, c / 4 + 2}); - - // range is (n=1, c=8, h=2, w=4) - // Argument is {x, y, z}. - // x = 4 since w = 4 - // y = 2 since h = 2 - // z = 2 since we are only copying 8 channels, hence 2 texel. n = 1 can be a - // bit misleading here, since it gives the impression that we are copying the - // entire channel. However, remember when we copy, we are trying to - // dst[dst_offset:dst_offset + range] = src[src_offset:src_offset + range], - // range must be non zero. - ValueRef range_ref = graph.add_scalar_list({4, 2, 2}); - - auto copyFn = VK_GET_OP_FN("etvk.copy_offset"); - copyFn( - graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value}); - - out.staging = graph.set_output_tensor(out.value); - - graph.prepare(); - graph.prepack(); - - fill_vtensor(graph, a, 0, /*iota = */ true); - - graph.execute(); - - EXTRACT_TENSOR(out); - EXTRACT_TENSOR(a); - - // We will examine the results in the dst_range - // The value in the cooresponding coordinate should match between the source - // and destination tensor. We loop thru the range, calculate both the src and - // dst index using the offsets, and compare the values in the extracted - // vector. They should match. - int n_idx = 0; - // at each nested loop, index range from dst_offset to dst_offset + range - - for (int c_idx = 0; c_idx < 8; c_idx++) { - for (int h_idx = 0; h_idx < 2; h_idx++) { - for (int w_idx = 0; w_idx < 4; w_idx++) { - auto dst_idx = - get_buf_idx(graph, out, {n_idx + 1, c_idx + 8, h_idx + 2, w_idx}); - auto src_idx = - get_buf_idx(graph, a, {n_idx, c_idx + 4, h_idx + 1, w_idx + 1}); - - EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]); - } - } - } -} - -TEST(VulkanComputeGraphTest, DISABLED_test_etvk_copy_channel_offset_int_node) { - GraphConfig config; - ComputeGraph graph(config); - - int64_t n = 2; - int64_t c = 12; - int64_t h = 4; - int64_t w = 8; - utils::GPUMemoryLayout memory_layout = - utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED; - - std::vector size = {n, c, h, w}; - - IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout); - - IOValueRef out = {}; - out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout); - - int64_t src_offset = 2; - int64_t dst_offset = 3; - int64_t range = 7; - - ValueRef src_offset_ref = graph.add_scalar(src_offset); - ValueRef dst_offset_ref = graph.add_scalar(dst_offset); - ValueRef range_ref = graph.add_scalar(range); - - auto copyFn = VK_GET_OP_FN("etvk.copy_channel_offset"); - copyFn( - graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value}); - - out.staging = graph.set_output_tensor(out.value); - - graph.prepare(); - graph.prepack(); - - fill_vtensor(graph, a, 0.0f, true); - - graph.execute(); - - EXTRACT_TENSOR(out); - EXTRACT_TENSOR(a); - - for (int n_idx = 0; n_idx < n; n_idx++) { - for (int c_idx = 0; c_idx < range; c_idx++) { - for (int h_idx = 0; h_idx < h; h_idx++) { - for (int w_idx = 0; w_idx < w; w_idx++) { - auto src_idx = - get_buf_idx(graph, a, {n_idx, c_idx + src_offset, h_idx, w_idx}); - auto dst_idx = get_buf_idx( - graph, out, {n_idx, c_idx + dst_offset, h_idx, w_idx}); - EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]); - } - } - } - } -} - TEST(VulkanComputeGraphTest, test_view_change_packing) { std::vector> layout_pairs = { diff --git a/backends/vulkan/utils.py b/backends/vulkan/utils.py index fca8173ffb7..2ca2ddf19b7 100644 --- a/backends/vulkan/utils.py +++ b/backends/vulkan/utils.py @@ -8,26 +8,18 @@ from typing import Any, List, Optional, Set, Tuple, Union import torch - from executorch.backends.vulkan.serialization.vulkan_graph_schema import ( VkMemoryLayout, VkStorageType, ) - from executorch.exir.backend.canonical_partitioners.config_partitioner import ( format_target_name, ) - from executorch.exir.dialects.edge._ops import EdgeOpOverload - from executorch.exir.tensor import TensorSpec - from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param - from torch._subclasses.fake_tensor import FakeTensor, FakeTensorConverter - from torch.export import ExportedProgram - from torch.export.exported_program import InputKind from torch.export.graph_signature import TensorArgument @@ -399,10 +391,23 @@ def node_has_target(node: Any, target: str): VkStorageType.TEXTURE_3D, } +# Memory layouts available to non-quantized tensors all_memory_layouts: Set[VkMemoryLayout] = { VkMemoryLayout.TENSOR_WIDTH_PACKED, VkMemoryLayout.TENSOR_HEIGHT_PACKED, VkMemoryLayout.TENSOR_CHANNELS_PACKED, +} + +# Memory layouts available to quantized tensors +all_quantized_memory_layouts: Set[VkMemoryLayout] = { + VkMemoryLayout.PACKED_INT8_4W4C, + VkMemoryLayout.PACKED_INT8_4H4W, +} + +universal_memory_layout_set: Set[VkMemoryLayout] = { + VkMemoryLayout.TENSOR_WIDTH_PACKED, + VkMemoryLayout.TENSOR_HEIGHT_PACKED, + VkMemoryLayout.TENSOR_CHANNELS_PACKED, VkMemoryLayout.PACKED_INT8_4W4C, VkMemoryLayout.PACKED_INT8_4H4W, } @@ -761,7 +766,7 @@ def make_filtered_tensor_repset( ## Convenience TensorRepSet definitions -PACKED_INT8_4W4C_BUFFER = TensorRepSet({VkMemoryLayout.PACKED_INT8_4W4C}, set()) +# Only includes memory layouts that can be used by non-quantized tensors CONTIGUOUS_ANY = TensorRepSet( {VkMemoryLayout.TENSOR_WIDTH_PACKED}, {VkMemoryLayout.TENSOR_WIDTH_PACKED} @@ -782,9 +787,18 @@ def make_filtered_tensor_repset( ANY_TEXTURE = TensorRepSet(set(), all_memory_layouts) ANY_BUFFER = TensorRepSet(all_memory_layouts, set()) - ANY_STORAGE = TensorRepSet(all_memory_layouts, all_memory_layouts) + +# Only includes memory layouts that can be used by quantized tensors + +PACKED_INT8_4W4C_BUFFER = TensorRepSet({VkMemoryLayout.PACKED_INT8_4W4C}, set()) + +# Special use RepSets + NO_STORAGE = TensorRepSet(set(), set()) +ALL_STORAGES_REPSET = TensorRepSet( + universal_memory_layout_set, universal_memory_layout_set +) class TensorRepSetList: @@ -908,19 +922,19 @@ def __init__( # noqa: C901 # Now, go through the arguments of the operator and create a filtered repset # for each based on the actual tensor value. args_repset_list = TensorRepSetList([]) - common_arg_repset = ANY_STORAGE + common_arg_repset = ALL_STORAGES_REPSET for i, arg_node in enumerate(op_node.args): arg_repset = inputs_repsets[i] - # Use ANY_STORAGE for non-tensor nodes so they don't cause the op repsets to - # appear empty + # Use ALL_STORAGES_REPSET for non-tensor nodes so they don't cause the op + # repsets to appear empty if not is_tensor_arg_node(arg_node): - args_repset_list.append(ANY_STORAGE) + args_repset_list.append(ALL_STORAGES_REPSET) # NO_STORAGE is used to denote that an input is either a non tensor arg or # a weight tensor that is not prepacked. Similar to the above, use - # ANY_STORAGE in this case. + # ALL_STORAGES_REPSET in this case. elif arg_repset.is_empty(): - args_repset_list.append(ANY_STORAGE) + args_repset_list.append(ALL_STORAGES_REPSET) else: assert not arg_repset.is_empty() @@ -933,7 +947,7 @@ def __init__( # noqa: C901 # Repeat for output tensors. outs_repset_list = TensorRepSetList([]) - common_out_repset = ANY_STORAGE + common_out_repset = ALL_STORAGES_REPSET if num_tensors_in_node(op_node) == 1: common_out_repset = make_filtered_tensor_repset( op_node.meta["val"], outputs_repsets[0], texture_limits diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py index 3a3f6cdf4fe..3ccbdc8ab85 100644 --- a/backends/vulkan/vulkan_preprocess.py +++ b/backends/vulkan/vulkan_preprocess.py @@ -6,6 +6,7 @@ # pyre-strict +import copy from functools import partial from typing import Any, Callable, Dict, final, List @@ -127,15 +128,21 @@ def preprocess( # noqa: C901 module_compile_spec: List[CompileSpec], ) -> PreprocessResult: compile_options = parse_compile_spec(module_compile_spec) - limits_x = compile_options.get( - "texture_limits_x", utils.DEFAULT_TEXTURE_LIMITS[0] - ) - limits_y = compile_options.get( - "texture_limits_y", utils.DEFAULT_TEXTURE_LIMITS[1] - ) - limits_z = compile_options.get( - "texture_limits_z", utils.DEFAULT_TEXTURE_LIMITS[2] - ) + + default_texture_limits = copy.deepcopy(utils.DEFAULT_TEXTURE_LIMITS) + # 2048 is the typical limit value for 3D textures, but mobile GPUs often support + # 16384. Since the Vulkan delegate primarily targets mobile GPUs at the moment, + # 16394 is the default texture limit used. This option is provided as a + # convenient way to switch to using a limit of 2048 for image textures which + # will be compatible with most GPUs. + if compile_options.get("small_texture_limits", False): + default_texture_limits[0] = 2048 + default_texture_limits[1] = 2048 + default_texture_limits[2] = 2048 + + limits_x = compile_options.get("texture_limits_x", default_texture_limits[0]) + limits_y = compile_options.get("texture_limits_y", default_texture_limits[1]) + limits_z = compile_options.get("texture_limits_z", default_texture_limits[2]) texture_limits = (limits_x, limits_y, limits_z) default_storage_type = compile_options.get( @@ -204,22 +211,26 @@ def preprocess( # noqa: C901 # Finally, apply dynamic shape passes and memory planning pass. These passes # must be applied only when the graph structure is finalized. - greedy_memory_planning = partial(greedy, allow_overlapping_allocations=False) - mem_planning_suite = MemoryPlanningAlgorithmSuite( - algo_list=[greedy_memory_planning] - ) - # This is a workaround to allow the memory planning pass to work without having - # to first apply ToOutVarPass(). See the `greedy()` function in - # `exir.memory_planning`; if this attribute isn't set, assertions in - # `collect_spec_from_nodes()` will fail. - program.graph_module.encounter_to_out_var_failure = True - program = apply_passes( - program, - [ - ConstraintBasedSymShapeEvalPass(), - MemoryPlanningPass(memory_planning_algo=mem_planning_suite), - ], - ) + final_passes = [ + ConstraintBasedSymShapeEvalPass(), + ] + if not compile_options.get("skip_memory_planning", False): + greedy_memory_planning = partial( + greedy, allow_overlapping_allocations=False + ) + mem_planning_suite = MemoryPlanningAlgorithmSuite( + algo_list=[greedy_memory_planning] + ) + # This is a workaround to allow the memory planning pass to work without having + # to first apply ToOutVarPass(). See the `greedy()` function in + # `exir.memory_planning`; if this attribute isn't set, assertions in + # `collect_spec_from_nodes()` will fail. + program.graph_module.encounter_to_out_var_failure = True + final_passes.append( + MemoryPlanningPass(memory_planning_algo=mem_planning_suite) + ) + + program = apply_passes(program, final_passes) graph_builder = VkGraphBuilder( program, diff --git a/examples/vulkan/export.py b/examples/vulkan/export.py index dace37e5473..58f2ccf1001 100644 --- a/examples/vulkan/export.py +++ b/examples/vulkan/export.py @@ -10,29 +10,29 @@ import argparse import logging +import os -import backends.vulkan.test.utils as test_utils - +import executorch.backends.vulkan.test.utils as test_utils import torch import torchvision - from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner from executorch.devtools import BundledProgram from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite from executorch.devtools.bundled_program.serialize import ( serialize_from_bundled_program_to_flatbuffer, ) +from executorch.examples.models import MODEL_NAME_TO_MODEL +from executorch.examples.models.model_factory import EagerModelFactory from executorch.exir import to_edge_transform_and_lower from executorch.extension.export_util.utils import save_pte_program from executorch.extension.pytree import tree_flatten from torch.export import Dim, export -from ..models import MODEL_NAME_TO_MODEL -from ..models.model_factory import EagerModelFactory - FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s" logging.basicConfig(level=logging.INFO, format=FORMAT) +import urllib + def is_vision_model(model_name): if model_name in [ @@ -70,6 +70,38 @@ def get_vision_model_dynamic_shapes(): ) +def get_dog_image_tensor(image_size=224, normalization="imagenet"): + url, filename = ( + "https://github.com/pytorch/hub/raw/master/images/dog.jpg", + "dog.jpg", + ) + try: + urllib.URLopener().retrieve(url, filename) + except: + urllib.request.urlretrieve(url, filename) + + from PIL import Image + from torchvision import transforms + + input_image = Image.open(filename).convert("RGB") + + transforms_list = [ + transforms.Resize((image_size, image_size)), + transforms.ToTensor(), + ] + if normalization == "imagenet": + transforms_list.append( + transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + ) + + preprocess = transforms.Compose(transforms_list) + + input_tensor = preprocess(input_image) + input_batch = input_tensor.unsqueeze(0) + input_batch = (input_batch,) + return input_batch + + def init_model(model_name): if model_name == "convnext_small": return torchvision.models.convnext_small() @@ -77,13 +109,29 @@ def init_model(model_name): return torchvision.models.densenet161() if model_name == "shufflenet_v2_x1_0": return torchvision.models.shufflenet_v2_x1_0() + if model_name == "YOLO_NAS_S": + try: + from super_gradients.common.object_names import Models + from super_gradients.training import models + except ImportError: + raise ImportError( + "Please install super-gradients to use the YOLO_NAS_S model." + ) + + return models.get(Models.YOLO_NAS_S, pretrained_weights="coco") return None def get_sample_inputs(model_name): + # Lock the random seed for reproducibility + torch.manual_seed(42) + if is_vision_model(model_name): return get_vision_model_sample_input() + if model_name == "YOLO_NAS_S": + input_batch = get_dog_image_tensor(640) + return input_batch return None @@ -95,7 +143,7 @@ def get_dynamic_shapes(model_name): return None -def main() -> None: +def main() -> None: # noqa: C901 logger = logging.getLogger("") logger.setLevel(logging.INFO) @@ -117,6 +165,24 @@ def main() -> None: "False", ) + parser.add_argument( + "--small_texture_limits", + action=argparse.BooleanOptionalAction, + default=False, + help="sets the default texture limit to be (2048, 2048, 2048) which is " + "compatible with more devices (i.e. desktop/laptop GPUs) compared to the " + "default (16384, 16384, 2048) which is more targeted for mobile GPUs. Default " + "is False.", + ) + + parser.add_argument( + "--skip_memory_planning", + action=argparse.BooleanOptionalAction, + default=False, + help="Skips memory planning pass while lowering, which can be used for " + "debugging. Default is False.", + ) + parser.add_argument( "-s", "--strict", @@ -159,6 +225,13 @@ def main() -> None: help="Execute lower_module_and_test_output to validate the model. Default is False", ) + parser.add_argument( + "--save_inputs", + action=argparse.BooleanOptionalAction, + default=False, + help="Whether to save the inputs to the model. Default is False", + ) + args = parser.parse_args() if args.model_name in MODEL_NAME_TO_MODEL: @@ -189,6 +262,10 @@ def main() -> None: if args.force_fp16: compile_options["force_fp16"] = True + if args.skip_memory_planning: + compile_options["skip_memory_planning"] = True + if args.small_texture_limits: + compile_options["small_texture_limits"] = True logging.info(f"Exporting model {args.model_name} with Vulkan delegate") @@ -230,25 +307,18 @@ def main() -> None: atol = 2e-2 rtol = 1e-1 - # Test the model if --test flag is provided - if args.test: - test_result = test_utils.run_and_check_output( - reference_model=model, - executorch_program=exec_prog, - sample_inputs=example_inputs, - atol=atol, - rtol=rtol, - ) + # Save regular program + save_pte_program(exec_prog, output_filename, args.output_dir) + logging.info( + f"Model exported and saved as {output_filename}.pte in {args.output_dir}" + ) - if test_result: - logging.info( - "✓ Model test PASSED - outputs match reference within tolerance" - ) - else: - logging.error("✗ Model test FAILED - outputs do not match reference") - raise RuntimeError( - "Model validation failed: ExecuTorch outputs do not match reference model outputs" - ) + if args.save_inputs: + inputs_flattened, _ = tree_flatten(example_inputs) + for i, input_tensor in enumerate(inputs_flattened): + input_filename = os.path.join(args.output_dir, f"input{i}.bin") + input_tensor.numpy().tofile(input_filename) + f"Model input saved as {input_filename} in {args.output_dir}" if args.bundled: # Create bundled program @@ -287,13 +357,27 @@ def main() -> None: logging.info( f"Bundled program exported and saved as {output_filename}.bpte in {args.output_dir}" ) - else: - # Save regular program - save_pte_program(exec_prog, output_filename, args.output_dir) - logging.info( - f"Model exported and saved as {output_filename}.pte in {args.output_dir}" + + # Test the model if --test flag is provided + if args.test: + test_result = test_utils.run_and_check_output( + reference_model=model, + executorch_program=exec_prog, + sample_inputs=example_inputs, + atol=atol, + rtol=rtol, ) + if test_result: + logging.info( + "✓ Model test PASSED - outputs match reference within tolerance" + ) + else: + logging.error("✗ Model test FAILED - outputs do not match reference") + raise RuntimeError( + "Model validation failed: ExecuTorch outputs do not match reference model outputs" + ) + if __name__ == "__main__": with torch.no_grad():