diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl index b51d5a3f6ed..8f113bd2cc2 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl @@ -21,9 +21,10 @@ ${define_required_extensions(DTYPE)} layout(std430) buffer; -${layout_declare_buffer(0, "w", "nchw_out", DTYPE)} -${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_ubo(2, "ivec4", "sizes")} +${layout_declare_buffer(B, "w", "nchw_out", DTYPE)} +${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} +${layout_declare_ubo(B, "ivec4", "sizes")} +${layout_declare_ubo(B, "ivec4", "axis_mapping")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -51,7 +52,7 @@ void write_out_texel(VEC4_T texel, ivec4 tensor_idx) { void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 tensor_idx = to_tensor_idx(pos, sizes, packed_dim); + const ivec4 tensor_idx = to_tensor_idx(pos, sizes, axis_mapping, packed_dim); if (any(greaterThanEqual(tensor_idx, sizes))) { return; diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h index 21eadff0b36..9dc06bd8552 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h +++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h @@ -183,6 +183,42 @@ ivec4 to_tensor_idx(ivec3 pos, ivec4 sizes, int packed_dim) { return tensor_idx; } +/* + * Derive (w,h,c,n) tensor indices from (x,y,z) texture position using axis + * mapping. + */ +ivec4 to_tensor_idx( + ivec3 pos, + ivec4 sizes, + const ivec4 axis_mapping, + const int packed_dim) { + // Align packed dim to next multiple of 4 to account for texel padding + sizes[packed_dim] = alignup4(sizes[packed_dim]); + + // Packed dim contains 4 elements per texel, so moving 1 unit traverses 4 + // elements in the tensor. + pos[axis_mapping[packed_dim]] *= 4; + + ivec4 tensor_idx; + for (int dim = 0; dim < 3; ++dim) { + tensor_idx[dim] = pos[axis_mapping[dim]]; + } + + // Early return if batch is 1. Batch index will be 0. + if (sizes.w == 1) { + tensor_idx.w = 0; + return tensor_idx; + } + + // Else, adjust the dim that's concatenated with batch. Note that the axis + // mapping for the batch dim indicates WHCN dim index of the dim that it is + // concatenated with, not a texture axis. + tensor_idx.w = tensor_idx[axis_mapping[3]] / sizes[axis_mapping[3]]; + tensor_idx[axis_mapping[3]] %= sizes[axis_mapping[3]]; + + return tensor_idx; +} + /* * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of a tensor, which dim * is packed along a texel @@ -199,6 +235,34 @@ ivec3 to_texture_pos(ivec4 idx, ivec4 sizes, int packed_dim) { return pos; } +/* + * Derive (x,y,z) texture position from (w,h,c,n) tensor indices using axis + * mapping. + */ +ivec3 to_texture_pos( + const ivec4 idx, + ivec4 sizes, + const ivec4 axis_mapping, + const int packed_dim) { + // Align packed dim to next multiple of 4 to account for texel padding + sizes[packed_dim] = alignup4(sizes[packed_dim]); + + ivec3 pos; + for (int dim = 0; dim < 3; ++dim) { + pos[axis_mapping[dim]] = idx[dim]; + } + + // Adjust batch dim if needed + if (sizes.w > 1) { + pos[axis_mapping[axis_mapping[3]]] += idx.w * sizes.w; + } + + // Adjust packed dim. Moving 1 texel unit along the packed dim traverses 4 + // tensor elements in that dim. + pos[axis_mapping[packed_dim]] /= 4; + return pos; +} + /* * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of the tensor, which dim * is packed along a texel @@ -218,6 +282,35 @@ ivec4 to_texture_elem_pos(ivec4 idx, ivec4 sizes, int packed_dim) { return pos; } +/* + * Derive (x,y,z,i) texel element position from the (w,h,c,n) tensor index using + * the axis mapping. + */ +ivec4 to_texture_elem_pos( + const ivec4 idx, + ivec4 sizes, + const ivec4 axis_mapping, + const int packed_dim) { + // Align packed dim to next multiple of 4 to account for texel padding + sizes[packed_dim] = alignup4(sizes[packed_dim]); + + ivec4 pos; + for (int dim = 0; dim < 3; ++dim) { + pos[axis_mapping[dim]] = idx[dim]; + } + + // Adjust batch dim if needed + if (sizes.w > 1) { + pos[axis_mapping[axis_mapping[3]]] += idx.w * sizes.w; + } + + // Adjust packed dim. Moving 1 texel unit along the packed dim traverses 4 + // tensor elements in that dim. + pos[axis_mapping[packed_dim]] /= 4; + pos.w = idx[packed_dim] % 4; + return pos; +} + // // Texel Access and Storage // diff --git a/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl b/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl index b1e3a0abdfe..3ef984bfc95 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl @@ -16,10 +16,11 @@ layout(std430) buffer; #extension GL_EXT_control_flow_attributes : require -${layout_declare_buffer(0, "w", "nchw_out", "int")} -${layout_declare_tensor(1, "r", "t_in", "int8", "texture3d")} -${layout_declare_ubo(2, "ivec4", "tensor_sizes")} -${layout_declare_ubo(3, "int", "out_numel")} +${layout_declare_buffer(B, "w", "nchw_out", "int")} +${layout_declare_tensor(B, "r", "t_in", "int8", "texture3d")} +${layout_declare_ubo(B, "ivec4", "tensor_sizes")} +${layout_declare_ubo(B, "ivec4", "axis_mapping")} +${layout_declare_ubo(B, "int", "out_numel")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl index abe93904805..04b6a26cc44 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl @@ -21,9 +21,10 @@ ${define_required_extensions(DTYPE)} layout(std430) buffer; -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_buffer(1, "r", "nchw_in", DTYPE)} -${layout_declare_ubo(2, "ivec4", "sizes")} +${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} +${layout_declare_buffer(B, "r", "nchw_in", DTYPE)} +${layout_declare_ubo(B, "ivec4", "sizes")} +${layout_declare_ubo(B, "ivec4", "axis_mapping")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -53,7 +54,7 @@ VEC4_T read_texel(ivec4 tensor_idx) { void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 tensor_idx = to_tensor_idx(pos, sizes, packed_dim); + const ivec4 tensor_idx = to_tensor_idx(pos, sizes, axis_mapping, packed_dim); if (any(greaterThanEqual(tensor_idx, sizes))) { return; } diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl index 378cf09d129..813a174d2a5 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl @@ -16,9 +16,10 @@ layout(std430) buffer; #extension GL_EXT_control_flow_attributes : require -${layout_declare_tensor(0, "w", "t_out", "int8", "texture3d")} -${layout_declare_buffer(1, "r", "nchw_in", "int")} -${layout_declare_ubo(2, "ivec4", "tensor_sizes")} +${layout_declare_tensor(B, "w", "t_out", "int8", "texture3d")} +${layout_declare_buffer(B, "r", "nchw_in", "int")} +${layout_declare_ubo(B, "ivec4", "sizes")} +${layout_declare_ubo(B, "ivec4", "axis_mapping")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -36,7 +37,7 @@ int extend_sign(int x) { ivec4 read_texel(ivec4 tensor_idx) { const ivec4 buf_indices = get_texel_nchw_buffer_ixs( - tensor_idx, tensor_sizes, packed_dim); + tensor_idx, sizes, packed_dim); int shift = (1 << 8) - 1; ivec4 masks; @@ -51,7 +52,7 @@ ivec4 read_texel(ivec4 tensor_idx) { ivec4 out_tex = ivec4(0); [[unroll]] for (int i = 0; i < 4; ++i) { - if (tensor_idx[packed_dim] + i < tensor_sizes[packed_dim]) { + if (tensor_idx[packed_dim] + i < sizes[packed_dim]) { int in_texel = nchw_in[buf_indices[i] / 4]; int extracted_val = (in_texel & masks[i]) >> (8 * (buf_indices[i] % 4)); extracted_val = extend_sign(extracted_val); @@ -64,9 +65,9 @@ ivec4 read_texel(ivec4 tensor_idx) { void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 tensor_idx = to_tensor_idx(pos, tensor_sizes, packed_dim); + const ivec4 tensor_idx = to_tensor_idx(pos, sizes, axis_mapping, packed_dim); - if (any(greaterThanEqual(tensor_idx, tensor_sizes))) { + if (any(greaterThanEqual(tensor_idx, sizes))) { return; } diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp index 74113197d46..dcdd2dccfa0 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp @@ -106,7 +106,7 @@ ValueRef prepack_biases( graph.create_local_wg_size(v), vref, v, - {t->sizes_ubo()}, + {t->sizes_ubo(), t->axis_mapping_ubo()}, // Specialization constants {SV(t->packed_dim_whcn_idx())})); diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp index 9df5b73c1a1..6a759e0fd2e 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp @@ -31,7 +31,8 @@ void add_staging_to_tensor_node( graph.strides_ubo(out_tensor), graph.numel_ubo(out_tensor)}); } else { - ubos.append(graph.sizes_ubo(out_tensor)); + ubos.append( + {graph.sizes_ubo(out_tensor), graph.axis_mapping_ubo(out_tensor)}); } graph.execute_nodes().emplace_back(new ExecuteNode( @@ -69,7 +70,8 @@ void add_tensor_to_staging_node( graph.strides_ubo(in_tensor), graph.numel_ubo(in_tensor)}); } else { - ubos.append(graph.sizes_ubo(in_tensor)); + ubos.append( + {graph.sizes_ubo(in_tensor), graph.axis_mapping_ubo(in_tensor)}); } // Normally, the image_to_nchw shader is structured so that each thread reads @@ -113,7 +115,7 @@ ValueRef prepack( if (graph.is_buffer_storage(v)) { ubos.append({graph.sizes_ubo(v), graph.strides_ubo(v), graph.numel_ubo(v)}); } else { - ubos.append(graph.sizes_ubo(v)); + ubos.append({graph.sizes_ubo(v), graph.axis_mapping_ubo(v)}); } graph.prepack_nodes().emplace_back(new PrepackNode( diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp index a469a44dc1a..4feaecced53 100644 --- a/backends/vulkan/test/utils/test_utils.cpp +++ b/backends/vulkan/test/utils/test_utils.cpp @@ -85,7 +85,8 @@ void record_nchw_to_image_op( vkapi::PipelineStage::COMPUTE, vkapi::MemoryAccessType::WRITE), src_buffer, - v_dst.sizes_ubo()); + v_dst.sizes_ubo(), + v_dst.axis_mapping_ubo()); } void record_image_to_nchw_op( @@ -106,7 +107,8 @@ void record_image_to_nchw_op( 0, dst_buffer, v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE), - v_src.sizes_ubo()); + v_src.sizes_ubo(), + v_src.axis_mapping_ubo()); } void record_int8_image_to_nchw_noint8_op( @@ -127,6 +129,7 @@ void record_int8_image_to_nchw_noint8_op( dst_buffer.buffer(), v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE), v_src.sizes_ubo(), + v_src.axis_mapping_ubo(), v_src.numel_ubo()); } diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index c7d20c38675..53d0c820f41 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -1233,8 +1233,8 @@ TEST(VulkanComputeGraphTest, test_simple_graph) { GraphConfig config; ComputeGraph graph(config); - std::vector size_big = {8, 64, 124}; - std::vector size_small = {8, 1, 124}; + std::vector size_big = {1, 8, 8}; + std::vector size_small = {1, 1, 8}; // Build graph @@ -1415,8 +1415,9 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { /*shared_object_idx = */ 4); // +2: t.sizes_ubo() for each staging shader + // +2: t.axis_mapping_ubo() for each staging shader // +2: staging buffer for each input tensor - EXPECT_TRUE(get_vma_allocation_count() == 4); + EXPECT_TRUE(get_vma_allocation_count() == 6); ValueRef c = graph.add_tensor( size_big, @@ -1433,8 +1434,9 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { // +2: alpha UBO, broadcast UBO for arithmetic shader // +1: t.sizes_ubo() uniform buffer for staging shader + // +1: t.axis_mapping_ubo() uniform buffer for staging shader // +1: staging buffer for the input tensor - EXPECT_TRUE(get_vma_allocation_count() == 9); + EXPECT_TRUE(get_vma_allocation_count() == 12); ValueRef e = graph.add_tensor( size_big, @@ -1450,14 +1452,15 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { // +2: alpha UBO, broadcast UBO for arithmetic shader // +1: t.sizes_ubo() for staging shader + // +1: t.axis_mapping_ubo() for staging shader // +1 staging buffer for the input tensor - EXPECT_TRUE(get_vma_allocation_count() == 13); + EXPECT_TRUE(get_vma_allocation_count() == 17); graph.prepare(); graph.encode_execute(); // +3: shared memory allocations for tensors - EXPECT_TRUE(get_vma_allocation_count() == 16); + EXPECT_TRUE(get_vma_allocation_count() == 20); // Run graph