From e1d7c538e5b9a8791f1f501424cd5b75967de38e Mon Sep 17 00:00:00 2001 From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> Date: Mon, 9 Jun 2025 09:20:42 -0700 Subject: [PATCH 1/2] [ET-VK] Adding more test cases for conv 2d dw op. Pull Request resolved: https://github.com/pytorch/executorch/pull/11476 The diff [ET-VK] adds more test cases for the conv 2d dw op in the vulkan backend. ghstack-source-id: 289137820 Differential Revision: [D76241304](https://our.internmc.facebook.com/intern/diff/D76241304/) --- backends/vulkan/test/op_tests/cases.py | 176 ++++++++++++++++++------- 1 file changed, 131 insertions(+), 45 deletions(-) diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py index 277daa60451..bd67933dc93 100644 --- a/backends/vulkan/test/op_tests/cases.py +++ b/backends/vulkan/test/op_tests/cases.py @@ -279,17 +279,6 @@ def get_conv_inputs(): output_padding=[0, 1], groups=1, ), - Test( - self=(1, 8, 72, 96), - weight=(8, 1, 3, 3), - bias=(8,), - stride=[1, 1], - padding=[1, 1], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=8, - ), Test( self=(1, 6, 40, 50), weight=(8, 6, 3, 3), @@ -345,39 +334,6 @@ def get_conv_inputs(): output_padding=[0], groups=5, ), - Test( - self=(1, 4, 234, 234), - weight=(4, 1, 3, 3), - bias=(4,), - stride=[2, 1], - padding=[1, 1], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=4, - ), - Test( - self=(1, 4, 234, 234), - weight=(4, 1, 3, 3), - bias=(4,), - stride=[1, 2], - padding=[1, 1], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=4, - ), - Test( - self=(1, 4, 234, 234), - weight=(4, 1, 3, 3), - bias=(4,), - stride=[2, 2], - padding=[1, 1], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=4, - ), Test( self=(1, 8, 90, 77), weight=(1, 8, 3, 3), @@ -526,6 +482,130 @@ def get_conv_inputs(): ), ] + test_cases_dw = [ + Test( + self=(1, XS, S, S1), + weight=(XS, 1, 3, 3), + bias=(XS,), + stride=[1, 1], + padding=[1, 1], + dilation=[1, 1], + transposed=False, + output_padding=[0, 0], + groups=XS, + ), + Test( + self=(1, XS, S, S1), + weight=(XS, 1, 5, 5), + bias=(XS,), + stride=[1, 1], + padding=[2, 2], + dilation=[1, 1], + transposed=False, + output_padding=[0, 0], + groups=XS, + ), + Test( + self=(1, XS, S, S1), + weight=(XS, 1, 3, 3), + bias=(XS,), + stride=[2, 1], + padding=[1, 1], + dilation=[1, 1], + transposed=False, + output_padding=[0, 0], + groups=XS, + ), + Test( + self=(1, XS, S, S1), + weight=(XS, 1, 5, 5), + bias=(XS,), + stride=[1, 2], + padding=[2, 2], + dilation=[1, 1], + transposed=False, + output_padding=[0, 0], + groups=XS, + ), + Test( + self=(1, S2, S, S1), + weight=(S2, 1, 3, 3), + bias=(S2,), + stride=[1, 1], + padding=[1, 1], + dilation=[1, 1], + transposed=False, + output_padding=[0, 0], + groups=S2, + ), + Test( + self=(1, S2, S, S1), + weight=(S2, 1, 5, 5), + bias=(S2,), + stride=[1, 1], + padding=[2, 2], + dilation=[1, 1], + transposed=False, + output_padding=[0, 0], + groups=S2, + ), + Test( + self=(1, 8, 72, 96), + weight=(8, 1, 3, 3), + bias=(8,), + stride=[1, 1], + padding=[1, 1], + dilation=[1, 1], + transposed=False, + output_padding=[0, 0], + groups=8, + ), + Test( + self=(1, 8, 72, 96), + weight=(8, 1, 5, 5), + bias=(8,), + stride=[1, 1], + padding=[2, 2], + dilation=[1, 1], + transposed=False, + output_padding=[0, 0], + groups=8, + ), + Test( + self=(1, 4, 234, 234), + weight=(4, 1, 3, 3), + bias=(4,), + stride=[2, 1], + padding=[1, 1], + dilation=[1, 1], + transposed=False, + output_padding=[0, 0], + groups=4, + ), + Test( + self=(1, 4, 234, 234), + weight=(4, 1, 3, 3), + bias=(4,), + stride=[1, 2], + padding=[1, 1], + dilation=[1, 1], + transposed=False, + output_padding=[0, 0], + groups=4, + ), + Test( + self=(1, 4, 234, 234), + weight=(4, 1, 3, 3), + bias=(4,), + stride=[2, 2], + padding=[1, 1], + dilation=[1, 1], + transposed=False, + output_padding=[0, 0], + groups=4, + ), + ] + test_suite = VkTestSuite(test_cases) test_suite.layouts = [ "utils::kChannelsPacked", @@ -536,7 +616,13 @@ def get_conv_inputs(): "utils::kChannelsPacked", ] test_suite_pw.test_name_suffix = "pw" - return [test_suite, test_suite_pw] + + test_suite_dw = VkTestSuite(test_cases_dw) + test_suite_dw.layouts = [ + "utils::kChannelsPacked", + ] + test_suite_dw.test_name_suffix = "dw" + return [test_suite, test_suite_pw, test_suite_dw] @register_test_suite("aten.native_layer_norm.default") From 92f5d93ece956aaf989218e68e52333e4976a5eb Mon Sep 17 00:00:00 2001 From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> Date: Mon, 9 Jun 2025 09:20:45 -0700 Subject: [PATCH 2/2] [ET-VK] Minor dispatch improvement to conv2d dw op to improve performance. Pull Request resolved: https://github.com/pytorch/executorch/pull/11477 This diff provides a minor dispatch improvement to the Conv2d depthwise (DW) op to enhance performance. ghstack-source-id: 289137821 Differential Revision: [D76242234](https://our.internmc.facebook.com/intern/diff/D76242234/) --- .../graph/ops/glsl/conv2d_dw_output_tile.glsl | 15 +++++++-------- .../ops/glsl/conv2d_dw_sned_output_tile.glsl | 8 ++++---- .../vulkan/runtime/graph/ops/impl/Convolution.cpp | 8 ++++---- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl index 0ee19206f59..19250419baf 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl @@ -60,18 +60,18 @@ void main() { const uint div_by_x = gl_GlobalInvocationID.x / out_limits_xy_scaled.x; ivec3 pos = ivec3( gl_GlobalInvocationID.x % out_limits_xy_scaled.x, - div_by_x % out_limits_xy_scaled.y, - div_by_x / out_limits_xy_scaled.y); - - // scale pos.xy by batch sizes, because that's the top pixel to be processed - pos.x *= BATCH_SIZE_X; - pos.y *= BATCH_SIZE_Y; + div_by_x, + gl_GlobalInvocationID.y); // do not process if top pixel does not fit within the output range - if (pos.z >= out_limits.z) { + if (pos.y >= out_limits_xy_scaled.y || pos.z >= out_limits.z) { return; } + // scale pos.xy by batch sizes, because that's the top pixel to be processed + pos.x *= BATCH_SIZE_X; + pos.y *= BATCH_SIZE_Y; + // Compute the index of the top-left element of the overlay region. Negative // indices indicate that the top-left element is in a region added by padding. const ivec2 ipos = pos.xy * stride - padding; @@ -79,7 +79,6 @@ void main() { // Compute the start and end of the input indices to load. Padding is assumed // to be constant 0 padding, so any reads from the padding region is skipped. const ivec2 start = ipos; - const ivec2 end = ipos + overlay_region.xy; // sum outputs VEC4_T sum[BATCH_SIZE_Y * BATCH_SIZE_X]; diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl index ceadc35779e..f161c1ba460 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl @@ -50,10 +50,11 @@ void main() { const uint div_by_x = gl_GlobalInvocationID.x / out_limits.x; const ivec3 pos = ivec3( gl_GlobalInvocationID.x % out_limits.x, - div_by_x % out_limits.y, - div_by_x / out_limits.y); + div_by_x, + gl_GlobalInvocationID.y); - if (pos.z >= out_limits.z) { + // do not process if top pixel does not fit within the output range + if (pos.y >= out_limits.y || pos.z >= out_limits.z) { return; } @@ -64,7 +65,6 @@ void main() { // Compute the start and end of the input indices to load. Padding is assumed // to be constant 0 padding, so any reads from the padding region is skipped. const ivec2 start = ipos; - const ivec2 end = ipos + overlay_region.xy; VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0); int kx = 0; diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp index ff375fba89c..d85bd9d841e 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp @@ -407,13 +407,11 @@ void add_conv2d_node( utils::uvec3 wg_size = create_conv2d_global_wg_size( graph, method, out, weight_data, stride_equals_dilation); - if (method == Conv2dMethod::Depthwise) { - wg_size = {wg_size[0] * wg_size[1] * wg_size[2], 1, 1}; - } else if (method == Conv2dMethod::Pointwise) { + utils::uvec3 local_wg_size; + if (method == Conv2dMethod::Depthwise || method == Conv2dMethod::Pointwise) { wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1}; } - utils::uvec3 local_wg_size; if (method == Conv2dMethod::Pointwise) { uint32_t local_wg_size_y = 1; if (wg_size[1] % 8 == 0) { @@ -424,6 +422,8 @@ void add_conv2d_node( local_wg_size_y = 2; } local_wg_size = {64 / local_wg_size_y, local_wg_size_y, 1}; + } else if (method == Conv2dMethod::Depthwise) { + local_wg_size = {64, 1, 1}; } else { local_wg_size = graph.create_local_wg_size(wg_size); }