From b9e179bbae0962700dc34ed948807031a1891f71 Mon Sep 17 00:00:00 2001 From: ssjia Date: Thu, 4 Dec 2025 07:37:46 -0800 Subject: [PATCH] [ET-VK][ez] Address regressed conv2d perf numbers on main Address the benchmark binaries reporting worse performance than one month ago. The regression was not a "real" regression but due to some changes in the benchmark binaries that were made during debugging but were not reverted during landing: 1. Only running 1 benchmark iteration without any warmup iterations 2. The quantize/dequantize shaders would normally be excluded for the overall execution time / FLOPS calculation, but the name of these shaders was recently changed and the logic that filtered these shaders when reporting time was not updated accordingly. Also includes a small fix to the input data loading logic. Differential Revision: [D88381899](https://our.internmc.facebook.com/intern/diff/D88381899/) ghstack-source-id: 327094958 Pull Request resolved: https://github.com/pytorch/executorch/pull/16079 --- .../runtime/graph/ops/glsl/conv2d_fp_input_tile_load.glslh | 2 ++ backends/vulkan/test/custom_ops/q4gsw_linear.cpp | 2 +- backends/vulkan/test/custom_ops/q8csw_linear.cpp | 2 +- backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp | 4 ++-- .../vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp | 4 ++-- backends/vulkan/test/custom_ops/utils.cpp | 7 +++---- 6 files changed, 11 insertions(+), 10 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_input_tile_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_input_tile_load.glslh index a3934422e27..4456043bb9f 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_input_tile_load.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_input_tile_load.glslh @@ -39,6 +39,8 @@ void load_fp_input_tile( [[unroll]] for (int w = 0; w < TILE_M; w++) { if (load_tidx.data.x < input_sizes.x) { tile.data[w][0] = load_fp_input_texel(load_tidx); + } else { + tile.data[w][0] = VEC4_T(0); } load_tidx.data.x++; } diff --git a/backends/vulkan/test/custom_ops/q4gsw_linear.cpp b/backends/vulkan/test/custom_ops/q4gsw_linear.cpp index 59d9d694c2c..2af1488541d 100644 --- a/backends/vulkan/test/custom_ops/q4gsw_linear.cpp +++ b/backends/vulkan/test/custom_ops/q4gsw_linear.cpp @@ -552,7 +552,7 @@ int main(int argc, char* argv[]) { generate_quantized_linear_test_cases, quantized_linear_flop_calculator, "QuantizedLinearQ4GSW", - 10, + 3, 10, ref_fn); diff --git a/backends/vulkan/test/custom_ops/q8csw_linear.cpp b/backends/vulkan/test/custom_ops/q8csw_linear.cpp index 23973426fcc..4aa6f00d3f5 100644 --- a/backends/vulkan/test/custom_ops/q8csw_linear.cpp +++ b/backends/vulkan/test/custom_ops/q8csw_linear.cpp @@ -471,7 +471,7 @@ int main(int argc, char* argv[]) { generate_quantized_linear_test_cases, quantized_linear_flop_calculator, "QuantizedLinear", - 0, + 3, 10, ref_fn); diff --git a/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp b/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp index bbd4af7579c..450817c9d90 100644 --- a/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp +++ b/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp @@ -626,8 +626,8 @@ int main(int argc, char* argv[]) { generate_quantized_conv2d_test_cases, quantized_conv2d_flop_calculator, "QuantizedConv2dQ8ToQ8To", - 0, - 1, + 3, + 10, ref_fn); return 0; diff --git a/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp b/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp index c259b45de06..c0fd65de0ed 100644 --- a/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp +++ b/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp @@ -584,8 +584,8 @@ int main(int argc, char* argv[]) { generate_quantized_conv2d_dw_test_cases, quantized_conv2d_dw_flop_calculator, "QuantizedDepthwiseInt8Conv2d", - 0, - 1, + 3, + 10, ref_fn); return 0; diff --git a/backends/vulkan/test/custom_ops/utils.cpp b/backends/vulkan/test/custom_ops/utils.cpp index 4de6c32ac25..7845c24c68e 100644 --- a/backends/vulkan/test/custom_ops/utils.cpp +++ b/backends/vulkan/test/custom_ops/utils.cpp @@ -662,10 +662,9 @@ float collect_gpu_timing_us(ComputeGraph& graph) { for (const auto& shader_result : results) { if (shader_result.kernel_name.find("nchw_to") == std::string::npos && shader_result.kernel_name.find("to_nchw") == std::string::npos && - shader_result.kernel_name.find( - "quantize_and_pack_q8ta_conv2d_input") == std::string::npos && - shader_result.kernel_name.find( - "unpack_and_dequantize_q8ta_conv2d_output") == + shader_result.kernel_name.find("quantize_and_pack_4w4c") == + std::string::npos && + shader_result.kernel_name.find("unpack_4w4c_and_dequantize") == std::string::npos) { // Calculate duration from start and end times, convert from ns to μs uint64_t duration_ns =