From b9e179bbae0962700dc34ed948807031a1891f71 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm1479.ncg0.facebook.com>
Date: Thu, 4 Dec 2025 07:37:46 -0800
Subject: [PATCH] [ET-VK][ez] Address regressed conv2d perf numbers on main

Address the benchmark binaries reporting worse performance than one month ago. The regression was not a "real" regression but due to some changes in the benchmark binaries that were made during debugging but were not reverted during landing:

1. Only running 1 benchmark iteration without any warmup iterations
2. The quantize/dequantize shaders would normally be excluded for the overall execution time / FLOPS calculation, but the name of these shaders was recently changed and the logic that filtered these shaders when reporting time was not updated accordingly.

Also includes a small fix to the input data loading logic.

Differential Revision: [D88381899](https://our.internmc.facebook.com/intern/diff/D88381899/)

ghstack-source-id: 327094958
Pull Request resolved: https://github.com/pytorch/executorch/pull/16079
---
 .../runtime/graph/ops/glsl/conv2d_fp_input_tile_load.glslh | 2 ++
 backends/vulkan/test/custom_ops/q4gsw_linear.cpp           | 2 +-
 backends/vulkan/test/custom_ops/q8csw_linear.cpp           | 2 +-
 backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp | 4 ++--
 .../vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp   | 4 ++--
 backends/vulkan/test/custom_ops/utils.cpp                  | 7 +++----
 6 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_input_tile_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_input_tile_load.glslh
index a3934422e27..4456043bb9f 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_input_tile_load.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_input_tile_load.glslh
@@ -39,6 +39,8 @@ void load_fp_input_tile(
   [[unroll]] for (int w = 0; w < TILE_M; w++) {
     if (load_tidx.data.x < input_sizes.x) {
       tile.data[w][0] = load_fp_input_texel(load_tidx);
+    } else {
+      tile.data[w][0] = VEC4_T(0);
     }
     load_tidx.data.x++;
   }
diff --git a/backends/vulkan/test/custom_ops/q4gsw_linear.cpp b/backends/vulkan/test/custom_ops/q4gsw_linear.cpp
index 59d9d694c2c..2af1488541d 100644
--- a/backends/vulkan/test/custom_ops/q4gsw_linear.cpp
+++ b/backends/vulkan/test/custom_ops/q4gsw_linear.cpp
@@ -552,7 +552,7 @@ int main(int argc, char* argv[]) {
       generate_quantized_linear_test_cases,
       quantized_linear_flop_calculator,
       "QuantizedLinearQ4GSW",
-      10,
+      3,
       10,
       ref_fn);
 
diff --git a/backends/vulkan/test/custom_ops/q8csw_linear.cpp b/backends/vulkan/test/custom_ops/q8csw_linear.cpp
index 23973426fcc..4aa6f00d3f5 100644
--- a/backends/vulkan/test/custom_ops/q8csw_linear.cpp
+++ b/backends/vulkan/test/custom_ops/q8csw_linear.cpp
@@ -471,7 +471,7 @@ int main(int argc, char* argv[]) {
       generate_quantized_linear_test_cases,
       quantized_linear_flop_calculator,
       "QuantizedLinear",
-      0,
+      3,
       10,
       ref_fn);
 
diff --git a/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp b/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp
index bbd4af7579c..450817c9d90 100644
--- a/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp
+++ b/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp
@@ -626,8 +626,8 @@ int main(int argc, char* argv[]) {
       generate_quantized_conv2d_test_cases,
       quantized_conv2d_flop_calculator,
       "QuantizedConv2dQ8ToQ8To",
-      0,
-      1,
+      3,
+      10,
       ref_fn);
 
   return 0;
diff --git a/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp b/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp
index c259b45de06..c0fd65de0ed 100644
--- a/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp
+++ b/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp
@@ -584,8 +584,8 @@ int main(int argc, char* argv[]) {
       generate_quantized_conv2d_dw_test_cases,
       quantized_conv2d_dw_flop_calculator,
       "QuantizedDepthwiseInt8Conv2d",
-      0,
-      1,
+      3,
+      10,
       ref_fn);
 
   return 0;
diff --git a/backends/vulkan/test/custom_ops/utils.cpp b/backends/vulkan/test/custom_ops/utils.cpp
index 4de6c32ac25..7845c24c68e 100644
--- a/backends/vulkan/test/custom_ops/utils.cpp
+++ b/backends/vulkan/test/custom_ops/utils.cpp
@@ -662,10 +662,9 @@ float collect_gpu_timing_us(ComputeGraph& graph) {
     for (const auto& shader_result : results) {
       if (shader_result.kernel_name.find("nchw_to") == std::string::npos &&
           shader_result.kernel_name.find("to_nchw") == std::string::npos &&
-          shader_result.kernel_name.find(
-              "quantize_and_pack_q8ta_conv2d_input") == std::string::npos &&
-          shader_result.kernel_name.find(
-              "unpack_and_dequantize_q8ta_conv2d_output") ==
+          shader_result.kernel_name.find("quantize_and_pack_4w4c") ==
+              std::string::npos &&
+          shader_result.kernel_name.find("unpack_4w4c_and_dequantize") ==
               std::string::npos) {
         // Calculate duration from start and end times, convert from ns to μs
         uint64_t duration_ns =