From b19c6beca8895cc5e91d265aa95f7fb30abfb933 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <vivektrivedi@meta.com>
Date: Wed, 15 Oct 2025 10:34:12 -0700
Subject: [PATCH] Removing manual unroll in linear shader to improve overall
 performance. (#15110)

Summary:

### Summary

This diff improves the overall performance of the linear shader by removing manual unrolling in the `linear_qcsnw_tiled.glsl` file.

The changes include:
- Removing the `[[unroll]]` directive in the for loop to allow the compiler to automatically unroll the loop, which can lead to better performance.
- Changing the type of `mat1` from `VEC4_T[TILE_ROWS]` to `T[TILE_ROWS][4]` to better match the access pattern in the loop.

Reviewed By: SS-JIA

Differential Revision: D84571616
---
 .../graph/ops/glsl/linear_qcsnw_tiled.glsl    | 33 +++++++++++--------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.glsl
index f6f05aab7ca..e63e267a4d7 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.glsl
@@ -66,7 +66,7 @@ void main() {
     return;
   }
 
-  VEC4_T mat1[TILE_ROWS];
+  T mat1[TILE_ROWS][4];
   VEC4_T qmat2[4][TILE_TXCOLS];
   VEC4_T sums[TILE_ROWS][TILE_TXCOLS];
 
@@ -78,7 +78,7 @@ void main() {
       scales[${c}] = VEC4_T(
         texelFetch(t_scales, u16vec2(out_txcol + ${c}, 0), 0));
 
-  [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
+  for (int r = 0; r < TILE_ROWS; ++r) {
     $for c in range(TILE_TXCOLS):
       sums[r][${c}] = VEC4_T(0.0);
   }
@@ -91,7 +91,7 @@ void main() {
       uint weight_row_txstride = div4(weight_sizes.x);
 
     // Preload weight tensor
-    [[unroll]] for (int r = 0; r < 4; r++) {
+    for (int r = 0; r < 4; r++) {
       $if QUANT_NBITS == 4:
         $for c in range(0, TILE_TXCOLS, 2):
           $if WEIGHT_STORAGE == "buffer":
@@ -117,21 +117,28 @@ void main() {
       uint in_row_txstride = div4(in_sizes.x);
 
     // Preload input tensor
-    [[unroll]] for (int i = 0; i < TILE_ROWS; i++) {
+    for (int i = 0; i < TILE_ROWS; i++) {
       $if IN_STORAGE == "buffer":
-        mat1[i] = t_in[(out_row + i) * in_row_txstride + txpos];
+        VEC4_T tmp = t_in[(out_row + i) * in_row_txstride + txpos];
+        mat1[i][0] = tmp.x;
+        mat1[i][1] = tmp.y;
+        mat1[i][2] = tmp.z;
+        mat1[i][3] = tmp.w;
       $else:
-        mat1[i] = VEC4_T(
-          texelFetch(t_in, u16vec3(txpos, out_row + i, 0), 0));
+        VEC4_T tmp = VEC4_T(texelFetch(t_in, u16vec3(txpos, out_row + i, 0), 0));
+        mat1[i][0] = tmp.x;
+        mat1[i][1] = tmp.y;
+        mat1[i][2] = tmp.z;
+        mat1[i][3] = tmp.w;
     }
 
     // Accumulate output
-    [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
+    for (int r = 0; r < TILE_ROWS; ++r) {
       $for c in range(TILE_TXCOLS):
-        sums[r][${c}] += mat1[r].x * qmat2[0][${c}] +
-                         mat1[r].y * qmat2[1][${c}] +
-                         mat1[r].z * qmat2[2][${c}] +
-                         mat1[r].w * qmat2[3][${c}];
+        sums[r][${c}] += mat1[r][0] * qmat2[0][${c}] +
+                         mat1[r][1] * qmat2[1][${c}] +
+                         mat1[r][2] * qmat2[2][${c}] +
+                         mat1[r][3] * qmat2[3][${c}];
     }
   }
 
@@ -140,7 +147,7 @@ void main() {
     uint out_bufi;
     uint out_row_txstride = div4(out_sizes.x);
 
-  [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
+  for (int r = 0; r < TILE_ROWS; ++r) {
     $for c in range(TILE_TXCOLS):
       $if OUT_STORAGE == "buffer":
         if (out_row + r < out_sizes.y) {