From 5ba68454f851a92db7e56d29854b63c4762e1b3a Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <vivektrivedi@meta.com>
Date: Wed, 13 Nov 2024 16:58:35 -0800
Subject: [PATCH] Use Linear tiling by default for executorch vulkan tensor
 images (#6838)

Summary:

This diff changes the default image layout for a tensor from TILING_OPTIMAL to TILING_LINEAR.
Linear tiling helps improve memory utilization by minimizing texture padding and give better control over texture caching.

q_8w_linear op shader and dispatch settings are modified to utilize linearity of texture.

Reviewed By: nathanaelsee

Differential Revision: D65912644
---
 backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl   | 5 ++++-
 .../vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp     | 8 ++++++--
 backends/vulkan/runtime/vk_api/memory/Image.cpp           | 2 +-
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
index f679732ddb3..f18adf1e889 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
@@ -118,7 +118,10 @@ VEC4_T q_8w_linear(const u16vec3 out_pos, const uint16_t K) {
 }
 
 void main() {
-  const u16vec3 out_pos = u16vec3(gl_GlobalInvocationID);
+  const u16vec3 out_pos = u16vec3(
+    gl_GlobalInvocationID.x / (out_limits.y * out_limits.z),
+    (gl_GlobalInvocationID.x / out_limits.z) % out_limits.y,
+    gl_GlobalInvocationID.x % out_limits.z);
   if (any(greaterThanEqual(out_pos, out_limits))) {
     return;
   }
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
index cb3bafbb81b..a78ac0519c4 100644
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
@@ -109,11 +109,15 @@ void add_q_8w_linear_node(
          graph.sizes_ubo(mat1_W_packed)});
   }
 
+  // set global work group size to be 1 dimensional
+  const utils::uvec3 wg_size = {
+      static_cast<uint32_t>(graph.numel_of(out_W_packed)), 1, 1};
+
   graph.execute_nodes().emplace_back(new DispatchNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      graph.create_global_wg_size(out_W_packed),
-      graph.create_local_wg_size(out_W_packed),
+      wg_size,
+      graph.create_local_wg_size(wg_size),
       // Inputs and Outputs
       {{out_W_packed, vkapi::MemoryAccessType::WRITE},
        {{mat1_W_packed, q_mat2, scales}, vkapi::MemoryAccessType::READ}},
diff --git a/backends/vulkan/runtime/vk_api/memory/Image.cpp b/backends/vulkan/runtime/vk_api/memory/Image.cpp
index 503938c4067..108befaeb5c 100644
--- a/backends/vulkan/runtime/vk_api/memory/Image.cpp
+++ b/backends/vulkan/runtime/vk_api/memory/Image.cpp
@@ -156,7 +156,7 @@ VulkanImage::VulkanImage(
       1u, // mipLevels
       1u, // arrayLayers
       VK_SAMPLE_COUNT_1_BIT, // samples
-      VK_IMAGE_TILING_OPTIMAL, // tiling
+      VK_IMAGE_TILING_LINEAR, // tiling
       image_properties_.image_usage, // usage
       VK_SHARING_MODE_EXCLUSIVE, // sharingMode
       0u, // queueFamilyIndexCount