From 0c0451977a59fd3132b2acbadf4aeccf81d8b2fc Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm1479.ncg0.facebook.com>
Date: Thu, 25 Sep 2025 08:52:20 -0700
Subject: [PATCH] [ET-VK] Add `kInt8x4` dtype and `GPUMemoryLayout`s for packed
 quantized tensors

Pull Request resolved: https://github.com/pytorch/executorch/pull/14329

## Motivation

Lay the foundations for being able to execute statically quantized CNNs with ET-VK. Unlike with dynamic quantization, static quantization allows the output of quantized operators to stay in integer representation and be fed directly to the next quantized operator.

## Context

Typically, int8 quantized tensors can be represented by simply having the tensor use the int8 data type. While this is possible in ET-VK, in practice quantized operators expect int8 quantized tensors to be packed so that 16 8-bit values are packed into each `ivec4`, such that quantized int8 tensors will load/store with a granularity of 16 elements.

The reason for this is twofold:
* Support for shader int8 / storage buffer int8 extension is not guaranteed, meaning some devices do not allow using int8 types in shaders
* We have found that load/store from storage buffers/textures that use int8 data types sometimes results in worse memory load performance, due to vectorized load/store instructions not being used.

Therefore, in ET-VK we need a way to mark that a quantized tensor should

1. Use int32 as the underlying data type for the storage buffer/texture
2. Account for the block-packing that may be used

## Changes

First, introduce the `Int8x4` dtype that can be used for packed int8 tensors. This dtype is functionally the same as `Int`, but denotes that each int32 actually contains 4 packed 8-bit values.

Second, introduce new memory layouts: `kPackedInt8_4W4C` and `kPackedInt8_4H4W`. The former will be used for convolution, whil the latter will be used for matrix multiplication. See the inline comments for more details about these memory layouts.

Then, update `QuantizedConvolution.cpp` and `QuantizedLinear.cpp` to use the new data type and memory layouts for the packed int8 input tensor.
ghstack-source-id: 312106548

Differential Revision: [D82542336](https://our.internmc.facebook.com/intern/diff/D82542336/)
---
 backends/vulkan/CMakeLists.txt                |   5 +
 .../vulkan/runtime/api/containers/Tensor.cpp  | 141 +++++++++++++++---
 .../vulkan/runtime/api/containers/Tensor.h    |   1 +
 .../graph/ops/impl/QuantizedConvolution.cpp   |  10 +-
 .../graph/ops/impl/QuantizedLinear.cpp        |  14 +-
 .../vulkan/runtime/utils/StorageUtils.cpp     |  25 ++++
 backends/vulkan/runtime/utils/StorageUtils.h  |  33 ++++
 backends/vulkan/runtime/vk_api/Types.h        |   3 +-
 .../vulkan/test/vulkan_compute_api_test.cpp   |   2 +
 9 files changed, 193 insertions(+), 41 deletions(-)
 create mode 100644 backends/vulkan/runtime/utils/StorageUtils.cpp

diff --git a/backends/vulkan/CMakeLists.txt b/backends/vulkan/CMakeLists.txt
index 29ff90e7293..17b2be4e73c 100644
--- a/backends/vulkan/CMakeLists.txt
+++ b/backends/vulkan/CMakeLists.txt
@@ -105,11 +105,16 @@ target_include_directories(
     $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/third-party/flatbuffers/include>
 )
 
+# vulkan runtime utils files
+
+file(GLOB_RECURSE vulkan_runtime_utils_cpp ${RUNTIME_PATH}/utils/*.cpp)
+
 # vulkan_backend
 
 file(GLOB vulkan_backend_cpp ${RUNTIME_PATH}/*.cpp)
 list(APPEND vulkan_backend_cpp ${vulkan_graph_cpp})
 list(APPEND vulkan_backend_cpp ${vulkan_standard_shaders_cpp})
+list(APPEND vulkan_backend_cpp ${vulkan_runtime_utils_cpp})
 
 add_library(vulkan_backend ${vulkan_backend_cpp})
 target_include_directories(
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index 433ae15db4e..d798b203673 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -14,6 +14,21 @@
 namespace vkcompute {
 namespace api {
 
+/*
+ * For PackedInt8 memory layouts, ensure that the scalar type used for the
+ * tensor is kInt8x4. Otherwise, return the original scalar type.
+ */
+vkapi::ScalarType get_effective_scalar_type(
+    const vkapi::ScalarType dtype,
+    const utils::GPUMemoryLayout memory_layout) {
+  vkapi::ScalarType effective_dtype = dtype;
+  if (utils::is_packed_int8_layout(memory_layout)) {
+    VK_CHECK_COND(dtype == vkapi::kInt8x4 || dtype == vkapi::kChar);
+    effective_dtype = vkapi::kInt8x4;
+  }
+  return effective_dtype;
+}
+
 /*
  * Used to infer the sizes of a tensor that would correspond to a given
  * VulkanImage.
@@ -187,6 +202,7 @@ std::vector<int64_t> calculate_padded_sizes(
 
 utils::uvec3 calculate_image_extents(
     const std::vector<int64_t>& padded_sizes,
+    const utils::GPUMemoryLayout memory_layout,
     const std::vector<int64_t>& axis_map,
     const int32_t packed_dim) {
   utils::uvec3 extents({1, 1, 1});
@@ -205,6 +221,28 @@ utils::uvec3 calculate_image_extents(
     extents[axis] = utils::safe_downcast<uint32_t>(padded_sizes.at(dim));
   }
 
+  // For "regular" tensor dtypes, 4 elements along the packed dim are packed
+  // into one texel (4-component vectorized type). However, for packed int8
+  // memory layouts, an additional level of packing is employed where 4 int8
+  // elements are packed into one int32, and then 4 int32 are packed into each
+  // ivec4 texel.
+  if (utils::is_packed_int8_layout(memory_layout)) {
+    // Each int in the ivec4 contains 4 channels. The overall ivec4 contains
+    // data for a 1Hx4Wx4C block of the input tensor.
+    if (memory_layout == utils::kPackedInt8_4W4C) {
+      VK_CHECK_COND(packed_dim == 2);
+      extents[axis_map.at(0)] = utils::div_up(extents[axis_map.at(0)], 4u);
+    }
+    // Each int in the ivec4 contains 4 elements along the width dim. The
+    // overall ivec4 contains data for a 4Hx4W block of the input tensor.
+    else if (memory_layout == utils::kPackedInt8_4H4W) {
+      VK_CHECK_COND(packed_dim == 0);
+      extents[axis_map.at(1)] = utils::div_up(extents[axis_map.at(1)], 4u);
+    } else {
+      VK_THROW("Unhandled packed int8 memory layout!");
+    }
+  }
+
   // axis_map[3] indicates the WHCN index of the dimension used for batch
   // concatenation. Thus a double lookup is required to determine the image axis
   // used for batch concatenation.
@@ -215,6 +253,7 @@ utils::uvec3 calculate_image_extents(
 
   VK_CHECK_COND(extents[axis_map.at(packed_dim)] % 4 == 0);
   extents[axis_map.at(packed_dim)] /= 4;
+
   return extents;
 }
 
@@ -247,35 +286,72 @@ utils::uvec3 calculate_logical_limits(
  */
 utils::uvec3 calculate_logical_limits(
     const std::vector<int64_t>& sizes,
+    const utils::GPUMemoryLayout memory_layout,
     const std::vector<int64_t>& axis_map,
     const int32_t packed_dim) {
   return calculate_logical_limits(
       calculate_image_extents(
-          calculate_padded_sizes(sizes, packed_dim), axis_map, packed_dim),
+          calculate_padded_sizes(sizes, packed_dim),
+          memory_layout,
+          axis_map,
+          packed_dim),
       axis_map);
 }
 
 int64_t calculate_gpu_buffer_numel(
+    const std::vector<int64_t>& sizes,
+    const utils::GPUMemoryLayout memory_layout,
+    const vkapi::ScalarType dtype) {
+  size_t numel;
+
+  // Mirrors the logic in calculate_image_extents for packed int8 memory layouts
+  if (dtype == vkapi::kInt8x4) {
+    VK_CHECK_COND(utils::is_packed_int8_layout(memory_layout));
+    std::vector<int64_t> blocks_in_dim =
+        flip_and_unsqueeze<int64_t>(sizes, kTensorSizes, 0);
+    // Each ivec4 contains data for a 1Hx4Wx4C block of the input
+    if (memory_layout == utils::kPackedInt8_4W4C) {
+      blocks_in_dim[0] = utils::div_up_4(blocks_in_dim[0]);
+      blocks_in_dim[2] = utils::div_up_4(blocks_in_dim[2]);
+    }
+    // Each ivec4 contains data for a 4Hx4W block of the input
+    else if (memory_layout == utils::kPackedInt8_4H4W) {
+      blocks_in_dim[0] = utils::div_up_4(blocks_in_dim[0]);
+      blocks_in_dim[1] = utils::div_up_4(blocks_in_dim[1]);
+    } else {
+      VK_THROW("Unhandled packed int8 memory layout!");
+    }
+    // Each block is represented as an ivec4, and the base dtype of the buffer
+    // is int. Therefore, need to multiply the number of blocks by 4 to obtain
+    // the number of int elements in the data buffer.
+    numel = utils::multiply_integers(blocks_in_dim) * 4;
+  }
+  // Case for "regular" dtypes/memory layouts
+  else {
+    numel = utils::multiply_integers(sizes);
+
+    // For 8-bit types, align to the next multiple of 4. For devices that do not
+    // support 8-bit storage buffers, the tensor data will be interpreted as an
+    // array of int32 instead.
+    if (vkapi::element_size(dtype) == 1) {
+      numel = utils::align_up_4(numel);
+    }
+  }
+  return numel;
+}
+
+int64_t calculate_staging_or_gpu_buffer_numel(
     Context* const context,
     const std::vector<int64_t>& sizes,
     const utils::uvec3 image_extents,
     const utils::StorageType storage_type,
+    const utils::GPUMemoryLayout memory_layout,
     const vkapi::ScalarType dtype) {
   // For texture backed tensors, simply multiply the total number of texels by 4
   if (storage_type != utils::kBuffer) {
     return image_extents[0] * image_extents[1] * image_extents[2] * 4;
   }
-  const bool is_int8 = dtype == vkapi::kChar;
-  const bool int8_supported =
-      context->adapter_ptr()->has_full_int8_buffers_support();
-  const size_t numel = utils::multiply_integers(sizes);
-  // For int8 tensors, if the device does not support int8 buffers, then int32
-  // is used instead to represent the buffer data. Therefore the number of
-  // elements in the buffer is aligned to the next multiple of 4.
-  if (is_int8 && int8_supported) {
-    return utils::align_up_4(numel);
-  }
-  return numel;
+  return calculate_gpu_buffer_numel(sizes, memory_layout, dtype);
 }
 
 template <typename T, typename = std::enable_if_t<std::is_integral<T>::value>>
@@ -332,10 +408,12 @@ vkapi::VulkanImage allocate_image(
     Context* const context_ptr,
     utils::uvec3& image_extents,
     const utils::StorageType storage_type,
-    const VkFormat image_format,
+    const vkapi::ScalarType dtype,
     const bool allocate_memory) {
   vkapi::Adapter* adapter_ptr = context_ptr->adapter_ptr();
 
+  const VkFormat image_format = vkcompute::vkapi::to_vkformat(dtype);
+
   vkapi::ImageSampler::Properties sampler_props{
       VK_FILTER_NEAREST,
       VK_SAMPLER_MIPMAP_MODE_NEAREST,
@@ -420,6 +498,7 @@ vkapi::VulkanBuffer allocate_buffer(
 vTensorStorage::vTensorStorage(
     Context* const context,
     const utils::StorageType storage_type,
+    const utils::GPUMemoryLayout memory_layout,
     const std::vector<int64_t>& axis_map,
     const int32_t packed_dim,
     const std::vector<int64_t>& sizes,
@@ -429,20 +508,22 @@ vTensorStorage::vTensorStorage(
       storage_type_{storage_type},
       image_extents_(calculate_image_extents(
           calculate_padded_sizes(sizes, packed_dim),
+          memory_layout,
           axis_map,
           packed_dim)),
-      buffer_length_{calculate_gpu_buffer_numel(
+      buffer_length_{calculate_staging_or_gpu_buffer_numel(
           context_,
           sizes,
           image_extents_,
           storage_type,
+          memory_layout,
           dtype)},
       buffer_offset_{0},
       image_(allocate_image(
           context_,
           image_extents_,
           storage_type_,
-          to_vkformat(dtype),
+          dtype,
           allocate_memory)),
       buffer_(allocate_buffer(
           context_,
@@ -553,7 +634,7 @@ vTensor::vTensor(
     const utils::GPUMemoryLayout memory_layout,
     const bool allocate_memory,
     const utils::AxisMapLayout axis_map_layout)
-    : dtype_(dtype),
+    : dtype_(get_effective_scalar_type(dtype, memory_layout)),
       // Calculate tensor metadata
       sizes_(sizes.begin(), sizes.end()),
       packed_dim_(utils::to_packed_dim<int32_t>(memory_layout)),
@@ -576,6 +657,7 @@ vTensor::vTensor(
       storage_(std::make_shared<vTensorStorage>(
           context,
           storage_type,
+          memory_layout,
           axis_map_,
           packed_dim_,
           sizes,
@@ -785,6 +867,16 @@ vkapi::VulkanBuffer& vTensor::buffer(
 }
 
 utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
+  if (dtype_ == vkapi::kInt8x4) {
+    switch (packed_dim_) {
+      case WHCN::kChannelsDim:
+        return utils::kPackedInt8_4W4C;
+      case WHCN::kWidthDim:
+        return utils::kPackedInt8_4H4W;
+      default:
+        VK_THROW("Invalid packed dim for Tensor with kInt8x4 type");
+    }
+  }
   switch (packed_dim_) {
     case WHCN::kWidthDim:
       return utils::kWidthPacked;
@@ -914,8 +1006,8 @@ void vTensor::update_metadata() {
         flip_and_unsqueeze_ivec4(dim_order_, kTensorDimOrder, numel_);
     uniform_data_->strides_v =
         flip_and_unsqueeze_ivec4(strides_, kTensorStrides, numel_);
-    uniform_data_->logical_limits.limits =
-        calculate_logical_limits(sizes_, axis_map_, packed_dim_);
+    uniform_data_->logical_limits.limits = calculate_logical_limits(
+        sizes_, estimate_memory_layout(), axis_map_, packed_dim_);
 
     if (sizes_uniform_offset_ != kUniformOffsetUnset) {
       uniforms_.update(uniform_data_->sizes_v, sizes_uniform_offset_);
@@ -942,11 +1034,15 @@ void vTensor::update_metadata() {
 }
 
 void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
+  utils::GPUMemoryLayout est_memory_layout = estimate_memory_layout();
   if (storage_type() != utils::kBuffer) {
     // For texture storage check that the current texture is large enough for
     // the new sizes of the tensor.
     utils::uvec3 virtual_extents = calculate_image_extents(
-        calculate_padded_sizes(sizes_, packed_dim_), axis_map_, packed_dim_);
+        calculate_padded_sizes(sizes_, packed_dim_),
+        est_memory_layout,
+        axis_map_,
+        packed_dim_);
 
     bool valid_resize = virtual_extents[0] <= storage_->image_extents_[0];
     valid_resize =
@@ -958,9 +1054,10 @@ void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
         valid_resize,
         "tensor sizes requires a larger texture than the current one.");
   } else {
-    // For buffer storage check that the current buffer is large enough for the
-    // new sizes of the tensor.
-    int64_t numel = utils::multiply_integers(sizes);
+    // For buffer storage check that the current buffer is large enough for
+    // the new sizes of the tensor.
+    int64_t numel =
+        calculate_gpu_buffer_numel(sizes_, est_memory_layout, dtype_);
     bool valid_resize =
         numel + storage_->buffer_offset_ <= storage_->buffer_length_;
     VK_CHECK_COND(
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
index 66c1fd1e4da..d9fc7784cbc 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.h
+++ b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -99,6 +99,7 @@ class vTensorStorage final {
   vTensorStorage(
       Context* context,
       const utils::StorageType storage_type,
+      const utils::GPUMemoryLayout memory_layout,
       const std::vector<int64_t>& axis_map,
       const int32_t packed_dim,
       const std::vector<int64_t>& sizes,
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp
index 51f8138485e..9fc9fd52ad6 100644
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp
@@ -564,16 +564,12 @@ void quantized_conv2d_impl(
     ValueRef packed_weight_sums = prepack_standard(
         graph, weight_sums_data, utils::kBuffer, utils::kWidthPacked);
 
-    // Allocate quantized + packed im2col matrix for input
-    const int64_t num_blocks_M = utils::div_up_4(input_im2col_sizes.at(0));
-    const int64_t num_blocks_K = utils::div_up_4(input_im2col_sizes.at(1));
-
     TmpTensor input_int_im2col(
         &graph,
-        {num_blocks_M, num_blocks_K * 4},
-        vkapi::kInt,
+        input_im2col_sizes,
+        vkapi::kInt8x4,
         utils::kBuffer,
-        utils::kWidthPacked);
+        utils::kPackedInt8_4H4W);
 
     add_quantize_and_pack_im2col_node(
         graph,
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
index 7fbfcee5cb1..6c841732d9c 100644
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
@@ -802,20 +802,12 @@ void quantized_linear_impl(
       graph, weight_sums_data, utils::kBuffer, utils::kWidthPacked);
 
   // Allocate temporary tensor to store quantized and packed input
-
-  int64_t num_blocks_M, num_blocks_K;
-  std::tie(num_blocks_M, num_blocks_K) =
-      get_quantized_input_num_blocks(graph, fp_input);
-
-  const int64_t int_input_height = num_blocks_M;
-  const int64_t int_input_width = num_blocks_K * 4;
-
   TmpTensor packed_int_input(
       &graph,
-      {int_input_height, int_input_width},
-      vkapi::kInt,
+      graph.sizes_of(fp_input),
+      vkapi::kInt8x4,
       utils::kBuffer,
-      utils::kWidthPacked);
+      utils::kPackedInt8_4H4W);
 
   // Non dynamically quantized input case
   if (!input_quant_config.is_dynamic) {
diff --git a/backends/vulkan/runtime/utils/StorageUtils.cpp b/backends/vulkan/runtime/utils/StorageUtils.cpp
new file mode 100644
index 00000000000..cfe3d9e159a
--- /dev/null
+++ b/backends/vulkan/runtime/utils/StorageUtils.cpp
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/utils/StorageUtils.h>
+
+namespace vkcompute {
+namespace utils {
+
+bool is_packed_int8_layout(const GPUMemoryLayout layout) {
+  switch (layout) {
+    case kPackedInt8_4W4C:
+    case kPackedInt8_4H4W:
+      return true;
+    default:
+      return false;
+  }
+}
+
+} // namespace utils
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/utils/StorageUtils.h b/backends/vulkan/runtime/utils/StorageUtils.h
index 20addf88c53..76edec897c7 100644
--- a/backends/vulkan/runtime/utils/StorageUtils.h
+++ b/backends/vulkan/runtime/utils/StorageUtils.h
@@ -84,9 +84,24 @@ enum class GPUMemoryLayout : uint8_t {
    * 2. For texture backed tensors, the packed dim will be the specified dim.
    *    The axis map will be `{0, 1, 2, 2}`.
    */
+
   TENSOR_WIDTH_PACKED = 0u,
   TENSOR_HEIGHT_PACKED = 1u,
   TENSOR_CHANNELS_PACKED = 2u,
+
+  /*
+   * The following memory layouts are used for quantized int8 tensors. For the
+   * above "standard" memory layouts, 4 elements along the packed dim are stored
+   * in each texel (4-component vectorized type). However, for packed int8
+   * memory layouts, an additional level of packing is used where 4 int8 values
+   * are packed into each int32, and each int32 is packed into each ivec4.
+   * Conceptually, this allows an additional packed dimension to be used.
+   * When loading a ivec4 from the GPU storage buffer / texture, data for a
+   * 16 element block is loaded, rather than 4 elements along one dimension.
+   */
+
+  TENSOR_PACKED_INT8_4W4C = 3u,
+  TENSOR_PACKED_INT8_4H4W = 4u,
 };
 
 static constexpr GPUMemoryLayout kWidthPacked =
@@ -98,6 +113,12 @@ static constexpr GPUMemoryLayout kHeightPacked =
 static constexpr GPUMemoryLayout kChannelsPacked =
     GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
 
+static constexpr GPUMemoryLayout kPackedInt8_4W4C =
+    GPUMemoryLayout::TENSOR_PACKED_INT8_4W4C;
+
+static constexpr GPUMemoryLayout kPackedInt8_4H4W =
+    GPUMemoryLayout::TENSOR_PACKED_INT8_4H4W;
+
 template <typename T>
 T to_packed_dim(const GPUMemoryLayout layout) {
   switch (layout) {
@@ -107,11 +128,17 @@ T to_packed_dim(const GPUMemoryLayout layout) {
       return 1;
     case kChannelsPacked:
       return 2;
+    case kPackedInt8_4W4C:
+      return 2;
+    case kPackedInt8_4H4W:
+      return 0;
   };
   // Should be unreachable
   return 0;
 }
 
+bool is_packed_int8_layout(const GPUMemoryLayout layout);
+
 inline std::ostream& operator<<(
     std::ostream& os,
     const StorageType storage_type) {
@@ -142,6 +169,12 @@ inline std::ostream& operator<<(
     case kChannelsPacked:
       os << "TENSOR_CHANNELS_PACKED";
       break;
+    case kPackedInt8_4W4C:
+      os << "TENSOR_PACKED_INT8_4W4C";
+      break;
+    case kPackedInt8_4H4W:
+      os << "TENSOR_PACKED_INT8_4H4W";
+      break;
   }
   return os;
 }
diff --git a/backends/vulkan/runtime/vk_api/Types.h b/backends/vulkan/runtime/vk_api/Types.h
index b3309aa6c69..f4415b5c08f 100644
--- a/backends/vulkan/runtime/vk_api/Types.h
+++ b/backends/vulkan/runtime/vk_api/Types.h
@@ -43,7 +43,8 @@
   _(double, VK_FORMAT_R64G64B64A64_SFLOAT, Double) \
   _(int8_t, VK_FORMAT_R8G8B8A8_SINT, QInt8)        \
   _(uint8_t, VK_FORMAT_R8G8B8A8_UINT, QUInt8)      \
-  _(int32_t, VK_FORMAT_R32G32B32A32_SINT, QInt32)
+  _(int32_t, VK_FORMAT_R32G32B32A32_SINT, QInt32)  \
+  _(int32_t, VK_FORMAT_R32G32B32A32_SINT, Int8x4)
 
 namespace vkcompute {
 namespace vkapi {
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index a193d02da88..189562178a7 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -187,6 +187,8 @@ std::vector<int64_t> get_reference_strides(
         default:
           return {};
       }
+    default:
+      VK_THROW("Unsupported memory layout: ", layout);
   }
   return {};
 }