From 0c0451977a59fd3132b2acbadf4aeccf81d8b2fc Mon Sep 17 00:00:00 2001 From: ssjia Date: Thu, 25 Sep 2025 08:52:20 -0700 Subject: [PATCH] [ET-VK] Add `kInt8x4` dtype and `GPUMemoryLayout`s for packed quantized tensors Pull Request resolved: https://github.com/pytorch/executorch/pull/14329 ## Motivation Lay the foundations for being able to execute statically quantized CNNs with ET-VK. Unlike with dynamic quantization, static quantization allows the output of quantized operators to stay in integer representation and be fed directly to the next quantized operator. ## Context Typically, int8 quantized tensors can be represented by simply having the tensor use the int8 data type. While this is possible in ET-VK, in practice quantized operators expect int8 quantized tensors to be packed so that 16 8-bit values are packed into each `ivec4`, such that quantized int8 tensors will load/store with a granularity of 16 elements. The reason for this is twofold: * Support for shader int8 / storage buffer int8 extension is not guaranteed, meaning some devices do not allow using int8 types in shaders * We have found that load/store from storage buffers/textures that use int8 data types sometimes results in worse memory load performance, due to vectorized load/store instructions not being used. Therefore, in ET-VK we need a way to mark that a quantized tensor should 1. Use int32 as the underlying data type for the storage buffer/texture 2. Account for the block-packing that may be used ## Changes First, introduce the `Int8x4` dtype that can be used for packed int8 tensors. This dtype is functionally the same as `Int`, but denotes that each int32 actually contains 4 packed 8-bit values. Second, introduce new memory layouts: `kPackedInt8_4W4C` and `kPackedInt8_4H4W`. The former will be used for convolution, whil the latter will be used for matrix multiplication. See the inline comments for more details about these memory layouts. Then, update `QuantizedConvolution.cpp` and `QuantizedLinear.cpp` to use the new data type and memory layouts for the packed int8 input tensor. ghstack-source-id: 312106548 Differential Revision: [D82542336](https://our.internmc.facebook.com/intern/diff/D82542336/) --- backends/vulkan/CMakeLists.txt | 5 + .../vulkan/runtime/api/containers/Tensor.cpp | 141 +++++++++++++++--- .../vulkan/runtime/api/containers/Tensor.h | 1 + .../graph/ops/impl/QuantizedConvolution.cpp | 10 +- .../graph/ops/impl/QuantizedLinear.cpp | 14 +- .../vulkan/runtime/utils/StorageUtils.cpp | 25 ++++ backends/vulkan/runtime/utils/StorageUtils.h | 33 ++++ backends/vulkan/runtime/vk_api/Types.h | 3 +- .../vulkan/test/vulkan_compute_api_test.cpp | 2 + 9 files changed, 193 insertions(+), 41 deletions(-) create mode 100644 backends/vulkan/runtime/utils/StorageUtils.cpp diff --git a/backends/vulkan/CMakeLists.txt b/backends/vulkan/CMakeLists.txt index 29ff90e7293..17b2be4e73c 100644 --- a/backends/vulkan/CMakeLists.txt +++ b/backends/vulkan/CMakeLists.txt @@ -105,11 +105,16 @@ target_include_directories( $ ) +# vulkan runtime utils files + +file(GLOB_RECURSE vulkan_runtime_utils_cpp ${RUNTIME_PATH}/utils/*.cpp) + # vulkan_backend file(GLOB vulkan_backend_cpp ${RUNTIME_PATH}/*.cpp) list(APPEND vulkan_backend_cpp ${vulkan_graph_cpp}) list(APPEND vulkan_backend_cpp ${vulkan_standard_shaders_cpp}) +list(APPEND vulkan_backend_cpp ${vulkan_runtime_utils_cpp}) add_library(vulkan_backend ${vulkan_backend_cpp}) target_include_directories( diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp index 433ae15db4e..d798b203673 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.cpp +++ b/backends/vulkan/runtime/api/containers/Tensor.cpp @@ -14,6 +14,21 @@ namespace vkcompute { namespace api { +/* + * For PackedInt8 memory layouts, ensure that the scalar type used for the + * tensor is kInt8x4. Otherwise, return the original scalar type. + */ +vkapi::ScalarType get_effective_scalar_type( + const vkapi::ScalarType dtype, + const utils::GPUMemoryLayout memory_layout) { + vkapi::ScalarType effective_dtype = dtype; + if (utils::is_packed_int8_layout(memory_layout)) { + VK_CHECK_COND(dtype == vkapi::kInt8x4 || dtype == vkapi::kChar); + effective_dtype = vkapi::kInt8x4; + } + return effective_dtype; +} + /* * Used to infer the sizes of a tensor that would correspond to a given * VulkanImage. @@ -187,6 +202,7 @@ std::vector calculate_padded_sizes( utils::uvec3 calculate_image_extents( const std::vector& padded_sizes, + const utils::GPUMemoryLayout memory_layout, const std::vector& axis_map, const int32_t packed_dim) { utils::uvec3 extents({1, 1, 1}); @@ -205,6 +221,28 @@ utils::uvec3 calculate_image_extents( extents[axis] = utils::safe_downcast(padded_sizes.at(dim)); } + // For "regular" tensor dtypes, 4 elements along the packed dim are packed + // into one texel (4-component vectorized type). However, for packed int8 + // memory layouts, an additional level of packing is employed where 4 int8 + // elements are packed into one int32, and then 4 int32 are packed into each + // ivec4 texel. + if (utils::is_packed_int8_layout(memory_layout)) { + // Each int in the ivec4 contains 4 channels. The overall ivec4 contains + // data for a 1Hx4Wx4C block of the input tensor. + if (memory_layout == utils::kPackedInt8_4W4C) { + VK_CHECK_COND(packed_dim == 2); + extents[axis_map.at(0)] = utils::div_up(extents[axis_map.at(0)], 4u); + } + // Each int in the ivec4 contains 4 elements along the width dim. The + // overall ivec4 contains data for a 4Hx4W block of the input tensor. + else if (memory_layout == utils::kPackedInt8_4H4W) { + VK_CHECK_COND(packed_dim == 0); + extents[axis_map.at(1)] = utils::div_up(extents[axis_map.at(1)], 4u); + } else { + VK_THROW("Unhandled packed int8 memory layout!"); + } + } + // axis_map[3] indicates the WHCN index of the dimension used for batch // concatenation. Thus a double lookup is required to determine the image axis // used for batch concatenation. @@ -215,6 +253,7 @@ utils::uvec3 calculate_image_extents( VK_CHECK_COND(extents[axis_map.at(packed_dim)] % 4 == 0); extents[axis_map.at(packed_dim)] /= 4; + return extents; } @@ -247,35 +286,72 @@ utils::uvec3 calculate_logical_limits( */ utils::uvec3 calculate_logical_limits( const std::vector& sizes, + const utils::GPUMemoryLayout memory_layout, const std::vector& axis_map, const int32_t packed_dim) { return calculate_logical_limits( calculate_image_extents( - calculate_padded_sizes(sizes, packed_dim), axis_map, packed_dim), + calculate_padded_sizes(sizes, packed_dim), + memory_layout, + axis_map, + packed_dim), axis_map); } int64_t calculate_gpu_buffer_numel( + const std::vector& sizes, + const utils::GPUMemoryLayout memory_layout, + const vkapi::ScalarType dtype) { + size_t numel; + + // Mirrors the logic in calculate_image_extents for packed int8 memory layouts + if (dtype == vkapi::kInt8x4) { + VK_CHECK_COND(utils::is_packed_int8_layout(memory_layout)); + std::vector blocks_in_dim = + flip_and_unsqueeze(sizes, kTensorSizes, 0); + // Each ivec4 contains data for a 1Hx4Wx4C block of the input + if (memory_layout == utils::kPackedInt8_4W4C) { + blocks_in_dim[0] = utils::div_up_4(blocks_in_dim[0]); + blocks_in_dim[2] = utils::div_up_4(blocks_in_dim[2]); + } + // Each ivec4 contains data for a 4Hx4W block of the input + else if (memory_layout == utils::kPackedInt8_4H4W) { + blocks_in_dim[0] = utils::div_up_4(blocks_in_dim[0]); + blocks_in_dim[1] = utils::div_up_4(blocks_in_dim[1]); + } else { + VK_THROW("Unhandled packed int8 memory layout!"); + } + // Each block is represented as an ivec4, and the base dtype of the buffer + // is int. Therefore, need to multiply the number of blocks by 4 to obtain + // the number of int elements in the data buffer. + numel = utils::multiply_integers(blocks_in_dim) * 4; + } + // Case for "regular" dtypes/memory layouts + else { + numel = utils::multiply_integers(sizes); + + // For 8-bit types, align to the next multiple of 4. For devices that do not + // support 8-bit storage buffers, the tensor data will be interpreted as an + // array of int32 instead. + if (vkapi::element_size(dtype) == 1) { + numel = utils::align_up_4(numel); + } + } + return numel; +} + +int64_t calculate_staging_or_gpu_buffer_numel( Context* const context, const std::vector& sizes, const utils::uvec3 image_extents, const utils::StorageType storage_type, + const utils::GPUMemoryLayout memory_layout, const vkapi::ScalarType dtype) { // For texture backed tensors, simply multiply the total number of texels by 4 if (storage_type != utils::kBuffer) { return image_extents[0] * image_extents[1] * image_extents[2] * 4; } - const bool is_int8 = dtype == vkapi::kChar; - const bool int8_supported = - context->adapter_ptr()->has_full_int8_buffers_support(); - const size_t numel = utils::multiply_integers(sizes); - // For int8 tensors, if the device does not support int8 buffers, then int32 - // is used instead to represent the buffer data. Therefore the number of - // elements in the buffer is aligned to the next multiple of 4. - if (is_int8 && int8_supported) { - return utils::align_up_4(numel); - } - return numel; + return calculate_gpu_buffer_numel(sizes, memory_layout, dtype); } template ::value>> @@ -332,10 +408,12 @@ vkapi::VulkanImage allocate_image( Context* const context_ptr, utils::uvec3& image_extents, const utils::StorageType storage_type, - const VkFormat image_format, + const vkapi::ScalarType dtype, const bool allocate_memory) { vkapi::Adapter* adapter_ptr = context_ptr->adapter_ptr(); + const VkFormat image_format = vkcompute::vkapi::to_vkformat(dtype); + vkapi::ImageSampler::Properties sampler_props{ VK_FILTER_NEAREST, VK_SAMPLER_MIPMAP_MODE_NEAREST, @@ -420,6 +498,7 @@ vkapi::VulkanBuffer allocate_buffer( vTensorStorage::vTensorStorage( Context* const context, const utils::StorageType storage_type, + const utils::GPUMemoryLayout memory_layout, const std::vector& axis_map, const int32_t packed_dim, const std::vector& sizes, @@ -429,20 +508,22 @@ vTensorStorage::vTensorStorage( storage_type_{storage_type}, image_extents_(calculate_image_extents( calculate_padded_sizes(sizes, packed_dim), + memory_layout, axis_map, packed_dim)), - buffer_length_{calculate_gpu_buffer_numel( + buffer_length_{calculate_staging_or_gpu_buffer_numel( context_, sizes, image_extents_, storage_type, + memory_layout, dtype)}, buffer_offset_{0}, image_(allocate_image( context_, image_extents_, storage_type_, - to_vkformat(dtype), + dtype, allocate_memory)), buffer_(allocate_buffer( context_, @@ -553,7 +634,7 @@ vTensor::vTensor( const utils::GPUMemoryLayout memory_layout, const bool allocate_memory, const utils::AxisMapLayout axis_map_layout) - : dtype_(dtype), + : dtype_(get_effective_scalar_type(dtype, memory_layout)), // Calculate tensor metadata sizes_(sizes.begin(), sizes.end()), packed_dim_(utils::to_packed_dim(memory_layout)), @@ -576,6 +657,7 @@ vTensor::vTensor( storage_(std::make_shared( context, storage_type, + memory_layout, axis_map_, packed_dim_, sizes, @@ -785,6 +867,16 @@ vkapi::VulkanBuffer& vTensor::buffer( } utils::GPUMemoryLayout vTensor::estimate_memory_layout() const { + if (dtype_ == vkapi::kInt8x4) { + switch (packed_dim_) { + case WHCN::kChannelsDim: + return utils::kPackedInt8_4W4C; + case WHCN::kWidthDim: + return utils::kPackedInt8_4H4W; + default: + VK_THROW("Invalid packed dim for Tensor with kInt8x4 type"); + } + } switch (packed_dim_) { case WHCN::kWidthDim: return utils::kWidthPacked; @@ -914,8 +1006,8 @@ void vTensor::update_metadata() { flip_and_unsqueeze_ivec4(dim_order_, kTensorDimOrder, numel_); uniform_data_->strides_v = flip_and_unsqueeze_ivec4(strides_, kTensorStrides, numel_); - uniform_data_->logical_limits.limits = - calculate_logical_limits(sizes_, axis_map_, packed_dim_); + uniform_data_->logical_limits.limits = calculate_logical_limits( + sizes_, estimate_memory_layout(), axis_map_, packed_dim_); if (sizes_uniform_offset_ != kUniformOffsetUnset) { uniforms_.update(uniform_data_->sizes_v, sizes_uniform_offset_); @@ -942,11 +1034,15 @@ void vTensor::update_metadata() { } void vTensor::check_sizes(const std::vector& sizes) const { + utils::GPUMemoryLayout est_memory_layout = estimate_memory_layout(); if (storage_type() != utils::kBuffer) { // For texture storage check that the current texture is large enough for // the new sizes of the tensor. utils::uvec3 virtual_extents = calculate_image_extents( - calculate_padded_sizes(sizes_, packed_dim_), axis_map_, packed_dim_); + calculate_padded_sizes(sizes_, packed_dim_), + est_memory_layout, + axis_map_, + packed_dim_); bool valid_resize = virtual_extents[0] <= storage_->image_extents_[0]; valid_resize = @@ -958,9 +1054,10 @@ void vTensor::check_sizes(const std::vector& sizes) const { valid_resize, "tensor sizes requires a larger texture than the current one."); } else { - // For buffer storage check that the current buffer is large enough for the - // new sizes of the tensor. - int64_t numel = utils::multiply_integers(sizes); + // For buffer storage check that the current buffer is large enough for + // the new sizes of the tensor. + int64_t numel = + calculate_gpu_buffer_numel(sizes_, est_memory_layout, dtype_); bool valid_resize = numel + storage_->buffer_offset_ <= storage_->buffer_length_; VK_CHECK_COND( diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h index 66c1fd1e4da..d9fc7784cbc 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.h +++ b/backends/vulkan/runtime/api/containers/Tensor.h @@ -99,6 +99,7 @@ class vTensorStorage final { vTensorStorage( Context* context, const utils::StorageType storage_type, + const utils::GPUMemoryLayout memory_layout, const std::vector& axis_map, const int32_t packed_dim, const std::vector& sizes, diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp index 51f8138485e..9fc9fd52ad6 100644 --- a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp @@ -564,16 +564,12 @@ void quantized_conv2d_impl( ValueRef packed_weight_sums = prepack_standard( graph, weight_sums_data, utils::kBuffer, utils::kWidthPacked); - // Allocate quantized + packed im2col matrix for input - const int64_t num_blocks_M = utils::div_up_4(input_im2col_sizes.at(0)); - const int64_t num_blocks_K = utils::div_up_4(input_im2col_sizes.at(1)); - TmpTensor input_int_im2col( &graph, - {num_blocks_M, num_blocks_K * 4}, - vkapi::kInt, + input_im2col_sizes, + vkapi::kInt8x4, utils::kBuffer, - utils::kWidthPacked); + utils::kPackedInt8_4H4W); add_quantize_and_pack_im2col_node( graph, diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp index 7fbfcee5cb1..6c841732d9c 100644 --- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp @@ -802,20 +802,12 @@ void quantized_linear_impl( graph, weight_sums_data, utils::kBuffer, utils::kWidthPacked); // Allocate temporary tensor to store quantized and packed input - - int64_t num_blocks_M, num_blocks_K; - std::tie(num_blocks_M, num_blocks_K) = - get_quantized_input_num_blocks(graph, fp_input); - - const int64_t int_input_height = num_blocks_M; - const int64_t int_input_width = num_blocks_K * 4; - TmpTensor packed_int_input( &graph, - {int_input_height, int_input_width}, - vkapi::kInt, + graph.sizes_of(fp_input), + vkapi::kInt8x4, utils::kBuffer, - utils::kWidthPacked); + utils::kPackedInt8_4H4W); // Non dynamically quantized input case if (!input_quant_config.is_dynamic) { diff --git a/backends/vulkan/runtime/utils/StorageUtils.cpp b/backends/vulkan/runtime/utils/StorageUtils.cpp new file mode 100644 index 00000000000..cfe3d9e159a --- /dev/null +++ b/backends/vulkan/runtime/utils/StorageUtils.cpp @@ -0,0 +1,25 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace vkcompute { +namespace utils { + +bool is_packed_int8_layout(const GPUMemoryLayout layout) { + switch (layout) { + case kPackedInt8_4W4C: + case kPackedInt8_4H4W: + return true; + default: + return false; + } +} + +} // namespace utils +} // namespace vkcompute diff --git a/backends/vulkan/runtime/utils/StorageUtils.h b/backends/vulkan/runtime/utils/StorageUtils.h index 20addf88c53..76edec897c7 100644 --- a/backends/vulkan/runtime/utils/StorageUtils.h +++ b/backends/vulkan/runtime/utils/StorageUtils.h @@ -84,9 +84,24 @@ enum class GPUMemoryLayout : uint8_t { * 2. For texture backed tensors, the packed dim will be the specified dim. * The axis map will be `{0, 1, 2, 2}`. */ + TENSOR_WIDTH_PACKED = 0u, TENSOR_HEIGHT_PACKED = 1u, TENSOR_CHANNELS_PACKED = 2u, + + /* + * The following memory layouts are used for quantized int8 tensors. For the + * above "standard" memory layouts, 4 elements along the packed dim are stored + * in each texel (4-component vectorized type). However, for packed int8 + * memory layouts, an additional level of packing is used where 4 int8 values + * are packed into each int32, and each int32 is packed into each ivec4. + * Conceptually, this allows an additional packed dimension to be used. + * When loading a ivec4 from the GPU storage buffer / texture, data for a + * 16 element block is loaded, rather than 4 elements along one dimension. + */ + + TENSOR_PACKED_INT8_4W4C = 3u, + TENSOR_PACKED_INT8_4H4W = 4u, }; static constexpr GPUMemoryLayout kWidthPacked = @@ -98,6 +113,12 @@ static constexpr GPUMemoryLayout kHeightPacked = static constexpr GPUMemoryLayout kChannelsPacked = GPUMemoryLayout::TENSOR_CHANNELS_PACKED; +static constexpr GPUMemoryLayout kPackedInt8_4W4C = + GPUMemoryLayout::TENSOR_PACKED_INT8_4W4C; + +static constexpr GPUMemoryLayout kPackedInt8_4H4W = + GPUMemoryLayout::TENSOR_PACKED_INT8_4H4W; + template T to_packed_dim(const GPUMemoryLayout layout) { switch (layout) { @@ -107,11 +128,17 @@ T to_packed_dim(const GPUMemoryLayout layout) { return 1; case kChannelsPacked: return 2; + case kPackedInt8_4W4C: + return 2; + case kPackedInt8_4H4W: + return 0; }; // Should be unreachable return 0; } +bool is_packed_int8_layout(const GPUMemoryLayout layout); + inline std::ostream& operator<<( std::ostream& os, const StorageType storage_type) { @@ -142,6 +169,12 @@ inline std::ostream& operator<<( case kChannelsPacked: os << "TENSOR_CHANNELS_PACKED"; break; + case kPackedInt8_4W4C: + os << "TENSOR_PACKED_INT8_4W4C"; + break; + case kPackedInt8_4H4W: + os << "TENSOR_PACKED_INT8_4H4W"; + break; } return os; } diff --git a/backends/vulkan/runtime/vk_api/Types.h b/backends/vulkan/runtime/vk_api/Types.h index b3309aa6c69..f4415b5c08f 100644 --- a/backends/vulkan/runtime/vk_api/Types.h +++ b/backends/vulkan/runtime/vk_api/Types.h @@ -43,7 +43,8 @@ _(double, VK_FORMAT_R64G64B64A64_SFLOAT, Double) \ _(int8_t, VK_FORMAT_R8G8B8A8_SINT, QInt8) \ _(uint8_t, VK_FORMAT_R8G8B8A8_UINT, QUInt8) \ - _(int32_t, VK_FORMAT_R32G32B32A32_SINT, QInt32) + _(int32_t, VK_FORMAT_R32G32B32A32_SINT, QInt32) \ + _(int32_t, VK_FORMAT_R32G32B32A32_SINT, Int8x4) namespace vkcompute { namespace vkapi { diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index a193d02da88..189562178a7 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -187,6 +187,8 @@ std::vector get_reference_strides( default: return {}; } + default: + VK_THROW("Unsupported memory layout: ", layout); } return {}; }