Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions backends/vulkan/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,16 @@ target_include_directories(
$<BUILD_INTERFACE:${EXECUTORCH_ROOT}/third-party/flatbuffers/include>
)

# vulkan runtime utils files

file(GLOB_RECURSE vulkan_runtime_utils_cpp ${RUNTIME_PATH}/utils/*.cpp)

# vulkan_backend

file(GLOB vulkan_backend_cpp ${RUNTIME_PATH}/*.cpp)
list(APPEND vulkan_backend_cpp ${vulkan_graph_cpp})
list(APPEND vulkan_backend_cpp ${vulkan_standard_shaders_cpp})
list(APPEND vulkan_backend_cpp ${vulkan_runtime_utils_cpp})

add_library(vulkan_backend ${vulkan_backend_cpp})
target_include_directories(
Expand Down
141 changes: 119 additions & 22 deletions backends/vulkan/runtime/api/containers/Tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,21 @@
namespace vkcompute {
namespace api {

/*
* For PackedInt8 memory layouts, ensure that the scalar type used for the
* tensor is kInt8x4. Otherwise, return the original scalar type.
*/
vkapi::ScalarType get_effective_scalar_type(
const vkapi::ScalarType dtype,
const utils::GPUMemoryLayout memory_layout) {
vkapi::ScalarType effective_dtype = dtype;
if (utils::is_packed_int8_layout(memory_layout)) {
VK_CHECK_COND(dtype == vkapi::kInt8x4 || dtype == vkapi::kChar);
effective_dtype = vkapi::kInt8x4;
}
return effective_dtype;
}

/*
* Used to infer the sizes of a tensor that would correspond to a given
* VulkanImage.
Expand Down Expand Up @@ -187,6 +202,7 @@ std::vector<int64_t> calculate_padded_sizes(

utils::uvec3 calculate_image_extents(
const std::vector<int64_t>& padded_sizes,
const utils::GPUMemoryLayout memory_layout,
const std::vector<int64_t>& axis_map,
const int32_t packed_dim) {
utils::uvec3 extents({1, 1, 1});
Expand All @@ -205,6 +221,28 @@ utils::uvec3 calculate_image_extents(
extents[axis] = utils::safe_downcast<uint32_t>(padded_sizes.at(dim));
}

// For "regular" tensor dtypes, 4 elements along the packed dim are packed
// into one texel (4-component vectorized type). However, for packed int8
// memory layouts, an additional level of packing is employed where 4 int8
// elements are packed into one int32, and then 4 int32 are packed into each
// ivec4 texel.
if (utils::is_packed_int8_layout(memory_layout)) {
// Each int in the ivec4 contains 4 channels. The overall ivec4 contains
// data for a 1Hx4Wx4C block of the input tensor.
if (memory_layout == utils::kPackedInt8_4W4C) {
VK_CHECK_COND(packed_dim == 2);
extents[axis_map.at(0)] = utils::div_up(extents[axis_map.at(0)], 4u);
}
// Each int in the ivec4 contains 4 elements along the width dim. The
// overall ivec4 contains data for a 4Hx4W block of the input tensor.
else if (memory_layout == utils::kPackedInt8_4H4W) {
VK_CHECK_COND(packed_dim == 0);
extents[axis_map.at(1)] = utils::div_up(extents[axis_map.at(1)], 4u);
} else {
VK_THROW("Unhandled packed int8 memory layout!");
}
}

// axis_map[3] indicates the WHCN index of the dimension used for batch
// concatenation. Thus a double lookup is required to determine the image axis
// used for batch concatenation.
Expand All @@ -215,6 +253,7 @@ utils::uvec3 calculate_image_extents(

VK_CHECK_COND(extents[axis_map.at(packed_dim)] % 4 == 0);
extents[axis_map.at(packed_dim)] /= 4;

return extents;
}

Expand Down Expand Up @@ -247,35 +286,72 @@ utils::uvec3 calculate_logical_limits(
*/
utils::uvec3 calculate_logical_limits(
const std::vector<int64_t>& sizes,
const utils::GPUMemoryLayout memory_layout,
const std::vector<int64_t>& axis_map,
const int32_t packed_dim) {
return calculate_logical_limits(
calculate_image_extents(
calculate_padded_sizes(sizes, packed_dim), axis_map, packed_dim),
calculate_padded_sizes(sizes, packed_dim),
memory_layout,
axis_map,
packed_dim),
axis_map);
}

int64_t calculate_gpu_buffer_numel(
const std::vector<int64_t>& sizes,
const utils::GPUMemoryLayout memory_layout,
const vkapi::ScalarType dtype) {
size_t numel;

// Mirrors the logic in calculate_image_extents for packed int8 memory layouts
if (dtype == vkapi::kInt8x4) {
VK_CHECK_COND(utils::is_packed_int8_layout(memory_layout));
std::vector<int64_t> blocks_in_dim =
flip_and_unsqueeze<int64_t>(sizes, kTensorSizes, 0);
// Each ivec4 contains data for a 1Hx4Wx4C block of the input
if (memory_layout == utils::kPackedInt8_4W4C) {
blocks_in_dim[0] = utils::div_up_4(blocks_in_dim[0]);
blocks_in_dim[2] = utils::div_up_4(blocks_in_dim[2]);
}
// Each ivec4 contains data for a 4Hx4W block of the input
else if (memory_layout == utils::kPackedInt8_4H4W) {
blocks_in_dim[0] = utils::div_up_4(blocks_in_dim[0]);
blocks_in_dim[1] = utils::div_up_4(blocks_in_dim[1]);
} else {
VK_THROW("Unhandled packed int8 memory layout!");
}
// Each block is represented as an ivec4, and the base dtype of the buffer
// is int. Therefore, need to multiply the number of blocks by 4 to obtain
// the number of int elements in the data buffer.
numel = utils::multiply_integers(blocks_in_dim) * 4;
}
// Case for "regular" dtypes/memory layouts
else {
numel = utils::multiply_integers(sizes);

// For 8-bit types, align to the next multiple of 4. For devices that do not
// support 8-bit storage buffers, the tensor data will be interpreted as an
// array of int32 instead.
if (vkapi::element_size(dtype) == 1) {
numel = utils::align_up_4(numel);
}
}
return numel;
}

int64_t calculate_staging_or_gpu_buffer_numel(
Context* const context,
const std::vector<int64_t>& sizes,
const utils::uvec3 image_extents,
const utils::StorageType storage_type,
const utils::GPUMemoryLayout memory_layout,
const vkapi::ScalarType dtype) {
// For texture backed tensors, simply multiply the total number of texels by 4
if (storage_type != utils::kBuffer) {
return image_extents[0] * image_extents[1] * image_extents[2] * 4;
}
const bool is_int8 = dtype == vkapi::kChar;
const bool int8_supported =
context->adapter_ptr()->has_full_int8_buffers_support();
const size_t numel = utils::multiply_integers(sizes);
// For int8 tensors, if the device does not support int8 buffers, then int32
// is used instead to represent the buffer data. Therefore the number of
// elements in the buffer is aligned to the next multiple of 4.
if (is_int8 && int8_supported) {
return utils::align_up_4(numel);
}
return numel;
return calculate_gpu_buffer_numel(sizes, memory_layout, dtype);
}

template <typename T, typename = std::enable_if_t<std::is_integral<T>::value>>
Expand Down Expand Up @@ -332,10 +408,12 @@ vkapi::VulkanImage allocate_image(
Context* const context_ptr,
utils::uvec3& image_extents,
const utils::StorageType storage_type,
const VkFormat image_format,
const vkapi::ScalarType dtype,
const bool allocate_memory) {
vkapi::Adapter* adapter_ptr = context_ptr->adapter_ptr();

const VkFormat image_format = vkcompute::vkapi::to_vkformat(dtype);

vkapi::ImageSampler::Properties sampler_props{
VK_FILTER_NEAREST,
VK_SAMPLER_MIPMAP_MODE_NEAREST,
Expand Down Expand Up @@ -420,6 +498,7 @@ vkapi::VulkanBuffer allocate_buffer(
vTensorStorage::vTensorStorage(
Context* const context,
const utils::StorageType storage_type,
const utils::GPUMemoryLayout memory_layout,
const std::vector<int64_t>& axis_map,
const int32_t packed_dim,
const std::vector<int64_t>& sizes,
Expand All @@ -429,20 +508,22 @@ vTensorStorage::vTensorStorage(
storage_type_{storage_type},
image_extents_(calculate_image_extents(
calculate_padded_sizes(sizes, packed_dim),
memory_layout,
axis_map,
packed_dim)),
buffer_length_{calculate_gpu_buffer_numel(
buffer_length_{calculate_staging_or_gpu_buffer_numel(
context_,
sizes,
image_extents_,
storage_type,
memory_layout,
dtype)},
buffer_offset_{0},
image_(allocate_image(
context_,
image_extents_,
storage_type_,
to_vkformat(dtype),
dtype,
allocate_memory)),
buffer_(allocate_buffer(
context_,
Expand Down Expand Up @@ -553,7 +634,7 @@ vTensor::vTensor(
const utils::GPUMemoryLayout memory_layout,
const bool allocate_memory,
const utils::AxisMapLayout axis_map_layout)
: dtype_(dtype),
: dtype_(get_effective_scalar_type(dtype, memory_layout)),
// Calculate tensor metadata
sizes_(sizes.begin(), sizes.end()),
packed_dim_(utils::to_packed_dim<int32_t>(memory_layout)),
Expand All @@ -576,6 +657,7 @@ vTensor::vTensor(
storage_(std::make_shared<vTensorStorage>(
context,
storage_type,
memory_layout,
axis_map_,
packed_dim_,
sizes,
Expand Down Expand Up @@ -785,6 +867,16 @@ vkapi::VulkanBuffer& vTensor::buffer(
}

utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
if (dtype_ == vkapi::kInt8x4) {
switch (packed_dim_) {
case WHCN::kChannelsDim:
return utils::kPackedInt8_4W4C;
case WHCN::kWidthDim:
return utils::kPackedInt8_4H4W;
default:
VK_THROW("Invalid packed dim for Tensor with kInt8x4 type");
}
}
switch (packed_dim_) {
case WHCN::kWidthDim:
return utils::kWidthPacked;
Expand Down Expand Up @@ -914,8 +1006,8 @@ void vTensor::update_metadata() {
flip_and_unsqueeze_ivec4(dim_order_, kTensorDimOrder, numel_);
uniform_data_->strides_v =
flip_and_unsqueeze_ivec4(strides_, kTensorStrides, numel_);
uniform_data_->logical_limits.limits =
calculate_logical_limits(sizes_, axis_map_, packed_dim_);
uniform_data_->logical_limits.limits = calculate_logical_limits(
sizes_, estimate_memory_layout(), axis_map_, packed_dim_);

if (sizes_uniform_offset_ != kUniformOffsetUnset) {
uniforms_.update(uniform_data_->sizes_v, sizes_uniform_offset_);
Expand All @@ -942,11 +1034,15 @@ void vTensor::update_metadata() {
}

void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
utils::GPUMemoryLayout est_memory_layout = estimate_memory_layout();
if (storage_type() != utils::kBuffer) {
// For texture storage check that the current texture is large enough for
// the new sizes of the tensor.
utils::uvec3 virtual_extents = calculate_image_extents(
calculate_padded_sizes(sizes_, packed_dim_), axis_map_, packed_dim_);
calculate_padded_sizes(sizes_, packed_dim_),
est_memory_layout,
axis_map_,
packed_dim_);

bool valid_resize = virtual_extents[0] <= storage_->image_extents_[0];
valid_resize =
Expand All @@ -958,9 +1054,10 @@ void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
valid_resize,
"tensor sizes requires a larger texture than the current one.");
} else {
// For buffer storage check that the current buffer is large enough for the
// new sizes of the tensor.
int64_t numel = utils::multiply_integers(sizes);
// For buffer storage check that the current buffer is large enough for
// the new sizes of the tensor.
int64_t numel =
calculate_gpu_buffer_numel(sizes_, est_memory_layout, dtype_);
bool valid_resize =
numel + storage_->buffer_offset_ <= storage_->buffer_length_;
VK_CHECK_COND(
Expand Down
1 change: 1 addition & 0 deletions backends/vulkan/runtime/api/containers/Tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ class vTensorStorage final {
vTensorStorage(
Context* context,
const utils::StorageType storage_type,
const utils::GPUMemoryLayout memory_layout,
const std::vector<int64_t>& axis_map,
const int32_t packed_dim,
const std::vector<int64_t>& sizes,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -564,16 +564,12 @@ void quantized_conv2d_impl(
ValueRef packed_weight_sums = prepack_standard(
graph, weight_sums_data, utils::kBuffer, utils::kWidthPacked);

// Allocate quantized + packed im2col matrix for input
const int64_t num_blocks_M = utils::div_up_4(input_im2col_sizes.at(0));
const int64_t num_blocks_K = utils::div_up_4(input_im2col_sizes.at(1));

TmpTensor input_int_im2col(
&graph,
{num_blocks_M, num_blocks_K * 4},
vkapi::kInt,
input_im2col_sizes,
vkapi::kInt8x4,
utils::kBuffer,
utils::kWidthPacked);
utils::kPackedInt8_4H4W);

add_quantize_and_pack_im2col_node(
graph,
Expand Down
14 changes: 3 additions & 11 deletions backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -802,20 +802,12 @@ void quantized_linear_impl(
graph, weight_sums_data, utils::kBuffer, utils::kWidthPacked);

// Allocate temporary tensor to store quantized and packed input

int64_t num_blocks_M, num_blocks_K;
std::tie(num_blocks_M, num_blocks_K) =
get_quantized_input_num_blocks(graph, fp_input);

const int64_t int_input_height = num_blocks_M;
const int64_t int_input_width = num_blocks_K * 4;

TmpTensor packed_int_input(
&graph,
{int_input_height, int_input_width},
vkapi::kInt,
graph.sizes_of(fp_input),
vkapi::kInt8x4,
utils::kBuffer,
utils::kWidthPacked);
utils::kPackedInt8_4H4W);

// Non dynamically quantized input case
if (!input_quant_config.is_dynamic) {
Expand Down
25 changes: 25 additions & 0 deletions backends/vulkan/runtime/utils/StorageUtils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/backends/vulkan/runtime/utils/StorageUtils.h>

namespace vkcompute {
namespace utils {

bool is_packed_int8_layout(const GPUMemoryLayout layout) {
switch (layout) {
case kPackedInt8_4W4C:
case kPackedInt8_4H4W:
return true;
default:
return false;
}
}

} // namespace utils
} // namespace vkcompute
Loading
Loading