Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions backends/vulkan/op_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,9 +522,6 @@ def register_view_op(features: OpFeatures):
@update_features(
[
# Shape Manipulation
exir_ops.edge.aten.squeeze_copy.dims,
exir_ops.edge.aten.unsqueeze_copy.default,
exir_ops.edge.aten.permute_copy.default,
exir_ops.edge.aten.t_copy.default,
# Indexing and lookup
exir_ops.edge.aten.flip.default,
Expand Down Expand Up @@ -556,10 +553,15 @@ def register_ported_op(features: OpFeatures):
return features


# Ops ported from PyTorch Vulkan backend. These ops are in a separate registry becasue they support all packed dimensions
@update_features(
[
# Indexing and lookup
exir_ops.edge.aten.slice_copy.Tensor,
# Shape Manipulation
exir_ops.edge.aten.squeeze_copy.dims,
exir_ops.edge.aten.unsqueeze_copy.default,
exir_ops.edge.aten.permute_copy.default,
]
)
def register_ported_op_all_packed_dims(features: OpFeatures):
Expand Down
53 changes: 29 additions & 24 deletions backends/vulkan/runtime/graph/ops/glsl/permute.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -21,56 +21,61 @@ layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} image_i

layout(push_constant) uniform PRECISION restrict Block {
ivec4 out_limits;
ivec4 sizes;
ivec4 in_sizes;
// output dims
ivec4 out_ndims;
// x = output channels aligned to 4, y = input channels aligned to 4
ivec2 ch_info;
ivec2 channel_info;
};

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
layout(constant_id = 3) const int packed_dim = C_DIM;

#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require

void main() {
const u16vec3 pos = u16vec3(gl_GlobalInvocationID);
u16vec3 pos = u16vec3(gl_GlobalInvocationID);

if (any(greaterThanEqual(pos, out_limits.xyz))) {
return;
}

const int out_channel_4up = int(ch_info.x);
const int in_channel_4up = int(ch_info.y);
const int out_batch = int(sizes[3]);
VEC4_T outval = VEC4_T(0.0);
ivec4 v = ivec4(0); // holds b,c,h,w

v[out_ndims[2]] = pos.y;
v[out_ndims[3]] = pos.x;
// scale up output position's packed dim
pos[packed_dim] <<= 2;

const int dst_index = pos.z << 2;
int dst_out_index = dst_index / out_channel_4up;
int dst_out_lane = dst_index % out_channel_4up;
// index of packed dim in bchw format
const int in_packed_dim_bchw_index = 3 - packed_dim;

for (int j = 0; j < 4; ++j, ++dst_out_lane) {
if (dst_out_index >= out_batch) {
// out of range
for (int j = 0; j < 4; ++j, pos[packed_dim]++) {
ivec4 in_bchw_pos = ivec4(0); // holds b,c,h,w
// determine input position based on output position and permute map
// out_ndims is in BCHW format
in_bchw_pos[out_ndims[0]] = (pos.z / channel_info.x);
in_bchw_pos[out_ndims[1]] = (pos.z % channel_info.x);
in_bchw_pos[out_ndims[2]] = pos.y;
in_bchw_pos[out_ndims[3]] = pos.x;

if (any(greaterThanEqual(in_bchw_pos.wzyx, in_sizes.xyzw))) {
break;
}

if (dst_out_lane == out_channel_4up) {
dst_out_lane = 0;
dst_out_index++;
}
// input tensor's packed dim pos (in xyz format) corresponding to output tensor's pos (which is also in xyz format)
const int in_packed_dim_pos = in_bchw_pos[in_packed_dim_bchw_index];

v[out_ndims[0]] = dst_out_index;
v[out_ndims[1]] = dst_out_lane;
// calculate input position in y axis using batch and channel index which is in_bchw_pos.x and in_bchw_pos.y respectively
in_bchw_pos.y = in_bchw_pos.y + in_bchw_pos.x * channel_info.y;

int src_index = v[0] * in_channel_4up + v[1];
// scale down input tensor's packed dim pos to perform fetch
in_bchw_pos[in_packed_dim_bchw_index] >>= 2;

VEC4_T inval = VEC4_T(texelFetch(image_in, u16vec3(v[3], v[2], src_index >> 2), 0));
outval[j] = inval[src_index & 0x3];
// fetch input texel
VEC4_T inval = VEC4_T(texelFetch(image_in, u16vec3(in_bchw_pos.wzy), 0));
outval[j] = inval[in_packed_dim_pos & 0x3];
}

pos[packed_dim] = uint16_t(gl_GlobalInvocationID[packed_dim]);

imageStore(image_out, pos, outval);
}
19 changes: 11 additions & 8 deletions backends/vulkan/runtime/graph/ops/impl/Permute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@ void check_args(
const api::vTensor& in,
const std::vector<int64_t>& permute_dims,
const api::vTensor& out) {
VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim));
VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim));
VK_CHECK_COND(check_same_packed_dim(in, out));

// This implementation doesn't not requires the input tensor to have the same
// dim size as the argument. The code will work as long as the input tensor's
Expand Down Expand Up @@ -72,10 +71,14 @@ void add_permute_node(
int32_t out_channels = dim_at<kChannel4D>(t_out->sizes());
int32_t in_channels = dim_at<kChannel4D>(t_in->sizes());

int32_t out_c_aligned = utils::align_up_4(out_channels);
int32_t in_c_aligned = utils::align_up_4(in_channels);
const auto packed_dim = graph.packed_dim_of(in);
ivec2 channel_info = {out_channels, in_channels};
if (packed_dim == WHCN::kChannelsDim) {
channel_info[0] = utils::align_up_4(channel_info[0]);
channel_info[1] = utils::align_up_4(channel_info[1]);
}

const ivec2 ch_info = {out_c_aligned, in_c_aligned};
const vkapi::SpecVarList spec_vars = {packed_dim};

graph.execute_nodes().emplace_back(new DispatchNode(
graph,
Expand All @@ -86,14 +89,14 @@ void add_permute_node(
{in, vkapi::MemoryAccessType::READ}},
{},
// Specialization Constants
{},
spec_vars,
// Resizing Logic
nullptr,
{},
{{graph.logical_limits_pc_of(out),
graph.sizes_pc_of(out),
graph.sizes_pc_of(in),
PushConstantDataInfo(&out_dims, sizeof(out_dims)),
PushConstantDataInfo(&ch_info, sizeof(ch_info))}}));
PushConstantDataInfo(&channel_info, sizeof(channel_info))}}));
}

void add_permute_node(
Expand Down
34 changes: 15 additions & 19 deletions backends/vulkan/test/op_tests/cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
# LICENSE file in the root directory of this source tree.


import itertools

from collections import namedtuple
from typing import Callable

Expand Down Expand Up @@ -457,26 +459,20 @@ def get_select_int_inputs():

@register_test_suite(["aten.permute.default", "aten.permute_copy.default"])
def get_permute_inputs():
test_suite = VkTestSuite(
[
((9, 2, 9, 4), [0, 1, 2, 3]),
((9, 2, 9, 4), [0, 1, 3, 2]),
((9, 2, 9, 4), [0, 2, 1, 3]),
((9, 2, 9, 4), [0, 2, 3, 1]),
((9, 2, 9, 4), [0, 3, 1, 2]),
((9, 2, 9, 4), [0, 3, 2, 1]),
((9, 2, 9, 4), [3, 0, 1, 2]),
((9, 2, 9, 4), [3, 2, 0, 1]),
((9, 2, 9, 4), [2, 3, 0, 1]),
((9, 2, 9, 4), [2, 0, 3, 1]),
((9, 2, 9), [2, 0, 1]),
((9, 2, 9), [1, 2, 0]),
((9, 2), [0, 1]),
((9, 2), [1, 0]),
]
)
batch_tests = [
((9, 2, 5, 7), out_axis) for out_axis in itertools.permutations([0, 1, 2, 3])
]
channel_tests = [
((9, 2, 5), out_axis) for out_axis in itertools.permutations([0, 1, 2])
]
wh_tests = [((9, 2), out_axis) for out_axis in itertools.permutations([0, 1])]
test_suite = VkTestSuite(batch_tests + channel_tests + wh_tests)

test_suite.layouts = ["utils::kChannelsPacked"]
test_suite.layouts = [
"utils::kWidthPacked",
"utils::kHeightPacked",
"utils::kChannelsPacked",
]
return test_suite


Expand Down
Loading