Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[vulkan] Add mean.dim op for vulkan #47312

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
28 changes: 9 additions & 19 deletions aten/src/ATen/native/vulkan/glsl/mean.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -2,36 +2,26 @@
#define PRECISION $precision
layout(std430) buffer;
layout(std430) uniform;
layout(set = 0, rgba16f, binding = 0) writeonly PRECISION uniform image3D uOutput;
layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
layout(set = 0, binding = 2) uniform constBlock {
layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D uOutput;
layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
layout(set = 0, binding = 2) uniform Block {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please mark this with PRECISION and restrict.

int W;
int H;
int OW;
int OH;
}
uConstBlock;
} uBlock;

layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;

void main() {
ivec3 pos = ivec3(gl_GlobalInvocationID);
int W = uConstBlock.W;
int H = uConstBlock.H;
int OW = uConstBlock.OW;
int OH = uConstBlock.OH;
vec4 r = vec4(1.0) / float(W) / float(H);
vec4 r = vec4(1.0) / float(uBlock.W) / float(uBlock.H);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Check adaptive_avg_pool here https://github.com/pytorch/pytorch/pull/47261/files for another implementation. Divisions are typically slower than multiplications.

vec4 acc = vec4(0);
int xi, yi;
for (xi = 0; xi < W; ++xi) {
for (yi = 0; yi < H; ++yi) {
for (xi = 0; xi < uBlock.W; ++xi) {
for (yi = 0; yi < uBlock.H; ++yi) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Iterate over y in the outer loop, and x in the inner loop. We are dealing with a texture that is packed in an opaque format, so this might not apply, but if and when the memory is laid out linearly that traversal has better locality of access.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Check adaptive_avg_pool shader here: https://github.com/pytorch/pytorch/pull/47261/files

acc += texelFetch(uInput, ivec3(xi, yi, pos.z), 0);
}
}
vec4 outValue = r * acc;
for (int vi = 0; vi < 4; ++vi) {
int oy = (4 * pos.z + vi) / OW;
int ox = (4 * pos.z + vi) % OW;
imageStore(uOutput, ivec3(ox, oy, 0), vec4(outValue[vi], 0, 0, 0));
}

imageStore(uOutput, pos, outValue);
}
30 changes: 30 additions & 0 deletions aten/src/ATen/native/vulkan/glsl/mean2d.glsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#version 450 core
#define PRECISION $precision
layout(std430) buffer;
layout(std430) uniform;
layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D uOutput;
layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
layout(set = 0, binding = 2) uniform Block {
int W;
int H;
} uBlock;

layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;

void main() {
ivec3 pos = ivec3(gl_GlobalInvocationID);
vec4 r = vec4(1.0) / float(uBlock.W) / float(uBlock.H);
vec4 acc = vec4(0);
int xi, yi;
int zi = (imageSize(uOutput).x*pos.y + pos.x)/4;
int zo = (imageSize(uOutput).x*pos.y + pos.x)%4;
for (xi = 0; xi < uBlock.W; ++xi) {
for (yi = 0; yi < uBlock.H; ++yi) {
acc += texelFetch(uInput, ivec3(xi, yi, zi), 0);
}
}
vec4 outValue = r * acc;

int test = (imageSize(uOutput).x*pos.x + pos.x);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

clean?

imageStore(uOutput, pos, vec4(outValue[zo], 0,0,0));
}
87 changes: 87 additions & 0 deletions aten/src/ATen/native/vulkan/ops/Pool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,92 @@ namespace vulkan {
namespace ops {
namespace {

int64_t normalize_dim(int64_t d, int64_t n) {
return (d % n + n) % n;
}

Tensor mean(
const at::Tensor& input_arg,
const IntArrayRef dim,
const bool keepdim,
const optional<ScalarType> dtype) {
TORCH_INTERNAL_ASSERT(
input_arg.dim() == 4,
"vulkan_mean expects 4-dimensional input");
static const std::unordered_set<int64_t> expected_dims_set({2, 3});
std::unordered_set<int64_t> dims_set;
for (const auto& d : dim) {
dims_set.insert(normalize_dim(d, 4));
}
TORCH_INTERNAL_ASSERT(
dims_set == expected_dims_set,
"vulkan_mean currently only supported for image-wide reduction"
);

std::vector<int64_t> output_dims{input_arg.sizes()[0], input_arg.sizes()[1]};
if (keepdim) {
output_dims.push_back(1);
output_dims.push_back(1);
}

api::Context* const context = api::context();
const vTensor& v_input = convert(input_arg);
vTensor v_output{
context,
output_dims,
input_arg.options(),
};

api::Command::Buffer command_buffer = context->command().pool.allocate();
command_buffer.begin();
{
if (v_input.has_image()) {
const struct {
uint32_t input_width, input_height;
} block {
input_arg.sizes()[3],
input_arg.sizes()[2],
};

if (keepdim) {
context->dispatch(
command_buffer,
{
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
},
VK_KERNEL(mean),
v_output.extents(),
v_output.image(command_buffer, vTensor::Access::Write),
v_input.image(command_buffer),
context->resource().pool.uniform(block).object);
}
else {
context->dispatch(
command_buffer,
{
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
},
VK_KERNEL(mean2d),
v_output.extents(),
v_output.image(command_buffer, vTensor::Access::Write),
v_input.image(command_buffer),
context->resource().pool.uniform(block).object);
}
}
else {
TORCH_CHECK(false, "Not implemented!");
}
}
command_buffer.end();
command_buffer.submit(context->gpu().queue);

return convert(v_output);
}

Tensor adaptive_avg_pool2d(const at::Tensor& input_arg, IntArrayRef output_size) {
TORCH_INTERNAL_ASSERT(
input_arg.dim() == 4,
Expand Down Expand Up @@ -155,6 +241,7 @@ Tensor avg_pool2d(
#ifdef USE_VULKAN_API

TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
m.impl("mean.dim", TORCH_FN(mean));
m.impl("_adaptive_avg_pool2d", TORCH_FN(adaptive_avg_pool2d));
m.impl("avg_pool2d", TORCH_FN(avg_pool2d));
}
Expand Down
37 changes: 37 additions & 0 deletions aten/src/ATen/test/vulkan_api_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,42 @@ TEST(VulkanTest, mm) {
ASSERT_TRUE(check);
}

TEST(VulkanTest, mean) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

VulkanAPITest

auto t_in =
at::rand({5,3,9,9}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
auto t_out_expected = at::mean(t_in, {-1,-2}, false);
auto tv_in = t_in.vulkan();

auto tv_out = at::mean(tv_in, {-1,-2}, false);
auto t_out = tv_out.cpu();

const auto check = almostEqual(t_out, t_out_expected);
if (!check) {
//std::cout << "original:\n" << t_in << std::endl;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

clean or uncomment?

std::cout << "expected:\n" << t_out_expected << std::endl;
std::cout << "got:\n" << t_out << std::endl;
}
ASSERT_TRUE(check);
}

TEST(VulkanTest, mean_keep_dim) {
auto t_in =
at::rand({10, 3, 21, 21}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
auto t_out_expected = at::mean(t_in, {-1, -2}, true);
auto tv_in = t_in.vulkan();

auto tv_out = at::mean(tv_in, {-1, -2}, true);
auto t_out = tv_out.cpu();

const auto check = almostEqual(t_out, t_out_expected);
if (!check) {
//std::cout << "original:\n" << t_in << std::endl;
std::cout << "expected:\n" << t_out_expected << std::endl;
std::cout << "got:\n" << t_out << std::endl;
}
ASSERT_TRUE(check);
}

TEST(VulkanTest, adaptive_avg_pool2d) {
auto t_in =
at::rand({1, 2, 7, 7}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
Expand All @@ -180,6 +216,7 @@ TEST(VulkanTest, adaptive_avg_pool2d) {

const auto check = almostEqual(t_out, t_out_expected);
if (!check) {
std::cout << "original:\n" << t_in << std::endl;
std::cout << "expected:\n" << t_out_expected << std::endl;
std::cout << "got:\n" << t_out << std::endl;
}
Expand Down