Skip to content

Commit

Permalink
Optimize Vulkan command buffer submission rate.
Browse files Browse the repository at this point in the history
ghstack-source-id: c62feab0d4e46987d8389913c3a670d6be92b52c
Pull Request resolved: #49112
  • Loading branch information
Ashkan Aliabadi committed Dec 9, 2020
1 parent 09b974c commit 3545493
Show file tree
Hide file tree
Showing 15 changed files with 641 additions and 849 deletions.
8 changes: 4 additions & 4 deletions aten/src/ATen/native/vulkan/api/Command.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -383,10 +383,10 @@ Command::Buffer Command::Pool::allocate() {
Configuration::kQuantum);

allocate_command_buffers(
device_,
command_pool_.get(),
buffer_.pool.data() + buffer_.in_use,
Configuration::kQuantum);
device_,
command_pool_.get(),
buffer_.pool.data() + buffer_.in_use,
Configuration::kQuantum);
}

return Buffer(buffer_.pool[buffer_.in_use++]);
Expand Down
7 changes: 6 additions & 1 deletion aten/src/ATen/native/vulkan/api/Command.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,15 @@ struct Command final {

class Buffer final {
public:
Buffer(VkCommandBuffer command_buffer = VK_NULL_HANDLE);
explicit Buffer(VkCommandBuffer command_buffer);
Buffer(const Buffer&) = delete;
Buffer& operator=(const Buffer&) = delete;
Buffer(Buffer&&);
Buffer& operator=(Buffer&&);
~Buffer() = default;

operator bool() const;
VkCommandBuffer handle() const;

void begin();
void end();
Expand Down Expand Up @@ -129,6 +130,10 @@ inline Command::Buffer::operator bool() const {
return VK_NULL_HANDLE != command_buffer_;
}

inline VkCommandBuffer Command::Buffer::handle() const {
return command_buffer_;
}

inline void Command::Buffer::Bound::reset() {
pipeline = {};
descriptor_set = VK_NULL_HANDLE;
Expand Down
11 changes: 9 additions & 2 deletions aten/src/ATen/native/vulkan/api/Common.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,17 @@

#ifdef USE_VULKAN_SHADERC_RUNTIME
#include <ATen/native/vulkan/glsl.h>
#define VK_KERNEL(name) { name##_glsl, }
#define VK_KERNEL(name) \
::at::native::vulkan::api::Shader::Descriptor{ \
name##_glsl, \
}
#else
#include <ATen/native/vulkan/spv.h>
#define VK_KERNEL(name) { name##_spv, name##_spv_len, }
#define VK_KERNEL(name) \
::at::native::vulkan::api::Shader::Descriptor{ \
name##_spv, \
name##_spv_len, \
}
#endif /* USE_VULKAN_SHADERC_RUNTIME */

#ifdef USE_VULKAN_WRAPPER
Expand Down
298 changes: 141 additions & 157 deletions aten/src/ATen/native/vulkan/ops/Add.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,47 +25,43 @@ Tensor add_scalar(
};

api::Command::Buffer command_buffer = context->command().pool.allocate();
command_buffer.begin();
{
if (v_output.has_image() && v_self.has_image()) {
const struct {
uvec3 extents;
float other;
} block {
v_self.extents(),
other.to<float>() * alpha.to<float>(),
};

context->dispatch(
command_buffer,
{
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
},
VK_KERNEL(add_scalar),
v_output.extents(),
// Write-only access bypasses synchronization but inserts appropriate
// barriers if necessary.
v_output.image(
command_buffer,
vTensor::Stage::Compute,
vTensor::Access::Write),
// Read-only access is implied on const tensors and triggers an async
// synchronization if necessary.
v_self.image(
command_buffer,
vTensor::Stage::Compute),
// Object lifetime is managed by the resource pool.
// It is OK not to keep track of the handle.
context->resource().pool.uniform(block).object);
}
else {
TORCH_CHECK(false, "Not implemented!");
}

if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
const struct {
uvec3 extents;
float other;
} block {
v_self.extents(),
other.to<float>() * alpha.to<float>(),
};

context->dispatch(
command_buffer,
{
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
},
VK_KERNEL(add_scalar),
v_output.extents(),
// Write-only access bypasses synchronization but inserts appropriate
// barriers if necessary.
v_output.image(
command_buffer,
vTensor::Stage::Compute,
vTensor::Access::Write),
// Read-only access is implied on const tensors and triggers an async
// synchronization if necessary.
v_self.image(
command_buffer,
vTensor::Stage::Compute),
// Object lifetime is managed by the resource pool.
// It is OK not to keep track of the handle.
context->resource().pool.uniform(block).object);
}
else {
TORCH_CHECK(false, "Not implemented!");
}
command_buffer.end();
command_buffer.submit(context->gpu().queue);

return convert(v_output);
}
Expand All @@ -83,41 +79,37 @@ Tensor& add_scalar_(
vTensor& v_self = convert(self);

api::Command::Buffer command_buffer = context->command().pool.allocate();
command_buffer.begin();
{
if (v_self.has_image()) {
const struct {
uvec3 extents;
float other;
} block {

if C10_LIKELY(v_self.has_image()) {
const struct {
uvec3 extents;
float other;
} block {
v_self.extents(),
other.to<float>() * alpha.to<float>(),
};

context->dispatch(
command_buffer,
{
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
},
VK_KERNEL(add_scalar_),
v_self.extents(),
other.to<float>() * alpha.to<float>(),
};

context->dispatch(
command_buffer,
{
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
},
VK_KERNEL(add_scalar_),
v_self.extents(),
// Read-Write access triggers an async synchronization if necessory
// and inserts appropriate barriers if hazards are detected.
v_self.image(
command_buffer,
vTensor::Stage::Compute,
vTensor::Access::Read | vTensor::Access::Write),
// Object lifetime is managed by the resource pool.
// It is OK not to keep track of the handle.
context->resource().pool.uniform(block).object);
}
else {
TORCH_CHECK(false, "Not implemented!");
}
// Read-Write access triggers an async synchronization if necessory
// and inserts appropriate barriers if hazards are detected.
v_self.image(
command_buffer,
vTensor::Stage::Compute,
vTensor::Access::Read | vTensor::Access::Write),
// Object lifetime is managed by the resource pool.
// It is OK not to keep track of the handle.
context->resource().pool.uniform(block).object);
}
else {
TORCH_CHECK(false, "Not implemented!");
}
command_buffer.end();
command_buffer.submit(context->gpu().queue);

return self;
}
Expand All @@ -141,53 +133,49 @@ Tensor add_tensor(
};

api::Command::Buffer command_buffer = context->command().pool.allocate();
command_buffer.begin();
{
if (v_self.has_image() && v_other.has_image()) {
const struct {
uvec3 extents;
float alpha;
} block {

if C10_LIKELY(v_self.has_image() && v_other.has_image()) {
const struct {
uvec3 extents;
float alpha;
} block {
v_output.extents(),
alpha.to<float>(),
};

context->dispatch(
command_buffer,
{
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
},
VK_KERNEL(add),
v_output.extents(),
alpha.to<float>(),
};

context->dispatch(
command_buffer,
{
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
},
VK_KERNEL(add),
v_output.extents(),
// Write-only access bypasses synchronization but inserts appropriate
// barriers if necessary.
v_output.image(
command_buffer,
vTensor::Stage::Compute,
vTensor::Access::Write),
// Read-only access is implied on const tensors and triggers an async
// synchronization if necessary.
v_self.image(
command_buffer,
vTensor::Stage::Compute),
// Read-only access is implied on const tensors and triggers an async
// synchronization if necessary.
v_other.image(
command_buffer,
vTensor::Stage::Compute),
// Object lifetime is managed by the resource pool.
// It is OK not to keep track of the handle.
context->resource().pool.uniform(block).object);
}
else {
TORCH_CHECK(false, "Not implemented!");
}
// Write-only access bypasses synchronization but inserts appropriate
// barriers if necessary.
v_output.image(
command_buffer,
vTensor::Stage::Compute,
vTensor::Access::Write),
// Read-only access is implied on const tensors and triggers an async
// synchronization if necessary.
v_self.image(
command_buffer,
vTensor::Stage::Compute),
// Read-only access is implied on const tensors and triggers an async
// synchronization if necessary.
v_other.image(
command_buffer,
vTensor::Stage::Compute),
// Object lifetime is managed by the resource pool.
// It is OK not to keep track of the handle.
context->resource().pool.uniform(block).object);
}
else {
TORCH_CHECK(false, "Not implemented!");
}
command_buffer.end();
command_buffer.submit(context->gpu().queue);

return convert(v_output);
}
Expand All @@ -208,47 +196,43 @@ Tensor& add_tensor_(
const vTensor& v_other = convert(other);

api::Command::Buffer command_buffer = context->command().pool.allocate();
command_buffer.begin();
{
if (v_self.has_image() && v_other.has_image() && !self.is_same(other)) {
const struct {
uvec3 extents;
float alpha;
} block {

if C10_LIKELY(v_self.has_image() && v_other.has_image() && !self.is_same(other)) {
const struct {
uvec3 extents;
float alpha;
} block {
v_self.extents(),
alpha.to<float>(),
};

context->dispatch(
command_buffer,
{
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
},
VK_KERNEL(add_),
v_self.extents(),
alpha.to<float>(),
};

context->dispatch(
command_buffer,
{
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
},
VK_KERNEL(add_),
v_self.extents(),
// Read-Write access triggers an async synchronization if necessory
// and inserts appropriate barriers if hazards are detected.
v_self.image(
command_buffer,
vTensor::Stage::Compute,
vTensor::Access::Read | vTensor::Access::Write),
// Read-only access is implied on const tensors and triggers an async
// synchronization if necessary.
v_other.image(
command_buffer,
vTensor::Stage::Compute),
// Object lifetime is managed by the resource pool.
// It is OK not to keep track of the handle.
context->resource().pool.uniform(block).object);
}
else {
TORCH_CHECK(false, "Not implemented!");
}
// Read-Write access triggers an async synchronization if necessory
// and inserts appropriate barriers if hazards are detected.
v_self.image(
command_buffer,
vTensor::Stage::Compute,
vTensor::Access::Read | vTensor::Access::Write),
// Read-only access is implied on const tensors and triggers an async
// synchronization if necessary.
v_other.image(
command_buffer,
vTensor::Stage::Compute),
// Object lifetime is managed by the resource pool.
// It is OK not to keep track of the handle.
context->resource().pool.uniform(block).object);
}
else {
TORCH_CHECK(false, "Not implemented!");
}
command_buffer.end();
command_buffer.submit(context->gpu().queue);

return self;
}
Expand Down

0 comments on commit 3545493

Please sign in to comment.