Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions aten/src/ATen/native/Copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <ATen/NativeFunctions.h>
#include <ATen/native/TensorIterator.h>
#include <ATen/native/quantized/Copy.h>
#include <ATen/native/vulkan/ops/Copy.h>
#include <ATen/quantized/Quantizer.h>
#include <ATen/vulkan/Context.h>
#include <ATen/metal/Context.h>
Expand Down Expand Up @@ -131,7 +132,11 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
}

if (self.device().type() == at::kVulkan || src.device().type() == at::kVulkan) {
#ifdef USE_VULKAN_API
return vulkan::ops::copy_(self, src);
#else
return at::vulkan::vulkan_copy_(self, src);
#endif
}

if (self.device().type() == at::kMetal || src.device().type() == at::kMetal) {
Expand Down
149 changes: 149 additions & 0 deletions aten/src/ATen/native/vulkan/ops/Copy.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
#include <ATen/native/vulkan/ops/Common.h>

namespace at {
namespace native {
namespace vulkan {
namespace ops {

Tensor& copy_(Tensor& self, const Tensor& src) {
// X -> Vulkan
if (at::kVulkan == self.device().type()) {
vTensor& v_self = convert(self);

// CPU -> Vulkan
if (at::kCPU == src.device().type()) {
// Requesting write-only host access to the tensor never triggers a sync
// as the contents will be overwritten regardless. Having said that,
// appropriate barriers are inserted automatically if WAR or WAW hazards
// are detected. Examples of such scenario for instance are if any of
// these async operations are on going in the background on 'self':
// - On discrete systems:
// * buffer-to-staging transfers
// * staging-to-buffer transfers
// - On UMA buffer is an alias for staging and accessible both on host
// and device. Consequently:
// * buffer-to-image NHWC -> NC4HW packing
// * image-to-buffer NC4HW -> NHWC unpacking

using Future = vTensor::Future<void, vTensor::Access::Write>;
Future v_self_future = v_self.host<void, vTensor::Access::Write>();

// This wait() will be a no-op if no hazards are detected, including the
// obvious, yet important, special case of 'self' being an empty tensor.

Future::Payload v_self_payload = v_self_future.wait();

memcpy(
v_self_payload.get(),
src.contiguous().data_ptr<float>(),
std::min(src.nbytes(), self.nbytes()));
}
// Vulkan -> Vulkan
else if (at::kVulkan == src.device().type()) {
api::Command::Buffer command_buffer = api::context()->command().pool.allocate();
command_buffer.begin();

command_buffer.copy(
// - Read-only access is implied on const tensors. Memory barriers
// are automatically inserted if a RAW hazard is detected.
// - Recording any potential pending sync operations into the same
// command buffer prevents an expensive queue submission.
convert(src).buffer(command_buffer),
// - Write-only access never triggers a sync as the contents will be
// overwritten regardless. Having said that, appropriate barriers
// are inserted automatically if WAR or WAW hazards are detected.
// - Recording pending sync operations into the same command buffer
// prevents an expensive queue submission.
v_self.buffer(command_buffer, vTensor::Access::Write));

command_buffer.end();
command_buffer.submit(api::context()->gpu().queue);
}
else {
TORCH_INTERNAL_ASSERT(false, "Unsupported!");
}
}
// Vulkan -> X
else if (at::kVulkan == src.device().type()) {
const vTensor& v_src = convert(src);

{
// Similar notes as above applies, with the additional consideration of
// potential syncs on read accesses. Namely,
// - on discrete systems, if the (staging, buffer, image) trio, or
// - on UMA, if the (buffer, image) duo
// have gone out of sync as a result of one processor writing to one
// resource which is then either accessed as an another resource type on
// the same or another processor. Same considerations regarding hazard
// avoidance as above applies.

using Future = vTensor::Future<const void, vTensor::Access::Read>;
const Future v_src_future = v_src.host<const void>();

// Vulkan -> CPU
if (at::kCPU == self.device().type()) {
// This wait() is a no-op if data is not out of sync. More often than
// not though, waits here are expected as the GPU catches up with
// compute submitted from CPU.

const Future::Payload v_src_payload = v_src_future.wait();

memcpy(
self.data_ptr<float>(),
v_src_payload.get(),
std::min(src.nbytes(), self.nbytes()));
}
else {
TORCH_INTERNAL_ASSERT(false, "Unsupported!");
}
}

//
// WARNING
//

// This is not great. We almost never want to flush the GPU pipeline as
// that has far reaching consequences, especially if PyTorch is not the only
// process accessing the GPU. If we have done our job properly, above
// synchronization mechanisms should be enough to ensure correctness at a more
// modest cost, as there is no need to flush the entirety of jobs in flight
// if one is only interested on waiting on computation affecting one single
// tensor to finish.
//
// Having said that, we still do need to release all pool resources at one
// point per inference run or we will run out of memory otherwise. There is
// no perfect answer to this problem that checks all boxes, which leaves us
// with one of several design decisions:
//
// 1) Use graph mode to gain an understanding of the computation graph,
// itself allowing us to place pool purges intelligently. Best option
// for performance and memory consumption. Not without its downsides if
// flexibility is a top priority.
// 2) If on eager mode, and hence are seeing operations one at a time, expose
// this release of resources to the user as a Python / C++ function. This
// makes for suboptimal user experience but is efficient in terms of
// performance.
// 3) If on eager mode, and interested in keeping this bookkeeping transparent
// to the user, release all resources somewhere ... like here. This is
// not ideal since it requires a pipeline flush to make sure these objects
// are not already in use by a workload in flight. Cannot do much better
// within the constraints of this approach. Good for user experience,
// suboptimal for performance.
// 4) If on eager mode, and interested in keeping this bookkeeping transparent
// to the user, and performance does not matter, make CPU and GPU run in
// lockstep. Obviously this is just bad. Mentioned for the sake of
// completeness.

api::context()->flush();
}
else {
TORCH_INTERNAL_ASSERT(false, "Unsupported!");
}

return self;
}

} // namespace ops
} // namespace vulkan
} // namespace native
} // namespace at
19 changes: 19 additions & 0 deletions aten/src/ATen/native/vulkan/ops/Copy.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#pragma once

#ifdef USE_VULKAN_API

#include <ATen/native/vulkan/ops/Common.h>

namespace at {
namespace native {
namespace vulkan {
namespace ops {

Tensor& copy_(Tensor& self, const Tensor& src);

} // namespace ops
} // namespace vulkan
} // namespace native
} // namespace at

#endif /* USE_VULKAN_API */
32 changes: 30 additions & 2 deletions aten/src/ATen/test/vulkan_api_test.cpp
Original file line number Diff line number Diff line change
@@ -1,11 +1,39 @@
#include <gtest/gtest.h>
#ifdef USE_VULKAN_API

#include <gtest/gtest.h>
#include <ATen/ATen.h>

#ifdef USE_VULKAN_API
// TODO: These functions should move to a common place.

namespace {

bool checkRtol(const at::Tensor& diff, const std::vector<at::Tensor>& inputs) {
float maxValue = 0.0f;

for (const auto& tensor : inputs) {
maxValue = fmax(tensor.abs().max().item<float>(), maxValue);
}

return diff.abs().max().item<float>() < (2e-6 * maxValue);
}

bool almostEqual(const at::Tensor& a, const at::Tensor& b) {
return checkRtol(a - b, {a, b});
}

bool exactlyEqual(const at::Tensor& a, const at::Tensor& b) {
return (a - b).abs().max().item<float>() == 0.0f;
}

} // namespace

namespace {

TEST(VulkanAPITest, copy) {
const auto cpu = at::rand({13, 17, 37, 19}, at::device(at::kCPU).dtype(at::kFloat));
ASSERT_TRUE(exactlyEqual(cpu, cpu.vulkan().cpu()));
}

TEST(VulkanAPITest, empty) {
ASSERT_NO_THROW(at::empty({1, 17, 41, 53}, at::device(at::kVulkan).dtype(at::kFloat)));
}
Expand Down