From eb48b31344eca6482d2f42c609e7d6368e6dc61e Mon Sep 17 00:00:00 2001 From: Mor Tzur Date: Wed, 13 Nov 2019 09:37:53 -0800 Subject: [PATCH] Device Resident Tensors - API & Framework (#3745) Summary: Taking over https://github.com/pytorch/glow/issues/3671, but spinning out the API and Glow-core level changes associated with the DRT plan in https://github.com/pytorch/glow/issues/3629. This does not implement DRT support on any device. Documentation: See https://github.com/pytorch/glow/issues/3629. Pull Request resolved: https://github.com/pytorch/glow/pull/3745 Test Plan: Ran tests, added two simple new sanity checks to DeviceManagerTest. The first `DeviceResidentTensors` should run only for backends that support resident tensors (none currently). The second `CanHandleDeviceResidentTensors` should run on all devices. Differential Revision: D18378905 Pulled By: nickgg fbshipit-source-id: 887c290dae5a6b9b75e9b41a415958d499bc5402 --- include/glow/Backends/DeviceManager.h | 30 ++- .../glow/Base/DeviceTensorTransferManager.h | 50 ++++ include/glow/Base/Tensor.h | 132 ++++++++++- .../glow/ExecutionEngine/ExecutionEngine.h | 7 + include/glow/Graph/PlaceholderBindings.h | 7 + include/glow/Support/Error.h | 2 + .../CPU/tests/CPUDeviceManagerTest.cpp | 4 +- lib/Backends/Habana/HabanaDeviceManager.cpp | 4 + .../Habana/tests/HabanaDeviceManagerTest.cpp | 4 +- .../Interpreter/InterpreterFunction.cpp | 2 + .../tests/InterpreterDeviceManagerTest.cpp | 4 +- lib/Backends/NNPI/NNPIDeviceManager.cpp | 3 + .../NNPI/tests/NNPIDeviceManagerTest.cpp | 1 + lib/Backends/OpenCL/OpenCLDeviceManager.cpp | 9 +- .../OpenCL/tests/OpenCLDeviceManagerTest.cpp | 4 +- lib/Base/Tensor.cpp | 18 ++ lib/ExecutionEngine/ExecutionEngine.cpp | 23 +- lib/LLVMIRCodeGen/LLVMCompiledFunction.cpp | 4 + lib/Support/Error.cpp | 2 + tests/unittests/BackendTestUtils.cpp | 4 + tests/unittests/DeviceManagerTest.cpp | 216 ++++++++++++++---- 21 files changed, 464 insertions(+), 66 deletions(-) create mode 100644 include/glow/Base/DeviceTensorTransferManager.h diff --git a/include/glow/Backends/DeviceManager.h b/include/glow/Backends/DeviceManager.h index 4df41ef270..c1c47d989b 100644 --- a/include/glow/Backends/DeviceManager.h +++ b/include/glow/Backends/DeviceManager.h @@ -17,6 +17,7 @@ #define GLOW_BACKENDS_DEVICEMANAGER_H #include "glow/Backend/CompiledFunction.h" +#include "glow/Base/DeviceTensorTransferManager.h" #include "glow/ExecutionContext/ExecutionContext.h" #include "glow/Graph/Graph.h" #include "glow/Runtime/RuntimeTypes.h" @@ -42,7 +43,7 @@ using ReadyCBTy = std::function; using FunctionMapTy = std::map; /// Interface managing a specific instance of a device. -class DeviceManager { +class DeviceManager : public DeviceTensorTransferManager { protected: /// Configuration object for the device. DeviceConfig config_; @@ -162,6 +163,33 @@ class DeviceManager { /// \returns the DeviceInfo for this device containing peak limits for /// compute and bandwidths (used in partitioning). virtual DeviceInfo getDeviceInfo() const { return DeviceInfo(); } + + /// Copies the contents of \p tensor from the host to the \p location + /// address on this device. Updates the tensor residency info. + virtual void transferToDevice(Tensor &tensor, void *locationContext, + std::function resultCB = + [](Error) {}) { + DCHECK("Not Implemented"); + resultCB(MAKE_ERR(ErrorValue::ErrorCode::DEVICE_FEATURE_NOT_SUPPORTED, + "Direct transfer not supported on this device")); + } + + /// Copies the device buffer associated with \p tensor to the host. + /// The tensor must be resident on this device. If \p release is true, + /// frees the device memory. Updates the tensor residency info. + virtual void transferFromDevice(Tensor &tensor, bool release = true, + std::function resultCB = + [](Error) {}) { + DCHECK("Not Implemented"); + resultCB(MAKE_ERR(ErrorValue::ErrorCode::DEVICE_FEATURE_NOT_SUPPORTED, + "Direct transfer not supported on this device")); + } + + /// Releases the device buffer associated with \p tensor. + virtual bool releaseDeviceTensor(void *locationContext) { + DCHECK("Not Implemented"); + return false; + } }; } // namespace runtime diff --git a/include/glow/Base/DeviceTensorTransferManager.h b/include/glow/Base/DeviceTensorTransferManager.h new file mode 100644 index 0000000000..1ae77a85ea --- /dev/null +++ b/include/glow/Base/DeviceTensorTransferManager.h @@ -0,0 +1,50 @@ +/** + * Copyright (c) Glow Contributors. See CONTRIBUTORS file. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef GLOW_BASE_DEVICETENSORTRANSFERMANAGER_H +#define GLOW_BASE_DEVICETENSORTRANSFERMANAGER_H + +#include "glow/Base/Tensor.h" +#include "glow/Support/Error.h" + +#include + +namespace glow { + +class Tensor; + +class DeviceTensorTransferManager { +public: + virtual ~DeviceTensorTransferManager() {} + /// Copies the contents of \p tensor from the host to the \p location address + /// on this device. Updates the tensor residency info. + virtual void transferToDevice(Tensor &tensor, void *locationContext = nullptr, + std::function resultCB = + [](Error) {}) = 0; + + /// Copies the device buffer associated with \p tensor to the host. + /// The tensor must be resident on this device. If \p release is true, frees + /// the device memory. Updates the tensor residency info. + virtual void transferFromDevice(Tensor &tensor, bool release = true, + std::function resultCB = + [](Error) {}) = 0; + + /// Releases the device buffer associated with \p tensor. + virtual bool releaseDeviceTensor(void *locationContext) = 0; +}; + +} // namespace glow + +#endif // GLOW_BASE_DEVICETENSORTRANSFERMANAGER_H diff --git a/include/glow/Base/Tensor.h b/include/glow/Base/Tensor.h index 0f0937a185..109019753c 100644 --- a/include/glow/Base/Tensor.h +++ b/include/glow/Base/Tensor.h @@ -20,6 +20,7 @@ #include #include +#include "glow/Base/DeviceTensorTransferManager.h" #include "glow/Base/Type.h" #include "glow/Support/Compiler.h" #include "glow/Support/Memory.h" @@ -48,6 +49,71 @@ void genericTranspose(const Tensor *src, Tensor *dest, /// returned dims. For example, input {2,1,4} would result in {2,1,4,1,1,1}. ShapeVector expandDimsToMax(llvm::ArrayRef currDims); +namespace runtime { +class DeviceManager; +} + +/// Holds information regarding whether this Tensor exists in a device-specific +/// form, either resident or specific for a device, and what device holds it. +class DeviceResidencyInfo final { + enum class TensorResidency { + Host, + Device, + }; + + // A pointer to the device manager of the device on which the tensor + // resides. + DeviceTensorTransferManager *deviceManager_{nullptr}; + /// The residency status of the tensor. + TensorResidency tensorResidency_{TensorResidency::Host}; + // A pointer to a context structure, containing the required info to access + // tensor data and perform transfers. + void *locationContext_{nullptr}; + +public: + DeviceResidencyInfo() + : deviceManager_(nullptr), tensorResidency_(TensorResidency::Host), + locationContext_(nullptr) {} + + /// Move ctor. + DeviceResidencyInfo(DeviceResidencyInfo &&other) = delete; + + /// Move assignment operator. + DeviceResidencyInfo &operator=(DeviceResidencyInfo &&other) = delete; + + ~DeviceResidencyInfo() { + // If a tensor is device resident, let its device manager free the device + // buffer. + if (isDeviceResident()) { + deviceManager_->releaseDeviceTensor(locationContext_); + } + } + + /// Removes all device specific state. + void clear() { + deviceManager_ = nullptr; + locationContext_ = nullptr; + tensorResidency_ = TensorResidency::Host; + } + + /// \returns true if this Tensor is resident or specific for a device. + bool isDeviceResident() const { + assert((tensorResidency_ == TensorResidency::Host || deviceManager_) && + "Device resident tensor must have an assigned device manager."); + return tensorResidency_ == TensorResidency::Device; + } + + /// \returns the DeviceManager this tensor is resident on, if any. + DeviceTensorTransferManager *getDeviceManager() const { + return deviceManager_; + } + + /// \returns the device specific location context for a resident Tensor. + void *getLocationContext() const { return locationContext_; } + + friend class Tensor; +}; + /// A class that represents a contiguous n-dimensional array (a tensor). class Tensor final { public: @@ -71,6 +137,10 @@ class Tensor final { /// The TensorPool that is managing this Tensor (if any). TensorPool *tensorPool_{nullptr}; + /// The device residency info accosiated with the tensor. + std::shared_ptr residencyInfoP_{ + new DeviceResidencyInfo()}; + /// Size in bytes of the unpadded region memory. This is useful communicating /// the actual size of the data, this allows for copying only inputs and not /// padding to the device. @@ -119,6 +189,7 @@ class Tensor final { /// Set the content of the tensor to zero. If \p resetFusedScalesOffsets, then /// fused scales/offsets will be set to 1.0/0.0 as well. void zero(bool resetFusedScalesOffsets = false) { + assert(!isDeviceResident() && "Tensor must reside on host to access data."); size_t size = actualSize(); // Quantized tensors should go to their offset. switch (type_.getElementType()) { @@ -298,7 +369,7 @@ class Tensor final { unownedTensor.isUnowned_ = true; unownedTensor.type_ = Type::newShape(getType(), dims); unownedTensor.unpaddedSize_ = unpaddedSize_; - + unownedTensor.residencyInfoP_ = residencyInfoP_; if (offsets.size() == 0) { assert(actualSize() == unownedTensor.actualSize() && "The size of the unowned tensor " @@ -321,6 +392,7 @@ class Tensor final { /// element to start a subview from. Tensor getOwnedSlice(llvm::ArrayRef dims, llvm::ArrayRef offsets = {}) const { + assert(!isDeviceResident() && "Tensor must reside on host to access data."); return getUnowned(dims, offsets).clone(); } @@ -341,6 +413,7 @@ class Tensor final { /// Assigns a new shape to the tensor and allocates a new buffer. void reset(const Type &T) { + assert(!isDeviceResident() && "Tensor must reside on host to access data."); // If the new size is identical to the allocated size then there is no need // to re-allocate the buffer. if (type_ == T && getData()) { @@ -390,6 +463,7 @@ class Tensor final { std::swap(isUnowned_, other.isUnowned_); std::swap(tensorPool_, other.tensorPool_); std::swap(unpaddedSize_, other.unpaddedSize_); + std::swap(residencyInfoP_, other.residencyInfoP_); } /// Move assignment operator. @@ -399,6 +473,7 @@ class Tensor final { std::swap(isUnowned_, other.isUnowned_); std::swap(tensorPool_, other.tensorPool_); std::swap(unpaddedSize_, other.unpaddedSize_); + std::swap(residencyInfoP_, other.residencyInfoP_); return *this; } @@ -429,6 +504,14 @@ class Tensor final { /// elements exceeding allowed error; maximum error and location found; etc.). bool isEqual(const Tensor &other, float allowedError = 0.0001, bool verbose = true) const { + if (isDeviceResident()) { + if (!other.isDeviceResident()) { + return false; + } + + return getDeviceManager() == other.getDeviceManager() && + getLocationContext() == other.getLocationContext(); + } return isEqualImpl(other, /*isBitwise=*/false, allowedError, verbose); } @@ -513,6 +596,7 @@ class Tensor final { /// Update the content and type of the tensor from the tensor \p t. void assign(const Tensor *t) { + assert(!isDeviceResident() && "Tensor must reside on host to access data."); assert(this != t && "Copying to self"); reset(t); size_t bufferSize = type_.getSizeInBytes(); @@ -521,6 +605,7 @@ class Tensor final { /// Update the raw data of the tensor from the tensor \p t. void copyRawFrom(const Tensor *t) { + assert(!isDeviceResident() && "Tensor must reside on host to access data."); assert(this != t && "Copying to self"); assert(actualSize() == t->actualSize()); assert(getElementType() == t->getElementType() && "Invalid element type"); @@ -531,6 +616,7 @@ class Tensor final { /// Update the content of the tensor with a slice from tensor \p t. A slice /// is one index from the first dimension of the tensor. void copySlice(const Tensor *t, size_t slice) { + assert(!isDeviceResident() && "Tensor must reside on host to access data."); auto dim = t->dims().slice(1); (void)dim; assert(dim == dims() && "Invalid slice size"); @@ -546,6 +632,7 @@ class Tensor final { /// The copying operation may overlap the end of the tensor \p t one or more /// times. This means that the data in the input tensor may be duplicated. void copyConsecutiveSlices(const Tensor *t, size_t startSliceIdx) { + assert(!isDeviceResident() && "Tensor must reside on host to access data."); auto onceSliceDim = t->dims().slice(1); (void)onceSliceDim; assert(onceSliceDim == dims().slice(1) && "Invalid slice size"); @@ -571,6 +658,7 @@ class Tensor final { /// and cast them to DestElemType in this. template void copyWithCast(const Tensor *t) { + assert(!isDeviceResident() && "Tensor must reside on host to access data."); static_assert(!std::is_same::value, "Use copyRawFrom instead"); assert(this != t && "Copying to self"); @@ -599,11 +687,13 @@ class Tensor final { /// Transpose the tensor \p src into the empty tensor \p dest. Shuffle the /// axis based on the list \p shuffle, where each element is the src index. void transpose(Tensor *dest, llvm::ArrayRef shuffle) const { + assert(!isDeviceResident() && "Tensor must reside on host to access data."); genericTranspose(this, dest, shuffle); } /// Create a new copy of the current tensor. Tensor clone() const { + assert(!isDeviceResident() && "Tensor must reside on host to access data."); Tensor slice; slice.assign(this); return slice; @@ -612,6 +702,40 @@ class Tensor final { /// Return the raw unsafe pointer to the tensor payload. char *getUnsafePtr() const { return getData(); } + /// \returns true if tensor data is stored on a device + bool isDeviceResident() const { return residencyInfoP_->isDeviceResident(); } + + /// Update device residency info with new device manager and context + void moveToDevice(DeviceTensorTransferManager *deviceManager, + void *locationContext); + + /// If device resident, copy Tensor contents back to host memory and release + /// associated device memory. + void ensureOnHost(); + + /// \returns the pointer to the device manager where the tensor resides. + DeviceTensorTransferManager *getDeviceManager() const { + assert(residencyInfoP_->isDeviceResident() && + "Tensor must be device resident"); + return residencyInfoP_->getDeviceManager(); + } + + /// \returns the pointer to the location context of where the tensor resides. + void *getLocationContext() const { + assert(residencyInfoP_->isDeviceResident() && + "Tensor must be device resident"); + return residencyInfoP_->getLocationContext(); + } + + /// Clears DeviceResidencyInfo. + /// Note that this does not affect the associated DeviceManager or device + /// memory. + void clearDeviceResidency() { + assert(residencyInfoP_->isDeviceResident() && + "Tensor must be device resident"); + residencyInfoP_->clear(); + } + /// \return a new handle that points and manages this tensor. template Handle getHandle() &; @@ -623,12 +747,14 @@ class Tensor final { private: /// \returns a pointer to the raw data, of type \p ElemTy. template ElemTy *getRawDataPointer() { + assert(!isDeviceResident() && "Tensor must reside on host to access data."); assert(type_.isType() && "Asking for the wrong ptr type."); return reinterpret_cast(data_); } /// \returns a const pointer to the raw data, of type \p ElemTy. template const ElemTy *getRawDataPointer() const { + assert(!isDeviceResident() && "Tensor must reside on host to access data."); assert(type_.isType() && "Asking for the wrong ptr type."); return reinterpret_cast(data_); } @@ -636,6 +762,7 @@ class Tensor final { template bool isEqualImpl(const Tensor &other, float allowedError, bool verbose) const { + assert(!isDeviceResident() && "Tensor must reside on host to access data."); auto const *myData = getRawDataPointer(); auto const *otherData = other.getRawDataPointer(); double maxFoundError = 0.0; @@ -668,6 +795,7 @@ class Tensor final { } bool isBitwiseEqualImpl(const Tensor &other) const { + assert(!isDeviceResident() && "Tensor must reside on host to access data."); auto const *myData = getUnsafePtr(); auto const *otherData = other.getUnsafePtr(); for (size_t i = 0, e = getSizeInBytes(); i < e; i++) { @@ -1283,11 +1411,13 @@ template class Handle final { }; template Handle Tensor::getHandle() & { + assert(!isDeviceResident() && "Tensor must reside on host to access data."); assert(type_.isType() && "Getting a handle to the wrong type."); return Handle(this); } template const Handle Tensor::getHandle() const & { + assert(!isDeviceResident() && "Tensor must reside on host to access data."); assert(type_.isType() && "Getting a handle to the wrong type."); return Handle(const_cast(this)); } diff --git a/include/glow/ExecutionEngine/ExecutionEngine.h b/include/glow/ExecutionEngine/ExecutionEngine.h index f80ba45eb9..29eb0e9003 100644 --- a/include/glow/ExecutionEngine/ExecutionEngine.h +++ b/include/glow/ExecutionEngine/ExecutionEngine.h @@ -59,6 +59,10 @@ class ExecutionEngine final { /// Glow functions compiled for this ExecutionEngine's backend. std::set compiledFunctions_; + /// Whether to move all Device Resident Tensors on to the host at the end of + /// the run. + bool ensureOutputsOnHost_{true}; + /// Single execution of the given function, \p name with the given context /// \bindings. void runInternal(ExecutionContext &context, llvm::StringRef name); @@ -85,6 +89,9 @@ class ExecutionEngine final { setBackendName(backendName_); } + // Set whether or not to ensure outputs are in host memory. + void ensureOutputsOnHost(bool should) { ensureOutputsOnHost_ = should; } + /// Get the name of the current backend in use. llvm::StringRef getBackendName() const; diff --git a/include/glow/Graph/PlaceholderBindings.h b/include/glow/Graph/PlaceholderBindings.h index 29d33cdbd7..ba6ffbec7e 100644 --- a/include/glow/Graph/PlaceholderBindings.h +++ b/include/glow/Graph/PlaceholderBindings.h @@ -114,6 +114,13 @@ class PlaceholderBindings final { /// PlaceholderBindings. uint64_t getDataSize() const; + /// Copies all Device Resident Tensors back to the host. + void ensureOnHost() { + for (auto &ph : pairs()) { + ph.second->ensureOnHost(); + } + } + PlaceholderBindings() = default; /// Construct the PlaceholderBindings with an initial mapping between \p diff --git a/include/glow/Support/Error.h b/include/glow/Support/Error.h index bb7caf8e5c..0064c13a12 100644 --- a/include/glow/Support/Error.h +++ b/include/glow/Support/Error.h @@ -262,6 +262,8 @@ class GlowErrorValue final { RUNTIME_DEVICE_NOT_FOUND, // Runtime error, network busy to perform any operation on it. RUNTIME_NET_BUSY, + // Device error, not supported. + DEVICE_FEATURE_NOT_SUPPORTED, // Compilation error; node unsupported after optimizations. COMPILE_UNSUPPORTED_NODE_AFTER_OPTIMIZE, // Compilation error; Compilation context not correctly setup. diff --git a/lib/Backends/CPU/tests/CPUDeviceManagerTest.cpp b/lib/Backends/CPU/tests/CPUDeviceManagerTest.cpp index bdf4d1d0f6..b7df8c30b4 100644 --- a/lib/Backends/CPU/tests/CPUDeviceManagerTest.cpp +++ b/lib/Backends/CPU/tests/CPUDeviceManagerTest.cpp @@ -17,4 +17,6 @@ using namespace glow; -std::set glow::backendTestBlacklist = {}; +std::set glow::backendTestBlacklist = { + "DeviceResidentTensors/0", +}; diff --git a/lib/Backends/Habana/HabanaDeviceManager.cpp b/lib/Backends/Habana/HabanaDeviceManager.cpp index 55dac1e4cf..6d1732e538 100644 --- a/lib/Backends/Habana/HabanaDeviceManager.cpp +++ b/lib/Backends/Habana/HabanaDeviceManager.cpp @@ -284,6 +284,10 @@ void HabanaDeviceManager::runFunctionImpl(RunIdentifierTy runId, TRACE_EVENT_SCOPE_NAMED(ctx->getTraceContext(), TraceLevel::RUNTIME, "HabanaDM::runnerThread", trEvent); + + /// Habana DeviceManager doesn't support Device Resident Tensors. + ctx->getPlaceholderBindings()->ensureOnHost(); + if (ctx->getTraceContext()) { ctx->getTraceContext()->setThreadName( llvm::formatv("Habana {0} (enqueue)", deviceId_).str()); diff --git a/lib/Backends/Habana/tests/HabanaDeviceManagerTest.cpp b/lib/Backends/Habana/tests/HabanaDeviceManagerTest.cpp index bdf4d1d0f6..b7df8c30b4 100644 --- a/lib/Backends/Habana/tests/HabanaDeviceManagerTest.cpp +++ b/lib/Backends/Habana/tests/HabanaDeviceManagerTest.cpp @@ -17,4 +17,6 @@ using namespace glow; -std::set glow::backendTestBlacklist = {}; +std::set glow::backendTestBlacklist = { + "DeviceResidentTensors/0", +}; diff --git a/lib/Backends/Interpreter/InterpreterFunction.cpp b/lib/Backends/Interpreter/InterpreterFunction.cpp index 73ec2203c0..cd9a606f38 100644 --- a/lib/Backends/Interpreter/InterpreterFunction.cpp +++ b/lib/Backends/Interpreter/InterpreterFunction.cpp @@ -192,6 +192,8 @@ Error BoundInterpreterFunction::execute(IRFunction *F, ExecutionContext *context) { { TRACE_EVENT_SCOPE(context, TraceLevel::RUNTIME, "registerTensors"); + // Make sure all referenced tensors are on the host. + context->getPlaceholderBindings()->ensureOnHost(); // Find all virtually padded tensors so they can be replaced. std::vector virtualPadded; diff --git a/lib/Backends/Interpreter/tests/InterpreterDeviceManagerTest.cpp b/lib/Backends/Interpreter/tests/InterpreterDeviceManagerTest.cpp index bdf4d1d0f6..b7df8c30b4 100644 --- a/lib/Backends/Interpreter/tests/InterpreterDeviceManagerTest.cpp +++ b/lib/Backends/Interpreter/tests/InterpreterDeviceManagerTest.cpp @@ -17,4 +17,6 @@ using namespace glow; -std::set glow::backendTestBlacklist = {}; +std::set glow::backendTestBlacklist = { + "DeviceResidentTensors/0", +}; diff --git a/lib/Backends/NNPI/NNPIDeviceManager.cpp b/lib/Backends/NNPI/NNPIDeviceManager.cpp index d52fee7587..417a0c6222 100644 --- a/lib/Backends/NNPI/NNPIDeviceManager.cpp +++ b/lib/Backends/NNPI/NNPIDeviceManager.cpp @@ -193,6 +193,9 @@ NNPIDeviceManager::runFunction(std::string functionName, runtime::ResultCBTy resultCB) { RunIdentifierTy runId = runIdentifier_++; + /// NNPI DeviceManager doesn't support Device Resident Tensors. + ctx->getPlaceholderBindings()->ensureOnHost(); + // Get thread env. auto infEnv = inferenceEnvs_.find(functionName); if (infEnv == inferenceEnvs_.end()) { diff --git a/lib/Backends/NNPI/tests/NNPIDeviceManagerTest.cpp b/lib/Backends/NNPI/tests/NNPIDeviceManagerTest.cpp index cb69e520f3..00cf00f2e2 100644 --- a/lib/Backends/NNPI/tests/NNPIDeviceManagerTest.cpp +++ b/lib/Backends/NNPI/tests/NNPIDeviceManagerTest.cpp @@ -19,4 +19,5 @@ using namespace glow; std::set glow::backendTestBlacklist = { "MultiFunction/0", + "DeviceResidentTensors/0", }; diff --git a/lib/Backends/OpenCL/OpenCLDeviceManager.cpp b/lib/Backends/OpenCL/OpenCLDeviceManager.cpp index 4d51453573..c511f9e345 100644 --- a/lib/Backends/OpenCL/OpenCLDeviceManager.cpp +++ b/lib/Backends/OpenCL/OpenCLDeviceManager.cpp @@ -462,6 +462,7 @@ void OpenCLDeviceManager::copyInputsToDevice( runtime::OpenCLDeviceBindings *devBindings) { TRACE_EVENT_SCOPE(context->getTraceContext(), TraceLevel::RUNTIME, "copyInputsToDevice"); + bool profilingEnabled = context->getTraceContext() && (context->getTraceContext()->getTraceLevel() & TraceLevel::COPY); @@ -491,6 +492,7 @@ void OpenCLDeviceManager::copyInputsToDevice( devBindings->kernelLaunches.emplace_back(name, "copy", event); } } + // Do it! clFinish(devBindings->commandQueue); } @@ -500,9 +502,11 @@ void OpenCLDeviceManager::copyOutputsFromDevice( runtime::OpenCLDeviceBindings *devBindings) { TRACE_EVENT_SCOPE(context->getTraceContext(), TraceLevel::RUNTIME, "copyOutputsFromDevice"); + bool profilingEnabled = context->getTraceContext() && (context->getTraceContext()->getTraceLevel() & TraceLevel::COPY); + auto &symbolTable = runtimeBundle.getSymbolTable(); for (auto PH : context->getPlaceholderBindings()->pairs()) { auto it = symbolTable.find(PH.first->getName()); @@ -528,6 +532,7 @@ void OpenCLDeviceManager::copyOutputsFromDevice( devBindings->kernelLaunches.emplace_back(name, "copy", event); } } + // Do it! clFinish(devBindings->commandQueue); } @@ -662,9 +667,11 @@ void OpenCLDeviceManager::runFunctionImpl( RunIdentifierTy id, std::string function, std::unique_ptr context, ResultCBTy resultCB) { DCHECK(resultCB != nullptr); - TRACE_EVENT_SCOPE_NAMED(context->getTraceContext(), TraceLevel::RUNTIME, "DeviceManager::run", dmRun); + /// OpenCL DeviceManager doesn't support Device Resident Tensors. + context->getPlaceholderBindings()->ensureOnHost(); + auto funcIt = functions_.find(function); if (funcIt == functions_.end()) { dmRun.addArg("reason", "function not found"); diff --git a/lib/Backends/OpenCL/tests/OpenCLDeviceManagerTest.cpp b/lib/Backends/OpenCL/tests/OpenCLDeviceManagerTest.cpp index bdf4d1d0f6..b7df8c30b4 100644 --- a/lib/Backends/OpenCL/tests/OpenCLDeviceManagerTest.cpp +++ b/lib/Backends/OpenCL/tests/OpenCLDeviceManagerTest.cpp @@ -17,4 +17,6 @@ using namespace glow; -std::set glow::backendTestBlacklist = {}; +std::set glow::backendTestBlacklist = { + "DeviceResidentTensors/0", +}; diff --git a/lib/Base/Tensor.cpp b/lib/Base/Tensor.cpp index 38f0b06715..adccff17b7 100644 --- a/lib/Base/Tensor.cpp +++ b/lib/Base/Tensor.cpp @@ -473,6 +473,7 @@ ShapeVector glow::expandDimsToMax(llvm::ArrayRef currDims) { } void Tensor::init(InitKind init, float val, PseudoRNG &PRNG) { + assert(!isDeviceResident() && "Tensor must reside on host to access data."); switch (init) { case InitKind::Zero: zero(); @@ -562,10 +563,12 @@ void Tensor::init(InitKind init, float val, PseudoRNG &PRNG) { } void Tensor::convertToType(ElemKind newTy) { + assert(!isDeviceResident() && "Tensor must reside on host to access data."); *this = this->getCopyConvertedToType(newTy); } Tensor Tensor::getCopyConvertedToType(ElemKind newKind) const { + assert(!isDeviceResident() && "Tensor must reside on host to access data."); const ElemKind origKind = getElementType(); DCHECK((origKind == ElemKind::FloatTy && newKind == ElemKind::Float16Ty) || (origKind == ElemKind::Float16Ty && newKind == ElemKind::FloatTy) || @@ -638,4 +641,19 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &os, const Tensor *t) { return os; } +void Tensor::moveToDevice(DeviceTensorTransferManager *deviceManager, + void *locationContext) { + residencyInfoP_->deviceManager_ = deviceManager; + residencyInfoP_->locationContext_ = locationContext; + residencyInfoP_->tensorResidency_ = + DeviceResidencyInfo::TensorResidency::Device; +} + +void Tensor::ensureOnHost() { + if (residencyInfoP_->isDeviceResident()) { + residencyInfoP_->deviceManager_->transferFromDevice(*this); + } + assert(!isDeviceResident()); +} + } // namespace glow diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp index b873e9171b..d20b410191 100644 --- a/lib/ExecutionEngine/ExecutionEngine.cpp +++ b/lib/ExecutionEngine/ExecutionEngine.cpp @@ -114,20 +114,25 @@ void glow::updateInputPlaceholdersByName(PlaceholderBindings &bindings, void ExecutionEngine::runInternal(ExecutionContext &context, llvm::StringRef name) { std::unique_ptr contextPtr(&context); + std::unique_ptr contextOut; std::promise runPromise; auto fut = runPromise.get_future(); Error runErr = Error::empty(); - hostManager_->runNetwork( - name, std::move(contextPtr), - [&runPromise, &runErr](runtime::RunIdentifierTy, Error err, - std::unique_ptr contextPtr) { - // Don't delete context. - contextPtr.release(); - runErr = std::move(err); - runPromise.set_value(); - }); + hostManager_->runNetwork(name, std::move(contextPtr), + [&runPromise, &runErr, &contextOut]( + runtime::RunIdentifierTy, Error err, + std::unique_ptr contextPtr) { + contextOut = std::move(contextPtr); + runErr = std::move(err); + runPromise.set_value(); + }); fut.wait(); + if (ensureOutputsOnHost_) { + contextOut->getPlaceholderBindings()->ensureOnHost(); + } + // Don't delete context. + contextOut.release(); EXIT_ON_ERR(std::move(runErr)); } diff --git a/lib/LLVMIRCodeGen/LLVMCompiledFunction.cpp b/lib/LLVMIRCodeGen/LLVMCompiledFunction.cpp index 9f61b4fb8f..97c5cefad5 100644 --- a/lib/LLVMIRCodeGen/LLVMCompiledFunction.cpp +++ b/lib/LLVMIRCodeGen/LLVMCompiledFunction.cpp @@ -33,6 +33,9 @@ void LLVMCompiledFunction::collectConstants(const Module *module) { void LLVMCompiledFunction::loadPlaceholders( PlaceholderBindings *bindings, uint8_t *baseMutableWeightVarsAddress) { + // Make sure our inputs are on the host. + bindings->ensureOnHost(); + // Copy Placeholders into allocated memory. auto &symbolTable = runtimeBundle_.getSymbolTable(); for (auto PH : bindings->pairs()) { @@ -40,6 +43,7 @@ void LLVMCompiledFunction::loadPlaceholders( if (it == symbolTable.end()) { continue; } + assert(!PH.second->isDeviceResident()); auto symbolInfo = it->second; auto payload = PH.second->getUnsafePtr(); auto addr = symbolInfo.offset; diff --git a/lib/Support/Error.cpp b/lib/Support/Error.cpp index e211a39cca..7f085b954b 100644 --- a/lib/Support/Error.cpp +++ b/lib/Support/Error.cpp @@ -87,6 +87,8 @@ std::string GlowErrorValue::errorCodeToString(const ErrorCode &ec) { return "RUNTIME_DEVICE_NOT_FOUND"; case ErrorCode::RUNTIME_NET_BUSY: return "RUNTIME_NET_BUSY"; + case ErrorCode::DEVICE_FEATURE_NOT_SUPPORTED: + return "DEVICE_FEATURE_NOT_SUPPORTED"; case ErrorCode::COMPILE_UNSUPPORTED_NODE_AFTER_OPTIMIZE: return "COMPILE_UNSUPPORTED_NODE_AFTER_OPTIMIZE"; case ErrorCode::COMPILE_CONTEXT_MALFORMED: diff --git a/tests/unittests/BackendTestUtils.cpp b/tests/unittests/BackendTestUtils.cpp index ac57aaebb4..1074714c01 100644 --- a/tests/unittests/BackendTestUtils.cpp +++ b/tests/unittests/BackendTestUtils.cpp @@ -212,6 +212,10 @@ void dispatchInference(const std::string &fname, for (auto &future : futures) { future.wait(); } + + for (auto &c : contexts) { + c->getPlaceholderBindings()->ensureOnHost(); + } // Release the original context passed in by reference so we don't free it. contexts[0].release(); } diff --git a/tests/unittests/DeviceManagerTest.cpp b/tests/unittests/DeviceManagerTest.cpp index d23cadf55a..e5bb7ec1d3 100644 --- a/tests/unittests/DeviceManagerTest.cpp +++ b/tests/unittests/DeviceManagerTest.cpp @@ -32,11 +32,26 @@ using namespace glow; using namespace glow::runtime; +template +std::pair, std::future> getFutureHelper() { + std::promise promise; + auto future = promise.get_future(); + return std::make_pair(std::move(promise), std::move(future)); +} + +template +void callbackHelper(std::promise &promise, ResultType res, + Error err) { + promise.set_value(!ERR_TO_BOOL(std::move(err)) ? std::move(res) + : ResultType()); +} + class DeviceManagerTest : public ::testing::TestWithParam { public: void SetUp() override { backendName = GetParam(); - device.reset(DeviceManager::createDeviceManager(DeviceConfig(backendName))); + DeviceConfig config(backendName); + device.reset(DeviceManager::createDeviceManager(config)); ASSERT_TRUE(device.get()); ASSERT_FALSE(ERR_TO_BOOL(device->init())); } @@ -45,6 +60,40 @@ class DeviceManagerTest : public ::testing::TestWithParam { std::string backendName; std::unique_ptr device{nullptr}; + + void addToDevice(Module *module, FunctionMapTy functions) { + + std::promise promise; + std::future future; + std::tie(promise, future) = getFutureHelper(); + + device->addNetwork(module, std::move(functions), + [&promise](const Module *module, Error err) { + callbackHelper(promise, module, std::move(err)); + }); + + future.wait_for(std::chrono::seconds(2)); + EXPECT_EQ(future.get(), module); + } + + std::unique_ptr + runFunction(std::string name, std::unique_ptr context) { + std::promise> runPromise; + std::future> runFuture; + + std::tie(runPromise, runFuture) = + getFutureHelper>(); + device->runFunction( + name, std::move(context), + [&runPromise](RunIdentifierTy, Error err, + std::unique_ptr context) { + callbackHelper(runPromise, std::move(context), std::move(err)); + }); + + runFuture.wait_for(std::chrono::seconds(2)); + context = runFuture.get(); + return context; + } }; std::unique_ptr makeBasicModule(std::string functionName = "main") { @@ -83,42 +132,18 @@ compileFunctions(llvm::StringRef backendName, Module *module, return results; } -template -std::pair, std::future> getFutureHelper() { - std::promise promise; - auto future = promise.get_future(); - return std::make_pair(std::move(promise), std::move(future)); -} - -template -void callbackHelper(std::promise &promise, ResultType res, - Error err) { - promise.set_value(!ERR_TO_BOOL(std::move(err)) ? std::move(res) - : ResultType()); -} - TEST_P(DeviceManagerTest, Basic) { auto module = makeBasicModule(); std::vector> backing; FunctionMapTy functions = compileFunctions(backendName, module.get(), backing); - std::promise promise; - std::future future; - std::tie(promise, future) = getFutureHelper(); - - device->addNetwork(module.get(), std::move(functions), - [&promise](const Module *module, Error err) { - callbackHelper(promise, module, std::move(err)); - }); - - future.wait_for(std::chrono::seconds(2)); - EXPECT_EQ(future.get(), module.get()); - std::unique_ptr context = glow::make_unique(); context->getPlaceholderBindings()->allocate(module->getPlaceholders()); + addToDevice(module.get(), std::move(functions)); + Tensor input1(ElemKind::FloatTy, {1}); Tensor output1(ElemKind::FloatTy, {1}); input1.getHandle().clear(0.5); @@ -128,21 +153,11 @@ TEST_P(DeviceManagerTest, Basic) { {module->getPlaceholderByName("main_input")}, {&input1}); - std::promise> runPromise; - std::future> runFuture; - - std::tie(runPromise, runFuture) = - getFutureHelper>(); - device->runFunction("main", std::move(context), - [&runPromise](RunIdentifierTy, Error err, - std::unique_ptr context) { - callbackHelper(runPromise, std::move(context), - std::move(err)); - }); - - runFuture.wait_for(std::chrono::seconds(2)); - context = runFuture.get(); + context = runFunction("main", std::move(context)); ASSERT_TRUE(context); + // We must ensure results are on host since we're using DeviceManager + // directly. + context->getPlaceholderBindings()->ensureOnHost(); Tensor *result1 = context->getPlaceholderBindings()->get( module->getPlaceholderByName("main_output")); ASSERT_TRUE(result1); @@ -211,8 +226,13 @@ TEST_P(DeviceManagerTest, PartialTensorCopy) { runFuture.wait_for(std::chrono::seconds(2)); context = runFuture.get(); ASSERT_TRUE(context); + // We must ensure results are on host since we're using DeviceManager + // directly. + context->getPlaceholderBindings()->ensureOnHost(); + Tensor *result1 = context->getPlaceholderBindings()->get( module->getPlaceholderByName("main_output")); + ASSERT_TRUE(result1); EXPECT_FLOAT_EQ(result1->getHandle().at({0}), std::max(std::tanh(0.5), 0.25)); } @@ -223,15 +243,7 @@ TEST_P(DeviceManagerTest, MultiRun) { FunctionMapTy functions = compileFunctions(backendName, module.get(), backing); - std::promise promise; - std::future future; - std::tie(promise, future) = getFutureHelper(); - device->addNetwork(module.get(), std::move(functions), - [&promise](const Module *module, Error err) { - callbackHelper(promise, module, std::move(err)); - }); - future.wait_for(std::chrono::seconds(2)); - EXPECT_EQ(future.get(), module.get()); + addToDevice(module.get(), std::move(functions)); std::unique_ptr context1 = glow::make_unique(); @@ -281,6 +293,10 @@ TEST_P(DeviceManagerTest, MultiRun) { ASSERT_TRUE(context1); ASSERT_TRUE(context2); EXPECT_NE(context1, context2); + // We must ensure results are on host since we're using DeviceManager + // directly. + context1->getPlaceholderBindings()->ensureOnHost(); + context2->getPlaceholderBindings()->ensureOnHost(); Tensor *result1 = context1->getPlaceholderBindings()->get( module->getPlaceholderByName("main_output")); @@ -376,6 +392,8 @@ TEST_P(DeviceManagerTest, MultiFunction) { ASSERT_TRUE(context1); ASSERT_TRUE(context2); EXPECT_NE(context1, context2); + context1->getPlaceholderBindings()->ensureOnHost(); + context2->getPlaceholderBindings()->ensureOnHost(); Tensor *result1 = context1->getPlaceholderBindings()->get( module->getPlaceholderByName("func1_output")); @@ -458,6 +476,8 @@ TEST_P(DeviceManagerTest, MultiModule) { ASSERT_TRUE(context1); ASSERT_TRUE(context2); EXPECT_NE(context1, context2); + context2->getPlaceholderBindings()->ensureOnHost(); + context2->getPlaceholderBindings()->ensureOnHost(); Tensor *result1 = context1->getPlaceholderBindings()->get( module1->getPlaceholderByName("func1_output")); @@ -738,4 +758,100 @@ TEST(DeviceManagerTest, DummyDeviceManager) { #endif // GLOW_WITH_CPU +/// Check that the device can move data to and from the host. +/// Disable if your device does not support Device Resident Tensors. +TEST_P(DeviceManagerTest, DeviceResidentTensors) { + CHECK_IF_ENABLED(); + Tensor T = {1.2f, 12.1f, 51.0f, 1515.2f}; + Tensor R = {1.2f, 12.1f, 51.0f, 1515.2f}; + + ASSERT_FALSE(T.isDeviceResident()); + + device->transferToDevice(T, nullptr); + + ASSERT_TRUE(T.isDeviceResident()); + + device->transferFromDevice(T); + + ASSERT_FALSE(T.isDeviceResident()); + + ASSERT_TRUE(T.isEqual(R)); +} + +/// A mock DeviceManager for use in Device Resident Tensor tests. +class MockDM : public DeviceManager { +public: + MockDM() : DeviceManager(DeviceConfig("MockDM")) {} + void addNetwork(const Module *module, FunctionMapTy functions, + ReadyCBTy readyCB) override {} + + void evictNetwork(std::string functionName, + EvictFunctionCBTy evictCB = [](std::string, Error) { + }) override {} + + runtime::RunIdentifierTy + runFunction(std::string functionName, + std::unique_ptr context, + runtime::ResultCBTy resultCB) override { + return 0; + } + + uint64_t getMaximumMemory() const override { return 0; } + + uint64_t getAvailableMemory() const override { return 0; } + + bool isMemoryAvailable(uint64_t estimate) const override { return 0; } + + void transferToDevice(Tensor &tensor, void *locationContext = nullptr, + std::function resultCB = [](Error) { + }) override { + if (locationContext == nullptr) { + locationContext = this; + } + tensor.moveToDevice(this, locationContext); + } + + void transferFromDevice(Tensor &tensor, bool release = true, + std::function resultCB = [](Error) { + }) override { + tensor.clearDeviceResidency(); + } + + bool releaseDeviceTensor(void *locationContext) override { return true; } +}; + +TEST_P(DeviceManagerTest, CanHandleDeviceResidentTensors) { + MockDM mockDM; + + auto module = makeBasicModule(); + std::vector> backing; + FunctionMapTy functions = + compileFunctions(backendName, module.get(), backing); + + addToDevice(module.get(), std::move(functions)); + + std::unique_ptr context = + glow::make_unique(); + context->getPlaceholderBindings()->allocate(module->getPlaceholders()); + + Tensor input1(ElemKind::FloatTy, {1}); + Tensor output1(ElemKind::FloatTy, {1}); + input1.getHandle().clear(0.5); + output1.getHandle().clear(std::max(std::tanh(0.5), 0.25)); + + updateInputPlaceholders(*context->getPlaceholderBindings(), + {module->getPlaceholderByName("main_input")}, + {&input1}); + + mockDM.transferToDevice(*context->getPlaceholderBindings()->get( + module->getPlaceholderByName("main_input"))); + + context = runFunction("main", std::move(context)); + ASSERT_TRUE(context); + Tensor *result1 = context->getPlaceholderBindings()->get( + module->getPlaceholderByName("main_output")); + ASSERT_TRUE(result1); + EXPECT_TRUE(result1->isEqual(output1)); +} + INSTANTIATE_BACKEND_TEST(DeviceManagerTest);