Device Resident Tensors - API & Framework (#3745)

Summary: Taking over #3671, but spinning out the API and Glow-core level changes associated with the DRT plan in #3629. This does not implement DRT support on any device. Documentation: See #3629. Pull Request resolved: #3745 Test Plan: Ran tests, added two simple new sanity checks to DeviceManagerTest. The first `DeviceResidentTensors` should run only for backends that support resident tensors (none currently). The second `CanHandleDeviceResidentTensors` should run on all devices. Differential Revision: D18378905 Pulled By: nickgg fbshipit-source-id: 887c290dae5a6b9b75e9b41a415958d499bc5402
pytorch · Nov 13, 2019 · eb48b31 · eb48b31
1 parent ec46f24
commit eb48b31
Show file tree

Hide file tree

Showing 21 changed files with 464 additions and 66 deletions.
diff --git a/include/glow/Backends/DeviceManager.h b/include/glow/Backends/DeviceManager.h
@@ -17,6 +17,7 @@
 #define GLOW_BACKENDS_DEVICEMANAGER_H
 
 #include "glow/Backend/CompiledFunction.h"
+#include "glow/Base/DeviceTensorTransferManager.h"
 #include "glow/ExecutionContext/ExecutionContext.h"
 #include "glow/Graph/Graph.h"
 #include "glow/Runtime/RuntimeTypes.h"
@@ -42,7 +43,7 @@ using ReadyCBTy = std::function<void(const Module *, Error)>;
 using FunctionMapTy = std::map<std::string, CompiledFunction *>;
 
 /// Interface managing a specific instance of a device.
-class DeviceManager {
+class DeviceManager : public DeviceTensorTransferManager {
 protected:
   /// Configuration object for the device.
   DeviceConfig config_;
@@ -162,6 +163,33 @@ class DeviceManager {
   /// \returns the DeviceInfo for this device containing peak limits for
   /// compute and bandwidths (used in partitioning).
   virtual DeviceInfo getDeviceInfo() const { return DeviceInfo(); }
+
+  /// Copies the contents of \p tensor from the host to the \p location
+  /// address on this device. Updates the tensor residency info.
+  virtual void transferToDevice(Tensor &tensor, void *locationContext,
+                                std::function<void(Error)> resultCB =
+                                    [](Error) {}) {
+    DCHECK("Not Implemented");
+    resultCB(MAKE_ERR(ErrorValue::ErrorCode::DEVICE_FEATURE_NOT_SUPPORTED,
+                      "Direct transfer not supported on this device"));
+  }
+
+  /// Copies the device buffer associated with \p tensor to the host.
+  /// The tensor must be resident on this device. If \p release is true,
+  /// frees the device memory. Updates the tensor residency info.
+  virtual void transferFromDevice(Tensor &tensor, bool release = true,
+                                  std::function<void(Error)> resultCB =
+                                      [](Error) {}) {
+    DCHECK("Not Implemented");
+    resultCB(MAKE_ERR(ErrorValue::ErrorCode::DEVICE_FEATURE_NOT_SUPPORTED,
+                      "Direct transfer not supported on this device"));
+  }
+
+  /// Releases the device buffer associated with \p tensor.
+  virtual bool releaseDeviceTensor(void *locationContext) {
+    DCHECK("Not Implemented");
+    return false;
+  }
 };
 
 } // namespace runtime

diff --git a/include/glow/Base/DeviceTensorTransferManager.h b/include/glow/Base/DeviceTensorTransferManager.h
@@ -0,0 +1,50 @@
+/**
+ * Copyright (c) Glow Contributors. See CONTRIBUTORS file.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef GLOW_BASE_DEVICETENSORTRANSFERMANAGER_H
+#define GLOW_BASE_DEVICETENSORTRANSFERMANAGER_H
+
+#include "glow/Base/Tensor.h"
+#include "glow/Support/Error.h"
+
+#include <functional>
+
+namespace glow {
+
+class Tensor;
+
+class DeviceTensorTransferManager {
+public:
+  virtual ~DeviceTensorTransferManager() {}
+  /// Copies the contents of \p tensor from the host to the \p location address
+  /// on this device. Updates the tensor residency info.
+  virtual void transferToDevice(Tensor &tensor, void *locationContext = nullptr,
+                                std::function<void(Error)> resultCB =
+                                    [](Error) {}) = 0;
+
+  /// Copies the device buffer associated with \p tensor to the host.
+  /// The tensor must be resident on this device. If \p release is true, frees
+  /// the device memory. Updates the tensor residency info.
+  virtual void transferFromDevice(Tensor &tensor, bool release = true,
+                                  std::function<void(Error)> resultCB =
+                                      [](Error) {}) = 0;
+
+  /// Releases the device buffer associated with \p tensor.
+  virtual bool releaseDeviceTensor(void *locationContext) = 0;
+};
+
+} // namespace glow
+
+#endif // GLOW_BASE_DEVICETENSORTRANSFERMANAGER_H
diff --git a/include/glow/Base/Tensor.h b/include/glow/Base/Tensor.h
@@ -20,6 +20,7 @@
 #include <cassert>
 #include <vector>
 
+#include "glow/Base/DeviceTensorTransferManager.h"
 #include "glow/Base/Type.h"
 #include "glow/Support/Compiler.h"
 #include "glow/Support/Memory.h"
@@ -48,6 +49,71 @@ void genericTranspose(const Tensor *src, Tensor *dest,
 /// returned dims. For example, input {2,1,4} would result in {2,1,4,1,1,1}.
 ShapeVector expandDimsToMax(llvm::ArrayRef<size_t> currDims);
 
+namespace runtime {
+class DeviceManager;
+}
+
+/// Holds information regarding whether this Tensor exists in a device-specific
+/// form, either resident or specific for a device, and what device holds it.
+class DeviceResidencyInfo final {
+  enum class TensorResidency {
+    Host,
+    Device,
+  };
+
+  // A pointer to the device manager of the device on which the tensor
+  // resides.
+  DeviceTensorTransferManager *deviceManager_{nullptr};
+  /// The residency status of the tensor.
+  TensorResidency tensorResidency_{TensorResidency::Host};
+  // A pointer to a context structure, containing the required info to access
+  // tensor data and perform transfers.
+  void *locationContext_{nullptr};
+
+public:
+  DeviceResidencyInfo()
+      : deviceManager_(nullptr), tensorResidency_(TensorResidency::Host),
+        locationContext_(nullptr) {}
+
+  /// Move ctor.
+  DeviceResidencyInfo(DeviceResidencyInfo &&other) = delete;
+
+  /// Move assignment operator.
+  DeviceResidencyInfo &operator=(DeviceResidencyInfo &&other) = delete;
+
+  ~DeviceResidencyInfo() {
+    // If a tensor is device resident, let its device manager free the device
+    // buffer.
+    if (isDeviceResident()) {
+      deviceManager_->releaseDeviceTensor(locationContext_);
+    }
+  }
+
+  /// Removes all device specific state.
+  void clear() {
+    deviceManager_ = nullptr;
+    locationContext_ = nullptr;
+    tensorResidency_ = TensorResidency::Host;
+  }
+
+  /// \returns true if this Tensor is resident or specific for a device.
+  bool isDeviceResident() const {
+    assert((tensorResidency_ == TensorResidency::Host || deviceManager_) &&
+           "Device resident tensor must have an assigned device manager.");
+    return tensorResidency_ == TensorResidency::Device;
+  }
+
+  /// \returns the DeviceManager this tensor is resident on, if any.
+  DeviceTensorTransferManager *getDeviceManager() const {
+    return deviceManager_;
+  }
+
+  /// \returns the device specific location context for a resident Tensor.
+  void *getLocationContext() const { return locationContext_; }
+
+  friend class Tensor;
+};
+
 /// A class that represents a contiguous n-dimensional array (a tensor).
 class Tensor final {
 public:
@@ -71,6 +137,10 @@ class Tensor final {
   /// The TensorPool that is managing this Tensor (if any).
   TensorPool *tensorPool_{nullptr};
 
+  /// The device residency info accosiated with the tensor.
+  std::shared_ptr<DeviceResidencyInfo> residencyInfoP_{
+      new DeviceResidencyInfo()};
+
   /// Size in bytes of the unpadded region memory. This is useful  communicating
   /// the actual size of the data, this allows for copying only inputs and not
   /// padding to the device.
@@ -119,6 +189,7 @@ class Tensor final {
   /// Set the content of the tensor to zero. If \p resetFusedScalesOffsets, then
   /// fused scales/offsets will be set to 1.0/0.0 as well.
   void zero(bool resetFusedScalesOffsets = false) {
+    assert(!isDeviceResident() && "Tensor must reside on host to access data.");
     size_t size = actualSize();
     // Quantized tensors should go to their offset.
     switch (type_.getElementType()) {
@@ -298,7 +369,7 @@ class Tensor final {
     unownedTensor.isUnowned_ = true;
     unownedTensor.type_ = Type::newShape(getType(), dims);
     unownedTensor.unpaddedSize_ = unpaddedSize_;
-
+    unownedTensor.residencyInfoP_ = residencyInfoP_;
     if (offsets.size() == 0) {
       assert(actualSize() == unownedTensor.actualSize() &&
              "The size of the unowned tensor "
@@ -321,6 +392,7 @@ class Tensor final {
   /// element to start a subview from.
   Tensor getOwnedSlice(llvm::ArrayRef<size_t> dims,
                        llvm::ArrayRef<size_t> offsets = {}) const {
+    assert(!isDeviceResident() && "Tensor must reside on host to access data.");
     return getUnowned(dims, offsets).clone();
   }
 
@@ -341,6 +413,7 @@ class Tensor final {
 
   /// Assigns a new shape to the tensor and allocates a new buffer.
   void reset(const Type &T) {
+    assert(!isDeviceResident() && "Tensor must reside on host to access data.");
     // If the new size is identical to the allocated size then there is no need
     // to re-allocate the buffer.
     if (type_ == T && getData()) {
@@ -390,6 +463,7 @@ class Tensor final {
     std::swap(isUnowned_, other.isUnowned_);
     std::swap(tensorPool_, other.tensorPool_);
     std::swap(unpaddedSize_, other.unpaddedSize_);
+    std::swap(residencyInfoP_, other.residencyInfoP_);
   }
 
   /// Move assignment operator.
@@ -399,6 +473,7 @@ class Tensor final {
     std::swap(isUnowned_, other.isUnowned_);
     std::swap(tensorPool_, other.tensorPool_);
     std::swap(unpaddedSize_, other.unpaddedSize_);
+    std::swap(residencyInfoP_, other.residencyInfoP_);
     return *this;
   }
 
@@ -429,6 +504,14 @@ class Tensor final {
   /// elements exceeding allowed error; maximum error and location found; etc.).
   bool isEqual(const Tensor &other, float allowedError = 0.0001,
                bool verbose = true) const {
+    if (isDeviceResident()) {
+      if (!other.isDeviceResident()) {
+        return false;
+      }
+
+      return getDeviceManager() == other.getDeviceManager() &&
+             getLocationContext() == other.getLocationContext();
+    }
     return isEqualImpl(other, /*isBitwise=*/false, allowedError, verbose);
   }
 
@@ -513,6 +596,7 @@ class Tensor final {
 
   /// Update the content and type of the tensor from the tensor \p t.
   void assign(const Tensor *t) {
+    assert(!isDeviceResident() && "Tensor must reside on host to access data.");
     assert(this != t && "Copying to self");
     reset(t);
     size_t bufferSize = type_.getSizeInBytes();
@@ -521,6 +605,7 @@ class Tensor final {
 
   /// Update the raw data of the tensor from the tensor \p t.
   void copyRawFrom(const Tensor *t) {
+    assert(!isDeviceResident() && "Tensor must reside on host to access data.");
     assert(this != t && "Copying to self");
     assert(actualSize() == t->actualSize());
     assert(getElementType() == t->getElementType() && "Invalid element type");
@@ -531,6 +616,7 @@ class Tensor final {
   /// Update the content of the tensor with a slice from tensor \p t. A slice
   /// is one index from the first dimension of the tensor.
   void copySlice(const Tensor *t, size_t slice) {
+    assert(!isDeviceResident() && "Tensor must reside on host to access data.");
     auto dim = t->dims().slice(1);
     (void)dim;
     assert(dim == dims() && "Invalid slice size");
@@ -546,6 +632,7 @@ class Tensor final {
   /// The copying operation may overlap the end of the tensor \p t one or more
   /// times. This means that the data in the input tensor may be duplicated.
   void copyConsecutiveSlices(const Tensor *t, size_t startSliceIdx) {
+    assert(!isDeviceResident() && "Tensor must reside on host to access data.");
     auto onceSliceDim = t->dims().slice(1);
     (void)onceSliceDim;
     assert(onceSliceDim == dims().slice(1) && "Invalid slice size");
@@ -571,6 +658,7 @@ class Tensor final {
   /// and cast them to DestElemType in this.
   template <typename DestElemType, typename SrcElemType>
   void copyWithCast(const Tensor *t) {
+    assert(!isDeviceResident() && "Tensor must reside on host to access data.");
     static_assert(!std::is_same<DestElemType, SrcElemType>::value,
                   "Use copyRawFrom instead");
     assert(this != t && "Copying to self");
@@ -599,11 +687,13 @@ class Tensor final {
   /// Transpose the tensor \p src into the empty tensor \p dest. Shuffle the
   /// axis based on the list \p shuffle, where each element is the src index.
   void transpose(Tensor *dest, llvm::ArrayRef<unsigned_t> shuffle) const {
+    assert(!isDeviceResident() && "Tensor must reside on host to access data.");
     genericTranspose(this, dest, shuffle);
   }
 
   /// Create a new copy of the current tensor.
   Tensor clone() const {
+    assert(!isDeviceResident() && "Tensor must reside on host to access data.");
     Tensor slice;
     slice.assign(this);
     return slice;
@@ -612,6 +702,40 @@ class Tensor final {
   /// Return the raw unsafe pointer to the tensor payload.
   char *getUnsafePtr() const { return getData(); }
 
+  /// \returns true if tensor data is stored on a device
+  bool isDeviceResident() const { return residencyInfoP_->isDeviceResident(); }
+
+  /// Update device residency info with new device manager and context
+  void moveToDevice(DeviceTensorTransferManager *deviceManager,
+                    void *locationContext);
+
+  /// If device resident, copy Tensor contents back to host memory and release
+  /// associated device memory.
+  void ensureOnHost();
+
+  /// \returns the pointer to the device manager where the tensor resides.
+  DeviceTensorTransferManager *getDeviceManager() const {
+    assert(residencyInfoP_->isDeviceResident() &&
+           "Tensor must be device resident");
+    return residencyInfoP_->getDeviceManager();
+  }
+
+  /// \returns the pointer to the location context of where the tensor resides.
+  void *getLocationContext() const {
+    assert(residencyInfoP_->isDeviceResident() &&
+           "Tensor must be device resident");
+    return residencyInfoP_->getLocationContext();
+  }
+
+  /// Clears DeviceResidencyInfo.
+  /// Note that this does not affect the associated DeviceManager or device
+  /// memory.
+  void clearDeviceResidency() {
+    assert(residencyInfoP_->isDeviceResident() &&
+           "Tensor must be device resident");
+    residencyInfoP_->clear();
+  }
+
   /// \return a new handle that points and manages this tensor.
   template <class ElemTy = float> Handle<ElemTy> getHandle() &;
 
@@ -623,19 +747,22 @@ class Tensor final {
 private:
   /// \returns a pointer to the raw data, of type \p ElemTy.
   template <class ElemTy> ElemTy *getRawDataPointer() {
+    assert(!isDeviceResident() && "Tensor must reside on host to access data.");
     assert(type_.isType<ElemTy>() && "Asking for the wrong ptr type.");
     return reinterpret_cast<ElemTy *>(data_);
   }
 
   /// \returns a const pointer to the raw data, of type \p ElemTy.
   template <class ElemTy> const ElemTy *getRawDataPointer() const {
+    assert(!isDeviceResident() && "Tensor must reside on host to access data.");
     assert(type_.isType<ElemTy>() && "Asking for the wrong ptr type.");
     return reinterpret_cast<const ElemTy *>(data_);
   }
 
   template <class ElemTy>
   bool isEqualImpl(const Tensor &other, float allowedError,
                    bool verbose) const {
+    assert(!isDeviceResident() && "Tensor must reside on host to access data.");
     auto const *myData = getRawDataPointer<ElemTy>();
     auto const *otherData = other.getRawDataPointer<ElemTy>();
     double maxFoundError = 0.0;
@@ -668,6 +795,7 @@ class Tensor final {
   }
 
   bool isBitwiseEqualImpl(const Tensor &other) const {
+    assert(!isDeviceResident() && "Tensor must reside on host to access data.");
     auto const *myData = getUnsafePtr();
     auto const *otherData = other.getUnsafePtr();
     for (size_t i = 0, e = getSizeInBytes(); i < e; i++) {
@@ -1283,11 +1411,13 @@ template <class ElemTy> class Handle final {
 };
 
 template <class ElemTy> Handle<ElemTy> Tensor::getHandle() & {
+  assert(!isDeviceResident() && "Tensor must reside on host to access data.");
   assert(type_.isType<ElemTy>() && "Getting a handle to the wrong type.");
   return Handle<ElemTy>(this);
 }
 
 template <class ElemTy> const Handle<ElemTy> Tensor::getHandle() const & {
+  assert(!isDeviceResident() && "Tensor must reside on host to access data.");
   assert(type_.isType<ElemTy>() && "Getting a handle to the wrong type.");
   return Handle<ElemTy>(const_cast<Tensor *>(this));
 }