Update on "Make CUDAFuture remember and restore current device in cal…

…lback" CUDAFuture aims to "capture" the current state of CUDA-related stuff when the future is marked complete (e.g., by looking at current streams and recording events on them) and then "replicate" a similar state when users synchronize with the result of the future (by synchronizing the current streams with these events). However, one "contextual" aspect of CUDA that we weren't capturing/replicating was the current device. This diff tries to fix that. I must mention that we can only do this for callbacks, while we cannot do it for the wait() method. I don't know if such a discrepancy between the two actually makes the overall behavior _worse_. I'd love to hear people's opinions on this. Differential Revision: [D25210335](https://our.internmc.facebook.com/intern/diff/D25210335/) [ghstack-poisoned]
pytorch · Dec 9, 2020 · 4f76484 · 4f76484
2 parents 08dcb5b + 2a91991
commit 4f76484
Show file tree

Hide file tree

Showing 93 changed files with 2,239 additions and 1,151 deletions.
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -75,7 +75,7 @@ jobs:
       - name: Run flake8
         run: |
           set -eux
-          pip install flake8==3.8.2 flake8-bugbear==20.1.4 flake8-comprehensions==3.3.0 flake8-executable==2.0.4 flake8-pyi==20.5.0 mccabe pycodestyle==2.6.0 pyflakes==2.2.0
+          pip install -r requirements-flake8.txt
           flake8 --version
           flake8 | tee ${GITHUB_WORKSPACE}/flake8-output.txt
       - name: Add annotations

diff --git a/.travis.aten.yml b/.travis.aten.yml
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -891,7 +891,7 @@ which is in PyTorch's `requirements.txt`.
 ## Pre-commit tidy/linting hook
 
 We use clang-tidy and flake8 (installed with flake8-bugbear,
-flake8-comprehensions, flake8-mypy, and flake8-pyi) to perform additional
+flake8-comprehensions, flake8-pyi, and others) to perform additional
 formatting and semantic checking of code. We provide a pre-commit git hook for
 performing these checks, before a commit is created:
 

diff --git a/README.md b/README.md
@@ -176,7 +176,7 @@ conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_ex
 On Linux
 ```bash
 # Add LAPACK support for the GPU if needed
-conda install -c pytorch magma-cuda102  # or [ magma-cuda101 | magma-cuda100 | magma-cuda92 ] depending on your cuda version
+conda install -c pytorch magma-cuda110  # or the magma-cuda* that matches your CUDA version from https://anaconda.org/pytorch/repo
 ```
 
 On MacOS

diff --git a/aten/src/ATen/BatchingRegistrations.cpp b/aten/src/ATen/BatchingRegistrations.cpp
@@ -941,8 +941,8 @@ Tensor new_empty_strided_batching_rule(
         size.size(), ") must match dimensionality of strides (",
         stride.size(), ")");
   auto storage_size = native::storage_size_for(size, stride);
-  for (int64_t idx = 0; idx < physical_strides.size(); ++idx) {
-    physical_strides[idx] *= storage_size;
+  for (auto& physical_stride : physical_strides) {
+    physical_stride *= storage_size;
   }
 
   // physical_strides = [B1 * B2 * S, B2 * S, S] + strides

diff --git a/aten/src/ATen/Config.h.in b/aten/src/ATen/Config.h.in
@@ -8,6 +8,7 @@
 
 #define AT_MKLDNN_ENABLED() @AT_MKLDNN_ENABLED@
 #define AT_MKL_ENABLED() @AT_MKL_ENABLED@
+#define AT_FFTW_ENABLED() @AT_FFTW_ENABLED@
 #define AT_NNPACK_ENABLED() @AT_NNPACK_ENABLED@
 #define CAFFE2_STATIC_LINK_CUDA() @CAFFE2_STATIC_LINK_CUDA_INT@
 #define AT_BUILD_WITH_BLAS() @USE_BLAS@

diff --git a/aten/src/ATen/NamedTensorUtils.cpp b/aten/src/ATen/NamedTensorUtils.cpp
@@ -264,11 +264,11 @@ static std::vector<Dimname> compute_dot_product_outnames(
   }
   std::vector<Dimname> outnames(num_outnames, Dimname::wildcard());
   int64_t index = 0;
-  for (int64_t j = 0; j < tensor_names.size(); ++j) {
+  for (size_t j = 0; j < tensor_names.size(); ++j) {
     if (j == tensor_dotted_dim) continue;
     outnames[index++] = tensor_names[j];
   }
-  for (int64_t j = 0; j < other_names.size(); ++j) {
+  for (size_t j = 0; j < other_names.size(); ++j) {
     if (j == other_dotted_dim) continue;
     outnames[index++] = other_names[j];
   }

diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp
@@ -46,6 +46,8 @@ SparseTensorImpl::SparseTensorImpl(at::DispatchKeySet key_set, const caffe2::Typ
   AT_ASSERT(values_.sizes() == IntArrayRef({0}));
   AT_ASSERT(values_.device() == indices_.device());
   AT_ASSERT(values_.device() == device());
+
+  is_non_overlapping_and_dense_ = false;
 }
 
 IntArrayRef SparseTensorImpl::strides() const {

diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp
@@ -939,8 +939,8 @@ TensorIterator TensorIterator::reduce_op(Tensor& out1, Tensor& out2, const Tenso
 }
 
 void TensorIteratorBase::populate_operands(TensorIteratorConfig& config) {
-  for (int i = 0; i < config.tensors_.size(); i++) {
-    operands_.emplace_back(std::move(config.tensors_[i]));
+  for (auto& tensor: config.tensors_) {
+    operands_.emplace_back(std::move(tensor));
   }
   num_outputs_ = config.num_outputs_;
 }

diff --git a/aten/src/ATen/TensorNames.cpp b/aten/src/ATen/TensorNames.cpp
@@ -61,10 +61,10 @@ TensorNames::TensorNames(ArrayRef<Dimname> names, int64_t start, int64_t end) {
 }
 
 TensorNames& TensorNames::unifyFromRightInplace(const TensorNames& other, const char* op_name) {
-  int64_t size_diff = std::labs(names_.size() - other.names_.size());
+  size_t size_diff = std::labs(names_.size() - other.names_.size());
 
   if (names_.size() > other.names_.size()) {
-    for (int64_t idx = size_diff; idx < names_.size(); ++idx) {
+    for (size_t idx = size_diff; idx < names_.size(); ++idx) {
       names_[idx] = names_[idx].unify(other.names_[idx - size_diff], op_name);
     }
   } else {

diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp
@@ -19,6 +19,7 @@ ThreadLocalState::ThreadLocalState(bool keep_grad_mode)
     grad_mode_enabled_ = GradMode::is_enabled();
   }
 #endif
+  bumped_record_all_functions_ = at::checkRecordAllFunctions();
 }
 
 /* static */

diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h
@@ -38,25 +38,47 @@ class TORCH_API ThreadLocalState {
   bool grad_mode_enabled_;
 #endif
 
+  // Whether pre-sampling RecordFunction optimization was enabled
+  bool bumped_record_all_functions_ = false;
+
   friend class ThreadLocalStateGuard;
 };
 
 // Guard to set and reset the thread local state
 class TORCH_API ThreadLocalStateGuard {
  public:
   explicit ThreadLocalStateGuard(const ThreadLocalState& state)
-      : prev_state_(ThreadLocalState()) {
+      : prev_state_(ThreadLocalState()),
+        bumped_record_all_functions_(state.bumped_record_all_functions_) {
+    // Special handling of RecordFunction pre-sampling optimization:
+    // pre-samping is enabled (bumped) when there're non-sampled
+    // (or high-frequency) global or TLS callbacks.
+    //
+    // ThreadLocalStateGuard simply resets RecordFunction's TLS and
+    // hence its thread local callbacks.
+    //
+    // Checking if the pre-sampling was enabled and preserving it in the
+    // async task by calling bumpRecordAllFunctions() and the corresponding
+    // releaseRecordAllFunctions()
+    if (bumped_record_all_functions_) {
+      at::bumpRecordAllFunctions();
+    }
     // set the given state across the thread boundary
     ThreadLocalState::setThreadLocalState(state);
   }
 
   ~ThreadLocalStateGuard() {
     // restore previously set variables
     ThreadLocalState::setThreadLocalState(prev_state_);
+    if (bumped_record_all_functions_) {
+      at::releaseRecordAllFunctions();
+    }
   }
 
  private:
   const ThreadLocalState prev_state_;
+  // Whether pre-sampling RecordFunction optimization was enabled
+  bool bumped_record_all_functions_ = false;
 };
 
 template <typename T>

diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h
@@ -371,28 +371,39 @@ inline Return Dispatcher::callWithDispatchKey(const TypedOperatorHandle<Return(A
   const KernelFunction& kernel = op.operatorIterator_->op.lookup(dispatchKey);
 
 #ifndef PYTORCH_DISABLE_PER_OP_PROFILING
-  // Check if we need to run callbacks registered with RecordFunction
-  // If true and callbacks need inputs, we box the arguments and pass
-  // them into the callbacks and also into the kernel call
-
-  // Note: for perf reasons we wouldn't want to pass arguments into
-  // the function call or prematurely box them
-  at::RecordFunction guard(at::RecordScope::FUNCTION);
-  if (C10_UNLIKELY(guard.isActive())) {
-    if (shouldRecord(dispatchKey) && op.operatorIterator_->op.isObserved()) {
-      int64_t seq_num = -1;
-      // Setting sequence number in the Autograd case to associate
-      // the forward range with the coresponding Autograd's node
-      if (isIncludedInAlias(dispatchKey, DispatchKey::Autograd) && at::GradMode::is_enabled()) {
-        seq_num = at::sequence_number::peek();
-      }
-      if (guard.needsInputs()) {
-        torch::jit::Stack stack = impl::boxArgs(args...);
-        guard.before(op, stack, seq_num);
-      } else {
-        guard.before(op, seq_num);
+  // By default, when there're no high-frequency or non-sampled callbacks,
+  // RecordFunction is pre-sampled as a perf optimization;
+  // shouldRunRecordFunction checks whether RecordFunction should be executed,
+  // and sets pre_sampled boolean argument value to whether pre-sampling was used -
+  // this boolean is passed into RecordFunction to adjust the sampling rates of
+  // the callbacks
+  bool pre_sampled = false;
+  if (C10_UNLIKELY(at::shouldRunRecordFunction(&pre_sampled))) {
+    // Check if we need to run callbacks registered with RecordFunction
+    // If true and callbacks need inputs, we box the arguments and pass
+    // them into the callbacks and also into the kernel call
+
+    // Note: for perf reasons we wouldn't want to pass arguments into
+    // the function call or prematurely box them
+    at::RecordFunction guard(at::RecordScope::FUNCTION, pre_sampled);
+    if (C10_UNLIKELY(guard.isActive())) {
+      if (shouldRecord(dispatchKey) && op.operatorIterator_->op.isObserved()) {
+        int64_t seq_num = -1;
+        // Setting sequence number in the Autograd case to associate
+        // the forward range with the coresponding Autograd's node
+        if (isIncludedInAlias(dispatchKey, DispatchKey::Autograd) && at::GradMode::is_enabled()) {
+          seq_num = at::sequence_number::peek();
+        }
+        if (guard.needsInputs()) {
+          torch::jit::Stack stack = impl::boxArgs(args...);
+          guard.before(op, stack, seq_num);
+        } else {
+          guard.before(op, seq_num);
+        }
       }
     }
+    // keeping the guard alive while executing the kernel
+    return kernel.template call<Return, Args...>(op, std::forward<Args>(args)...);
   }
 #endif  // PYTORCH_DISABLE_PER_OP_PROFILING
   return kernel.template call<Return, Args...>(op, std::forward<Args>(args)...);
@@ -429,20 +440,26 @@ inline void Dispatcher::callBoxed(const OperatorHandle& op, Stack* stack) const
   const auto& kernel = entry.lookup(dispatchKey);
 
 #ifndef PYTORCH_DISABLE_PER_OP_PROFILING
-  // using already existing stack to record function execution in observers
-  at::RecordFunction guard(at::RecordScope::FUNCTION);
-  if (C10_UNLIKELY(guard.isActive())) {
-    if (shouldRecord(dispatchKey) && entry.isObserved()) {
-      int64_t seq_num = -1;
-      if (isIncludedInAlias(dispatchKey, DispatchKey::Autograd) && at::GradMode::is_enabled()) {
-        seq_num = at::sequence_number::peek();
-      }
-      if (guard.needsInputs()) {
-        guard.before(op, *stack, seq_num);
-      } else {
-        guard.before(op, seq_num);
+  bool pre_sampled = false;
+  if (C10_UNLIKELY(at::shouldRunRecordFunction(&pre_sampled))) {
+    // using already existing stack to record function execution in observers
+    at::RecordFunction guard(at::RecordScope::FUNCTION, pre_sampled);
+    if (C10_UNLIKELY(guard.isActive())) {
+      if (shouldRecord(dispatchKey) && entry.isObserved()) {
+        int64_t seq_num = -1;
+        if (isIncludedInAlias(dispatchKey, DispatchKey::Autograd) && at::GradMode::is_enabled()) {
+          seq_num = at::sequence_number::peek();
+        }
+        if (guard.needsInputs()) {
+          guard.before(op, *stack, seq_num);
+        } else {
+          guard.before(op, seq_num);
+        }
       }
     }
+    // keeping the guard alive while executing the kernel
+    kernel.callBoxed(op, stack);
+    return;
   }
 #endif  // PYTORCH_DISABLE_PER_OP_PROFILING
   kernel.callBoxed(op, stack);

diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
@@ -157,8 +157,12 @@ void IValue::visit(const std::function<bool (const IValue &)>& visitor) const {
       break;
     }
     case Tag::PyObject: {
-      IValue contained_value = toPyObjectHolder()->toIValue();
-      contained_value.visit(visitor);
+      c10::intrusive_ptr<at::ivalue::PyObjectHolder> py_obj = toPyObjectHolder();
+      auto match = py_obj->tryToInferType();
+      if (match.success()) {
+        auto contained_value = py_obj->toIValue(match.type());
+        contained_value.visit(visitor);
+      }
       break;
     }
     default:
@@ -206,7 +210,11 @@ void IValue::getSubValues(HashAliasedIValues& subValues) const {
     }
     case Tag::PyObject: {
       subValues.insert(*this);
-      IValue contained_value = toPyObjectHolder()->toIValue();
+      c10::intrusive_ptr<at::ivalue::PyObjectHolder> py_obj = toPyObjectHolder();
+      auto match = py_obj->tryToInferType();
+      TORCH_INTERNAL_ASSERT(match.success(),
+            "Tracer cannot infer type of ", py_obj->toStr(), "\n:", match.reason());
+      auto contained_value = py_obj->toIValue(match.type());
       contained_value.getSubValues(subValues);
       break;
     }

diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
@@ -637,8 +637,32 @@ struct C10_EXPORT ivalue::Object final : c10::intrusive_ptr_target {
 // see concrete implementation in python_ivalue.h
 struct ivalue::PyObjectHolder : c10::intrusive_ptr_target {
  public:
+  struct InferredType {
+    InferredType(TypePtr type) : type_(std::move(type)) {}
+    InferredType(std::string reason)
+        : type_(nullptr), reason_(std::move(reason)) {}
+    TypePtr type() const {
+      TORCH_INTERNAL_ASSERT(type_);
+      return type_;
+    }
+    bool success() const {
+      return type_ != nullptr;
+    }
+    const std::string& reason() const {
+      TORCH_INTERNAL_ASSERT(!type_);
+      return reason_;
+    }
+
+  private:
+    TypePtr type_;
+    std::string reason_;
+  };
+
   virtual PyObject* getPyObject() = 0;
-  virtual IValue toIValue() = 0;
+  virtual InferredType tryToInferType() = 0;
+  virtual IValue toIValue(const TypePtr& type, c10::optional<int32_t> N = c10::nullopt) = 0;
+  virtual std::string toStr() = 0;
+
   virtual ~PyObjectHolder(){};
 };
 

diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
@@ -177,14 +177,10 @@ auto ConvParams::needs_64bit_indexing_no_split(const at::Tensor& input, const at
   int64_t outsize = 1;
   if (transposed) {
     std::vector<int64_t> o = conv_input_size(input.sizes(), weight.sizes(), padding, output_padding, stride, dilation, groups);
-    for (int64_t i = 1; i < o.size(); i++) {
-      outsize *= o[i];
-    }
+    outsize = prod_intlist(o.begin() + 1, o.end());
   } else {
     std::vector<int64_t> o = conv_output_size(input.sizes(), weight.sizes(), padding, stride, dilation);
-    for (int64_t i = 1; i < o.size(); i++) {
-      outsize *= o[i];
-    }
+    outsize = prod_intlist(o.begin() + 1, o.end());
   }
   return outsize > int_max;
 }

diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp
@@ -204,7 +204,7 @@ std::vector<Tensor> foreach_tensor_##NAME##_slow(TensorList tensors1, TensorList
                                                                                              \
   std::vector<Tensor> result;                                                                \
   result.reserve(tensors1.size());                                                           \
-  for (int i = 0; i < tensors1.size(); i++) {                                                \
+  for (size_t i = 0; i < tensors1.size(); i++) {                                             \
     result.emplace_back(at::NAME(tensors1[i], tensors2[i]));                                 \
   }                                                                                          \
                                                                                              \

diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
@@ -415,6 +415,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
 
   bool use_cudnn = false;
   use_cudnn = (input.is_cuda()
+               && input.scalar_type() != at::kBFloat16 && weight.scalar_type() != at::kBFloat16
                && (input.scalar_type() != at::kHalf
                  || weight.scalar_type() == at::kFloat)
                && weight.defined() && bias.defined()