Update on "Factor out numerical logic"

This change is similar to #54049 in that it helps us factor out some code that can be used in both fast and slow versions of gradcheck. - `compute_gradient` and `compute_numerical_jacobian_cols` have fewer responsibilities: - compute_numerical_jacobian_cols essentially only handles the complexity of complex derivatives - compute_gradient handles only finite differencing (and doesn't worry about different layouts and indexing into the input tensor) - we have two stages again where we first compute the columns separately, then combine them [ghstack-poisoned]
pytorch · Apr 1, 2021 · cd846df · cd846df
2 parents 387d332 + ca5c463
commit cd846df
Show file tree

Hide file tree

Showing 96 changed files with 2,050 additions and 1,380 deletions.
diff --git a/.github/workflows/auto_label.yml b/.github/workflows/auto_label.yml
@@ -0,0 +1,44 @@
+name: Label PRs & Issues
+
+on:
+  issues:
+    types: [opened, edited]
+  pull_request_target:
+    types: [edited, opened, synchronize, reopened]
+
+jobs:
+  auto-label-rocm:
+    runs-on: ubuntu-18.04
+    steps:
+    - name: Retrieve information
+      id: vars
+      run: |
+        set -eux
+        IS_PR=${{ github.event.pull_request }}
+        if [[ -n "${IS_PR}" ]]; then
+          TITLE="${{ github.event.pull_request.title }}"
+          ISSUE_NUMBER=${{ github.event.pull_request.number }}
+        else
+          TITLE="${{ github.event.issue.title }}"
+          ISSUE_NUMBER=${{ github.event.issue.number }}
+        fi
+        echo ::set-output name=TITLE::"${TITLE}"
+        echo ::set-output name=ISSUE_NUMBER::"${ISSUE_NUMBER}"
+        echo ::set-output name=OWNER::"${{ github.repository_owner }}"
+        echo ::set-output name=REPO::"${{ github.event.repository.name }}"
+    - name: Auto-label ROCm
+      run: |
+        set -eux
+        if [[ "${TITLE,,}" == *rocm* ]]; then
+          curl \
+            -X POST \
+            -H "Authorization: token ${GITHUB_TOKEN}" \
+            "https://api.github.com/repos/${OWNER}/${REPO}/issues/${ISSUE_NUMBER}/labels" \
+            -d '{"labels":["ROCm"]}'
+        fi
+      env:
+        GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
+        TITLE: "${{ steps.vars.outputs.TITLE }}"
+        ISSUE_NUMBER: "${{ steps.vars.outputs.ISSUE_NUMBER }}"
+        OWNER: "${{ steps.vars.outputs.OWNER }}"
+        REPO: "${{ steps.vars.outputs.REPO }}"
diff --git a/README.md b/README.md
@@ -264,9 +264,15 @@ Each CUDA version only supports one particular XCode version. The following comb
 
 On Windows
 
+Choose Correct Visual Studio Version.
+
+Visual Studio upgrades are very often. Sometimes, there're regressions in some new versions.
+It'd best to use the same Visual Studio Version as [PyTorch CI's](https://github.com/pytorch/pytorch/blob/a1bd7918cc5a06cbef6c5178259bf0a7b5ab1ce3/.circleci/scripts/vs_install.ps1#L4).
+You can use Visual Studio Enterprise, Professional or Community though PyTorch CI uses Visual Studio BuildTools.
+
 Build with CPU
 
-It's fairly easy to build with CPU. Visual Studio 2019 version 16.7.6 (MSVC toolchain version 14.27) or higher is recommended.
+It's fairly easy to build with CPU.
 
 Note on OpenMP: The desired OpenMP implementation is Intel OpenMP (iomp). In order to link against iomp, you'll need to manually download the library and set up the buliding environment by tweaking `CMAKE_INCLUDE_PATH` and `LIB`. The instruction [here](https://github.com/pytorch/pytorch/blob/master/docs/source/notes/windows.rst#building-from-source) is an example for setting up both MKL and Intel OpenMP. Without these configuraions for CMake, Microsoft Visual C OpenMP runtime (vcomp) will be used.
 

diff --git a/android/gradle.properties b/android/gradle.properties
@@ -1,6 +1,6 @@
 ABI_FILTERS=armeabi-v7a,arm64-v8a,x86,x86_64
 
-VERSION_NAME=1.8.0-SNAPSHOT
+VERSION_NAME=1.9.0-SNAPSHOT
 GROUP=org.pytorch
 MAVEN_GROUP=org.pytorch
 SONATYPE_STAGING_PROFILE=orgpytorch

diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@@ -87,6 +87,16 @@ OperatorHandle Dispatcher::findSchemaOrThrow(const char* name, const char* overl
   return it.value();
 }
 
+const std::vector<OperatorName> Dispatcher::getAllOpNames() {
+  return operatorLookupTable_.read([&] (const ska::flat_hash_map<OperatorName, OperatorHandle>& operatorLookupTable) -> std::vector<OperatorName> {
+    std::vector<OperatorName> allOpNames;
+    for (const auto& op : operatorLookupTable) {
+        allOpNames.push_back(op.first);
+    }
+    return allOpNames;
+  });
+}
+
 // Postcondition: caller is responsible for disposing of registration when they
 // are done
 OperatorHandle Dispatcher::findOrRegisterName_(const OperatorName& op_name) {

diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h
@@ -138,6 +138,9 @@ class TORCH_API Dispatcher final {
   // Like findSchema, but also returns OperatorHandle even if there is no schema
   c10::optional<OperatorHandle> findOp(const OperatorName& operator_name);
 
+  // Returns a list of all operator names present in the operatorLookupTable_
+  const std::vector<OperatorName> getAllOpNames();
+
   // ------------------------------------------------------------------------
   //
   // Invoking operators

diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
@@ -366,7 +366,7 @@ struct C10_EXPORT ivalue::Future : c10::intrusive_ptr_target {
   /**
    * Explicitly mark the future as completed with the output value and DataPtrs.
    * The data_ptrs contains storage pointers for all tensors in IValue, which
-   * will be passed to postMarkCompletedHook. Some subclass, like CUDAFuture,
+   * will be passed to preMarkCompletedHook. Some subclass, like CUDAFuture,
    * uses these DataPtrs to synchronize CUDA streams. You only need to provide
    * data_ptrs when 1) DataPtrs cannot be extracted through
    * IValue::getSubValues() or 2) customized DataPtrs extraction is more
@@ -381,10 +381,12 @@ struct C10_EXPORT ivalue::Future : c10::intrusive_ptr_target {
         !completed(),
         "Attempting to mark a completed Future as complete again. Note that "
         "a Future can only be marked completed once.");
-    completed_ = true;
-    value_ = std::move(value);
 
-    postMarkCompletedHook(value_, std::move(data_ptrs));
+    preMarkCompletedHook(value, std::move(data_ptrs));
+    // Only set value_ and completed_ flag once preMarkCompletedHook has
+    // returned successfully to allow for proper error propagation.
+    value_ = std::move(value);
+    completed_ = true;
 
     std::vector<std::function<void(void)>> cbs;
     cbs.swap(callbacks_);
@@ -527,7 +529,7 @@ struct C10_EXPORT ivalue::Future : c10::intrusive_ptr_target {
   // resides on and record an event in those devices' current streams.
   // The data_ptrs field contains storage pointers of all tensors in the value,
   // which is used by the CUDAFuture subclass to synchronize streams.
-  virtual void postMarkCompletedHook(
+  virtual void preMarkCompletedHook(
       const at::IValue& value,
       c10::optional<std::vector<std::reference_wrapper<const at::DataPtr>>>
           data_ptrs) {}
@@ -567,7 +569,7 @@ struct C10_EXPORT ivalue::Future : c10::intrusive_ptr_target {
     completed_ = true;
     eptr_ = std::move(eptr);
 
-    // Do not call postMarkCompletedHook() here as there isn't any value.
+    // Do not call preMarkCompletedHook() here as there isn't any value.
 
     std::vector<std::function<void(void)>> cbs;
     cbs.swap(callbacks_);

diff --git a/aten/src/ATen/core/op_registration/adaption.h b/aten/src/ATen/core/op_registration/adaption.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <c10/core/TensorOptions.h>
+
+/*
+ * [Note: hacky wrapper removal for optional tensor]
+ *
+ * The kernel implementation takes an optional tensor marked in the schema as
+ * Tensor? but the C++ function takes Tensor instead of the optional<Tensor>
+ * expected by the dispatcher.
+ *
+ * To remove the hacky wrapper, the C++ function is changed to take
+ * optional<Tensor> and unwrap the Tensor value at the beginning of
+ * the function, e.g.:
+ *   > const Tensor& weight =
+     >     c10::value_or_else(weight_opt, [] {returnTensor();});
+ *
+ * We may want make the kernel handle optional directly without going through
+ * the creation of a default constructed tensor.
+ */
+
+/*
+ * [Note: hacky wrapper removal for TensorOptions]
+ *
+ * The kernel implementation takes a TensorOptions argument but the dispatcher
+ * expects separate arguments for dtype, layout, device, pin_memory.
+ *
+ * To remove the hacky wrapper, the kernel implementation is changed to take
+ * the 4 arguments (dtype, layout, device, pin_memory), and assemble the
+ * TensorOptions value at the beginning of the function, e.g.:
+ *   > TensorOptions options = TensorOptions().dtype(dtype).layout(layout)
+ *   >    .device(device).pinned_memory(pin_memory);
+ *
+ * We may want make the kernel handle these parameters directly without going
+ * through the creation of a TensorOptions value.
+ */
+
+namespace c10 {
+namespace impl {
+
+inline c10::optional<MemoryFormat>
+check_tensor_options_and_extract_memory_format(
+    const TensorOptions& options,
+    c10::optional<MemoryFormat> memory_format) {
+  TORCH_CHECK(
+      options.requires_grad_opt() == c10::nullopt ||
+          options.requires_grad_opt().value() == false,
+      "Operators taking TensorOptions cannot take a TensorOptions with "
+      "options.requires_grad set as true. This isn't implemented yet.");
+  TORCH_CHECK(
+      !(options.has_memory_format() && memory_format.has_value()),
+      "Cannot set memory_format both in TensorOptions and explicit argument; please delete "
+      "the redundant setter.");
+  if (memory_format.has_value()) {
+    return memory_format;
+  } else {
+    return options.memory_format_opt();
+  }
+}
+} // namespace impl
+} // namespace c10