From 7d4c1a5ab07243f9c087bb957e79e4e6fcb5469d Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Sat, 24 Oct 2020 12:35:23 -0700
Subject: [PATCH 001/169] Fix type warning (#46770)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/46770

Test Plan: Standard pre-commit test rig.

Reviewed By: malfet

Differential Revision: D24480898

fbshipit-source-id: a5031f1e20f4b1ea5954e7cabd54502300d5a916
---
 aten/src/ATen/Context.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index fed5e88e5314..0597f993b608 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -314,10 +314,12 @@ static inline void manual_seed(uint64_t seed) {
   }
   // NB: Sometimes we build with CUDA, but we don't have any GPUs
   // available. In that case, we must not seed CUDA; it will fail!
-  int num_gpus = detail::getCUDAHooks().getNumGPUs();
+  const auto num_gpus = detail::getCUDAHooks().getNumGPUs();
   if (hasCUDA() && num_gpus > 0) {
     for (int i = 0; i < num_gpus; i++) {
-      auto cuda_gen = globalContext().defaultGenerator(Device(at::kCUDA, i));
+      auto cuda_gen = globalContext().defaultGenerator(
+        Device(at::kCUDA, static_cast<c10::DeviceIndex>(i))
+      );
       {
         // See Note [Acquire lock when using random generators]
         std::lock_guard<std::mutex> lock(cuda_gen.mutex());

From f9b94301529f1089f29a25958d2e754a986bfe75 Mon Sep 17 00:00:00 2001
From: Yanan Cao <gmagogsfm@gmail.com>
Date: Sat, 24 Oct 2020 12:49:17 -0700
Subject: [PATCH 002/169] Support doc_string for TorchBind custom classes
 (#46576)

Summary:
With this PR, users can optionally provide a "doc_string" to describe a class or its method. doc_string for TorchBind classes and methods are stored as `doc_string` properties in `Function` and `ScriptClass`. These `dos_string` properties are then exposed in Python layer via PyBind for doc generation.

Fixes https://github.com/pytorch/pytorch/issues/46047

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46576

Reviewed By: wanchaol

Differential Revision: D24440636

Pulled By: gmagogsfm

fbshipit-source-id: bfa9b270a6c2d8bc769a88fad6be939cc6310412
---
 aten/src/ATen/core/builtin_function.h         | 12 +++++-
 aten/src/ATen/core/function.h                 |  5 +++
 aten/src/ATen/core/jit_type.h                 | 12 +++++-
 aten/src/ATen/core/type.cpp                   | 12 +++---
 test/cpp/jit/test_custom_class.cpp            | 42 +++++++++++++++++++
 torch/csrc/jit/python/python_custom_class.cpp |  5 ++-
 torch/csrc/jit/python/script_init.cpp         |  8 +++-
 torch/custom_class.h                          | 22 +++++-----
 8 files changed, 96 insertions(+), 22 deletions(-)

diff --git a/aten/src/ATen/core/builtin_function.h b/aten/src/ATen/core/builtin_function.h
index b4804cfebcbe..3d7f70d86877 100644
--- a/aten/src/ATen/core/builtin_function.h
+++ b/aten/src/ATen/core/builtin_function.h
@@ -10,13 +10,19 @@ struct BuiltinOpFunction : public Function {
   BuiltinOpFunction(
       c10::QualifiedName qualname,
       c10::FunctionSchema schema,
-      std::function<void(Stack&)> callable)
+      std::function<void(Stack&)> callable,
+      std::string doc_string = "")
       : name_(std::move(qualname)),
         callable_(std::move(callable)),
-        schema_(std::move(schema)) {
+        schema_(std::move(schema)),
+        doc_string_(std::move(doc_string)) {
     TORCH_INTERNAL_ASSERT(schema_.returns().size() == 1);
   }
 
+  const std::string& doc_string() const override {
+    return doc_string_;
+  }
+
   bool isGraphFunction() const override {
     return false;
   }
@@ -110,6 +116,8 @@ struct BuiltinOpFunction : public Function {
   std::function<void(Stack&)> callable_;
 
   c10::FunctionSchema schema_;
+
+  std::string doc_string_;
 };
 
 } // namespace jit
diff --git a/aten/src/ATen/core/function.h b/aten/src/ATen/core/function.h
index 0cf658b0f701..8264bc57e8e8 100644
--- a/aten/src/ATen/core/function.h
+++ b/aten/src/ATen/core/function.h
@@ -25,6 +25,11 @@ TORCH_API void preoptimizeGraph(std::shared_ptr<Graph>& graph);
 // execution of the function. Method is a wrapper around an
 // underlying Function that also provides a `self` object.
 struct TORCH_API Function {
+  virtual const std::string& doc_string() const {
+    static const std::string no_doc_string = "";
+    return no_doc_string;
+  }
+
   virtual bool isGraphFunction() const = 0;
 
   virtual void run(Stack& stack) = 0;
diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index a1b21ee1ba21..06900064e266 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -1989,7 +1989,8 @@ struct CAFFE2_API ClassType : public NamedType {
   static ClassTypePtr create(
       c10::optional<QualifiedName> qualifiedName,
       std::weak_ptr<CompilationUnit> cu,
-      bool is_module = false);
+      bool is_module = false,
+      std::string doc_string = "");
 
   bool operator==(const Type& rhs) const override {
     if (auto user_rhs = rhs.cast<ClassType>()) {
@@ -2175,6 +2176,9 @@ struct CAFFE2_API ClassType : public NamedType {
     return constantNames_[slot];
   }
 
+  const std::string& doc_string() const {
+    return doc_string_;
+  }
 
   IValue getConstant(const std::string& name) const;
 
@@ -2271,7 +2275,8 @@ struct CAFFE2_API ClassType : public NamedType {
   ClassType(
       c10::optional<QualifiedName> name,
       std::weak_ptr<CompilationUnit> cu,
-      bool is_module);
+      bool is_module,
+      std::string doc_string);
 
   std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
     const auto& n = name().value();
@@ -2306,6 +2311,9 @@ struct CAFFE2_API ClassType : public NamedType {
   std::vector<Property> properties_;
 
   bool isModule_ = false;
+
+  // Doc string of class.
+  std::string doc_string_ = "";
 };
 
 struct InterfaceType;
diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp
index 634d706091af..f5478d040060 100644
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@@ -1211,19 +1211,21 @@ InterfaceType::~InterfaceType() = default;
 ClassTypePtr ClassType::create(
     c10::optional<QualifiedName> qualifiedName,
     std::weak_ptr<CompilationUnit> cu,
-    bool is_module) {
+    bool is_module,
+    std::string doc_string) {
   return ClassTypePtr(
-      new ClassType(std::move(qualifiedName), std::move(cu), is_module));
+      new ClassType(std::move(qualifiedName), std::move(cu), is_module, std::move(doc_string)));
 }
 
 ClassType::ClassType(
     c10::optional<QualifiedName> name,
     std::weak_ptr<CompilationUnit> cu,
-    bool is_module = false)
+    bool is_module = false,
+    std::string doc_string = "")
     : NamedType(TypeKind::ClassType, std::move(name)),
       compilation_unit_(std::move(cu)),
-      isModule_(is_module) {
-}
+      isModule_(is_module),
+      doc_string_(std::move(doc_string)) {}
 
 const std::vector<torch::jit::Function*>& ClassType::methods() const {
   return methods_;
diff --git a/test/cpp/jit/test_custom_class.cpp b/test/cpp/jit/test_custom_class.cpp
index a96a3b4a5635..776df23e1737 100644
--- a/test/cpp/jit/test_custom_class.cpp
+++ b/test/cpp/jit/test_custom_class.cpp
@@ -44,5 +44,47 @@ TEST(CustomClassTest, TorchbindIValueAPI) {
   test_with_obj(new_stack_ivalue, "boo");
 }
 
+class TorchBindTestClass : public torch::jit::CustomClassHolder {
+ public:
+  std::string get() {
+    return "Hello, I am your test custom class";
+  }
+};
+
+constexpr char class_doc_string[] = R"(
+  I am docstring for TorchBindTestClass
+  Args:
+      What is an argument? Oh never mind, I don't take any.
+
+  Return:
+      How would I know? I am just a holder of some meaningless test methods.
+  )";
+constexpr char method_doc_string[] =
+    "I am docstring for TorchBindTestClass get_with_docstring method";
+
+namespace {
+static auto reg =
+    torch::class_<TorchBindTestClass>(
+        "_TorchBindTest",
+        "_TorchBindTestClass",
+        class_doc_string)
+        .def("get", &TorchBindTestClass::get)
+        .def("get_with_docstring", &TorchBindTestClass::get, method_doc_string);
+
+} // namespace
+
+// Tests DocString is properly propagated when defining CustomClasses.
+TEST(CustomClassTest, TestDocString) {
+  auto class_type = getCustomClass(
+      "__torch__.torch.classes._TorchBindTest._TorchBindTestClass");
+  AT_ASSERT(class_type);
+  AT_ASSERT(class_type->doc_string() == class_doc_string);
+
+  AT_ASSERT(class_type->getMethod("get").doc_string().empty());
+  AT_ASSERT(
+      class_type->getMethod("get_with_docstring").doc_string() ==
+      method_doc_string);
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/python/python_custom_class.cpp b/torch/csrc/jit/python/python_custom_class.cpp
index 49c85c8c3c7f..9809b854e6ac 100644
--- a/torch/csrc/jit/python/python_custom_class.cpp
+++ b/torch/csrc/jit/python/python_custom_class.cpp
@@ -28,7 +28,10 @@ void initPythonCustomClassBindings(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
 
   py::class_<ScriptClass>(m, "ScriptClass")
-      .def("__call__", &ScriptClass::__call__);
+      .def("__call__", &ScriptClass::__call__)
+      .def_property_readonly("__doc__", [](const ScriptClass& self) {
+        return self.class_type_.type_->expect<ClassType>()->doc_string();
+      });
 
   // This function returns a ScriptClass that wraps the constructor
   // of the given class, specified by the qualified name passed in.
diff --git a/torch/csrc/jit/python/script_init.cpp b/torch/csrc/jit/python/script_init.cpp
index 74e0e75362a6..a99f7469ac65 100644
--- a/torch/csrc/jit/python/script_init.cpp
+++ b/torch/csrc/jit/python/script_init.cpp
@@ -1187,9 +1187,13 @@ void initJitScriptBindings(PyObject* module) {
           "name",
           [](const StrongFunctionPtr& self) { return self.function_->name(); })
       .def_property_readonly(
-          "qualified_name", [](const StrongFunctionPtr& self) {
+          "qualified_name",
+          [](const StrongFunctionPtr& self) {
             return self.function_->qualname().qualifiedName();
-          });
+          })
+      .def_property_readonly("__doc__", [](const StrongFunctionPtr& self) {
+        return self.function_->doc_string();
+      });
 
   py::class_<Method>(m, "ScriptMethod", py::dynamic_attr())
       .def(
diff --git a/torch/custom_class.h b/torch/custom_class.h
index 3805cfafc91a..571a584294db 100644
--- a/torch/custom_class.h
+++ b/torch/custom_class.h
@@ -58,14 +58,16 @@ class class_ {
   /// see this class exposed as in Python and TorchScript. For example, if
   /// you pass `foo` as the namespace name and `Bar` as the className, the
   /// class will appear as `torch.classes.foo.Bar` in Python and TorchScript
-  explicit class_(const std::string& namespaceName, const std::string& className) {
+  explicit class_(const std::string& namespaceName, const std::string& className, std::string doc_string = "") {
     detail::checkValidIdent(namespaceName, "Namespace name");
     detail::checkValidIdent(className, "Class name");
     qualClassName = std::string("__torch__.torch.classes.") + namespaceName + "." + className;
 
     classTypePtr = at::ClassType::create(
         c10::QualifiedName(qualClassName),
-        std::weak_ptr<jit::CompilationUnit>());
+        std::weak_ptr<jit::CompilationUnit>(),
+        /*is_module=*/false,
+        std::move(doc_string));
     classTypePtr->addAttribute("capsule", at::CapsuleType::get());
 
     c10::getCustomClassTypeMap().insert(
@@ -81,7 +83,7 @@ class class_ {
   /// `torch::init<int, std::string>()` would register a two-argument constructor
   /// taking an `int` and a `std::string` as argument.
   template <typename... Types>
-  class_& def(detail::types<void, Types...>) { // Used in combination with
+  class_& def(detail::types<void, Types...>, std::string doc_string = "") { // Used in combination with
                                                // torch::init<...>()
     auto func = [](c10::tagged_capsule<CurClass> self, Types... args) {
       auto classObj = c10::make_intrusive<CurClass>(args...);
@@ -89,7 +91,7 @@ class class_ {
       object->setSlot(0, c10::IValue::make_capsule(std::move(classObj)));
     };
 
-    defineMethod("__init__", std::move(func));
+    defineMethod("__init__", std::move(func), std::move(doc_string));
     return *this;
   }
 
@@ -112,18 +114,18 @@ class class_ {
   ///       // do something
   ///     })
   template <typename Func>
-  class_& def(std::string name, Func f) {
+  class_& def(std::string name, Func f, std::string doc_string = "") {
     auto wrapped_f = detail::wrap_func<CurClass, Func>(std::move(f));
-    defineMethod(std::move(name), std::move(wrapped_f));
+    defineMethod(std::move(name), std::move(wrapped_f), std::move(doc_string));
     return *this;
   }
 
   /// This is an unsafe method registration API added for adding custom JIT backend support via custom
   /// C++ classes. It is not for general purpose use.
-  class_& _def_unboxed(std::string name, std::function<void(jit::Stack&)> func, c10::FunctionSchema schema) {
+  class_& _def_unboxed(std::string name, std::function<void(jit::Stack&)> func, c10::FunctionSchema schema, std::string doc_string = "") {
     auto qualMethodName = qualClassName + "." + name;
     auto method = std::make_unique<jit::BuiltinOpFunction>(
-        qualMethodName, std::move(schema), std::move(func));
+        qualMethodName, std::move(schema), std::move(func), std::move(doc_string));
     classTypePtr->addMethod(method.get());
     registerCustomClassMethod(std::move(method));
     return *this;
@@ -228,7 +230,7 @@ class class_ {
 
  private:
   template <typename Func>
-  void defineMethod(std::string name, Func func) {
+  void defineMethod(std::string name, Func func, std::string doc_string = "") {
     auto qualMethodName = qualClassName + "." + name;
     auto schema = c10::inferFunctionSchemaSingleReturn<Func>(std::move(name), "");
 
@@ -241,7 +243,7 @@ class class_ {
       detail::BoxedProxy<RetType, Func>()(stack, func);
     };
     auto method = std::make_unique<jit::BuiltinOpFunction>(
-        qualMethodName, std::move(schema), std::move(wrapped_func));
+        qualMethodName, std::move(schema), std::move(wrapped_func), std::move(doc_string));
 
     // Register the method here to keep the Method alive.
     // ClassTypes do not hold ownership of their methods (normally it

From 9cbdd84e151263cd2efb95436c6ed2c4e75b17df Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Sat, 24 Oct 2020 14:20:57 -0700
Subject: [PATCH 003/169] Fix compiler warning

Summary: `sizeof` returns an unsigned, so comparison against `-1` is a warning. This fixes that.

Test Plan: Standard pre-commit test rig.

Reviewed By: bhosmer

Differential Revision: D24506390

fbshipit-source-id: cdb2887d319c6730a90b9f8d74a248527dd6c2ab
---
 aten/src/ATen/core/boxing/impl/boxing.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/core/boxing/impl/boxing.h b/aten/src/ATen/core/boxing/impl/boxing.h
index d40823555c65..484d462b8ad9 100644
--- a/aten/src/ATen/core/boxing/impl/boxing.h
+++ b/aten/src/ATen/core/boxing/impl/boxing.h
@@ -97,7 +97,13 @@ using can_unbox =
 //
 template <class FuncType, class Enable = void>
 struct BoxedKernelWrapper {
-  static_assert(sizeof(FuncType) == -1,
+  // The reason we're not just doing straight up static_assert(false, ...) here:
+  // Basically, the way to make sure a static_assert only fires if a template
+  // is actually instantiated (rather than every time the file is parsed) is to use
+  // template parameters in the expression, e.g. FuncType here. However, since
+  // `sizeof(FuncType) != sizeof(FuncType)` is always false, this has the same
+  // effect.
+  static_assert(sizeof(FuncType) != sizeof(FuncType),
     "Function signature contains one or more unsupported parameter and/or return types. "
     "Look for a nearby error like "
     "\"'call' is not a member of 'c10::impl::BoxedKernelWrapper<(your function type), void>'\" "

From fa8cd06a5c95dfdce464c7720c58eaf8d2dabfd7 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Sat, 24 Oct 2020 16:07:30 -0700
Subject: [PATCH 004/169] Perform explicit cast (#46771)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46771

`std::ceil` returns a `float` which is cast to `size_t` by the `max` operation.

We convert to `int64_t` to suppress the warning while matching the type of `newDims[0]`.

Since the types match, we don't need an explicit template type for `max`. This allows `max` to take `int64_t` as its values, matching the type of `newCapacity`.

Test Plan: Standard pre-commit test rig.

Reviewed By: malfet

Differential Revision: D24481684

fbshipit-source-id: aed7cabc1e9d395b2662cb633f3ace19c279ab4c
---
 c10/core/TensorImpl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 871867c9e2c2..ad636e51ff12 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -1040,8 +1040,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
       return;
     }
     auto newCapacity = sizes_;
-    newCapacity[0] = std::max<size_t>(
-        newDims[0], std::ceil(sizes_[0] * (growthPct + 100) / 100));
+    newCapacity[0] = std::max(
+        newDims[0], static_cast<int64_t>(std::ceil(sizes_[0] * (1 + growthPct / 100))));
     auto oldData = std::move(storage_.data_ptr());
     auto oldSize = numel_;
     auto oldDims = sizes_;

From edbc84aa4ae6b9d0410e581c31e8c309426b2d29 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Sat, 24 Oct 2020 16:09:00 -0700
Subject: [PATCH 005/169] Fix hash type (#46769)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46769

Value to be hashed is `int64_t`, but hash is `int`. This results in a downconversion which throws out bits which would otherwise be hashed.

Test Plan: Standard pre-commit test rig

Reviewed By: malfet

Differential Revision: D24480962

fbshipit-source-id: 497b1d8bc3f6d2119a6ba16e6ae92911bd34b916
---
 aten/src/ATen/core/Dict_inl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/core/Dict_inl.h b/aten/src/ATen/core/Dict_inl.h
index f5c997a55b74..6cfbea286f00 100644
--- a/aten/src/ATen/core/Dict_inl.h
+++ b/aten/src/ATen/core/Dict_inl.h
@@ -38,7 +38,7 @@ namespace detail {
 
 inline size_t DictKeyHash::operator()(const IValue& ivalue) const {
   if (ivalue.isInt()) {
-    return std::hash<int>()(ivalue.toInt());
+    return std::hash<int64_t>()(ivalue.toInt());
   } else if (ivalue.isString()) {
     return std::hash<std::string>()(ivalue.toStringRef());
   } else if (ivalue.isDouble()) {

From d94bd998ece87a1af996ad047e98b1ccba2c3a85 Mon Sep 17 00:00:00 2001
From: anjali411 <chourdiaanjali123@gmail.com>
Date: Sun, 25 Oct 2020 19:38:00 -0700
Subject: [PATCH 006/169] Update backward formulas (Re #44444) (#46275)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46275

Re #44444

Test Plan: Imported from OSS

Reviewed By: zou3519

Differential Revision: D24285785

Pulled By: anjali411

fbshipit-source-id: c60ecd4fe4f144132085f2c91d3b950e92b2a491
---
 aten/src/ATen/native/Pow.cpp                  | 10 +--
 aten/src/ATen/native/cpu/BinaryOpsKernel.cpp  | 22 ++++-
 .../cuda/BinaryMiscBackwardOpsKernels.cu      | 16 +++-
 test/test_autograd.py                         |  6 +-
 test/test_cuda.py                             |  3 +-
 test/test_jit.py                              |  2 +-
 test/test_ops.py                              | 18 ++---
 tools/autograd/derivatives.yaml               | 30 +++----
 tools/autograd/gen_variable_type.py           |  5 +-
 torch/csrc/autograd/FunctionsManual.cpp       | 80 ++++++++++++-------
 torch/csrc/autograd/FunctionsManual.h         |  1 +
 .../_internal/common_methods_invocations.py   | 37 ++++++---
 12 files changed, 144 insertions(+), 86 deletions(-)

diff --git a/aten/src/ATen/native/Pow.cpp b/aten/src/ATen/native/Pow.cpp
index db33de9a9475..ca5d1848a4b8 100644
--- a/aten/src/ATen/native/Pow.cpp
+++ b/aten/src/ATen/native/Pow.cpp
@@ -31,13 +31,11 @@ Tensor& pow_out(Tensor& result, const Tensor& base, Scalar exp) {
            "result type ", common_dtype, "can't be cast to the desired output type ",
            result.scalar_type());
 
-  if (exp.isComplex() && (exp.toComplexDouble() == 0.0) ) {
-    result.resize_as_(base).fill_(1);
-  } else if (exp.isComplex() && (exp.toComplexDouble() == 1.0) ) {
-    result.resize_as_(base).fill_(base);
-  } else if (!exp.isComplex() && (exp.toDouble() == 0.0)) {
+  auto exponent = (exp.isComplex()) ? exp.toComplexDouble() : exp.toDouble();
+
+  if (exponent == 0.0) {
     result.resize_as_(base).fill_(1);
-  } else if (!exp.isComplex() && (exp.toDouble() == 1.0)) {
+  } else if (exponent == 1.0) {
     result.resize_as_(base).copy_(base);
   } else {
     auto iter = TensorIterator::unary_op(result, base.to(common_dtype));
diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
index fce8c348919b..4e2d00e8347b 100644
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@@ -589,17 +589,31 @@ void logit_backward_kernel(TensorIterator& iter, Scalar eps_scalar) {
 }
 
 void tanh_backward_kernel(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.dtype(), "tanh_backward_cpu", [&]() {
-    auto one_vec = Vec256<scalar_t>(scalar_t{1});
+  if (isComplexType(iter.dtype())) {
+    AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "tanh_backward_cpu", [&]() {
+      auto one_vec = Vec256<scalar_t>(scalar_t{1});
     cpu_kernel_vec(
       iter,
       [=](scalar_t a, scalar_t b) -> scalar_t {
-        return a * (scalar_t{1} - b * b);
+        return a * std::conj(scalar_t{1} - b * b);
       },
       [=](Vec256<scalar_t> a, Vec256<scalar_t> b) {
-        return a * (one_vec - b * b);
+        return a * (one_vec - b * b).conj();
       });
   });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "tanh_backward_cpu", [&]() {
+      auto one_vec = Vec256<scalar_t>(scalar_t{1});
+      cpu_kernel_vec(
+        iter,
+        [=](scalar_t a, scalar_t b) -> scalar_t {
+          return a * (scalar_t{1} - b * b);
+        },
+        [=](Vec256<scalar_t> a, Vec256<scalar_t> b) {
+          return a * (one_vec - b * b);
+        });
+    });
+  }
 }
 
 void mse_kernel(TensorIterator& iter) {
diff --git a/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu
index 9b7bc28a829e..ed7e2190f75e 100644
--- a/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu
+++ b/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu
@@ -60,13 +60,21 @@ void logit_backward_kernel_cuda(TensorIterator& iter, Scalar eps_scalar) {
 }
 
 void tanh_backward_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "tanh_backward_cuda", [&]() {
-    AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "tanh_backward_cuda", [&] {
+  if(isComplexType(iter.dtype())) {
+    AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "tanh_backward_complex_cuda", [&]() {
       gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
-        return a * (scalar_t{1.} - b * b);
+        return a * std::conj(scalar_t{1.} - b * b);
       });
     });
-  });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "tanh_backward_cuda", [&]() {
+      AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "tanh_backward_cuda", [&] {
+        gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+          return a * (scalar_t{1.} - b * b);
+        });
+      });
+    });
+  }
 }
 
 REGISTER_DISPATCH(sigmoid_backward_stub, &sigmoid_backward_kernel_cuda);
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 211834f9e82d..37cccdbbfbd6 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -4917,8 +4917,8 @@ def run_functional_checks(test_case, test_name, name, apply_fn, run_grad_checks,
 # the tests for these ops which do not have 'complex' in variant should not run for complex
 # and only run for floating point
 
-# TODO(@anjali411): add the commented tests back after updating the formula based on tensorflow definition
-separate_complex_tests = ['view_as_real', 'real', 'imag', 'asin', 'acos']  # ['log', 'log10', 'log1p', 'log2', 'reciprocal', 'tan']
+separate_complex_tests = ['view_as_real', 'real', 'imag', 'asin', 'acos', 'div', 'log',
+                          'log10', 'log1p', 'log2', 'pow', 'tan', 'reciprocal', 'rsqrt']
 
 # NOTE: Some non-holomorphic are separately tested in TestAutogradComplex until gradcheck works properly
 # for non-holomorphic functions
@@ -4930,7 +4930,7 @@ def run_functional_checks(test_case, test_name, name, apply_fn, run_grad_checks,
                 'chunk', 'split', 'split_with_sizes', 'repeat', 'expand', 'zero_',
                 'eq_', 'ne_', 'add', '__radd__', 'sum', 'conj', 'sin', 'cos', 'mul', 'sinh',
                 'cosh', '__rmul__', 'sgn', 'abs', 'dot', 'vdot', 'tensor_split',
-                'matmul', 'bmm', 'mv', 'ger', 'diagonal', ] + separate_complex_tests
+                'matmul', 'bmm', 'mv', 'ger', 'diagonal', 'atan', 'angle'] + separate_complex_tests
 
 # this list corresponds to cases that are not currently implemented
 skip_cuda_list = ['bmm_complex', 'matmul_4d_4d_complex']
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 577cd5caa36a..3fa44ee43b20 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -23,7 +23,7 @@
 from torch.testing._internal.common_methods_invocations import tri_tests_args, tri_large_tests_args, \
     _compare_trilu_indices, _compare_large_trilu_indices
 from torch.testing._internal.common_utils import TestCase, get_gpu_type, freeze_rng_state, run_tests, \
-    NO_MULTIPROCESSING_SPAWN, skipIfRocm, load_tests, IS_SANDCASTLE, \
+    NO_MULTIPROCESSING_SPAWN, skipIfRocm, load_tests, IS_SANDCASTLE, IS_WINDOWS, \
     slowTest, skipCUDANonDefaultStreamIf, TEST_WITH_ROCM, TEST_NUMPY
 from torch.testing._internal.autocast_test_lists import AutocastTestLists
 
@@ -2151,6 +2151,7 @@ def run(data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_api):
 
         self._run_scaling_case(run, unskipped=3, skipped=1)
 
+    @unittest.skipIf(IS_WINDOWS, 'FIXME: fix this test for Windows')
     def test_grad_scaling_penalty(self):
         def run(data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_api):
             for i, (input, target) in enumerate(data):
diff --git a/test/test_jit.py b/test/test_jit.py
index c5c4fe52d724..dbe3fff20245 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -15631,7 +15631,7 @@ def add_autograd_test(
 
     # Disable complex tests
     # TODO: Add complex support for jit
-    if 'complex' in variant_name or name in ['view_as_complex', 'complex']:
+    if 'complex' in variant_name or name in ['view_as_complex', 'complex', 'angle']:
         return
 
     # Skips aliases, which are tested in test_op_aliases.py
diff --git a/test/test_ops.py b/test/test_ops.py
index 5be450d4d41f..1d85f86113e9 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -89,41 +89,35 @@ def _gradgrad_test_helper(self, device, dtype, op, variant):
         return self._check_helper(device, dtype, op, variant, 'gradgradcheck')
 
     # Tests that gradients are computed correctly
-    # TODO(@anjali411) enable this for torch.cdouble.
-    @dtypes(torch.double)
+    @dtypes(torch.double, torch.cdouble)
     @ops(op_db)
     def test_fn_grad(self, device, dtype, op):
         self._grad_test_helper(device, dtype, op, op.get_op())
 
-    # TODO(@anjali411) enable this for torch.cdouble.
-    @dtypes(torch.double)
+    @dtypes(torch.double, torch.cdouble)
     @ops(op_db)
     def test_method_grad(self, device, dtype, op):
         self._grad_test_helper(device, dtype, op, op.get_method())
 
-    # TODO(@anjali411) enable this for torch.cdouble.
-    @dtypes(torch.double)
+    @dtypes(torch.double, torch.cdouble)
     @ops(op_db)
     def test_inplace_grad(self, device, dtype, op):
         if not op.test_inplace_grad:
             self.skipTest("Skipped! Inplace gradcheck marked to skip.")
         self._grad_test_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace()))
 
-    # TODO(@anjali411) enable this for torch.cdouble.
     # Test that gradients of gradients are computed correctly
-    @dtypes(torch.double)
+    @dtypes(torch.double, torch.cdouble)
     @ops(op_db)
     def test_fn_gradgrad(self, device, dtype, op):
         self._gradgrad_test_helper(device, dtype, op, op.get_op())
 
-    # TODO(@anjali411) enable this for torch.cdouble.
-    @dtypes(torch.double)
+    @dtypes(torch.double, torch.cdouble)
     @ops(op_db)
     def test_method_gradgrad(self, device, dtype, op):
         self._gradgrad_test_helper(device, dtype, op, op.get_method())
 
-    # TODO(@anjali411) enable this for torch.cdouble.
-    @dtypes(torch.double)
+    @dtypes(torch.double, torch.cdouble)
     @ops(op_db)
     def test_inplace_gradgrad(self, device, dtype, op):
         if not op.test_inplace_grad:
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 7fbc3e9fd4b3..fa03db91694c 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -162,7 +162,7 @@
   self: grad * self.sgn()
 
 - name: acos(Tensor self) -> Tensor
-  self: grad * -((-self * self + 1).rsqrt())
+  self: grad * -((-self * self + 1).rsqrt()).conj()
 
 - name: add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
   self: grad
@@ -213,7 +213,7 @@
   self: grad
 
 - name: angle(Tensor self) -> Tensor
-  self: grad.to(self.scalar_type()) * (self*Scalar(c10::complex<double>{0.0, 1.0})).conj() / self.abs().pow(2)
+  self: angle_backward(grad, self)
 
 # The four items below are necessary because TensorIterator doesn't work on
 # Variables (codegen does not unwrap the input Tensor for all() and any() ).
@@ -230,19 +230,19 @@
   self: not_implemented("all")
 
 - name: acosh(Tensor self) -> Tensor
-  self: grad * (self.pow(2) - 1).rsqrt()
+  self: grad * (self.pow(2) - 1).rsqrt().conj()
 
 - name: acosh_(Tensor(a!) self) -> Tensor(a!)
   self: not_implemented("inplace version of acosh")
 
 - name: asinh(Tensor self) -> Tensor
-  self: grad * (self.pow(2) + 1).rsqrt()
+  self: grad * (self.pow(2) + 1).rsqrt().conj()
 
 - name: asinh_(Tensor(a!) self) -> Tensor(a!)
   self: not_implemented("inplace version of asinh")
 
 - name: atanh(Tensor self) -> Tensor
-  self: grad * 1 / (1 - self.pow(2))
+  self: grad * 1 / (1 - self.pow(2)).conj()
 
 - name: atanh_(Tensor(a!) self) -> Tensor(a!)
   self: not_implemented("inplace version of atanh")
@@ -251,10 +251,10 @@
   self: as_strided_backward(grad, TensorGeometry(self), size, stride, storage_offset)
 
 - name: asin(Tensor self) -> Tensor
-  self: grad * (-self * self + 1).rsqrt()
+  self: grad * (-self * self + 1).rsqrt().conj()
 
 - name: atan(Tensor self) -> Tensor
-  self: grad / (self * self + 1)
+  self: grad / (self * self + 1).conj()
 
 - name: atan2(Tensor self, Tensor other) -> Tensor
   self, other: atan2_backward(grad, self, other, grad_input_mask)
@@ -610,16 +610,16 @@
   self: grad * polygamma(n + 1, self)
 
 - name: log(Tensor self) -> Tensor
-  self: grad.div(self)
+  self: grad.div(self.conj())
 
 - name: log10(Tensor self) -> Tensor
-  self: grad / (self * 2.3025850929940456)
+  self: grad / (self.conj() * 2.3025850929940456)
 
 - name: log1p(Tensor self) -> Tensor
   self: log1p_backward(grad, self)
 
 - name: log2(Tensor self) -> Tensor
-  self: grad / (self * 0.6931471805599453)
+  self: grad / (self.conj() * 0.6931471805599453)
 
 - name: logaddexp(Tensor self, Tensor other) -> Tensor
   self: grad / (1 + exp(other - self))
@@ -884,7 +884,7 @@
   self: zeros_like(grad)
 
 - name: reciprocal(Tensor self) -> Tensor
-  self: -grad * result * result
+  self: -grad * (result * result).conj()
 
 - name: remainder.Scalar(Tensor self, Scalar other) -> Tensor
   self: grad
@@ -909,7 +909,7 @@
   self: zeros_like(grad)
 
 - name: rsqrt(Tensor self) -> Tensor
-  self: -0.5 * grad * result.pow(3)
+  self: -0.5 * grad * result.pow(3).conj()
 
 - name: scatter_.src(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)
   self: grad.clone().scatter_(dim, index, 0)
@@ -1046,7 +1046,7 @@
   index: non_differentiable
 
 - name: tan(Tensor self) -> Tensor
-  self: grad * (1 + result.pow(2))
+  self: grad * (1 + result.pow(2)).conj()
 
 - name: tanh(Tensor self) -> Tensor
   self: tanh_backward(grad, result)
@@ -1670,8 +1670,8 @@
   output: grad * grad_output * (-2 * output + 1)
 
 - name: tanh_backward(Tensor grad_output, Tensor output) -> Tensor
-  grad_output: tanh_backward(grad, output)
-  output: -2 * output * grad * grad_output
+  grad_output: tanh_backward(grad, output.conj())
+  output: grad.conj() * (-2 * output.conj() * grad_output)
 
 # cudnn
 - name: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index aa65a68dff87..0c41310b457a 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -165,7 +165,10 @@
     'neg', 'complex', 'select', '_s_where', 'as_strided', 'slice', 'constant_pad_nd',
     'unbind', 'split', 'split_with_sizes', 'unsafe_split', 'split_with_sizes_backward',
     'dot', 'vdot', 'cholesky', 'triangular_solve', 'mm', '_unsafe_view', 'mv', 'ger',
-    'bmm', 'diagonal'
+    'bmm', 'diagonal', 'cholesky', 'atan', 'log', 'log10', 'log1p', 'log2', 'reciprocal',
+    'tan', 'pow', 'rsqrt', 'tanh', 'tanh_backward', 'asinh', 'acosh'
+    'dot', 'vdot', 'cholesky', 'atan', 'log', 'log10', 'log1p', 'log2', 'reciprocal',
+    'tan', 'pow', 'rsqrt', 'tanh', 'tanh_backward', 'asinh', 'acosh'
 }
 
 # Some operators invalidate the grad_accumulator. Let's reset it.
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 689d66c6f405..0fb0e44e9450 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -95,6 +95,14 @@ Tensor handle_r_to_c(ScalarType self_st, Tensor gradient_result) {
   return gradient_result;
 }
 
+Tensor handle_r_to_c(Tensor self, Tensor gradient_result) {
+  if (!self.is_complex() && gradient_result.is_complex()) {
+    // R -> C
+    return at::real(gradient_result);
+  }
+  return gradient_result;
+}
+
 Tensor restore_reduced_dims(const Tensor &output, IntArrayRef dims, bool keepdim) {
   if (keepdim) {
     return output;
@@ -177,16 +185,18 @@ Tensor norm_backward(Tensor grad, const Tensor & self, const optional<Scalar> &
 }
 
 Tensor pow_backward(Tensor grad, const Tensor & self, const Scalar & exponent_) {
-  double exponent = exponent_.toDouble();
+  auto exponent = (exponent_.isComplex()) ? exponent_.toComplexDouble() : exponent_.toDouble();
   if (exponent == 0.0) {
     return at::zeros_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   } else {
-    return grad * exponent * self.pow(exponent - 1);
+    auto out = grad * (exponent * self.pow(exponent - 1)).conj();
+    return handle_r_to_c(self, out);
   }
 }
 
 Tensor pow_backward_self(Tensor grad, const Tensor & self, const Tensor & exponent) {
-  return at::where(exponent == 0.0, at::zeros({}, grad.options()), grad * exponent * self.pow(exponent - 1));
+  auto out = at::where(exponent == 0.0, at::zeros({}, grad.options()), grad * (exponent * self.pow(exponent - 1)).conj());
+  return handle_r_to_c(self, out);
 }
 
 // Caveats:
@@ -198,18 +208,46 @@ Tensor pow_backward_self(Tensor grad, const Tensor & self, const Tensor & expone
 // d(a^b)/db = 0 for a > 0 and b -> +0.
 // Currently, tensorflow agrees with us.
 Tensor pow_backward_exponent(Tensor grad, const Tensor& self, const Tensor& exponent, Tensor result) {
-  return grad * at::where(at::logical_and(self == 0, exponent >= 0),
+  Tensor cond;
+  if (exponent.is_complex()) {
+    auto is_real_exp = at::logical_and(at::imag(exponent) == 0, at::real(exponent) >= 0);
+    cond = at::logical_and(self == 0, is_real_exp);
+  } else {
+    cond = at::logical_and(self == 0, exponent >= 0);
+  }
+  auto out = grad * at::where(cond,
                           at::zeros({}, grad.options()),
-                          result * self.log());
+                          (result * self.log()).conj());
+  return handle_r_to_c(exponent, out);
 }
 
 Tensor pow_backward_exponent(Tensor grad, const Scalar & base, const Tensor& exponent, Tensor result) {
-  if (base.toDouble() == 0) {
-    return grad * at::where(exponent >= 0,
+  auto base_ = base.isComplex() ? base.toComplexDouble() : base.toDouble();
+  auto grad_lambda = [](auto a, auto b) { return (a * std::log(b)).conj(); };
+  if (base_ == 0.0) {
+    auto cond = [](auto exp) {
+      if (exp.is_complex()) {
+        return at::logical_and(at::imag(exp) == 0, at::real(exp) >= 0);
+      } else {
+        return exp >=0;
+      }
+    };
+    auto out = grad * at::where(cond(exponent),
                             at::zeros({}, grad.options()),
-                            result * std::log(base.toDouble()));
+                            grad_lambda(result, base_));
+    return handle_r_to_c(exponent, out);
+  } else {
+    auto out = grad * grad_lambda(result, base_);
+    return handle_r_to_c(exponent, out);
+  }
+}
+
+Tensor angle_backward(Tensor grad, const Tensor& self) {
+  if (self.is_complex()) {
+    return at::where(self == 0.0, at::zeros({}, self.options()),
+                     grad * self / self.abs().pow(2) * Scalar(c10::complex<double>{0.0, 1.0}));
   } else {
-    return grad * result * std::log(base.toDouble());
+    return at::zeros_like(self, at::MemoryFormat::Preserve);
   }
 }
 
@@ -226,35 +264,23 @@ Tensor sgn_backward(Tensor result, Tensor grad, Tensor self) {
     // https://arxiv.org/pdf/1701.00392.pdf Section 4.20
     return at::where(abs == 0.0, at::zeros({}, grad.options()), (grad/abs - (at::real(grad/self) * result)));
   } else {
-    return at::zeros_like(grad, at::MemoryFormat::Preserve);
+    return at::zeros_like(self, at::MemoryFormat::Preserve);
   }
 }
 
 Tensor mul_tensor_backward(Tensor grad, Tensor other, ScalarType self_st) {
-  auto result = grad * other.conj();
-  if (!at::isComplexType(self_st) && result.is_complex()) {
-    // R -> C
-    result = at::real(result);
-  }
-  return result;
+  auto out = grad * other.conj();
+  return handle_r_to_c(self_st, out);
 }
 
 Tensor div_tensor_self_backward(Tensor grad, Tensor other, ScalarType self_st) {
   auto result = grad / other.conj();
-  if (!at::isComplexType(self_st) && result.is_complex()) {
-    // R -> C
-    result = at::real(result);
-  }
-  return result;
+  return handle_r_to_c(self_st, result);
 }
 
 Tensor div_tensor_other_backward(Tensor grad, Tensor self, Tensor other) {
   auto result = -grad * ((self / other) / other).conj();
-  if (!other.is_complex() && result.is_complex()) {
-    // R -> C
-    result = at::real(result);
-  }
-  return result;
+  return handle_r_to_c(other, result);
 }
 
 Tensor permute_backwards(const Tensor & grad, IntArrayRef fwd_dims) {
@@ -2647,7 +2673,7 @@ Tensor log1p_backward(const Tensor& grad, const Tensor& self) {
       "Use a different mathematical operation which preserves sparsity of gradients, ",
       "or report a bug if you think this is an error.");
   }
-  return grad / (self + 1);
+  return grad / (self + 1).conj();
 }
 
 Tensor sparse_constructor_values_backward(const Tensor& sparse_grad_out, const Tensor& indices, IntArrayRef values_shape) {
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
index ffd65081f678..1f5ba99e83a2 100644
--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@@ -44,6 +44,7 @@ at::Tensor pow_backward(at::Tensor grad, const at::Tensor & self, const at::Scal
 at::Tensor pow_backward_self(at::Tensor grad, const at::Tensor & self, const at::Tensor & exponent);
 at::Tensor pow_backward_exponent(at::Tensor grad, const at::Tensor& self, const at::Tensor& exponent, at::Tensor result);
 at::Tensor pow_backward_exponent(at::Tensor grad, const at::Scalar & base, const at::Tensor& exponent, at::Tensor result);
+at::Tensor angle_backward(at::Tensor grad, const at::Tensor& self);
 at::Tensor mul_tensor_backward(Tensor grad, Tensor other, ScalarType self_st);
 at::Tensor div_tensor_self_backward(Tensor grad, Tensor other, ScalarType self_st);
 at::Tensor div_tensor_other_backward(Tensor grad, Tensor self, Tensor other);
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index bf80473dae38..2bd110480b88 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -574,6 +574,15 @@ def method_tests():
             (True, [], ['aten::mul', 'aten::reciprocal'])),
         ('__rdiv__', uniform_scalar(1e-1, requires_grad=True), (3.14,), 'scalar_constant',
             (True, [], ['aten::mul', 'aten::reciprocal'])),
+        ('div', (S, S, S), (torch.rand(S, S, S, dtype=torch.cdouble) + 0.1,), 'complex', (True,)),
+        ('div', (S, S, S), (torch.rand(S, S, dtype=torch.cdouble) + 0.1,), 'complex_broadcast_rhs', (True,)),
+        ('div', (S, S), (torch.rand(S, S, S, dtype=torch.cdouble) + 0.1,), 'complex_broadcast_lhs', (True,)),
+        ('div', (S, 1, S), (torch.rand(M, S, dtype=torch.cdouble) + 0.1,), 'complex_broadcast_all', (True,)),
+        ('div', (), (uniform_scalar(0.1j),), 'complex_scalar', (True,)),
+        ('div', (S, S, S), (uniform_scalar(0.1j),), 'complex_scalar_broadcast_rhs', (True,)),
+        ('div', (), (uniform_scalar(0.1j),), 'complex_scalar_broadcast_lhs', (True,)),
+        ('div', torch.rand(S, S, S, dtype=torch.cdouble) + 1e-1, (3.14j,), 'complex_constant', (True,)),
+        ('div', uniform_scalar(1e-1j, requires_grad=True), (3.14j,), 'complex_scalar_constant', (True,)),
         ('pow', torch.rand(S, S, S) + 1e-3, (torch.rand(S, S, S) + 0.1,), '', (True,)),
         ('pow', torch.rand(S, S, S) + 1e-3, (torch.rand(1,) + 0.1,), 'broadcast_rhs', (True,)),
         ('pow', torch.rand(1,) + 1e-3, (torch.rand(S, S, S) + 0.1,), 'broadcast_lhs', (True,)),
@@ -582,8 +591,11 @@ def method_tests():
         ('pow', torch.rand(S, S, S) + 1e-3, (uniform_scalar(0.1),), 'scalar_broadcast_rhs', (True,)),
         ('pow', uniform_scalar(1e-3, requires_grad=True), (torch.rand(S, S, S) + 0.1,), 'scalar_broadcast_lhs', (True,)),
         ('pow', torch.rand(S, S, S) + 1e-3, (3.14,), 'constant', (True,)),
+        ('pow', torch.rand(S, S, S, dtype=torch.cdouble) + 1e-3 * (1 + 1j), (3.14,), 'complex_constant', (True,)),
         ('__rpow__', torch.rand(S, S, S) + 1e-3, (3.14,), 'constant', (True, 'aten::pow')),
         ('pow', uniform_scalar(1e-3, requires_grad=True), (3.14,), 'scalar_constant', (True,)),
+        ('pow', uniform_scalar(1e-3 * (1 + 1j), requires_grad=True), (3.14,), 'complex_scalar_constant', (True,)),
+        ('pow', uniform_scalar(1e-3 * (1 + 1j), requires_grad=True), (3.14j,), 'complex_imaginary_exponent', (True,)),
         ('__rpow__', uniform_scalar(1e-3, requires_grad=True), (3.14,), 'scalar_constant', (True, 'aten::pow')),
         ('transpose', (1, 2, 3), (1, 2), 'dim', (False,), [0, 1]),
         ('transpose', (), (0, 0), 'scalar', (False,)),
@@ -655,13 +667,12 @@ def method_tests():
         ('log1p', uniform_scalar(requires_grad=True), NO_ARGS, 'scalar', (True,)),
         ('log2', torch.rand(S, S, S) + 1e-2, NO_ARGS, '', (True,)),
         ('log2', uniform_scalar(1e-2, requires_grad=True), NO_ARGS, 'scalar', (True,)),
-        # TODO(@anjali411): add the commented tests back after updating the formula based on tensorflow definition.
-        # ('log', torch.randn(S, S, S, dtype=torch.cfloat) + 1e-2, NO_ARGS, 'complex', (True,)),
-        # ('log', uniform_scalar(1e-2j, requires_grad=True), NO_ARGS, 'complex_scalar', (True,)),
-        # ('log10', torch.randn(S, S, S, dtype=torch.cfloat) + 1e-2, NO_ARGS, 'complex', (True,)),
-        # ('log10', uniform_scalar(1e-2j, requires_grad=True), NO_ARGS, 'complex_scalar', (True,)),
-        # ('log2', torch.randn(S, S, S, dtype=torch.cfloat) + 1e-2, NO_ARGS, 'complex', (True,)),
-        # ('log2', uniform_scalar(1e-2j, requires_grad=True), NO_ARGS, 'complex_scalar', (True,)),
+        ('log', torch.randn(S, S, S, dtype=torch.cdouble) + 1e-2, NO_ARGS, 'complex', (True,)),
+        ('log', uniform_scalar(1e-2j, requires_grad=True), NO_ARGS, 'complex_scalar', (True,)),
+        ('log10', torch.randn(S, S, S, dtype=torch.cdouble) + 1e-2, NO_ARGS, 'complex', (True,)),
+        ('log10', uniform_scalar(1e-2j, requires_grad=True), NO_ARGS, 'complex_scalar', (True,)),
+        ('log2', torch.randn(S, S, S, dtype=torch.cdouble) + 1e-2, NO_ARGS, 'complex', (True,)),
+        ('log2', uniform_scalar(1e-2j, requires_grad=True), NO_ARGS, 'complex_scalar', (True,)),
         ('tanh', (S, S, S), NO_ARGS, '', (True,)),
         ('tanh', (), NO_ARGS, 'scalar', (True,)),
         ('sigmoid', (S, S, S), NO_ARGS, '', (True,)),
@@ -682,6 +693,8 @@ def method_tests():
         ('complex', (S, S, S), ((S, S, S),), ''),
         ('abs', (S, S, S), NO_ARGS, '', (True,)),
         ('abs', (), NO_ARGS, 'scalar', (True,)),
+        ('angle', (S, S, S), NO_ARGS, '', (True,)),
+        ('angle', (), NO_ARGS, 'scalar', (True,)),
         ('clamp', (S, S, S), (0, 1), '', (True,)),
         ('clamp', (S, S, S), (None, 0.5), 'min', (True,)),
         ('clamp', (S, S, S), (0.5, None), 'max', (True,)),
@@ -696,8 +709,7 @@ def method_tests():
         ('cos', (S, S, S), NO_ARGS, '', (True,)),
         ('cos', (), NO_ARGS, 'scalar', (True,)),
         ('tan', torch.randn(S, S, S).clamp(-1, 1), NO_ARGS, '', (True,)),
-        # TODO(@anjali411): add the commented test back after updating the formula based on tensorflow definition.
-        # ('tan', (S, S, S), NO_ARGS, 'complex', (True,)),
+        ('tan', (S, S, S), NO_ARGS, 'complex', (True,)),
         ('asin', torch.randn(S, S, S).clamp(-0.9, 0.9), NO_ARGS, '', (True,)),
         ('acos', torch.randn(S, S, S).clamp(-0.9, 0.9), NO_ARGS, '', (True,)),
         ('atan', (S, S, S), NO_ARGS, '', (True,)),
@@ -709,9 +721,8 @@ def method_tests():
         ('atan2', (S, 1, S), ((S, S),), 'broadcast_all'),
         ('reciprocal', torch.rand(S, S, S) + 0.1, NO_ARGS, '', (True,)),
         ('reciprocal', uniform_scalar(0.1, requires_grad=True), NO_ARGS, 'scalar', (True,)),
-        # TODO(@anjali411): add the commented tests back after updating the formula based on tensorflow definition.
-        # ('reciprocal', torch.randn(S, S, S, dtype=torch.cdouble) + 0.1, NO_ARGS, 'complex', (True,)),
-        # ('reciprocal', uniform_scalar(0.1j), NO_ARGS, 'complex_scalar', (True,)),
+        ('reciprocal', torch.randn(S, S, S, dtype=torch.cdouble) + 0.1, NO_ARGS, 'complex', (True,)),
+        ('reciprocal', uniform_scalar(0.1j), NO_ARGS, 'complex_scalar', (True,)),
         ('round', (S, S, S), NO_ARGS, '', (True,)),
         ('round', (), NO_ARGS, 'scalar', (True,)),
         ('sign', (S, S, S), NO_ARGS),
@@ -728,6 +739,8 @@ def method_tests():
         ('deg2rad', (S, S, S), NO_ARGS),
         ('rsqrt', torch.rand(S, S, S) + 1e-2, NO_ARGS, '', (True,)),
         ('rsqrt', uniform_scalar(1e-2, requires_grad=True), NO_ARGS, 'scalar', (True,)),
+        ('rsqrt', torch.rand(S, S, S, dtype=torch.cfloat) + 1e-2, NO_ARGS, 'complex', (True,)),
+        ('rsqrt', uniform_scalar(1e-2 * (1 + 1j), requires_grad=True), NO_ARGS, 'complex_scalar', (True,)),
         ('frac', (S, S, S), NO_ARGS, '', (True,)),
         ('frac', (), NO_ARGS, 'scalar', (True,)),
         ('fmod', (S, S, S), (1.5,), '', (True,)),

From b61671ccd2f8664c306ccd661e030d8762cca693 Mon Sep 17 00:00:00 2001
From: Kurt Mohler <kurtamohler@gmail.com>
Date: Mon, 26 Oct 2020 02:56:12 -0700
Subject: [PATCH 007/169] Enable dtype arg for torch.linalg.norm with order
 'fro' and 'nuc' (#46637)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/46255

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46637

Reviewed By: gchanan

Differential Revision: D24459097

Pulled By: mruberry

fbshipit-source-id: 7f207a23de902c27f8313ee80f452687a97e8f6f
---
 aten/src/ATen/native/LinearAlgebra.cpp | 12 ++++++------
 test/test_linalg.py                    |  9 +--------
 torch/linalg/__init__.py               |  3 +--
 3 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index e3f50db11b6c..0dd727fb3197 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -1391,9 +1391,8 @@ static std::vector<int64_t> make_dim_list(int64_t ndim) {
 }
 
 // Checks for valid arguments to linalg_norm when type(ord) == str
-static void check_str_ord_valid(const std::string& str_ord, optional<IntArrayRef> opt_dim, int64_t ndim, optional<ScalarType> opt_dtype) {
+static void check_str_ord_valid(const std::string& str_ord, optional<IntArrayRef> opt_dim, int64_t ndim) {
   TORCH_CHECK((str_ord == "nuc") || (str_ord == "fro"), "Invalid norm order: ", str_ord);
-  TORCH_CHECK(!opt_dtype.has_value(), "ord=\'", str_ord, "\' does not yet support the dtype argument");
   bool dims_valid = (ndim == 2 && !opt_dim.has_value()) || (opt_dim.has_value() && opt_dim.value().size() == 2);
   TORCH_CHECK(dims_valid, "order \"", str_ord,
     "\" can only be used if either len(dim) == 2 or (self.dim() == 2 and dim is None)");
@@ -1553,14 +1552,15 @@ static Tensor& linalg_norm_out_impl(Tensor& result, const Tensor& self, optional
   if (opt_str_ord.has_value()) {
     // 'ord' is string
     auto str_ord = opt_str_ord.value();
-    check_str_ord_valid(str_ord, opt_dim, ndim, opt_dtype);
+    check_str_ord_valid(str_ord, opt_dim, ndim);
+    Tensor self_ = opt_dtype.has_value() ? self.to(opt_dtype.value()) : self;
     if (str_ord == "fro") {
-      at::frobenius_norm_out(result, self, opt_dim.value_or(IntArrayRef({0, 1})), keepdim);
+      at::frobenius_norm_out(result, self_, opt_dim.value_or(IntArrayRef({0, 1})), keepdim);
     } else if (str_ord == "nuc") {
       if (opt_dim.has_value()) {
-        at::nuclear_norm_out(result, self, opt_dim.value(), keepdim);
+        at::nuclear_norm_out(result, self_, opt_dim.value(), keepdim);
       } else {
-        at::nuclear_norm_out(result, self, keepdim);
+        at::nuclear_norm_out(result, self_, keepdim);
       }
     }
   } else {
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 0220dff476c1..127c674e5b05 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -202,7 +202,7 @@ def run_test_case(input_size, ord, keepdim, from_dtype, to_dtype, compare_dtype)
             self.assertEqual(result_converted, result_out_converted, msg=msg)
 
         ord_vector = [0, 1, -1, 2, -2, 3, -3, 4.5, -4.5, inf, -inf, None]
-        ord_matrix = [1, -1, 2, -2, inf, -inf, None]
+        ord_matrix = ['fro', 'nuc', 1, -1, 2, -2, inf, -inf, None]
         S = 10
         test_cases = [
             ((S, ), ord_vector),
@@ -230,13 +230,6 @@ def run_test_case(input_size, ord, keepdim, from_dtype, to_dtype, compare_dtype)
                         with self.assertRaisesRegex(RuntimeError, r'provided dtype must match dtype of result'):
                             torch.linalg.norm(input, ord=ord, keepdim=keepdim, dtype=dtype, out=result)
 
-        # TODO: Once dtype arg is supported in nuclear and frobenius norms, remove the following test
-        #       and  add 'nuc' and 'fro' to ord_matrix above
-        for ord in ['nuc', 'fro']:
-            input = torch.randn(10, 10, device=device)
-            with self.assertRaisesRegex(RuntimeError, f"ord=\'{ord}\' does not yet support the dtype argument"):
-                torch.linalg.norm(input, ord, dtype=torch.float)
-
     # This test compares torch.linalg.norm and numpy.linalg.norm to ensure that
     # their vector norm results match
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
diff --git a/torch/linalg/__init__.py b/torch/linalg/__init__.py
index 5e2b59c45c80..074bb47faaac 100644
--- a/torch/linalg/__init__.py
+++ b/torch/linalg/__init__.py
@@ -68,8 +68,7 @@
         :attr:`dtype` before performing the operation, and the returned tensor's type
         will be :attr:`dtype`. If this argument is used in conjunction with the
         :attr:`out` argument, the output tensor's type must match this argument or a
-        RuntimeError will be raised. This argument is not currently supported for
-        :attr:`ord='nuc'` or :attr:`ord='fro'`. Default: ``None``
+        RuntimeError will be raised. Default: ``None``
 
 Examples::
 

From 83d358da7c9957db6cc6456f69f43a4c2a4d54e4 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Mon, 26 Oct 2020 08:32:09 -0700
Subject: [PATCH 008/169] Fix LAPACK functionality detection from static
 OpenBLAS (#46710)

Summary:
BLAS `sgemm_` only depends on pthreads, but LAPACK `cheev_` also depends on libm

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46710

Reviewed By: walterddr

Differential Revision: D24476082

Pulled By: malfet

fbshipit-source-id: e0b91116f18bbcdabb1f99c2ec9d98283df4393f
---
 cmake/Dependencies.cmake       |  1 +
 cmake/Modules/FindBLAS.cmake   |  2 +-
 cmake/Modules/FindLAPACK.cmake | 24 ++++++++++++++++++++++++
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index a52dbc050cbf..0ce2d8b44a32 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1635,6 +1635,7 @@ if(NOT INTERN_BUILD_MOBILE)
   find_package(LAPACK)
   if(LAPACK_FOUND)
     set(USE_LAPACK 1)
+    list(APPEND Caffe2_PRIVATE_DEPENDENCY_LIBS ${LAPACK_LIBRARIES})
   endif()
 
   if(NOT USE_CUDA)
diff --git a/cmake/Modules/FindBLAS.cmake b/cmake/Modules/FindBLAS.cmake
index e93e98a6095d..38e826d1f5f4 100644
--- a/cmake/Modules/FindBLAS.cmake
+++ b/cmake/Modules/FindBLAS.cmake
@@ -153,7 +153,7 @@ if((NOT BLAS_LIBRARIES)
   BLAS
   sgemm
   ""
-  "openblas;pthread")
+  "openblas;pthread;m")
   if(BLAS_LIBRARIES)
     set(BLAS_INFO "open")
   endif(BLAS_LIBRARIES)
diff --git a/cmake/Modules/FindLAPACK.cmake b/cmake/Modules/FindLAPACK.cmake
index c057f207132f..b0e607d90587 100644
--- a/cmake/Modules/FindLAPACK.cmake
+++ b/cmake/Modules/FindLAPACK.cmake
@@ -123,6 +123,30 @@ if(BLAS_FOUND)
   IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "open"))
     SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
     check_function_exists("cheev_" OPEN_LAPACK_WORKS)
+    if(OPEN_LAPACK_WORKS)
+      check_function_exists("cgesdd_" LAPACK_CGESDD_WORKS)
+      if(NOT LAPACK_CGESDD_WORKS)
+        find_library(GFORTRAN_LIBRARY
+          NAMES libgfortran.a gfortran
+          PATHS /usr/lib/gcc/aarch64-linux-gnu/9/
+                /usr/lib/gcc/x86_64-redhat-linux/9/
+                /usr/lib/gcc/aarch64-linux-gnu/8/
+                /usr/lib/gcc/x86_64-redhat-linux/8/
+                /usr/lib/gcc/aarch64-linux-gnu/7/
+                /usr/lib/gcc/x86_64-redhat-linux/7/
+                )
+       list(APPEND CMAKE_REQUIRED_LIBRARIES "${GFORTRAN_LIBRARY}")
+       unset(LAPACK_CGESDD_WORKS CACHE)
+       check_function_exists("cgesdd_" LAPACK_CGESDD_WORKS)
+       if(LAPACK_CGESDD_WORKS)
+         list(APPEND LAPACK_LIBRARIES "${GFORTRAN_LIBRARY}")
+       else()
+         message(WARNING "OpenBlas has been compiled with Lapack support, but cgesdd can not be used")
+         set(OPEN_LAPACK_WORKS NO)
+       endif()
+      endif()
+    endif()
+
     set(CMAKE_REQUIRED_LIBRARIES)
     if(OPEN_LAPACK_WORKS)
       SET(LAPACK_INFO "open")

From 3e606da0aff41711822ad22708f6b1ddcd3f6e78 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@fb.com>
Date: Mon, 26 Oct 2020 10:08:57 -0700
Subject: [PATCH 009/169] Upgrading lcov install to install v1.15 to be
 compatible with GCC9 (#46847)

Summary:
According to [this issue](https://github.com/linux-test-project/lcov/issues/58), LCOV 1.14 and below are not compatible with GCC9 when gathering coverage.

Instead of installing `lcov` with `apt-get`, which installs version 1.13, this PR would install v1.15 from source onto the ubuntu Docker images.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46847

Reviewed By: seemethere

Differential Revision: D24540444

Pulled By: janeyx99

fbshipit-source-id: 0ac2a37241d94cdd8fea2fded7984c495a64cedc
---
 .circleci/docker/common/install_lcov.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.circleci/docker/common/install_lcov.sh b/.circleci/docker/common/install_lcov.sh
index eea6e1a5bc21..b4364698318a 100644
--- a/.circleci/docker/common/install_lcov.sh
+++ b/.circleci/docker/common/install_lcov.sh
@@ -2,5 +2,7 @@
 
 set -ex
 
-sudo apt-get -qq update
-sudo apt-get -qq install lcov
+git clone --branch v1.15 https://github.com/linux-test-project/lcov.git
+pushd lcov
+sudo make install   # will be installed in /usr/local/bin/lcov
+popd

From 5a2b537b5485f3b296b1312d24339e5f43b3afcb Mon Sep 17 00:00:00 2001
From: Jinwoo Park <jinwoop@fb.com>
Date: Mon, 26 Oct 2020 10:43:23 -0700
Subject: [PATCH 010/169] Add error messages and workaround for RET failure of
 containers with a torch class type (#46543)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46543

Add error messages and workaround for RET failure of containers with a torch class type.
 - Error case condition
  1) ins.op == RET
  2) input_type == TypeKind::ListType or TypeKind::DictType
  3) Any(input_type's element type) == TypeKind::ClassType
ghstack-source-id: 114618426

Test Plan:
buck test mode/dev caffe2/test:mobile -- 'test'

    Summary
       Pass: 13
       ListingSuccess: 1
    Finished test run: https://our.intern.facebook.com/intern/testinfra/testrun/7318349417617713

Reviewed By: iseeyuan

Differential Revision: D24388483

fbshipit-source-id: 7d30f6684a999054d0163e691422797cb818bb6a
---
 test/mobile/test_lite_script_module.py        | 46 +++++++++++++++++++
 .../csrc/jit/serialization/export_module.cpp  | 12 +++++
 2 files changed, 58 insertions(+)

diff --git a/test/mobile/test_lite_script_module.py b/test/mobile/test_lite_script_module.py
index 16558953611b..ca67875b107f 100644
--- a/test/mobile/test_lite_script_module.py
+++ b/test/mobile/test_lite_script_module.py
@@ -170,5 +170,51 @@ def forward(self):
                                     r"a pytorch class \(class Foo\(torch\.nn\.Module\)\)\'s attributes."):
             script_module._save_to_buffer_for_lite_interpreter()
 
+    def test_unsupported_return_list_with_module_class(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super(Foo, self).__init__()
+
+        class MyTestModuleForListWithModuleClass(torch.nn.Module):
+            def __init__(self):
+                super(MyTestModuleForListWithModuleClass, self).__init__()
+                self.foo = Foo()
+
+            def forward(self):
+                my_list: List[Foo] = [self.foo]
+                return my_list
+
+        script_module = torch.jit.script(MyTestModuleForListWithModuleClass())
+        with self.assertRaisesRegex(RuntimeError,
+                                    r"^Returining a list or dictionary with pytorch class type "
+                                    r"is not supported in mobile module "
+                                    r"\(List\[Foo\] or Dict\[int\, Foo\] for class Foo\(torch\.nn\.Module\)\)\. "
+                                    r"Workaround\: instead of using pytorch class as their element type\, "
+                                    r"use a combination of list\, dictionary\, and single types\.$"):
+            script_module._save_to_buffer_for_lite_interpreter()
+
+    def test_unsupported_return_dict_with_module_class(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super(Foo, self).__init__()
+
+        class MyTestModuleForDictWithModuleClass(torch.nn.Module):
+            def __init__(self):
+                super(MyTestModuleForDictWithModuleClass, self).__init__()
+                self.foo = Foo()
+
+            def forward(self):
+                my_dict: Dict[int, Foo] = {1: self.foo}
+                return my_dict
+
+        script_module = torch.jit.script(MyTestModuleForDictWithModuleClass())
+        with self.assertRaisesRegex(RuntimeError,
+                                    r"^Returining a list or dictionary with pytorch class type "
+                                    r"is not supported in mobile module "
+                                    r"\(List\[Foo\] or Dict\[int\, Foo\] for class Foo\(torch\.nn\.Module\)\)\. "
+                                    r"Workaround\: instead of using pytorch class as their element type\, "
+                                    r"use a combination of list\, dictionary\, and single types\.$"):
+            script_module._save_to_buffer_for_lite_interpreter()
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/torch/csrc/jit/serialization/export_module.cpp b/torch/csrc/jit/serialization/export_module.cpp
index 548c816bac6a..8eea619cd535 100644
--- a/torch/csrc/jit/serialization/export_module.cpp
+++ b/torch/csrc/jit/serialization/export_module.cpp
@@ -131,6 +131,18 @@ std::pair<IValue, c10::optional<IValue>> getFunctionTuple(
                 "use a dictionary type's key-value pair itmes or ",
                 "a pytorch class (class Foo(torch.nn.Module))'s attributes.'");
           }
+        } else if (
+            input_type->kind() == TypeKind::ListType ||
+            input_type->kind() == TypeKind::DictType) {
+          for (const TypePtr& element_type : input_type->containedTypes()) {
+            TORCH_CHECK(
+                element_type->kind() != TypeKind::ClassType,
+                "Returining a list or dictionary with pytorch class type ",
+                "is not supported in mobile module "
+                "(List[Foo] or Dict[int, Foo] for class Foo(torch.nn.Module)). "
+                "Workaround: instead of using pytorch class as their element type, ",
+                "use a combination of list, dictionary, and single types.");
+          }
         }
       }
     } else {

From b5662ba0f0cb6d99805a44dd5b6bcda128468c83 Mon Sep 17 00:00:00 2001
From: Huan Gui <huangui@fb.com>
Date: Mon, 26 Oct 2020 11:05:26 -0700
Subject: [PATCH 011/169] [uhm][0/n] add cuda Mod Op (#46732)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46732

as titled

Test Plan:
unittest

buck test mode/dev-nosan //caffe2/caffe2/python/operator_test:mod_op_test

Reviewed By: xianjiec

Differential Revision: D24368100

fbshipit-source-id: 1232d22a67ac268986043911d548fa9d657470ec
---
 caffe2/operators/mod_op.cc                 |  3 --
 caffe2/operators/mod_op.cu                 | 63 ++++++++++++++++++++++
 caffe2/python/operator_test/mod_op_test.py | 13 ++---
 3 files changed, 68 insertions(+), 11 deletions(-)
 create mode 100644 caffe2/operators/mod_op.cu

diff --git a/caffe2/operators/mod_op.cc b/caffe2/operators/mod_op.cc
index 48c1eea5a415..8faaac51572f 100644
--- a/caffe2/operators/mod_op.cc
+++ b/caffe2/operators/mod_op.cc
@@ -25,8 +25,6 @@ bool ModOp<CPUContext>::DoRunWithType() {
   return true;
 }
 
-namespace {
-
 REGISTER_CPU_OPERATOR(Mod, ModOp<CPUContext>);
 OPERATOR_SCHEMA(Mod)
     .NumInputs(1)
@@ -95,5 +93,4 @@ X after running op:
     .Output(0, "Y", "*(type: Tensor`<int>`)* Output tensor of data with modulo operation applied.");
 
 SHOULD_NOT_DO_GRADIENT(ModOp);
-} // namespace
 } // namespace caffe2
diff --git a/caffe2/operators/mod_op.cu b/caffe2/operators/mod_op.cu
new file mode 100644
index 000000000000..90043a29389f
--- /dev/null
+++ b/caffe2/operators/mod_op.cu
@@ -0,0 +1,63 @@
+#include "caffe2/operators/mod_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+__global__ void ModOpSimpleKernel(const int N, const int64_t divisor_,
+                            const T* data_ptr, T* output_ptr) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    output_ptr[i] = data_ptr[i] % divisor_;
+  }
+}
+
+
+template <typename T>
+__global__ void ModOpKernel(const int N, const int64_t divisor_,
+                            const T* data_ptr, T* output_ptr) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    output_ptr[i] = data_ptr[i] % divisor_;
+    if (output_ptr[i] && ((output_ptr[i] > 0) != (divisor_ > 0))) {
+      output_ptr[i] += divisor_;
+    }
+  }
+}
+
+}  // namespace
+
+template <>
+template <typename T>
+bool ModOp<CUDAContext>::DoRunWithType() {
+  auto& data = Input(DATA);
+  auto N = data.numel();
+  const auto* data_ptr = data.template data<T>();
+
+  auto* output = Output(0, data.sizes(), at::dtype<T>());
+  auto* output_ptr = output->template mutable_data<T>();
+
+  if (sign_follow_divisor_) {
+    ModOpKernel<<<
+        CAFFE_GET_BLOCKS(N),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        N, divisor_, data_ptr, output_ptr);
+  } else {
+    ModOpSimpleKernel<<<
+        CAFFE_GET_BLOCKS(N),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        N, divisor_, data_ptr, output_ptr);
+  }
+
+  return true;
+
+}
+
+REGISTER_CUDA_OPERATOR(Mod, ModOp<CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/python/operator_test/mod_op_test.py b/caffe2/python/operator_test/mod_op_test.py
index 914bffd2067c..03ff766c11e4 100644
--- a/caffe2/python/operator_test/mod_op_test.py
+++ b/caffe2/python/operator_test/mod_op_test.py
@@ -1,12 +1,7 @@
-
-
-
-
-
 import numpy
 
 from caffe2.python import core
-from hypothesis import given
+from hypothesis import given, settings
 
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
@@ -16,7 +11,8 @@
 @st.composite
 def _data(draw):
     return draw(
-        hu.tensor(dtype=np.int64,
+        hu.tensor(
+            dtype=np.int64,
             elements=st.integers(
                 min_value=np.iinfo(np.int64).min, max_value=np.iinfo(np.int64).max
             )
@@ -25,6 +21,7 @@ def _data(draw):
 
 
 class TestMod(hu.HypothesisTestCase):
+    @settings(deadline=None)
     @given(
         data=_data(),
         divisor=st.integers(
@@ -32,7 +29,7 @@ class TestMod(hu.HypothesisTestCase):
         ),
         inplace=st.booleans(),
         sign_follow_divisor=st.booleans(),
-        **hu.gcs_cpu_only
+        **hu.gcs
     )
     def test_mod(
         self, data, divisor, inplace, sign_follow_divisor, gc, dc

From e927b62e739cb433bbf19ce56d78e6b4c9da8b70 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Mon, 26 Oct 2020 11:59:13 -0700
Subject: [PATCH 012/169] [quant][graphmode][fx] Support
 sigmoid/hardsigmoid/tanh in qat (#46738)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/46738

Test Plan: Imported from OSS

Reviewed By: raghuramank100

Differential Revision: D24486972

fbshipit-source-id: c9f139bfdd54973da1a93a45e32937595dbe67fc
---
 test/quantization/test_quantize.py            |  33 ++++-
 test/quantization/test_quantize_fx.py         |  99 ++++++++++++---
 test/test_quantization.py                     |   3 +-
 torch/quantization/fake_quantize.py           |  22 +++-
 torch/quantization/fx/pattern_utils.py        |  18 ++-
 .../quantization/fx/quantization_patterns.py  |  34 ++++--
 torch/quantization/fx/quantize.py             | 113 ++++++++++--------
 torch/quantization/quantization_mappings.py   |  23 ++++
 torch/quantization/quantize.py                |  45 +++++--
 9 files changed, 292 insertions(+), 98 deletions(-)

diff --git a/test/quantization/test_quantize.py b/test/quantization/test_quantize.py
index 480ca18128c5..1ef6e9b89c0c 100644
--- a/test/quantization/test_quantize.py
+++ b/test/quantization/test_quantize.py
@@ -25,7 +25,8 @@
     float_qparams_dynamic_qconfig,
     PerChannelMinMaxObserver,
     QConfigDynamic,
-    default_dynamic_quant_observer
+    default_dynamic_quant_observer,
+    FixedQParamsFakeQuantize,
 )
 
 from torch.testing._internal.common_quantization import (
@@ -1247,6 +1248,36 @@ def forward(self, x):
     def test_leaky_relu(self):
         self._test_activation_op_impl(nn.LeakyReLU, nnq.LeakyReLU, {'negative_slope': 0.1, 'inplace': False})
 
+
+class TestEagerModeQATOps(QuantizationTestCase):
+    def test_fixed_qparam_ops(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.sigmoid = torch.nn.Sigmoid()
+                self.hardsigmoid = torch.nn.Hardsigmoid()
+                self.tanh = torch.nn.Tanh()
+                self.quant = QuantStub()
+                self.dequant = DeQuantStub()
+
+            def forward(self, x):
+                x = self.quant(x)
+                x = self.sigmoid(x)
+                x = self.hardsigmoid(x)
+                x = self.tanh(x)
+                x = self.dequant(x)
+                return x
+
+        m = M().train()
+        m.qconfig = default_qat_qconfig
+        m = prepare_qat(m)
+        for attr in ['sigmoid', 'hardsigmoid', 'tanh']:
+            self.assertEqual(type(getattr(m, attr).activation_post_process), FixedQParamsFakeQuantize)
+        m = convert(m)
+        # make sure activation post process is removed
+        for attr in ['sigmoid', 'hardsigmoid', 'tanh']:
+            self.assertFalse(hasattr(getattr(m, attr), 'activation_post_process'))
+
 class TestFunctionalModule(QuantizationTestCase):
     # Histogram Observers are slow, so have no-deadline to ensure test doesn't time out
     @given(train_mode=st.booleans())
diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index 4e352a80daa3..4113b754a3e8 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -23,6 +23,7 @@
     convert,
     PerChannelMinMaxObserver,
     QConfigDynamic,
+    FixedQParamsFakeQuantize,
 )
 
 # test utils
@@ -1410,7 +1411,7 @@ def test_general_value_ops(self):
         """
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(3, 3, 3)
                 self.avg_pool1d = torch.nn.AvgPool1d(3)
                 self.avg_pool2d = torch.nn.AvgPool2d(3)
@@ -1418,9 +1419,6 @@ def __init__(self):
                 self.adaptive_avg_pool1d = torch.nn.AdaptiveAvgPool1d((1))
                 self.adaptive_avg_pool2d = torch.nn.AdaptiveAvgPool2d((1, 1))
                 self.adaptive_avg_pool3d = torch.nn.AdaptiveAvgPool3d((1, 1, 1))
-                self.hardsigmoid = torch.nn.Hardsigmoid()
-                self.sigmoid = torch.nn.Sigmoid()
-                self.tanh = torch.nn.Tanh()
 
             def forward(self, x):
                 x = self.conv(x)
@@ -1442,21 +1440,6 @@ def forward(self, x):
                 x = x.mean([2, 3], True)
                 x = F.interpolate(x, 4, mode='nearest')
                 x = F.interpolate(x, 4, mode='linear')
-                x = self.hardsigmoid(x)
-                x = F.hardsigmoid(x)
-                x = F.hardsigmoid(x, inplace=True)
-                x = x.hardsigmoid()
-                x.hardsigmoid_()
-                x = self.sigmoid(x)
-                x = torch.sigmoid(x)
-                # F.sigmoid is deprecated
-                x = x.sigmoid()
-                x.sigmoid_()
-                x = self.tanh(x)
-                # F.tanh is deprecated
-                x = torch.tanh(x)
-                x = x.tanh()
-                x.tanh_()
                 x = self.conv(x)
                 return x
 
@@ -1488,6 +1471,84 @@ def forward(self, x):
             expected_node_occurrence=count_check,
             expected_node_list=order_check)
 
+    @skipIfNoFBGEMM
+    def test_fixed_qparams_ops(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(3, 3, 3)
+                self.sigmoid = torch.nn.Sigmoid()
+                self.hardsigmoid = torch.nn.Hardsigmoid()
+                self.tanh = torch.nn.Tanh()
+
+            def forward(self, x):
+                x = self.conv(x)
+                # F.sigmoid is deprecated
+                x = self.sigmoid(x)
+                x = torch.sigmoid(x)
+                x = x.sigmoid()
+                x.sigmoid_()
+                x = self.hardsigmoid(x)
+                x = F.hardsigmoid(x)
+                x = F.hardsigmoid(x, inplace=True)
+                x = x.hardsigmoid()
+                x.hardsigmoid_()
+                x = self.tanh(x)
+                # F.tanh is deprecated
+                x = torch.tanh(x)
+                x = x.tanh()
+                x.tanh_()
+                x = self.conv(x)
+                return x
+
+        for eval_mode in [True, False]:
+            # This model is not executable since we just put all ops
+            # in the same forward
+            m = M()
+            if eval_mode:
+                m.eval()
+                qconfig = default_qconfig
+                prepare = prepare_fx
+                fq_count = 0
+            else:
+                m.train()
+                qconfig = default_qat_qconfig
+                prepare = prepare_qat_fx
+                fq_count = 13
+
+            # nothing to fuse so skipping the fuse step
+            qconfig_dict = {'': qconfig}
+            prepared = prepare(m, qconfig_dict)
+            # check the correct number of activation_post_process is inserted
+            count_check = {
+                ns.call_module(FixedQParamsFakeQuantize) : fq_count,
+            }
+            self.checkGraphModuleNodes(
+                prepared,
+                expected_node_occurrence=count_check)
+            # not runnable
+            quantized = convert_fx(prepared)
+
+            # This checks that the dequantize from the output of first conv
+            # is being propagated to the end, so that we don't insert extra
+            # observers
+            # check exact counts of quantize and dequantize
+            count_check = {
+                ns.call_function(torch.quantize_per_tensor) : 1,
+                ns.call_method('dequantize') : 1
+            }
+            order_check = [
+                ns.call_function(torch.quantize_per_tensor),
+                ns.call_module(nnq.Conv2d),
+                ns.call_module(nn.Sigmoid),
+                ns.call_module(nnq.Conv2d),
+                ns.call_method('dequantize'),
+            ]
+            self.checkGraphModuleNodes(
+                quantized,
+                expected_node_occurrence=count_check,
+                expected_node_list=order_check)
+
     def test_float_functional(self):
         class TorchAdd(nn.Module):
             """Wrapper around torch.add so that all ops can be found at build"""
diff --git a/test/test_quantization.py b/test/test_quantization.py
index 8a6f05cb19de..26d3ee020983 100644
--- a/test/test_quantization.py
+++ b/test/test_quantization.py
@@ -44,6 +44,7 @@
 from quantization.test_quantize import TestPostTrainingDynamic  # noqa: F401
 from quantization.test_quantize import TestQuantizationAwareTraining  # noqa: F401
 from quantization.test_quantize import TestEagerModeOps  # noqa: F401
+from quantization.test_quantize import TestEagerModeQATOps  # noqa: F401
 
 # TODO: merge with other tests in test_quantize.py?
 from quantization.test_quantize import TestFunctionalModule  # noqa: F401
@@ -64,7 +65,7 @@
 from quantization.test_quantize_fx import TestQuantizeFxOps  # noqa: F401
 from quantization.test_quantize_fx import TestQuantizeFxModels  # noqa: F401
 
-# Tooling: numric_suite
+# Tooling: numeric_suite
 from quantization.test_numeric_suite import TestEagerModeNumericSuite  # noqa: F401
 
 # Backward Compatibility
diff --git a/torch/quantization/fake_quantize.py b/torch/quantization/fake_quantize.py
index dd2333883325..cacaad6bea0b 100644
--- a/torch/quantization/fake_quantize.py
+++ b/torch/quantization/fake_quantize.py
@@ -159,12 +159,12 @@ def forward(self, X):
 
     @torch.jit.export
     def extra_repr(self):
-        return 'fake_quant_enabled={}, observer_enabled={},\
-            quant_min={}, quant_max={}, dtype={}, qscheme={}, ch_axis={}, \
-        scale={}, zero_point={}'.format(
-            self.fake_quant_enabled, self.observer_enabled,
-            self.quant_min, self.quant_max,
-            self.dtype, self.qscheme, self.ch_axis, self.scale, self.zero_point)
+        return 'fake_quant_enabled={}, observer_enabled={}, ' \
+               'quant_min={}, quant_max={}, dtype={}, qscheme={}, ch_axis={}, ' \
+               'scale={}, zero_point={}'.format(
+                   self.fake_quant_enabled, self.observer_enabled,
+                   self.quant_min, self.quant_max,
+                   self.dtype, self.qscheme, self.ch_axis, self.scale, self.zero_point)
 
     def _save_to_state_dict(self, destination, prefix, keep_vars):
         # We cannot currently register scalar values as buffers, so need to manually
@@ -226,9 +226,19 @@ def forward(self, X):
                                                       self.quant_max)
         return X
 
+    @torch.jit.export
     def calculate_qparams(self):
         return self.scale, self.zero_point
 
+    @torch.jit.export
+    def extra_repr(self):
+        return 'fake_quant_enabled={}, observer_enabled={}, scale={}, zero_point={}, ' \
+               'dtype={}, quant_min={}, quant_max={}, qscheme={}'.format(
+                   self.fake_quant_enabled, self.observer_enabled,
+                   self.scale, self.zero_point, self.dtype,
+                   self.quant_min, self.quant_max, self.qscheme)
+
+
 default_fake_quant = FakeQuantize.with_args(observer=MovingAverageMinMaxObserver, quant_min=0, quant_max=255,
                                             dtype=torch.quint8, qscheme=torch.per_tensor_affine, reduce_range=True)
 default_weight_fake_quant = FakeQuantize.with_args(observer=MovingAverageMinMaxObserver, quant_min=-128, quant_max=127,
diff --git a/torch/quantization/fx/pattern_utils.py b/torch/quantization/fx/pattern_utils.py
index efc1cd492d1b..ccbd7cc2a2c4 100644
--- a/torch/quantization/fx/pattern_utils.py
+++ b/torch/quantization/fx/pattern_utils.py
@@ -14,10 +14,17 @@ def get_default_fusion_patterns():
     return DEFAULT_FUSION_PATTERNS
 
 DEFAULT_QUANTIZATION_PATTERNS = OrderedDict()
+# a map from pattern to activation_post_process(observer/fake_quant) consstructor for output activation
+# e.g. pattern: torch.sigmoid,
+#      output_activation_post_process: default_affine_fixed_qparam_fake_quant
+DEFAULT_OUTPUT_ACTIVATION_POST_PROCESS_MAP = dict()
+
 # Register pattern for both static quantization and qat
-def register_quant_pattern(pattern):
+def register_quant_pattern(pattern, output_activation_post_process=None):
     def insert(fn):
         DEFAULT_QUANTIZATION_PATTERNS[pattern] = fn
+        if output_activation_post_process is not None:
+            DEFAULT_OUTPUT_ACTIVATION_POST_PROCESS_MAP[pattern] = output_activation_post_process
         return fn
     return insert
 
@@ -25,6 +32,12 @@ def insert(fn):
 def get_default_quant_patterns():
     return DEFAULT_QUANTIZATION_PATTERNS
 
+# a map from pattern to output activation post process constructor
+# e.g. torch.sigmoid -> default_affine_fixed_qparam_fake_quant
+def get_default_output_activation_post_process_map():
+    return DEFAULT_OUTPUT_ACTIVATION_POST_PROCESS_MAP
+
+
 # Example use of register pattern function:
 # @register_fusion_pattern(torch.nn.ReLU, (torch.nn.BatchNorm2d, torch.nn.Conv2d)))
 # class ConvBNReLUFusion():
@@ -62,6 +75,9 @@ def is_match(modules, node, pattern, max_uses=sys.maxsize):
         elif node.target is getattr:
             if node.args[1] != pattern[1]:
                 return False
+    elif isinstance(self_match, str):
+        if node.op != 'call_method' or node.target != self_match:
+            return False
     elif node.target != self_match:
         return False
 
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index 12787e4a87db..2898a07a17c1 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -4,6 +4,10 @@
 )
 import torch.nn.quantized as nnq
 import torch.nn.quantized.dynamic as nnqd
+from torch.quantization import (
+    default_affine_fixed_qparams_fake_quant,
+    default_symmetric_fixed_qparams_fake_quant,
+)
 
 from ..quantization_mappings import (
     get_static_quant_module_class,
@@ -487,6 +491,22 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
         return quantizer.quantized_graph.create_node(
             'call_function', quantized_op, args, kwargs)
 
+@register_quant_pattern(torch.nn.Hardsigmoid, default_affine_fixed_qparams_fake_quant)
+@register_quant_pattern(torch.nn.functional.hardsigmoid, default_affine_fixed_qparams_fake_quant)
+@register_quant_pattern('hardsigmoid', default_affine_fixed_qparams_fake_quant)
+@register_quant_pattern('hardsigmoid_', default_affine_fixed_qparams_fake_quant)
+@register_quant_pattern(torch.nn.Sigmoid, default_affine_fixed_qparams_fake_quant)
+@register_quant_pattern(torch.sigmoid, default_affine_fixed_qparams_fake_quant)
+@register_quant_pattern('sigmoid', default_affine_fixed_qparams_fake_quant)
+@register_quant_pattern('sigmoid_', default_affine_fixed_qparams_fake_quant)
+@register_quant_pattern(torch.nn.Tanh, default_symmetric_fixed_qparams_fake_quant)
+@register_quant_pattern(torch.tanh, default_symmetric_fixed_qparams_fake_quant)
+@register_quant_pattern('tanh', default_symmetric_fixed_qparams_fake_quant)
+@register_quant_pattern('tanh_', default_symmetric_fixed_qparams_fake_quant)
+class FixedQParamsOpQuantizeHandler(QuantizeHandler):
+    def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_dict=None):
+        return quantizer.quantized_graph.node_copy(node, load_arg(quantized=None))
+
 # these ops have quantized equivalents that do not need any extra information
 @register_quant_pattern(torch.nn.AdaptiveAvgPool1d)
 @register_quant_pattern(torch.nn.AdaptiveAvgPool2d)
@@ -495,20 +515,16 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 @register_quant_pattern(torch.nn.AvgPool2d)
 @register_quant_pattern(torch.nn.AvgPool3d)
 @register_quant_pattern(torch.nn.Dropout)
-@register_quant_pattern(torch.nn.Hardsigmoid)
 @register_quant_pattern(torch.nn.Hardtanh)
 @register_quant_pattern(torch.nn.MaxPool1d)
 @register_quant_pattern(torch.nn.MaxPool2d)
 @register_quant_pattern(torch.nn.MaxPool3d)
 @register_quant_pattern(torch.nn.ReLU)
 @register_quant_pattern(torch.nn.ReLU6)
-@register_quant_pattern(torch.nn.Sigmoid)
-@register_quant_pattern(torch.nn.Tanh)
 @register_quant_pattern(torch.adaptive_avg_pool1d)
 @register_quant_pattern(torch.nn.functional.adaptive_avg_pool2d)
 @register_quant_pattern(torch.nn.functional.adaptive_avg_pool3d)
 @register_quant_pattern(torch.nn.functional.dropout)
-@register_quant_pattern(torch.nn.functional.hardsigmoid)
 @register_quant_pattern(torch.nn.functional.hardtanh)
 @register_quant_pattern(torch.nn.functional.hardtanh_)
 @register_quant_pattern(torch.nn.functional.interpolate)
@@ -528,11 +544,9 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 @register_quant_pattern(torch.mean)
 @register_quant_pattern(torch.min)
 @register_quant_pattern(torch.repeat_interleave)
-@register_quant_pattern(torch.sigmoid)
 @register_quant_pattern(torch.sort)
 @register_quant_pattern(torch.squeeze)
 @register_quant_pattern(torch.stack)
-@register_quant_pattern(torch.tanh)
 @register_quant_pattern(torch.unsqueeze)
 @register_quant_pattern(operator.getitem)
 @register_quant_pattern(operator.floordiv)
@@ -541,8 +555,6 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 @register_quant_pattern('contiguous')
 @register_quant_pattern('detach')
 @register_quant_pattern('detach_')
-@register_quant_pattern('hardsigmoid')
-@register_quant_pattern('hardsigmoid_')
 @register_quant_pattern('mean')
 @register_quant_pattern('numel')
 @register_quant_pattern('permute')
@@ -553,13 +565,9 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 @register_quant_pattern('reshape')
 @register_quant_pattern('resize_')
 @register_quant_pattern('shape')
-@register_quant_pattern('sigmoid')
-@register_quant_pattern('sigmoid_')
 @register_quant_pattern('size')
 @register_quant_pattern('squeeze')
 @register_quant_pattern('squeeze_')
-@register_quant_pattern('tanh')
-@register_quant_pattern('tanh_')
 @register_quant_pattern('transpose')
 @register_quant_pattern('unsqueeze')
 @register_quant_pattern('unsqueeze_')
@@ -570,7 +578,7 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 
 # Default quantization handler, used for quantization of input and output
 # of quantizable objects (e.g. modules and functionals)
-class DefaultQuant(QuantizeHandler):
+class DefaultQuantizeHandler(QuantizeHandler):
     def convert(self, quantizer, node):
         assert self.all_nodes
         root_module = quantizer.modules['']
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index 3eebef4ff10a..afa87a83badf 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -19,11 +19,15 @@
     get_default_qat_module_mappings,
 )
 
-from ..quantize import _remove_qconfig
+from ..quantize import (
+    _remove_qconfig,
+    is_activation_post_process
+)
 
 from .pattern_utils import (
     is_match,
     get_default_quant_patterns,
+    get_default_output_activation_post_process_map,
 )
 
 from .standalone_module import (
@@ -131,11 +135,6 @@ def assert_and_get_unique_device(module):
     device = next(iter(devices)) if len(devices) > 0 else None
     return device
 
-def is_activation_post_process(module):
-    return (isinstance(module, torch.quantization.ObserverBase) or
-            isinstance(module, torch.quantization.FakeQuantize) or
-            isinstance(module, torch.quantization.FakeQuantizeBase))
-
 def is_submodule_of_fake_quant(name, module, named_modules):
     parent_name, _ = _parent_name(name)
     return is_activation_post_process(named_modules[parent_name])
@@ -353,7 +352,7 @@ def _prepare(self, model, qconfig_dict, inplace, prepare_custom_config_dict, is_
 
         # find _inputs_ to matched nodes that are not quantized, these
         # have to be quantized, which requires measuring stats,
-        # initialize an DefaultQuant object for each
+        # initialize an DefaultQuantizeHandler object for each
         quants = self._find_quants(model.graph, matches)
 
         self.activation_post_process_map = dict()
@@ -383,7 +382,7 @@ def load_arg(a):
                 continue
 
             prefix = node.name + '_activation_post_process_'
-            root_node, _, obj, qconfig = matches.get(node.name, (None, None, None, None))
+            root_node, _, pattern, obj, qconfig = matches.get(node.name, (None, None, None, None, None))
             if root_node is None:
                 env[node.name] = observed_graph.node_copy(node, load_arg)
             elif root_node is node:
@@ -430,9 +429,19 @@ def insert_observer(node, observer, device):
                 if not activation_is_statically_quantized(qconfig):
                     continue
 
-                # inserting observers for output of observed module, or mark the output
-                # as observed
-                if isinstance(obj, CopyNode):
+                if isinstance(obj, FixedQParamsOpQuantizeHandler) and model.training:
+                    # we only insert fake quantize module in qat
+                    activation_post_process_ctr = \
+                        get_default_output_activation_post_process_map().get(pattern, None)
+                    assert activation_post_process_ctr is not None, \
+                        'activation_post_process constructor not provided for ' + \
+                        'pattern:' + str(pattern)
+                    device = assert_and_get_unique_device(model)
+                    insert_observer(node, activation_post_process_ctr(), device)
+                elif (isinstance(obj, FixedQParamsOpQuantizeHandler) and not model.training) \
+                     or isinstance(obj, CopyNode):
+                    # inserting observers for output of observed module, or mark the output
+                    # as observed
                     assert node.op in [
                         'call_module',
                         'call_function',
@@ -472,6 +481,7 @@ def is_observed(input_arg):
             else:
                 env[node.name] = observed_graph.node_copy(node, load_arg)
 
+            # insert observer for output of the node
             if node.name not in observed_node_names_set and node.name in quants:
                 if is_standalone_module and node.name in graph_inputs:
                     # we'll insert observer for input of standalone module
@@ -480,11 +490,11 @@ def is_observed(input_arg):
                     continue
                 get_new_observer_name = get_new_attr_name_with_prefix(prefix)
                 observer_name = get_new_observer_name(model)
-                _, qconfig, is_weight = quants[node.name]
-                if qconfig is not None:
+                _, activation_post_process_ctr = quants[node.name]
+                if activation_post_process_ctr is not None:
                     # TODO: use insert_observer
-                    new_observer = \
-                        qconfig.weight() if is_weight else qconfig.activation()
+                    new_observer = activation_post_process_ctr()
+
                     # respect device affinity when adding observers
                     device = assert_and_get_unique_device(model)
                     if device:
@@ -669,7 +679,7 @@ def is_quantized(node):
                     graph_output = map_arg(node.args[0], load_non_quantized)
                 self.quantized_graph.output(graph_output)
                 continue
-            root_node, matched, obj, qconfig = matches.get(node.name, (None, None, None, None))
+            root_node, matched, matched_pattern, obj, qconfig = matches.get(node.name, (None, None, None, None, None))
             if root_node is node:
                 if qconfig is None:
                     result = self.quantized_graph.node_copy(node, load_non_quantized)
@@ -682,7 +692,10 @@ def is_quantized(node):
                         quantized = True
 
                     # Need to get correct quantized/non-quantized state for the output of CopyNode
-                    if isinstance(obj, CopyNode):
+                    if type(obj) in [
+                            CopyNode,
+                            FixedQParamsOpQuantizeHandler
+                    ]:
                         assert node.op in [
                             'call_module',
                             'call_function',
@@ -751,13 +764,7 @@ def load_arg(a):
             else:
                 env[node.name] = act_post_process_removed_graph.node_copy(node, load_arg)
 
-        module_dict = dict(model.named_modules())
-        to_be_removed = []
-        for name, module in model.named_modules():
-            if is_activation_post_process(module) and not is_submodule_of_fake_quant(name, module, module_dict):
-                to_be_removed.append(name)
-        for n in to_be_removed:
-            delattr(model, n)
+        # removes qconfig and activation_post_process modules
         _remove_qconfig(model)
         model = GraphModule(model, act_post_process_removed_graph)
         return model
@@ -832,10 +839,10 @@ def _find_matches(
 
         Outputs a map of
           node_name ->
-            (node, matched_values, QuantizeHandler instance, qconfig)
+            (node, matched_values, matched_pattern, QuantizeHandler instance, qconfig)
 
         For example, {
-          'relu_1': (relu_1, [relu_1], <CopyNode instance>, QConfig(...)),
+          'relu_1': (relu_1, [relu_1], torch.nn.functional.relu, <CopyNode instance>, QConfig(...)),
           ...
         }
         """
@@ -862,7 +869,7 @@ def record_match(pattern, node, matched):
                         matched = []
                         record_match(pattern, node, matched)
                         for n in matched:
-                            match_map[n.name] = (node, matched, value(self, node), self.qconfig_map[n.name])
+                            match_map[n.name] = (node, matched, pattern, value(self, node), self.qconfig_map[n.name])
                             all_matched.add(n.name)
                         # break after finding the first match
                         break
@@ -873,7 +880,7 @@ def record_match(pattern, node, matched):
                type(self.modules[node.target]) in custom_module_class_mapping:
                 custom_module_qconfig = self.qconfig_map[node.name]
                 match_map[node.name] = (
-                    node, [node], CustomModuleQuantizeHandler(self, node), custom_module_qconfig)
+                    node, [node], None, CustomModuleQuantizeHandler(self, node), custom_module_qconfig)
 
         def is_standalone_module(module_path):
             if standalone_module_names is None:
@@ -888,7 +895,7 @@ def is_standalone_module(module_path):
                 # add node to matched nodes
                 custom_module_qconfig = self.qconfig_map[node.name]
                 match_map[node.name] = (
-                    node, [node], StandaloneModuleQuantizeHandler(self, node), custom_module_qconfig)
+                    node, [node], None, StandaloneModuleQuantizeHandler(self, node), custom_module_qconfig)
 
         return match_map
 
@@ -902,16 +909,13 @@ def _find_quants(self, graph, matches):
           - matches: output of self._find_matches function
 
         Outputs a map of
-          node_name -> (QuantizeHandler instance (always DefaultQuant), qconfig)
+         node_name -> (QuantizeHandler instance (always DefaultQuantizeHandler),
+         activation_post_process (observer/fake_quantize module) constructor)
         """
         quants = {}
 
-        def visit(node, qconfig):
+        def visit(node, matched_pattern, qconfig):
             def visit_arg(arg):
-                # note: we have to measure quantization information
-                # even for nodes where we might not use it because it is already
-                # quantized. This is because each match has the option to
-                # say NotImplemented (if for instance, it is an __add__ and the data type is not appropriate)
                 is_weight = False
                 if isinstance(node, Node) and node.op == 'call_function' and node.target in WEIGHT_INDEX_DICT:
                     for i, node_arg in enumerate(node.args):
@@ -919,26 +923,39 @@ def visit_arg(arg):
                             is_weight = True
                 if qconfig is not None and \
                    (activation_is_statically_quantized(qconfig) or is_weight):
-                    # overwrite previous quant config
-                    quants[arg.name] = (DefaultQuant(self, arg), qconfig, is_weight)
+                    act_post_process_ctr = qconfig.weight if is_weight else qconfig.activation
+                    quants[arg.name] = (DefaultQuantizeHandler(self, arg), qconfig, is_weight)
+                    # overwrite the constructor from qconfig
+                    act_post_process_ctr = \
+                        get_default_output_activation_post_process_map().get(
+                            matched_pattern,
+                            act_post_process_ctr)
+                    # overwrite previous activation post process constructor if necessary
+                    quants[arg.name] = (DefaultQuantizeHandler(self, arg), act_post_process_ctr)
             return visit_arg
 
         for node in graph.nodes:
             if node.name in matches:
-                root_node, matched, obj, qconfig = matches[node.name]
+                root_node, matched_nodes, matched_pattern, quantize_handler, qconfig = matches[node.name]
                 # don't attach observer/fake_quant for CopyNode
-                if isinstance(obj, CopyNode):
+                if isinstance(quantize_handler, CopyNode):
                     qconfig = None
                 if root_node is node:
-                    # matched[-1] is the first op in the sequence and
-                    # matched[0] is the last op in the sequence
+                    # matched_nodes[-1] is the first op in the sequence and
+                    # matched_nodes[0] is the last op in the sequence
                     # inputs
-                    map_arg(matched[-1].args, visit(matched[-1], qconfig))
-                    map_arg(matched[-1].kwargs, visit(matched[-1], qconfig))
+                    # matched_pattern is set to None for inputs because
+                    # we only want to select QuantizeHandler object based
+                    # on pattern for output, inputs will always use
+                    # DefaultQuantizeHandler
+                    map_arg(matched_nodes[-1].args, visit(matched_nodes[-1], None, qconfig))
+                    map_arg(matched_nodes[-1].kwargs, visit(matched_nodes[-1], None, qconfig))
+
                     # output
-                    if isinstance(obj, StandaloneModuleQuantizeHandler):
-                        # we don't insert observer for output of custom
-                        # module
-                        continue
-                    map_arg(matched[0], visit(None, qconfig))
+                    # we don't insert observer for output of standalone module
+                    if not isinstance(quantize_handler, StandaloneModuleQuantizeHandler):
+                        # passing in matched_pattern here so that we can customize
+                        # activation_post_process constructor for output based on the pattern, e.g.
+                        # for sigmoid op we'll use default_affine_fixed_qparam_fake_quant
+                        map_arg(matched_nodes[0], visit(None, matched_pattern, qconfig))
         return quants
diff --git a/torch/quantization/quantization_mappings.py b/torch/quantization/quantization_mappings.py
index 40fef784ce2d..82340a49309c 100644
--- a/torch/quantization/quantization_mappings.py
+++ b/torch/quantization/quantization_mappings.py
@@ -10,6 +10,10 @@
 import torch.nn.qat as nnqat
 
 from .stubs import QuantStub, DeQuantStub
+from .fake_quantize import (
+    default_affine_fixed_qparams_fake_quant,
+    default_symmetric_fixed_qparams_fake_quant,
+)
 
 # Default map for swapping float module to quantized ones
 DEFAULT_STATIC_QUANT_MODULE_MAPPINGS = {
@@ -91,6 +95,13 @@
     F.leaky_relu: torch._ops.ops.quantized.leaky_relu,
 }
 
+# mapping from module to output activation post process class
+DEFAULT_MODULE_TO_ACT_POST_PROCESS = {
+    nn.Hardsigmoid: default_affine_fixed_qparams_fake_quant,
+    nn.Sigmoid: default_affine_fixed_qparams_fake_quant,
+    nn.Tanh: default_symmetric_fixed_qparams_fake_quant,
+}
+
 def get_default_static_quant_module_mappings():
     ''' Get module mapping for post training static quantization
     '''
@@ -158,3 +169,15 @@ def get_quantized_operator(float_op):
     assert quantized_op is not None, \
         'Operator {} does not have corresponding quantized op'.format(str(float_op))
     return quantized_op
+
+def get_default_special_act_post_process(module_cls):
+    r""" Get the special activation post process for `module`, this has
+    higher priority than the activation post process in `qconfig`
+    e.g.
+    input: torch.nn.Sigmoid
+    output: default_affine_fixed_qparam_fake_quant
+    """
+    return DEFAULT_MODULE_TO_ACT_POST_PROCESS.get(module_cls, None)
+
+def has_special_act_post_process(module_cls):
+    return module_cls in DEFAULT_MODULE_TO_ACT_POST_PROCESS
diff --git a/torch/quantization/quantize.py b/torch/quantization/quantize.py
index 9aa52d373ff6..f217c5ffb65c 100644
--- a/torch/quantization/quantize.py
+++ b/torch/quantization/quantize.py
@@ -9,14 +9,23 @@
 import torch.nn.quantized as nnq
 import torch.nn.intrinsic.qat as nniqat
 
-from .quantization_mappings import (get_default_dynamic_quant_module_mappings,
-                                    get_default_static_quant_module_mappings,
-                                    get_default_qat_module_mappings,
-                                    get_default_qconfig_propagation_list)
+from .quantization_mappings import (
+    get_default_dynamic_quant_module_mappings,
+    get_default_static_quant_module_mappings,
+    get_default_qat_module_mappings,
+    get_default_qconfig_propagation_list,
+    has_special_act_post_process,
+    get_default_special_act_post_process,
+)
 
 from .stubs import DeQuantStub, QuantWrapper
 from .qconfig import default_dynamic_qconfig, float16_dynamic_qconfig, float_qparams_dynamic_qconfig
 
+def is_activation_post_process(module):
+    return (isinstance(module, torch.quantization.ObserverBase) or
+            isinstance(module, torch.quantization.FakeQuantize) or
+            isinstance(module, torch.quantization.FakeQuantizeBase))
+
 def _propagate_qconfig_helper(module, qconfig_dict, allow_list=None,
                               qconfig_parent=None, prefix=''):
     r"""This is a helper function for `propagate_qconfig_`
@@ -107,8 +116,8 @@ def add_observer_(module, qconfig_propagation_list=None, non_leaf_module_list=No
         )
         device = next(iter(devices)) if len(devices) > 0 else None
 
-    def get_activation_post_process(qconfig, device):
-        activation = qconfig.activation()
+    def get_activation_post_process(qconfig, device, special_act_post_process=None):
+        activation = qconfig.activation() if special_act_post_process is None else special_act_post_process()
         if device is not None:
             activation.to(device)
         return activation
@@ -116,13 +125,13 @@ def get_activation_post_process(qconfig, device):
     def needs_observation(m):
         return hasattr(m, 'qconfig') and m.qconfig is not None
 
-    def insert_activation_post_process(m):
+    def insert_activation_post_process(m, special_act_post_process=None):
         """ Adds an activation post process module and register
         a post hook that calls the module
         """
         if needs_observation(m):
             # observer and hook will be gone after we swap the module
-            m.add_module('activation_post_process', get_activation_post_process(m.qconfig, device))
+            m.add_module('activation_post_process', get_activation_post_process(m.qconfig, device, special_act_post_process))
             # Register observer as the first entry in the hook list
             # All post forward hooks are preserved and will be executed after the observer before convert
             handle = register_activation_post_process_hook(m)
@@ -130,8 +139,11 @@ def insert_activation_post_process(m):
 
     for name, child in module.named_children():
         if type(child) == nnq.FloatFunctional or type(child) == nnq.QFunctional:
-            if hasattr(child, 'qconfig') and child.qconfig is not None:
+            if needs_observation(child):
                 child.activation_post_process = get_activation_post_process(child.qconfig, device)
+        elif has_special_act_post_process(type(child)):
+            special_act_post_process = get_default_special_act_post_process(type(child))
+            insert_activation_post_process(child, special_act_post_process)
         elif non_leaf_module_list is not None and type(child) in non_leaf_module_list:
             insert_activation_post_process(child)
         elif needs_observation(child) and type(child) in custom_module_class_mapping:
@@ -229,6 +241,19 @@ def prepare(model, inplace=False, allow_list=None,
         custom_module_class_mapping=custom_module_class_mapping)
     return model
 
+def _remove_activation_post_process(module):
+    # TODO: maybe we should change activation_post_process to _activation_post_process
+    # to prevent it from being used by user
+    if hasattr(module, 'activation_post_process') and \
+       is_activation_post_process(module.activation_post_process):
+        delattr(module, 'activation_post_process')
+
+    # remove activation_post_proceess hook
+    for handle_id, hook_fn in module._forward_hooks.items():
+        if hook_fn is _observer_forward_hook:
+            module._forward_hooks.pop(handle_id)
+
+# TODO: rename to something more general
 def _remove_qconfig(module):
     r"""Clean up the qconfig left in the module so that new qconfig can be
     propagated.
@@ -242,6 +267,8 @@ def _remove_qconfig(module):
     if hasattr(module, "qconfig"):
         del module.qconfig
 
+    _remove_activation_post_process(module)
+
 def quantize(model, run_fn, run_args, mapping=None, inplace=False):
     r"""Quantize the input float model with post training static quantization.
 

From 56a3831bc623a628ecf5f11b06b11dbbe28ae0d5 Mon Sep 17 00:00:00 2001
From: shmsong <shisong@umich.edu>
Date: Mon, 26 Oct 2020 12:20:24 -0700
Subject: [PATCH 013/169] [NVFuser]Benchmark minor update (#46778)

Summary:
This is a tiny PR for two minor fixes:

1. Added `torch._C._jit_set_texpr_fuser_enabled(False)` to enable shape inference on nv fuser runs.
2. Renamed dynamic benchmark module names to avoid multiple matching. i.e. `simple_element` with `dynamic_simple_element`. I guess it'd be much easier if the pattern matching was based on `startswith`. Would be happy to update that if agreed.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46778

Reviewed By: zhangguanheng66

Differential Revision: D24516911

Pulled By: bertmaher

fbshipit-source-id: 839f9a3e058f9d7aca17b2e6eb8b558e0e48e8f4
---
 benchmarks/tensorexpr/__main__.py    | 1 +
 benchmarks/tensorexpr/elementwise.py | 2 +-
 benchmarks/tensorexpr/reduction.py   | 6 +++---
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/benchmarks/tensorexpr/__main__.py b/benchmarks/tensorexpr/__main__.py
index 62c58de600d2..a1f0a5ee2fed 100644
--- a/benchmarks/tensorexpr/__main__.py
+++ b/benchmarks/tensorexpr/__main__.py
@@ -133,6 +133,7 @@ def main():
     elif args.cuda_fuser == "nvf":
         import torch
         torch._C._jit_set_profiling_executor(True)
+        torch._C._jit_set_texpr_fuser_enabled(False)
         torch._C._jit_set_nvfuser_enabled(True)
         torch._C._jit_set_profiling_mode(True)
     else :
diff --git a/benchmarks/tensorexpr/elementwise.py b/benchmarks/tensorexpr/elementwise.py
index 6a45a84a1787..af1352dfa949 100644
--- a/benchmarks/tensorexpr/elementwise.py
+++ b/benchmarks/tensorexpr/elementwise.py
@@ -219,7 +219,7 @@ def __init__(self, mode, device, dtype, N):
 
     @classmethod
     def module(cls):
-        return "dynamic_simple_element"
+        return "simple_dynamic_element"
 
     def instantiate_input(self):
         N, = self.rand_shape([self.N])
diff --git a/benchmarks/tensorexpr/reduction.py b/benchmarks/tensorexpr/reduction.py
index 57027745a6a7..bc3e4e158a17 100644
--- a/benchmarks/tensorexpr/reduction.py
+++ b/benchmarks/tensorexpr/reduction.py
@@ -175,7 +175,7 @@ def instantiate_input(self):
 
     @staticmethod
     def module():
-        return "dynamic_reduce2d"
+        return "dynamicreduce2d"
 
 
 class DynamicReduce2DInnerBench(DynamicReduce2DBench):
@@ -184,7 +184,7 @@ def __init__(self, mode, device, dtype, dim0, dim1):
 
     @staticmethod
     def module():
-        return "dynamic_reduce2d_inner"
+        return "reduce2d_dynamic_inner"
 
 
 class DynamicReduce2DOuterBench(DynamicReduce2DBench):
@@ -193,7 +193,7 @@ def __init__(self, mode, device, dtype, dim0, dim1):
 
     @staticmethod
     def module():
-        return "dynamic_reduce2d_outer"
+        return "reduce2d_dynamic_outer"
 
 benchmark.register_benchmark_class(DynamicReduce2DInnerBench)
 benchmark.register_benchmark_class(DynamicReduce2DOuterBench)

From 0c74b43a3f047862bbef3e991c3e5b8b8e83a23e Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Mon, 26 Oct 2020 12:37:50 -0700
Subject: [PATCH 014/169] Update TensorPipe submodule (#46842)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/46842

Reviewed By: mrshenli

Differential Revision: D24539899

Pulled By: lw

fbshipit-source-id: 8731165c6ecd3c97433b4dfa469989f5b9019e36
---
 third_party/tensorpipe       | 2 +-
 third_party/tensorpipe.BUILD | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/third_party/tensorpipe b/third_party/tensorpipe
index 95ff9319161f..82a114882e21 160000
--- a/third_party/tensorpipe
+++ b/third_party/tensorpipe
@@ -1 +1 @@
-Subproject commit 95ff9319161fcdb3c674d2bb63fac3e94095b343
+Subproject commit 82a114882e21b176916e2f12a7b566af3d63df71
diff --git a/third_party/tensorpipe.BUILD b/third_party/tensorpipe.BUILD
index 29b2cd1d0d0c..66c7b1c7a1ab 100644
--- a/third_party/tensorpipe.BUILD
+++ b/third_party/tensorpipe.BUILD
@@ -74,6 +74,7 @@ header_template_rule(
         "#cmakedefine01 TENSORPIPE_HAS_SHM_TRANSPORT": "",
         "#cmakedefine01 TENSORPIPE_HAS_CMA_CHANNEL": "",
         "#cmakedefine01 TENSORPIPE_HAS_CUDA_IPC_CHANNEL": "",
+        "#cmakedefine01 TENSORPIPE_HAS_IBV_TRANSPORT": "",
         "#cmakedefine01 TENSORPIPE_SUPPORTS_CUDA": "",
     },
 )

From 25db74bf5e6ed580172d1f9aa637cc9243c6f09d Mon Sep 17 00:00:00 2001
From: Alban Desmaison <albandes@fb.com>
Date: Mon, 26 Oct 2020 12:40:55 -0700
Subject: [PATCH 015/169] Revert D24486972: [quant][graphmode][fx] Support
 sigmoid/hardsigmoid/tanh in qat

Test Plan: revert-hammer

Differential Revision:
D24486972 (https://github.com/pytorch/pytorch/commit/e927b62e739cb433bbf19ce56d78e6b4c9da8b70)

Original commit changeset: c9f139bfdd54

fbshipit-source-id: 2a75f5ec93d55a62b40d1cdd49adcf65436058f7
---
 test/quantization/test_quantize.py            |  33 +----
 test/quantization/test_quantize_fx.py         |  99 +++------------
 test/test_quantization.py                     |   3 +-
 torch/quantization/fake_quantize.py           |  22 +---
 torch/quantization/fx/pattern_utils.py        |  18 +--
 .../quantization/fx/quantization_patterns.py  |  34 ++----
 torch/quantization/fx/quantize.py             | 113 ++++++++----------
 torch/quantization/quantization_mappings.py   |  23 ----
 torch/quantization/quantize.py                |  45 ++-----
 9 files changed, 98 insertions(+), 292 deletions(-)

diff --git a/test/quantization/test_quantize.py b/test/quantization/test_quantize.py
index 1ef6e9b89c0c..480ca18128c5 100644
--- a/test/quantization/test_quantize.py
+++ b/test/quantization/test_quantize.py
@@ -25,8 +25,7 @@
     float_qparams_dynamic_qconfig,
     PerChannelMinMaxObserver,
     QConfigDynamic,
-    default_dynamic_quant_observer,
-    FixedQParamsFakeQuantize,
+    default_dynamic_quant_observer
 )
 
 from torch.testing._internal.common_quantization import (
@@ -1248,36 +1247,6 @@ def forward(self, x):
     def test_leaky_relu(self):
         self._test_activation_op_impl(nn.LeakyReLU, nnq.LeakyReLU, {'negative_slope': 0.1, 'inplace': False})
 
-
-class TestEagerModeQATOps(QuantizationTestCase):
-    def test_fixed_qparam_ops(self):
-        class M(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.sigmoid = torch.nn.Sigmoid()
-                self.hardsigmoid = torch.nn.Hardsigmoid()
-                self.tanh = torch.nn.Tanh()
-                self.quant = QuantStub()
-                self.dequant = DeQuantStub()
-
-            def forward(self, x):
-                x = self.quant(x)
-                x = self.sigmoid(x)
-                x = self.hardsigmoid(x)
-                x = self.tanh(x)
-                x = self.dequant(x)
-                return x
-
-        m = M().train()
-        m.qconfig = default_qat_qconfig
-        m = prepare_qat(m)
-        for attr in ['sigmoid', 'hardsigmoid', 'tanh']:
-            self.assertEqual(type(getattr(m, attr).activation_post_process), FixedQParamsFakeQuantize)
-        m = convert(m)
-        # make sure activation post process is removed
-        for attr in ['sigmoid', 'hardsigmoid', 'tanh']:
-            self.assertFalse(hasattr(getattr(m, attr), 'activation_post_process'))
-
 class TestFunctionalModule(QuantizationTestCase):
     # Histogram Observers are slow, so have no-deadline to ensure test doesn't time out
     @given(train_mode=st.booleans())
diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index 4113b754a3e8..4e352a80daa3 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -23,7 +23,6 @@
     convert,
     PerChannelMinMaxObserver,
     QConfigDynamic,
-    FixedQParamsFakeQuantize,
 )
 
 # test utils
@@ -1411,7 +1410,7 @@ def test_general_value_ops(self):
         """
         class M(torch.nn.Module):
             def __init__(self):
-                super().__init__()
+                super(M, self).__init__()
                 self.conv = torch.nn.Conv2d(3, 3, 3)
                 self.avg_pool1d = torch.nn.AvgPool1d(3)
                 self.avg_pool2d = torch.nn.AvgPool2d(3)
@@ -1419,6 +1418,9 @@ def __init__(self):
                 self.adaptive_avg_pool1d = torch.nn.AdaptiveAvgPool1d((1))
                 self.adaptive_avg_pool2d = torch.nn.AdaptiveAvgPool2d((1, 1))
                 self.adaptive_avg_pool3d = torch.nn.AdaptiveAvgPool3d((1, 1, 1))
+                self.hardsigmoid = torch.nn.Hardsigmoid()
+                self.sigmoid = torch.nn.Sigmoid()
+                self.tanh = torch.nn.Tanh()
 
             def forward(self, x):
                 x = self.conv(x)
@@ -1440,6 +1442,21 @@ def forward(self, x):
                 x = x.mean([2, 3], True)
                 x = F.interpolate(x, 4, mode='nearest')
                 x = F.interpolate(x, 4, mode='linear')
+                x = self.hardsigmoid(x)
+                x = F.hardsigmoid(x)
+                x = F.hardsigmoid(x, inplace=True)
+                x = x.hardsigmoid()
+                x.hardsigmoid_()
+                x = self.sigmoid(x)
+                x = torch.sigmoid(x)
+                # F.sigmoid is deprecated
+                x = x.sigmoid()
+                x.sigmoid_()
+                x = self.tanh(x)
+                # F.tanh is deprecated
+                x = torch.tanh(x)
+                x = x.tanh()
+                x.tanh_()
                 x = self.conv(x)
                 return x
 
@@ -1471,84 +1488,6 @@ def forward(self, x):
             expected_node_occurrence=count_check,
             expected_node_list=order_check)
 
-    @skipIfNoFBGEMM
-    def test_fixed_qparams_ops(self):
-        class M(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.conv = torch.nn.Conv2d(3, 3, 3)
-                self.sigmoid = torch.nn.Sigmoid()
-                self.hardsigmoid = torch.nn.Hardsigmoid()
-                self.tanh = torch.nn.Tanh()
-
-            def forward(self, x):
-                x = self.conv(x)
-                # F.sigmoid is deprecated
-                x = self.sigmoid(x)
-                x = torch.sigmoid(x)
-                x = x.sigmoid()
-                x.sigmoid_()
-                x = self.hardsigmoid(x)
-                x = F.hardsigmoid(x)
-                x = F.hardsigmoid(x, inplace=True)
-                x = x.hardsigmoid()
-                x.hardsigmoid_()
-                x = self.tanh(x)
-                # F.tanh is deprecated
-                x = torch.tanh(x)
-                x = x.tanh()
-                x.tanh_()
-                x = self.conv(x)
-                return x
-
-        for eval_mode in [True, False]:
-            # This model is not executable since we just put all ops
-            # in the same forward
-            m = M()
-            if eval_mode:
-                m.eval()
-                qconfig = default_qconfig
-                prepare = prepare_fx
-                fq_count = 0
-            else:
-                m.train()
-                qconfig = default_qat_qconfig
-                prepare = prepare_qat_fx
-                fq_count = 13
-
-            # nothing to fuse so skipping the fuse step
-            qconfig_dict = {'': qconfig}
-            prepared = prepare(m, qconfig_dict)
-            # check the correct number of activation_post_process is inserted
-            count_check = {
-                ns.call_module(FixedQParamsFakeQuantize) : fq_count,
-            }
-            self.checkGraphModuleNodes(
-                prepared,
-                expected_node_occurrence=count_check)
-            # not runnable
-            quantized = convert_fx(prepared)
-
-            # This checks that the dequantize from the output of first conv
-            # is being propagated to the end, so that we don't insert extra
-            # observers
-            # check exact counts of quantize and dequantize
-            count_check = {
-                ns.call_function(torch.quantize_per_tensor) : 1,
-                ns.call_method('dequantize') : 1
-            }
-            order_check = [
-                ns.call_function(torch.quantize_per_tensor),
-                ns.call_module(nnq.Conv2d),
-                ns.call_module(nn.Sigmoid),
-                ns.call_module(nnq.Conv2d),
-                ns.call_method('dequantize'),
-            ]
-            self.checkGraphModuleNodes(
-                quantized,
-                expected_node_occurrence=count_check,
-                expected_node_list=order_check)
-
     def test_float_functional(self):
         class TorchAdd(nn.Module):
             """Wrapper around torch.add so that all ops can be found at build"""
diff --git a/test/test_quantization.py b/test/test_quantization.py
index 26d3ee020983..8a6f05cb19de 100644
--- a/test/test_quantization.py
+++ b/test/test_quantization.py
@@ -44,7 +44,6 @@
 from quantization.test_quantize import TestPostTrainingDynamic  # noqa: F401
 from quantization.test_quantize import TestQuantizationAwareTraining  # noqa: F401
 from quantization.test_quantize import TestEagerModeOps  # noqa: F401
-from quantization.test_quantize import TestEagerModeQATOps  # noqa: F401
 
 # TODO: merge with other tests in test_quantize.py?
 from quantization.test_quantize import TestFunctionalModule  # noqa: F401
@@ -65,7 +64,7 @@
 from quantization.test_quantize_fx import TestQuantizeFxOps  # noqa: F401
 from quantization.test_quantize_fx import TestQuantizeFxModels  # noqa: F401
 
-# Tooling: numeric_suite
+# Tooling: numric_suite
 from quantization.test_numeric_suite import TestEagerModeNumericSuite  # noqa: F401
 
 # Backward Compatibility
diff --git a/torch/quantization/fake_quantize.py b/torch/quantization/fake_quantize.py
index cacaad6bea0b..dd2333883325 100644
--- a/torch/quantization/fake_quantize.py
+++ b/torch/quantization/fake_quantize.py
@@ -159,12 +159,12 @@ def forward(self, X):
 
     @torch.jit.export
     def extra_repr(self):
-        return 'fake_quant_enabled={}, observer_enabled={}, ' \
-               'quant_min={}, quant_max={}, dtype={}, qscheme={}, ch_axis={}, ' \
-               'scale={}, zero_point={}'.format(
-                   self.fake_quant_enabled, self.observer_enabled,
-                   self.quant_min, self.quant_max,
-                   self.dtype, self.qscheme, self.ch_axis, self.scale, self.zero_point)
+        return 'fake_quant_enabled={}, observer_enabled={},\
+            quant_min={}, quant_max={}, dtype={}, qscheme={}, ch_axis={}, \
+        scale={}, zero_point={}'.format(
+            self.fake_quant_enabled, self.observer_enabled,
+            self.quant_min, self.quant_max,
+            self.dtype, self.qscheme, self.ch_axis, self.scale, self.zero_point)
 
     def _save_to_state_dict(self, destination, prefix, keep_vars):
         # We cannot currently register scalar values as buffers, so need to manually
@@ -226,19 +226,9 @@ def forward(self, X):
                                                       self.quant_max)
         return X
 
-    @torch.jit.export
     def calculate_qparams(self):
         return self.scale, self.zero_point
 
-    @torch.jit.export
-    def extra_repr(self):
-        return 'fake_quant_enabled={}, observer_enabled={}, scale={}, zero_point={}, ' \
-               'dtype={}, quant_min={}, quant_max={}, qscheme={}'.format(
-                   self.fake_quant_enabled, self.observer_enabled,
-                   self.scale, self.zero_point, self.dtype,
-                   self.quant_min, self.quant_max, self.qscheme)
-
-
 default_fake_quant = FakeQuantize.with_args(observer=MovingAverageMinMaxObserver, quant_min=0, quant_max=255,
                                             dtype=torch.quint8, qscheme=torch.per_tensor_affine, reduce_range=True)
 default_weight_fake_quant = FakeQuantize.with_args(observer=MovingAverageMinMaxObserver, quant_min=-128, quant_max=127,
diff --git a/torch/quantization/fx/pattern_utils.py b/torch/quantization/fx/pattern_utils.py
index ccbd7cc2a2c4..efc1cd492d1b 100644
--- a/torch/quantization/fx/pattern_utils.py
+++ b/torch/quantization/fx/pattern_utils.py
@@ -14,17 +14,10 @@ def get_default_fusion_patterns():
     return DEFAULT_FUSION_PATTERNS
 
 DEFAULT_QUANTIZATION_PATTERNS = OrderedDict()
-# a map from pattern to activation_post_process(observer/fake_quant) consstructor for output activation
-# e.g. pattern: torch.sigmoid,
-#      output_activation_post_process: default_affine_fixed_qparam_fake_quant
-DEFAULT_OUTPUT_ACTIVATION_POST_PROCESS_MAP = dict()
-
 # Register pattern for both static quantization and qat
-def register_quant_pattern(pattern, output_activation_post_process=None):
+def register_quant_pattern(pattern):
     def insert(fn):
         DEFAULT_QUANTIZATION_PATTERNS[pattern] = fn
-        if output_activation_post_process is not None:
-            DEFAULT_OUTPUT_ACTIVATION_POST_PROCESS_MAP[pattern] = output_activation_post_process
         return fn
     return insert
 
@@ -32,12 +25,6 @@ def insert(fn):
 def get_default_quant_patterns():
     return DEFAULT_QUANTIZATION_PATTERNS
 
-# a map from pattern to output activation post process constructor
-# e.g. torch.sigmoid -> default_affine_fixed_qparam_fake_quant
-def get_default_output_activation_post_process_map():
-    return DEFAULT_OUTPUT_ACTIVATION_POST_PROCESS_MAP
-
-
 # Example use of register pattern function:
 # @register_fusion_pattern(torch.nn.ReLU, (torch.nn.BatchNorm2d, torch.nn.Conv2d)))
 # class ConvBNReLUFusion():
@@ -75,9 +62,6 @@ def is_match(modules, node, pattern, max_uses=sys.maxsize):
         elif node.target is getattr:
             if node.args[1] != pattern[1]:
                 return False
-    elif isinstance(self_match, str):
-        if node.op != 'call_method' or node.target != self_match:
-            return False
     elif node.target != self_match:
         return False
 
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index 2898a07a17c1..12787e4a87db 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -4,10 +4,6 @@
 )
 import torch.nn.quantized as nnq
 import torch.nn.quantized.dynamic as nnqd
-from torch.quantization import (
-    default_affine_fixed_qparams_fake_quant,
-    default_symmetric_fixed_qparams_fake_quant,
-)
 
 from ..quantization_mappings import (
     get_static_quant_module_class,
@@ -491,22 +487,6 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
         return quantizer.quantized_graph.create_node(
             'call_function', quantized_op, args, kwargs)
 
-@register_quant_pattern(torch.nn.Hardsigmoid, default_affine_fixed_qparams_fake_quant)
-@register_quant_pattern(torch.nn.functional.hardsigmoid, default_affine_fixed_qparams_fake_quant)
-@register_quant_pattern('hardsigmoid', default_affine_fixed_qparams_fake_quant)
-@register_quant_pattern('hardsigmoid_', default_affine_fixed_qparams_fake_quant)
-@register_quant_pattern(torch.nn.Sigmoid, default_affine_fixed_qparams_fake_quant)
-@register_quant_pattern(torch.sigmoid, default_affine_fixed_qparams_fake_quant)
-@register_quant_pattern('sigmoid', default_affine_fixed_qparams_fake_quant)
-@register_quant_pattern('sigmoid_', default_affine_fixed_qparams_fake_quant)
-@register_quant_pattern(torch.nn.Tanh, default_symmetric_fixed_qparams_fake_quant)
-@register_quant_pattern(torch.tanh, default_symmetric_fixed_qparams_fake_quant)
-@register_quant_pattern('tanh', default_symmetric_fixed_qparams_fake_quant)
-@register_quant_pattern('tanh_', default_symmetric_fixed_qparams_fake_quant)
-class FixedQParamsOpQuantizeHandler(QuantizeHandler):
-    def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_dict=None):
-        return quantizer.quantized_graph.node_copy(node, load_arg(quantized=None))
-
 # these ops have quantized equivalents that do not need any extra information
 @register_quant_pattern(torch.nn.AdaptiveAvgPool1d)
 @register_quant_pattern(torch.nn.AdaptiveAvgPool2d)
@@ -515,16 +495,20 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 @register_quant_pattern(torch.nn.AvgPool2d)
 @register_quant_pattern(torch.nn.AvgPool3d)
 @register_quant_pattern(torch.nn.Dropout)
+@register_quant_pattern(torch.nn.Hardsigmoid)
 @register_quant_pattern(torch.nn.Hardtanh)
 @register_quant_pattern(torch.nn.MaxPool1d)
 @register_quant_pattern(torch.nn.MaxPool2d)
 @register_quant_pattern(torch.nn.MaxPool3d)
 @register_quant_pattern(torch.nn.ReLU)
 @register_quant_pattern(torch.nn.ReLU6)
+@register_quant_pattern(torch.nn.Sigmoid)
+@register_quant_pattern(torch.nn.Tanh)
 @register_quant_pattern(torch.adaptive_avg_pool1d)
 @register_quant_pattern(torch.nn.functional.adaptive_avg_pool2d)
 @register_quant_pattern(torch.nn.functional.adaptive_avg_pool3d)
 @register_quant_pattern(torch.nn.functional.dropout)
+@register_quant_pattern(torch.nn.functional.hardsigmoid)
 @register_quant_pattern(torch.nn.functional.hardtanh)
 @register_quant_pattern(torch.nn.functional.hardtanh_)
 @register_quant_pattern(torch.nn.functional.interpolate)
@@ -544,9 +528,11 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 @register_quant_pattern(torch.mean)
 @register_quant_pattern(torch.min)
 @register_quant_pattern(torch.repeat_interleave)
+@register_quant_pattern(torch.sigmoid)
 @register_quant_pattern(torch.sort)
 @register_quant_pattern(torch.squeeze)
 @register_quant_pattern(torch.stack)
+@register_quant_pattern(torch.tanh)
 @register_quant_pattern(torch.unsqueeze)
 @register_quant_pattern(operator.getitem)
 @register_quant_pattern(operator.floordiv)
@@ -555,6 +541,8 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 @register_quant_pattern('contiguous')
 @register_quant_pattern('detach')
 @register_quant_pattern('detach_')
+@register_quant_pattern('hardsigmoid')
+@register_quant_pattern('hardsigmoid_')
 @register_quant_pattern('mean')
 @register_quant_pattern('numel')
 @register_quant_pattern('permute')
@@ -565,9 +553,13 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 @register_quant_pattern('reshape')
 @register_quant_pattern('resize_')
 @register_quant_pattern('shape')
+@register_quant_pattern('sigmoid')
+@register_quant_pattern('sigmoid_')
 @register_quant_pattern('size')
 @register_quant_pattern('squeeze')
 @register_quant_pattern('squeeze_')
+@register_quant_pattern('tanh')
+@register_quant_pattern('tanh_')
 @register_quant_pattern('transpose')
 @register_quant_pattern('unsqueeze')
 @register_quant_pattern('unsqueeze_')
@@ -578,7 +570,7 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 
 # Default quantization handler, used for quantization of input and output
 # of quantizable objects (e.g. modules and functionals)
-class DefaultQuantizeHandler(QuantizeHandler):
+class DefaultQuant(QuantizeHandler):
     def convert(self, quantizer, node):
         assert self.all_nodes
         root_module = quantizer.modules['']
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index afa87a83badf..3eebef4ff10a 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -19,15 +19,11 @@
     get_default_qat_module_mappings,
 )
 
-from ..quantize import (
-    _remove_qconfig,
-    is_activation_post_process
-)
+from ..quantize import _remove_qconfig
 
 from .pattern_utils import (
     is_match,
     get_default_quant_patterns,
-    get_default_output_activation_post_process_map,
 )
 
 from .standalone_module import (
@@ -135,6 +131,11 @@ def assert_and_get_unique_device(module):
     device = next(iter(devices)) if len(devices) > 0 else None
     return device
 
+def is_activation_post_process(module):
+    return (isinstance(module, torch.quantization.ObserverBase) or
+            isinstance(module, torch.quantization.FakeQuantize) or
+            isinstance(module, torch.quantization.FakeQuantizeBase))
+
 def is_submodule_of_fake_quant(name, module, named_modules):
     parent_name, _ = _parent_name(name)
     return is_activation_post_process(named_modules[parent_name])
@@ -352,7 +353,7 @@ def _prepare(self, model, qconfig_dict, inplace, prepare_custom_config_dict, is_
 
         # find _inputs_ to matched nodes that are not quantized, these
         # have to be quantized, which requires measuring stats,
-        # initialize an DefaultQuantizeHandler object for each
+        # initialize an DefaultQuant object for each
         quants = self._find_quants(model.graph, matches)
 
         self.activation_post_process_map = dict()
@@ -382,7 +383,7 @@ def load_arg(a):
                 continue
 
             prefix = node.name + '_activation_post_process_'
-            root_node, _, pattern, obj, qconfig = matches.get(node.name, (None, None, None, None, None))
+            root_node, _, obj, qconfig = matches.get(node.name, (None, None, None, None))
             if root_node is None:
                 env[node.name] = observed_graph.node_copy(node, load_arg)
             elif root_node is node:
@@ -429,19 +430,9 @@ def insert_observer(node, observer, device):
                 if not activation_is_statically_quantized(qconfig):
                     continue
 
-                if isinstance(obj, FixedQParamsOpQuantizeHandler) and model.training:
-                    # we only insert fake quantize module in qat
-                    activation_post_process_ctr = \
-                        get_default_output_activation_post_process_map().get(pattern, None)
-                    assert activation_post_process_ctr is not None, \
-                        'activation_post_process constructor not provided for ' + \
-                        'pattern:' + str(pattern)
-                    device = assert_and_get_unique_device(model)
-                    insert_observer(node, activation_post_process_ctr(), device)
-                elif (isinstance(obj, FixedQParamsOpQuantizeHandler) and not model.training) \
-                     or isinstance(obj, CopyNode):
-                    # inserting observers for output of observed module, or mark the output
-                    # as observed
+                # inserting observers for output of observed module, or mark the output
+                # as observed
+                if isinstance(obj, CopyNode):
                     assert node.op in [
                         'call_module',
                         'call_function',
@@ -481,7 +472,6 @@ def is_observed(input_arg):
             else:
                 env[node.name] = observed_graph.node_copy(node, load_arg)
 
-            # insert observer for output of the node
             if node.name not in observed_node_names_set and node.name in quants:
                 if is_standalone_module and node.name in graph_inputs:
                     # we'll insert observer for input of standalone module
@@ -490,11 +480,11 @@ def is_observed(input_arg):
                     continue
                 get_new_observer_name = get_new_attr_name_with_prefix(prefix)
                 observer_name = get_new_observer_name(model)
-                _, activation_post_process_ctr = quants[node.name]
-                if activation_post_process_ctr is not None:
+                _, qconfig, is_weight = quants[node.name]
+                if qconfig is not None:
                     # TODO: use insert_observer
-                    new_observer = activation_post_process_ctr()
-
+                    new_observer = \
+                        qconfig.weight() if is_weight else qconfig.activation()
                     # respect device affinity when adding observers
                     device = assert_and_get_unique_device(model)
                     if device:
@@ -679,7 +669,7 @@ def is_quantized(node):
                     graph_output = map_arg(node.args[0], load_non_quantized)
                 self.quantized_graph.output(graph_output)
                 continue
-            root_node, matched, matched_pattern, obj, qconfig = matches.get(node.name, (None, None, None, None, None))
+            root_node, matched, obj, qconfig = matches.get(node.name, (None, None, None, None))
             if root_node is node:
                 if qconfig is None:
                     result = self.quantized_graph.node_copy(node, load_non_quantized)
@@ -692,10 +682,7 @@ def is_quantized(node):
                         quantized = True
 
                     # Need to get correct quantized/non-quantized state for the output of CopyNode
-                    if type(obj) in [
-                            CopyNode,
-                            FixedQParamsOpQuantizeHandler
-                    ]:
+                    if isinstance(obj, CopyNode):
                         assert node.op in [
                             'call_module',
                             'call_function',
@@ -764,7 +751,13 @@ def load_arg(a):
             else:
                 env[node.name] = act_post_process_removed_graph.node_copy(node, load_arg)
 
-        # removes qconfig and activation_post_process modules
+        module_dict = dict(model.named_modules())
+        to_be_removed = []
+        for name, module in model.named_modules():
+            if is_activation_post_process(module) and not is_submodule_of_fake_quant(name, module, module_dict):
+                to_be_removed.append(name)
+        for n in to_be_removed:
+            delattr(model, n)
         _remove_qconfig(model)
         model = GraphModule(model, act_post_process_removed_graph)
         return model
@@ -839,10 +832,10 @@ def _find_matches(
 
         Outputs a map of
           node_name ->
-            (node, matched_values, matched_pattern, QuantizeHandler instance, qconfig)
+            (node, matched_values, QuantizeHandler instance, qconfig)
 
         For example, {
-          'relu_1': (relu_1, [relu_1], torch.nn.functional.relu, <CopyNode instance>, QConfig(...)),
+          'relu_1': (relu_1, [relu_1], <CopyNode instance>, QConfig(...)),
           ...
         }
         """
@@ -869,7 +862,7 @@ def record_match(pattern, node, matched):
                         matched = []
                         record_match(pattern, node, matched)
                         for n in matched:
-                            match_map[n.name] = (node, matched, pattern, value(self, node), self.qconfig_map[n.name])
+                            match_map[n.name] = (node, matched, value(self, node), self.qconfig_map[n.name])
                             all_matched.add(n.name)
                         # break after finding the first match
                         break
@@ -880,7 +873,7 @@ def record_match(pattern, node, matched):
                type(self.modules[node.target]) in custom_module_class_mapping:
                 custom_module_qconfig = self.qconfig_map[node.name]
                 match_map[node.name] = (
-                    node, [node], None, CustomModuleQuantizeHandler(self, node), custom_module_qconfig)
+                    node, [node], CustomModuleQuantizeHandler(self, node), custom_module_qconfig)
 
         def is_standalone_module(module_path):
             if standalone_module_names is None:
@@ -895,7 +888,7 @@ def is_standalone_module(module_path):
                 # add node to matched nodes
                 custom_module_qconfig = self.qconfig_map[node.name]
                 match_map[node.name] = (
-                    node, [node], None, StandaloneModuleQuantizeHandler(self, node), custom_module_qconfig)
+                    node, [node], StandaloneModuleQuantizeHandler(self, node), custom_module_qconfig)
 
         return match_map
 
@@ -909,13 +902,16 @@ def _find_quants(self, graph, matches):
           - matches: output of self._find_matches function
 
         Outputs a map of
-         node_name -> (QuantizeHandler instance (always DefaultQuantizeHandler),
-         activation_post_process (observer/fake_quantize module) constructor)
+          node_name -> (QuantizeHandler instance (always DefaultQuant), qconfig)
         """
         quants = {}
 
-        def visit(node, matched_pattern, qconfig):
+        def visit(node, qconfig):
             def visit_arg(arg):
+                # note: we have to measure quantization information
+                # even for nodes where we might not use it because it is already
+                # quantized. This is because each match has the option to
+                # say NotImplemented (if for instance, it is an __add__ and the data type is not appropriate)
                 is_weight = False
                 if isinstance(node, Node) and node.op == 'call_function' and node.target in WEIGHT_INDEX_DICT:
                     for i, node_arg in enumerate(node.args):
@@ -923,39 +919,26 @@ def visit_arg(arg):
                             is_weight = True
                 if qconfig is not None and \
                    (activation_is_statically_quantized(qconfig) or is_weight):
-                    act_post_process_ctr = qconfig.weight if is_weight else qconfig.activation
-                    quants[arg.name] = (DefaultQuantizeHandler(self, arg), qconfig, is_weight)
-                    # overwrite the constructor from qconfig
-                    act_post_process_ctr = \
-                        get_default_output_activation_post_process_map().get(
-                            matched_pattern,
-                            act_post_process_ctr)
-                    # overwrite previous activation post process constructor if necessary
-                    quants[arg.name] = (DefaultQuantizeHandler(self, arg), act_post_process_ctr)
+                    # overwrite previous quant config
+                    quants[arg.name] = (DefaultQuant(self, arg), qconfig, is_weight)
             return visit_arg
 
         for node in graph.nodes:
             if node.name in matches:
-                root_node, matched_nodes, matched_pattern, quantize_handler, qconfig = matches[node.name]
+                root_node, matched, obj, qconfig = matches[node.name]
                 # don't attach observer/fake_quant for CopyNode
-                if isinstance(quantize_handler, CopyNode):
+                if isinstance(obj, CopyNode):
                     qconfig = None
                 if root_node is node:
-                    # matched_nodes[-1] is the first op in the sequence and
-                    # matched_nodes[0] is the last op in the sequence
+                    # matched[-1] is the first op in the sequence and
+                    # matched[0] is the last op in the sequence
                     # inputs
-                    # matched_pattern is set to None for inputs because
-                    # we only want to select QuantizeHandler object based
-                    # on pattern for output, inputs will always use
-                    # DefaultQuantizeHandler
-                    map_arg(matched_nodes[-1].args, visit(matched_nodes[-1], None, qconfig))
-                    map_arg(matched_nodes[-1].kwargs, visit(matched_nodes[-1], None, qconfig))
-
+                    map_arg(matched[-1].args, visit(matched[-1], qconfig))
+                    map_arg(matched[-1].kwargs, visit(matched[-1], qconfig))
                     # output
-                    # we don't insert observer for output of standalone module
-                    if not isinstance(quantize_handler, StandaloneModuleQuantizeHandler):
-                        # passing in matched_pattern here so that we can customize
-                        # activation_post_process constructor for output based on the pattern, e.g.
-                        # for sigmoid op we'll use default_affine_fixed_qparam_fake_quant
-                        map_arg(matched_nodes[0], visit(None, matched_pattern, qconfig))
+                    if isinstance(obj, StandaloneModuleQuantizeHandler):
+                        # we don't insert observer for output of custom
+                        # module
+                        continue
+                    map_arg(matched[0], visit(None, qconfig))
         return quants
diff --git a/torch/quantization/quantization_mappings.py b/torch/quantization/quantization_mappings.py
index 82340a49309c..40fef784ce2d 100644
--- a/torch/quantization/quantization_mappings.py
+++ b/torch/quantization/quantization_mappings.py
@@ -10,10 +10,6 @@
 import torch.nn.qat as nnqat
 
 from .stubs import QuantStub, DeQuantStub
-from .fake_quantize import (
-    default_affine_fixed_qparams_fake_quant,
-    default_symmetric_fixed_qparams_fake_quant,
-)
 
 # Default map for swapping float module to quantized ones
 DEFAULT_STATIC_QUANT_MODULE_MAPPINGS = {
@@ -95,13 +91,6 @@
     F.leaky_relu: torch._ops.ops.quantized.leaky_relu,
 }
 
-# mapping from module to output activation post process class
-DEFAULT_MODULE_TO_ACT_POST_PROCESS = {
-    nn.Hardsigmoid: default_affine_fixed_qparams_fake_quant,
-    nn.Sigmoid: default_affine_fixed_qparams_fake_quant,
-    nn.Tanh: default_symmetric_fixed_qparams_fake_quant,
-}
-
 def get_default_static_quant_module_mappings():
     ''' Get module mapping for post training static quantization
     '''
@@ -169,15 +158,3 @@ def get_quantized_operator(float_op):
     assert quantized_op is not None, \
         'Operator {} does not have corresponding quantized op'.format(str(float_op))
     return quantized_op
-
-def get_default_special_act_post_process(module_cls):
-    r""" Get the special activation post process for `module`, this has
-    higher priority than the activation post process in `qconfig`
-    e.g.
-    input: torch.nn.Sigmoid
-    output: default_affine_fixed_qparam_fake_quant
-    """
-    return DEFAULT_MODULE_TO_ACT_POST_PROCESS.get(module_cls, None)
-
-def has_special_act_post_process(module_cls):
-    return module_cls in DEFAULT_MODULE_TO_ACT_POST_PROCESS
diff --git a/torch/quantization/quantize.py b/torch/quantization/quantize.py
index f217c5ffb65c..9aa52d373ff6 100644
--- a/torch/quantization/quantize.py
+++ b/torch/quantization/quantize.py
@@ -9,23 +9,14 @@
 import torch.nn.quantized as nnq
 import torch.nn.intrinsic.qat as nniqat
 
-from .quantization_mappings import (
-    get_default_dynamic_quant_module_mappings,
-    get_default_static_quant_module_mappings,
-    get_default_qat_module_mappings,
-    get_default_qconfig_propagation_list,
-    has_special_act_post_process,
-    get_default_special_act_post_process,
-)
+from .quantization_mappings import (get_default_dynamic_quant_module_mappings,
+                                    get_default_static_quant_module_mappings,
+                                    get_default_qat_module_mappings,
+                                    get_default_qconfig_propagation_list)
 
 from .stubs import DeQuantStub, QuantWrapper
 from .qconfig import default_dynamic_qconfig, float16_dynamic_qconfig, float_qparams_dynamic_qconfig
 
-def is_activation_post_process(module):
-    return (isinstance(module, torch.quantization.ObserverBase) or
-            isinstance(module, torch.quantization.FakeQuantize) or
-            isinstance(module, torch.quantization.FakeQuantizeBase))
-
 def _propagate_qconfig_helper(module, qconfig_dict, allow_list=None,
                               qconfig_parent=None, prefix=''):
     r"""This is a helper function for `propagate_qconfig_`
@@ -116,8 +107,8 @@ def add_observer_(module, qconfig_propagation_list=None, non_leaf_module_list=No
         )
         device = next(iter(devices)) if len(devices) > 0 else None
 
-    def get_activation_post_process(qconfig, device, special_act_post_process=None):
-        activation = qconfig.activation() if special_act_post_process is None else special_act_post_process()
+    def get_activation_post_process(qconfig, device):
+        activation = qconfig.activation()
         if device is not None:
             activation.to(device)
         return activation
@@ -125,13 +116,13 @@ def get_activation_post_process(qconfig, device, special_act_post_process=None):
     def needs_observation(m):
         return hasattr(m, 'qconfig') and m.qconfig is not None
 
-    def insert_activation_post_process(m, special_act_post_process=None):
+    def insert_activation_post_process(m):
         """ Adds an activation post process module and register
         a post hook that calls the module
         """
         if needs_observation(m):
             # observer and hook will be gone after we swap the module
-            m.add_module('activation_post_process', get_activation_post_process(m.qconfig, device, special_act_post_process))
+            m.add_module('activation_post_process', get_activation_post_process(m.qconfig, device))
             # Register observer as the first entry in the hook list
             # All post forward hooks are preserved and will be executed after the observer before convert
             handle = register_activation_post_process_hook(m)
@@ -139,11 +130,8 @@ def insert_activation_post_process(m, special_act_post_process=None):
 
     for name, child in module.named_children():
         if type(child) == nnq.FloatFunctional or type(child) == nnq.QFunctional:
-            if needs_observation(child):
+            if hasattr(child, 'qconfig') and child.qconfig is not None:
                 child.activation_post_process = get_activation_post_process(child.qconfig, device)
-        elif has_special_act_post_process(type(child)):
-            special_act_post_process = get_default_special_act_post_process(type(child))
-            insert_activation_post_process(child, special_act_post_process)
         elif non_leaf_module_list is not None and type(child) in non_leaf_module_list:
             insert_activation_post_process(child)
         elif needs_observation(child) and type(child) in custom_module_class_mapping:
@@ -241,19 +229,6 @@ def prepare(model, inplace=False, allow_list=None,
         custom_module_class_mapping=custom_module_class_mapping)
     return model
 
-def _remove_activation_post_process(module):
-    # TODO: maybe we should change activation_post_process to _activation_post_process
-    # to prevent it from being used by user
-    if hasattr(module, 'activation_post_process') and \
-       is_activation_post_process(module.activation_post_process):
-        delattr(module, 'activation_post_process')
-
-    # remove activation_post_proceess hook
-    for handle_id, hook_fn in module._forward_hooks.items():
-        if hook_fn is _observer_forward_hook:
-            module._forward_hooks.pop(handle_id)
-
-# TODO: rename to something more general
 def _remove_qconfig(module):
     r"""Clean up the qconfig left in the module so that new qconfig can be
     propagated.
@@ -267,8 +242,6 @@ def _remove_qconfig(module):
     if hasattr(module, "qconfig"):
         del module.qconfig
 
-    _remove_activation_post_process(module)
-
 def quantize(model, run_fn, run_args, mapping=None, inplace=False):
     r"""Quantize the input float model with post training static quantization.
 

From 58ed60c259834e324e86f3e3118e4fcbbfea8dd1 Mon Sep 17 00:00:00 2001
From: Oscar Sandoval <osandoval@fb.com>
Date: Mon, 26 Oct 2020 12:47:32 -0700
Subject: [PATCH 016/169] Added context manager enabling all futures returned
 by rpc_async and custom build rpc functions to be automatically waited on
 (#41807)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/41807

Test Plan: Make sure ci tests pass, including newly written test

Reviewed By: mrshenli

Differential Revision: D22640839

Pulled By: osandoval-fb

fbshipit-source-id: 3ff98d8e8c6e6d08575e307f05b5e159442d7216
---
 torch/distributed/rpc/api.py                  | 33 +++++++++++-
 .../_internal/distributed/rpc/rpc_test.py     | 54 ++++++++++++++++++-
 2 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/torch/distributed/rpc/api.py b/torch/distributed/rpc/api.py
index 1888241e4da4..7cb99066b507 100644
--- a/torch/distributed/rpc/api.py
+++ b/torch/distributed/rpc/api.py
@@ -141,6 +141,34 @@ def _broadcast_to_followers(sequence_id, objects_map):
     states.gathered_objects = objects_map
     states.proceed_signal.set()
 
+_thread_local_var = threading.local()
+
+@contextlib.contextmanager
+def _wait_all():
+    r"""
+    A context manager that collects all futures returned by ``rpc_async`` and
+    waits them on the context manager's exit; relieving the user of needing
+    to explicitly call wait.
+
+
+    Example::
+        >>> # On worker 0:
+        >>> import torch
+        >>> import torch.distributed.rpc as rpc
+        >>> rpc.init_rpc("worker0", rank=0, world_size=2)
+        >>> with rpc._wait_all():
+        >>>    fut_1 = rpc.rpc_async(dst, torch.add, (torch.ones(2, 2), 1))
+        >>>    fut_2 = rpc.rpc_async(dst, torch.add, (torch.ones(2, 2), 1))
+        >>> #fut_1 and fut_2 are waited on
+    """
+    _thread_local_var.future_list = []
+    try:
+        yield
+    finally:
+        try:
+            torch.futures.wait_all(_thread_local_var.future_list)
+        finally:
+            del _thread_local_var.future_list
 
 @_require_initialized
 def _all_gather(obj, timeout=UNSET_RPC_TIMEOUT):
@@ -830,4 +858,7 @@ def rpc_async(to, func, args=None, kwargs=None, timeout=UNSET_RPC_TIMEOUT):
         >>> rpc.init_rpc("worker1", rank=1, world_size=2)
         >>> rpc.shutdown()
     """
-    return _invoke_rpc(to, func, RPCExecMode.ASYNC, args, kwargs, timeout)
+    fut = _invoke_rpc(to, func, RPCExecMode.ASYNC, args, kwargs, timeout)
+    if hasattr(_thread_local_var, "future_list"):
+        _thread_local_var.future_list.append(fut)
+    return fut
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index 41006be6b39b..82be0fd97731 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -15,7 +15,7 @@
 import torch.distributed.rpc as rpc
 import torch.distributed.autograd as dist_autograd
 from torch.distributed.rpc import RRef, _get_debug_info, _rref_context_get_debug_info
-from torch.distributed.rpc.api import _delete_all_user_and_unforked_owner_rrefs, _use_rpc_pickler
+from torch.distributed.rpc.api import _delete_all_user_and_unforked_owner_rrefs, _use_rpc_pickler, _thread_local_var, _wait_all
 from torch.distributed.rpc.internal import (
     PythonUDF,
     RPCExecMode,
@@ -2856,6 +2856,58 @@ class TestPickler:
             torch.distributed.rpc.api._default_pickler is _internal_rpc_pickler
         )
 
+    @dist_init
+    def test_wait_all(self):
+        with _wait_all():
+            self.assertTrue(_thread_local_var.future_list == [])
+            dst = worker_name((self.rank + 1) % self.world_size)
+            fut = rpc.rpc_async(dst, torch.add, (torch.ones(2, 2), 1))
+            self.assertTrue(len(_thread_local_var.future_list) == 1)
+            self.assertTrue(isinstance(_thread_local_var.future_list[0], torch._C.Future))
+        self.assertTrue(fut.done())
+        self.assertEqual(fut.wait(), torch.ones(2, 2) + 1)
+        self.assertFalse(hasattr(_thread_local_var, "future_list"))
+
+    @dist_init
+    def test_wait_all_multiple_call(self):
+        with _wait_all():
+            self.assertTrue(_thread_local_var.future_list == [])
+            dst = worker_name((self.rank + 1) % self.world_size)
+            for i in range(20):
+                fut = rpc.rpc_async(dst, torch.add, (torch.ones(i, i), 1))
+                res = rpc.rpc_sync(dst, torch.add, (torch.ones(i, i), 1))
+                self.assertEqual(res, torch.ones(i, i) + 1)
+                self.assertEqual(fut.wait(), torch.ones(i, i) + 1)
+            self.assertTrue(len(_thread_local_var.future_list) == 20)
+        self.assertFalse(hasattr(_thread_local_var, "future_list"))
+
+    @dist_init
+    def test_wait_all_timeout(self):
+        expected_error = self.get_timeout_error_regex()
+        with self.assertRaisesRegex(RuntimeError, expected_error):
+            with _wait_all():
+                self.assertTrue(_thread_local_var.future_list == [])
+                dst = worker_name((self.rank + 1) % self.world_size)
+                timeout = 0.1  # 100 ms
+                fut = rpc.rpc_async(dst, my_sleep_func, args=(1,), timeout=timeout)
+        self.assertFalse(hasattr(_thread_local_var, "future_list"))
+
+    @dist_init
+    def test_wait_all_raise_in_user_func(self):
+        with self.assertRaises(ValueError):
+            with _wait_all():
+                self.assertTrue(_thread_local_var.future_list == [])
+                dst = worker_name((self.rank + 1) % self.world_size)
+                fut = rpc.rpc_async(dst, raise_func)
+        self.assertFalse(hasattr(_thread_local_var, "future_list"))
+
+    @dist_init
+    def test_wait_all_raise_in_body(self):
+        with self.assertRaises(ValueError):
+            with _wait_all():
+                raise_func()
+        self.assertFalse(hasattr(_thread_local_var, "future_list"))
+
     @dist_init
     def test_function_not_on_callee(self):
         # test that if a function does not exist on a callee, we don't crash,

From 57bf0b596aeb1b3fdb47345e6985edf09b319d8d Mon Sep 17 00:00:00 2001
From: Zafar <cc.rafaz@zafar.cc>
Date: Mon, 26 Oct 2020 14:28:14 -0700
Subject: [PATCH 017/169] [docs] Changing the wording on quantization
 versioning and support (#46858)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/46858

Test Plan: Imported from OSS

Reviewed By: dskhudia

Differential Revision: D24542598

Pulled By: z-a-f

fbshipit-source-id: 0eb7a2dcc8f8ad52954f2555cf41d5f7524cbc2c
---
 docs/source/quantization.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index b78ed2c08586..6cfbe186544f 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -45,8 +45,8 @@ The corresponding implementation is chosen automatically based on the PyTorch bu
 
 .. note::
 
-  PyTorch 1.3 doesn't provide quantized operator implementations on CUDA yet -
-  this is direction of future work.  Move the model to CPU in order to test the
+  At the moment PyTorch doesn't provide quantized operator implementations on CUDA -
+  this is the direction for future work. Move the model to CPU in order to test the
   quantized functionality.
 
   Quantization-aware training (through :class:`~torch.quantization.FakeQuantize`)

From 5e2f17d77a1d4592f571349dc952ff3ec42703de Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@fb.com>
Date: Mon, 26 Oct 2020 14:39:11 -0700
Subject: [PATCH 018/169] Add NCCL_ASYNC_ERROR_HANDLING to docs (#46856)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46856

Add reference to NCCL_ASYNC_ERROR_HANDLING in the pytorch docs,
similar to how NCCL_BLOCKING_WAIT is curently described.
ghstack-source-id: 115186877

Test Plan: CI, verifying docs change

Reviewed By: jiayisuse

Differential Revision: D24541822

fbshipit-source-id: a0b3e843bc6392d2787a4bb270118f2dfda5f4ec
---
 torch/distributed/constants.py        |  6 +++---
 torch/distributed/distributed_c10d.py | 15 ++++++++++++++-
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/torch/distributed/constants.py b/torch/distributed/constants.py
index f7718c4a20e0..dc541c932d11 100644
--- a/torch/distributed/constants.py
+++ b/torch/distributed/constants.py
@@ -2,7 +2,7 @@
 
 # Default process group wide timeout, if applicable.
 # This only applies to the gloo and nccl backends
-# (only if NCCL_BLOCKING_WAIT is set to 1). To make an attempt at
-# backwards compatibility with THD, we use an extraordinarily high default
-# timeout, given that THD did not have timeouts.
+# (only if NCCL_BLOCKING_WAIT or NCCL_ASYNC_ERROR_HANDLING is set to 1).
+# To make an attempt at backwards compatibility with THD, we use an
+# extraordinarily high default timeout, given that THD did not have timeouts.
 default_pg_timeout = timedelta(minutes=30)
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index e5a67edfae57..32353c5dc023 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -393,7 +393,20 @@ def init_process_group(backend,
             the process group. Default value equals 30 minutes.
             This is applicable for the ``gloo`` backend. For ``nccl``, this is
             applicable only if the environment variable ``NCCL_BLOCKING_WAIT``
-            is set to 1.
+            or ``NCCL_ASYNC_ERROR_HANDLING`` is set to 1. When
+            ``NCCL_BLOCKING_WAIT`` is set, this is the duration for which the
+            process will block and wait for collectives to complete before
+            throwing an exception. When ``NCCL_ASYNC_ERROR_HANDLING`` is set,
+            this is the duration after which collectives will be aborted
+            asynchronously and the process will crash. ``NCCL_BLOCKING_WAIT``
+            will provide errors to the user which can be caught and handled,
+            but due to its blocking nature, it has a performance overhead. On
+            the other hand, ``NCCL_ASYNC_ERROR_HANDLING`` has little
+            performance overhead, but crashes the process on errors. This is
+            done since CUDA execution is async and it is no longer safe to
+            continue executing user code since failed async NCCL operations
+            might result in subsequent CUDA operations to run on corrupted
+            data. Only one of these two environment variables should be set.
         group_name (str, optional, deprecated): Group name.
 
     To enable ``backend == Backend.MPI``, PyTorch needs to be built from source

From 02dc52f25b9b85eb996b44ef8bb896cd3d931d89 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Mon, 26 Oct 2020 15:29:52 -0700
Subject: [PATCH 019/169] vmap fallback: gracefully error out when vmap over
 dim of size 0 (#46846)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46846

Previously, this would crash with a floating point error. If the user vmaps
over a dimension of size 0, ideally we would return a tensor with a
batch dim of size 0 and the correct output shape. However, this isn't
possible without a shape-checking API. This PR changes the vmap fallback
to error out gracefully if it sees vmap occuring over a dimension of
size 0.

If we want to support vmapping over dimension of size 0 for a specific
op, then the guidance is to implement a batching rule for that op that
handles 0-sized dims.

Test Plan: - new test

Reviewed By: ezyang

Differential Revision: D24539315

Pulled By: zou3519

fbshipit-source-id: a19c049b46512d77c084cfee145720de8971f658
---
 aten/src/ATen/BatchedFallback.cpp | 10 ++++++++++
 test/test_vmap.py                 | 31 +++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/aten/src/ATen/BatchedFallback.cpp b/aten/src/ATen/BatchedFallback.cpp
index 51f231ee922b..396acc9e0403 100644
--- a/aten/src/ATen/BatchedFallback.cpp
+++ b/aten/src/ATen/BatchedFallback.cpp
@@ -161,6 +161,11 @@ void batchedTensorInplaceForLoopFallback(const c10::OperatorHandle& op, torch::j
       batch_sizes.end(),
       1,
       std::multiplies<int64_t>());
+  // Without a shape-checking API, we're unable to compute the correct shape of
+  // the output so we just error out.
+  TORCH_CHECK(num_batches > 0,
+      "Batching rule not implemented for ", schema.operator_name(), ". ",
+      "The fallback path does not support vmap over dims of size 0.");
 
   // Strategy: For each batch, we are going to push slices (where applicable)
   // of the arguments onto `stack`, and call `op`.
@@ -293,6 +298,11 @@ void batchedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Sta
       batch_sizes.end(),
       1,
       std::multiplies<int64_t>());
+  // Without a shape-checking API, we're unable to compute the correct shape of
+  // the output so we just error out.
+  TORCH_CHECK(num_batches > 0,
+      "Batching rule not implemented for ", schema.operator_name(), ". ",
+      "The fallback path does not support vmap over dims of size 0.");
 
   // Strategy: For each batch, we are going to push slices (where applicable)
   // of the arguments onto `stack`, call `op`, and store the result in
diff --git a/test/test_vmap.py b/test/test_vmap.py
index 2400ad1e00ee..69e5da0aa380 100644
--- a/test/test_vmap.py
+++ b/test/test_vmap.py
@@ -466,6 +466,37 @@ def _assert_uses_vmap_fallback(self, vmap_args, inputs):
             self.assertEqual(len(wa), 2)
             self.assertRegex(str(wa[-1].message), FALLBACK_REGEX)
 
+    def test_fallback_zero_dim(self):
+        # NB: One day we will implement a batching rule for torch.atan2.
+        # If/when we do, this test should be replaced to test the fallback
+        # path on another operator to avoid bitrot.
+        op = torch.atan2
+        x = torch.randn(11)
+        y = torch.randn(11)
+        self._assert_uses_vmap_fallback((op,), (x, y))
+
+        B0, B1 = 0, 3
+        x = torch.randn(B0, 11)
+        y = torch.randn(11)
+
+        msg = 'The fallback path does not support vmap over dims of size 0'
+
+        with self.assertRaisesRegex(RuntimeError, msg):
+            vmap(op, (0, None))(x, y)
+        with self.assertRaisesRegex(RuntimeError, msg):
+            vmap(op, (None, 0))(y, x)
+        with self.assertRaisesRegex(RuntimeError, msg):
+            vmap(op)(x, x)
+
+        x = torch.randn(B0, B1, 11)
+        y = torch.randn(B1, 11)
+        with self.assertRaisesRegex(RuntimeError, msg):
+            vmap(op, (0, None))(x, y)
+        with self.assertRaisesRegex(RuntimeError, msg):
+            vmap(op, (None, 0))(y, x)
+        with self.assertRaisesRegex(RuntimeError, msg):
+            vmap(op)(x, x)
+
     def test_fallback_atan2(self):
         # NB: One day we will implement a batching rule for torch.atan2.
         # If/when we do, this test should be replaced to test the fallback

From 13a5be571bf8fc521f8f5c2918f68a8f0fe2a007 Mon Sep 17 00:00:00 2001
From: anjali411 <chourdiaanjali123@gmail.com>
Date: Mon, 26 Oct 2020 15:43:47 -0700
Subject: [PATCH 020/169] Enable complex backward for torch.take() and
 tensor.fill_() (#46860)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/46860

Test Plan: Imported from OSS

Reviewed By: izdeby

Differential Revision: D24544601

Pulled By: anjali411

fbshipit-source-id: 4e29d48da30da3630cb558ccee464d89780b1ab7
---
 test/test_autograd.py                                 | 10 +++-------
 tools/autograd/gen_variable_type.py                   |  4 +---
 torch/testing/_internal/common_methods_invocations.py |  4 ++++
 3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index 37cccdbbfbd6..eec016cc9623 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -4918,7 +4918,7 @@ def run_functional_checks(test_case, test_name, name, apply_fn, run_grad_checks,
 # and only run for floating point
 
 separate_complex_tests = ['view_as_real', 'real', 'imag', 'asin', 'acos', 'div', 'log',
-                          'log10', 'log1p', 'log2', 'pow', 'tan', 'reciprocal', 'rsqrt']
+                          'log10', 'log1p', 'log2', 'pow', 'tan', 'reciprocal', 'rsqrt', '__rdiv__']
 
 # NOTE: Some non-holomorphic are separately tested in TestAutogradComplex until gradcheck works properly
 # for non-holomorphic functions
@@ -4929,16 +4929,12 @@ def run_functional_checks(test_case, test_name, name, apply_fn, run_grad_checks,
                 'permute', 'squeeze', 'unsqueeze', 'resize', 'resize_as', 'tril', 'triu',
                 'chunk', 'split', 'split_with_sizes', 'repeat', 'expand', 'zero_',
                 'eq_', 'ne_', 'add', '__radd__', 'sum', 'conj', 'sin', 'cos', 'mul', 'sinh',
-                'cosh', '__rmul__', 'sgn', 'abs', 'dot', 'vdot', 'tensor_split',
-                'matmul', 'bmm', 'mv', 'ger', 'diagonal', 'atan', 'angle'] + separate_complex_tests
+                'cosh', '__rmul__', 'sgn', 'abs', 'dot', 'vdot', 'tensor_split', 'matmul',
+                'bmm', 'mv', 'ger', 'diagonal', 'atan', 'angle', 'tanh', 'fill_', 'sub'] + separate_complex_tests
 
 # this list corresponds to cases that are not currently implemented
 skip_cuda_list = ['bmm_complex', 'matmul_4d_4d_complex']
 
-# TODO(@anjali411): add tests for 'sub', 'div
-# TODO(@anjali411): add the commented tests back after updating the formula based on tensorflow definition - @anjali411
-# complex_list += ['fill_', 't', '__rdiv__', 'tanh']
-
 def add_test(
         name,
         self_size,
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 0c41310b457a..570943755d2e 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -166,9 +166,7 @@
     'unbind', 'split', 'split_with_sizes', 'unsafe_split', 'split_with_sizes_backward',
     'dot', 'vdot', 'cholesky', 'triangular_solve', 'mm', '_unsafe_view', 'mv', 'ger',
     'bmm', 'diagonal', 'cholesky', 'atan', 'log', 'log10', 'log1p', 'log2', 'reciprocal',
-    'tan', 'pow', 'rsqrt', 'tanh', 'tanh_backward', 'asinh', 'acosh'
-    'dot', 'vdot', 'cholesky', 'atan', 'log', 'log10', 'log1p', 'log2', 'reciprocal',
-    'tan', 'pow', 'rsqrt', 'tanh', 'tanh_backward', 'asinh', 'acosh'
+    'tan', 'pow', 'rsqrt', 'tanh', 'tanh_backward', 'asinh', 'acosh', 'take', 'fill_'
 }
 
 # Some operators invalidate the grad_accumulator. Let's reset it.
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 2bd110480b88..7d2a04f10e5a 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -574,6 +574,10 @@ def method_tests():
             (True, [], ['aten::mul', 'aten::reciprocal'])),
         ('__rdiv__', uniform_scalar(1e-1, requires_grad=True), (3.14,), 'scalar_constant',
             (True, [], ['aten::mul', 'aten::reciprocal'])),
+        ('__rdiv__', torch.rand(S, S, S, dtype=torch.cdouble) + 1e-1, (3.14j,), 'complex_constant',
+            (True, [], ['aten::mul', 'aten::reciprocal'])),
+        ('__rdiv__', uniform_scalar(1e-1 * (1 + 1j), requires_grad=True), (3.14j,), 'complex_scalar_constant',
+            (True, [], ['aten::mul', 'aten::reciprocal'])),
         ('div', (S, S, S), (torch.rand(S, S, S, dtype=torch.cdouble) + 0.1,), 'complex', (True,)),
         ('div', (S, S, S), (torch.rand(S, S, dtype=torch.cdouble) + 0.1,), 'complex_broadcast_rhs', (True,)),
         ('div', (S, S), (torch.rand(S, S, S, dtype=torch.cdouble) + 0.1,), 'complex_broadcast_lhs', (True,)),

From 99cf3b1ce41cc1883fac894b938444da5f979323 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Mon, 26 Oct 2020 15:51:29 -0700
Subject: [PATCH 021/169] CUDA BFloat16 signal windows (#45155)

Summary:
Looks like this op is never tested for the support of different dtypes?

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45155

Reviewed By: zou3519

Differential Revision: D24438839

Pulled By: ngimel

fbshipit-source-id: 103ff609e11811a0705d04520c2b97c456b623ef
---
 aten/src/ATen/native/cuda/RangeFactories.cu | 102 ++++++++++----------
 aten/src/ATen/native/cuda/UnaryOpsKernel.cu |  15 +--
 test/test_torch.py                          |  15 ++-
 3 files changed, 69 insertions(+), 63 deletions(-)

diff --git a/aten/src/ATen/native/cuda/RangeFactories.cu b/aten/src/ATen/native/cuda/RangeFactories.cu
index 38f3f8487fc4..4286f05111b6 100644
--- a/aten/src/ATen/native/cuda/RangeFactories.cu
+++ b/aten/src/ATen/native/cuda/RangeFactories.cu
@@ -207,62 +207,60 @@ Tensor& range_cuda_out(Tensor& result, Scalar start, Scalar end, Scalar step) {
 
 Tensor& arange_cuda_out(Tensor& result, Scalar start, Scalar end, Scalar step) {
   AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, result.scalar_type(), "arange_cuda", [&]() {
-    AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "arange_cuda", [&] {
-      using accscalar_t = at::acc_type<scalar_t, true>;
-      auto xstart = start.to<accscalar_t>();
-      auto xend = end.to<accscalar_t>();
-      auto xstep = step.to<accscalar_t>();
-
-      // we use double precision for (start - end) / step
-      // to compute size_d for consistency across devices.
-      // The problem with using accscalar_t is that accscalar_t might be float32 on gpu for a float32 scalar_t,
-      // but double on cpu for the same,
-      // and the effective output size starts differing on CPU vs GPU because of precision issues, which
-      // we dont want.
-      // the corner-case we do want to take into account is int64_t, which has higher precision than double
-      double size_d;
-      if (std::is_same<scalar_t, int64_t>::value) {
-        size_d = std::ceil(static_cast<double>(end.to<accscalar_t>() - start.to<accscalar_t>())
-                           / step.to<accscalar_t>());
-      } else {
-        size_d = std::ceil(static_cast<double>(end.to<double>() - start.to<double>())
-                           / step.to<double>());
-      }
-
-      TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
-      TORCH_CHECK(std::isfinite(static_cast<double>(xstart)) &&
-               std::isfinite(static_cast<double>(xend)),
-               "unsupported range: ", xstart, " -> ", xend);
-      TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
-               "upper bound and larger bound inconsistent with step sign");
-
-      TORCH_CHECK(size_d >= 0 && size_d <= static_cast<double>(std::numeric_limits<int64_t>::max()),
-               "invalid size, possible overflow?");
-      int64_t size = static_cast<int64_t>(size_d);
-      int64_t numel = result.numel();
-
-      if (numel != size) {
-        if(numel > 0){
-          TORCH_WARN("The number of elements in the out tensor of shape ", result.sizes(),
-                      " is ", numel, " which does not match the computed number of elements ", size,
-                      ". Note that this may occur as a result of rounding error. "
-                      "The out tensor will be resized to a tensor of shape (", size, ",).");
-        }
-        result.resize_({size});
-      }
-      bool is_contiguous = result.is_contiguous();
-      Tensor r = !is_contiguous ? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT) : result;
+    using accscalar_t = at::acc_type<scalar_t, true>;
+    auto xstart = start.to<accscalar_t>();
+    auto xend = end.to<accscalar_t>();
+    auto xstep = step.to<accscalar_t>();
 
-      gpu_kernel_with_index(r, [xstart, xstep]GPU_LAMBDA(int64_t ind) -> scalar_t {
-          accscalar_t inc = xstep * static_cast<accscalar_t>(ind);
-          accscalar_t val = xstart + inc;
-          return static_cast<scalar_t>(val);
-      });
+    // we use double precision for (start - end) / step
+    // to compute size_d for consistency across devices.
+    // The problem with using accscalar_t is that accscalar_t might be float32 on gpu for a float32 scalar_t,
+    // but double on cpu for the same,
+    // and the effective output size starts differing on CPU vs GPU because of precision issues, which
+    // we dont want.
+    // the corner-case we do want to take into account is int64_t, which has higher precision than double
+    double size_d;
+    if (std::is_same<scalar_t, int64_t>::value) {
+      size_d = std::ceil(static_cast<double>(end.to<accscalar_t>() - start.to<accscalar_t>())
+                          / step.to<accscalar_t>());
+    } else {
+      size_d = std::ceil(static_cast<double>(end.to<double>() - start.to<double>())
+                          / step.to<double>());
+    }
 
-      if(!is_contiguous) {
-        result.copy_(r);
+    TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
+    TORCH_CHECK(std::isfinite(static_cast<double>(xstart)) &&
+              std::isfinite(static_cast<double>(xend)),
+              "unsupported range: ", xstart, " -> ", xend);
+    TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
+              "upper bound and larger bound inconsistent with step sign");
+
+    TORCH_CHECK(size_d >= 0 && size_d <= static_cast<double>(std::numeric_limits<int64_t>::max()),
+              "invalid size, possible overflow?");
+    int64_t size = static_cast<int64_t>(size_d);
+    int64_t numel = result.numel();
+
+    if (numel != size) {
+      if(numel > 0){
+        TORCH_WARN("The number of elements in the out tensor of shape ", result.sizes(),
+                    " is ", numel, " which does not match the computed number of elements ", size,
+                    ". Note that this may occur as a result of rounding error. "
+                    "The out tensor will be resized to a tensor of shape (", size, ",).");
       }
+      result.resize_({size});
+    }
+    bool is_contiguous = result.is_contiguous();
+    Tensor r = !is_contiguous ? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT) : result;
+
+    gpu_kernel_with_index(r, [xstart, xstep]GPU_LAMBDA(int64_t ind) -> scalar_t {
+        accscalar_t inc = xstep * static_cast<accscalar_t>(ind);
+        accscalar_t val = xstart + inc;
+        return static_cast<scalar_t>(val);
     });
+
+    if(!is_contiguous) {
+      result.copy_(r);
+    }
   });
 
   AT_CUDA_CHECK(cudaGetLastError());
diff --git a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
index c3c8dd1e5094..4b1f0c1a6aa3 100644
--- a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
@@ -223,13 +223,16 @@ void nan_to_num_kernel_cuda(
   });
 }
 
-void kaiser_window_kernel_cuda(TensorIterator& iter, int64_t window_length, double beta){
+void kaiser_window_kernel_cuda(TensorIterator& iter, int64_t window_length, double beta_){
   AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "kaiser_window_cuda", [&](){
-    AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "kaiser_window_cuda", [&] {
-      const scalar_t alpha = static_cast<scalar_t>((window_length - 1) / 2.0);
-      gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t a) -> scalar_t {
-        return calc_i0(static_cast<scalar_t>(beta) * ::sqrt(1 - ::pow((a - alpha) / alpha, static_cast<scalar_t>(2.0)))) / calc_i0(static_cast<scalar_t>(beta));
-      });
+    using T_ACC = acc_type<scalar_t, true>;
+    const T_ACC inv_alpha = static_cast<T_ACC>(2.0 / (window_length - 1));
+    const T_ACC beta = static_cast<T_ACC>(beta_);
+    const T_ACC inv_i0_beta = 1.0 / calc_i0(beta);
+    gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t a) -> scalar_t {
+      T_ACC x = static_cast<T_ACC>(a) * inv_alpha - 1;
+      T_ACC y = std::max<T_ACC>(0, 1 - x * x);
+      return calc_i0(beta * ::sqrt(y)) * inv_i0_beta;
     });
   });
 }
diff --git a/test/test_torch.py b/test/test_torch.py
index 96ac4872289f..d44f74440d1a 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -8096,21 +8096,26 @@ def test_complex_rot90(self, device, dtype):
             self.compare_with_numpy(torch_fn, np_fn, data)
 
     @onlyOnCPUAndCUDA
+    @precisionOverride({torch.bfloat16: 5e-2, torch.half: 1e-3})
     @unittest.skipIf(not TEST_SCIPY, "Scipy not found")
-    def test_signal_window_functions(self, device):
+    @dtypesIfCUDA(torch.float, torch.double, torch.bfloat16, torch.half, torch.long)
+    @dtypesIfCPU(torch.float, torch.double, torch.long)
+    def test_signal_window_functions(self, device, dtype):
 
         def test(name, kwargs):
             torch_method = getattr(torch, name + '_window')
+            if not dtype.is_floating_point:
+                with self.assertRaisesRegex(RuntimeError, r'floating point'):
+                    torch_method(3, dtype=dtype)
+                return
             for size in [0, 1, 2, 5, 10, 50, 100, 1024, 2048]:
                 for periodic in [True, False]:
-                    res = torch_method(size, periodic=periodic, **kwargs, device=device)
-                    # NB: scipy always returns a float32 result
+                    res = torch_method(size, periodic=periodic, **kwargs, device=device, dtype=dtype)
+                    # NB: scipy always returns a float64 result
                     ref = torch.from_numpy(signal.get_window((name, *(kwargs.values())), size, fftbins=periodic))
                     self.assertEqual(res, ref, exact_dtype=False)
             with self.assertRaisesRegex(RuntimeError, r'not implemented for sparse types'):
                 torch_method(3, layout=torch.sparse_coo)
-            with self.assertRaisesRegex(RuntimeError, r'floating point'):
-                torch_method(3, dtype=torch.long)
             self.assertTrue(torch_method(3, requires_grad=True).requires_grad)
             self.assertFalse(torch_method(3).requires_grad)
 

From 7731370e718ad16994d8da25fbdc9de64a22ec44 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Mon, 26 Oct 2020 15:59:12 -0700
Subject: [PATCH 022/169] CUDA BFloat16 gelu, hardswish, hardsigmoid (#44997)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44997

Reviewed By: izdeby

Differential Revision: D24547748

Pulled By: ngimel

fbshipit-source-id: 34639dfe6ca41c3f59fd2af861e5e3b1bb86757a
---
 aten/src/ATen/native/cuda/Activation.cu | 120 ++++++++++++------------
 test/test_nn.py                         |  34 ++++---
 test/test_torch.py                      |  18 ++--
 3 files changed, 86 insertions(+), 86 deletions(-)

diff --git a/aten/src/ATen/native/cuda/Activation.cu b/aten/src/ATen/native/cuda/Activation.cu
index 145fc990daeb..9926d8b05d95 100644
--- a/aten/src/ATen/native/cuda/Activation.cu
+++ b/aten/src/ATen/native/cuda/Activation.cu
@@ -341,12 +341,10 @@ namespace {
 
 void GeluCUDAKernelImpl(TensorIterator& it) {
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, it.dtype(), "GeluCUDAKernelImpl", [&]() {
-    AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "GeluCUDAKernelImpl", [&] {
-      using T_ACC = acc_type<scalar_t, true>;
-      gpu_kernel(it, [] GPU_LAMBDA(scalar_t x) -> scalar_t {
-        return static_cast<T_ACC>(x) *
-            c10::cuda::compat::normcdf(static_cast<T_ACC>(x));
-      });
+    using T_ACC = acc_type<scalar_t, true>;
+    gpu_kernel(it, [] GPU_LAMBDA(scalar_t x) -> scalar_t {
+      return static_cast<T_ACC>(x) *
+          c10::cuda::compat::normcdf(static_cast<T_ACC>(x));
     });
   });
 }
@@ -354,17 +352,15 @@ void GeluCUDAKernelImpl(TensorIterator& it) {
 void GeluBackwardCUDAKernelImpl(TensorIterator& it) {
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16,
       it.dtype(), "GeluBackwardCUDAKernelImpl", [&]() {
-        AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "GeluBackwardCUDAKernelImpl", [&] {
-          using T_ACC = acc_type<scalar_t, true>;
-          gpu_kernel(it, [] GPU_LAMBDA(scalar_t dy, scalar_t x) -> scalar_t {
-            constexpr T_ACC kBeta = M_2_SQRTPI * M_SQRT1_2 * T_ACC(0.5);
-            const T_ACC cdf = c10::cuda::compat::normcdf(static_cast<T_ACC>(x));
-            const T_ACC pdf =
-                c10::cuda::compat::exp(
-                    T_ACC(-0.5) * static_cast<T_ACC>(x) * static_cast<T_ACC>(x)) *
-                kBeta;
-            return static_cast<T_ACC>(dy) * (cdf + static_cast<T_ACC>(x) * pdf);
-          });
+        using T_ACC = acc_type<scalar_t, true>;
+        gpu_kernel(it, [] GPU_LAMBDA(scalar_t dy, scalar_t x) -> scalar_t {
+          constexpr T_ACC kBeta = M_2_SQRTPI * M_SQRT1_2 * T_ACC(0.5);
+          const T_ACC cdf = c10::cuda::compat::normcdf(static_cast<T_ACC>(x));
+          const T_ACC pdf =
+              c10::cuda::compat::exp(
+                  T_ACC(-0.5) * static_cast<T_ACC>(x) * static_cast<T_ACC>(x)) *
+              kBeta;
+          return static_cast<T_ACC>(dy) * (cdf + static_cast<T_ACC>(x) * pdf);
         });
       });
 }
@@ -389,68 +385,70 @@ void leaky_relu_backward_kernel(TensorIterator& iter, Scalar negval_) {
 
 void hardswish_kernel(TensorIterator& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "hardswish_cuda", [&]() {
-    AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "hardswish_cuda", [&] {
-      const scalar_t zero(0.0f);
-      const scalar_t one_sixth(1.0f / 6.0f);
-      const scalar_t three(3.0f);
-      const scalar_t six(6.0f);
-      gpu_kernel(iter, [zero, one_sixth, three, six]GPU_LAMBDA(scalar_t self_val) -> scalar_t {
-        return self_val * std::min(std::max(self_val + three, zero), six) * one_sixth;
-      });
+    using T_ACC = acc_type<scalar_t, true>;
+    const T_ACC zero(0.0f);
+    const T_ACC one_sixth(1.0f / 6.0f);
+    const T_ACC three(3.0f);
+    const T_ACC six(6.0f);
+    gpu_kernel(iter, [zero, one_sixth, three, six]GPU_LAMBDA(scalar_t self_val) -> scalar_t {
+      T_ACC x = static_cast<T_ACC>(self_val);
+      return x * std::min(std::max(x + three, zero), six) * one_sixth;
     });
   });
 }
 
 void hardswish_backward_kernel(TensorIterator& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "hardswish_backward_cuda", [&]() {
-    AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "hardswish_backward_cuda", [&] {
-      const scalar_t zero(0.0f);
-      const scalar_t three(3.0f);
-      const scalar_t neg_three(-3.0f);
-      const scalar_t one_half(0.5f);
-      gpu_kernel(
-        iter, 
-        [zero, three, neg_three, one_half]GPU_LAMBDA(scalar_t grad_val, scalar_t self_val) -> scalar_t {
-          if (self_val < neg_three) {
-            return zero;
-          } else if (self_val <= three) {
-            return grad_val * ((self_val / three) + one_half);
-          } else {
-            return grad_val;
-          }
-      });
+    using T_ACC = acc_type<scalar_t, true>;
+    const T_ACC zero(0.0f);
+    const T_ACC three(3.0f);
+    const T_ACC neg_three(-3.0f);
+    const T_ACC one_half(0.5f);
+    gpu_kernel(
+      iter, 
+      [zero, three, neg_three, one_half]GPU_LAMBDA(scalar_t grad_val_, scalar_t self_val_) -> scalar_t {
+        T_ACC grad_val = static_cast<T_ACC>(grad_val_);
+        T_ACC self_val = static_cast<T_ACC>(self_val_);
+        if (self_val < neg_three) {
+          return zero;
+        } else if (self_val <= three) {
+          return grad_val * ((self_val / three) + one_half);
+        } else {
+          return grad_val;
+        }
     });
   });
 }
 
 void hardsigmoid_kernel(TensorIterator& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "hardsigmoid_cuda", [&]() {
-    AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "hardsigmoid_cuda", [&] {
-      const scalar_t zero(0.0f);
-      const scalar_t one_sixth(1.0f / 6.0f);
-      const scalar_t three(3.0f);
-      const scalar_t six(6.0f);
-      gpu_kernel(iter, [zero, one_sixth, three, six]GPU_LAMBDA(scalar_t self_val) -> scalar_t {
-        return std::min(std::max(self_val + three, zero), six) * one_sixth;
-      });
+    using T_ACC = acc_type<scalar_t, true>;
+    const T_ACC zero(0.0f);
+    const T_ACC one_sixth(1.0f / 6.0f);
+    const T_ACC three(3.0f);
+    const T_ACC six(6.0f);
+    gpu_kernel(iter, [zero, one_sixth, three, six]GPU_LAMBDA(scalar_t self_val) -> scalar_t {
+      T_ACC x = static_cast<T_ACC>(self_val);
+      return std::min(std::max(x + three, zero), six) * one_sixth;
     });
   });
 }
 
 void hardsigmoid_backward_kernel(TensorIterator& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "hardsigmoid_backward_cuda", [&]() {
-    AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "hardsigmoid_backward_cuda", [&] {
-      const scalar_t zero(0.0f);
-      const scalar_t three(3.0f);
-      const scalar_t neg_three(-3.0f);
-      const scalar_t one_sixth(1.0f / 6.0f);
-      gpu_kernel(
-        iter, 
-        [zero, three, neg_three, one_sixth]GPU_LAMBDA(scalar_t grad_val, scalar_t self_val) -> scalar_t {
-          return (self_val >= neg_three && self_val <= three)
-            ? grad_val * one_sixth
-            : zero;
-      });
+    using T_ACC = acc_type<scalar_t, true>;
+    const T_ACC zero(0.0f);
+    const T_ACC three(3.0f);
+    const T_ACC neg_three(-3.0f);
+    const T_ACC one_sixth(1.0f / 6.0f);
+    gpu_kernel(
+      iter, 
+      [zero, three, neg_three, one_sixth]GPU_LAMBDA(scalar_t grad_val_, scalar_t self_val_) -> scalar_t {
+        T_ACC grad_val = static_cast<T_ACC>(grad_val_);
+        T_ACC self_val = static_cast<T_ACC>(self_val_);
+        return (self_val >= neg_three && self_val <= three)
+          ? grad_val * one_sixth
+          : zero;
     });
   });
 }
diff --git a/test/test_nn.py b/test/test_nn.py
index 784786196914..25960d817ab8 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -6503,27 +6503,31 @@ def test_PReLU_backward_requires_grad_false(self):
     @unittest.skipIf(
         not TEST_NUMPY or not TEST_SCIPY, "Numpy or Scipy not found")
     def test_gelu(self):
-        def _test_gelu(n, m, dtype, contiguous):
+        def _test_gelu(n, m, dtype, contiguous, atol=None, rtol=None):
+            numpy_dtype = {
+                torch.bfloat16: torch.float, torch.float: torch.float, torch.double: torch.double
+            }[dtype]
+            devices = ['cpu'] if dtype != torch.bfloat16 else [] + \
+                ['cuda'] if TEST_CUDA else []
+
             def _gelu_ref(X):
                 return X * stats.norm.cdf(X)
 
-            if contiguous:
-                X = torch.rand(n, m, dtype=dtype, requires_grad=True)
-            else:
-                X = torch.rand(n, m, dtype=dtype, requires_grad=True)[:, ::2]
-            res = F.gelu(X)
-            ref = _gelu_ref(X.detach().numpy())
-            self.assertEqual(res, ref)
-            gradcheck(F.gelu, [X], eps=1e-4)
-
-            if TEST_CUDA:
-                X_cuda = X.cuda()
-                res_cuda = F.gelu(X_cuda)
-                self.assertEqual(res_cuda.cpu(), ref)
-                gradcheck(F.gelu, [X_cuda], eps=1e-4)
+            for d in devices:
+                if contiguous:
+                    X = torch.rand(n, m, dtype=dtype, requires_grad=True, device=d)
+                else:
+                    X = torch.rand(n, m, dtype=dtype, requires_grad=True, device=d)[:, ::2]
+                res = F.gelu(X)
+                ref = _gelu_ref(X.to(numpy_dtype).cpu().detach().numpy())
+                self.assertEqual(res, ref, rtol=rtol, atol=atol)
+                if dtype != torch.bfloat16:
+                    gradcheck(F.gelu, [X], eps=1e-4)
 
         for n in range(1, 10):
             for m in range(1, 10):
+                _test_gelu(n, m, torch.bfloat16, True, 1e-2, 0)
+                _test_gelu(n, m, torch.bfloat16, False, 1e-2, 0)
                 _test_gelu(n, m, torch.float32, True)
                 _test_gelu(n, m, torch.float32, False)
                 _test_gelu(n, m, torch.float64, True)
diff --git a/test/test_torch.py b/test/test_torch.py
index d44f74440d1a..4a5c1f3f168b 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -17018,6 +17018,8 @@ def test_exp_slow(self, device, dtype):
         b = torch.exp(torch.ones(1, dtype=dtype, device=device))
         self.assertEqual(a, b.expand(2 ** 31))
 
+    @precisionOverride({torch.bfloat16: 1e-2, torch.float: 0.0002, torch.double: 0.0002})
+    @dtypesIfCUDA(torch.float, torch.double, torch.bfloat16)
     @dtypes(torch.float, torch.double)
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     def test_hardswish(self, device, dtype):
@@ -17025,7 +17027,6 @@ def test_hardswish(self, device, dtype):
         expectedOutput = np.multiply(
             inputValues,
             np.minimum(np.maximum((np.add(inputValues, 3)), 0), 6) / 6.0)
-        precision_4dps = 0.0002
 
         inputTensor = torch.tensor(inputValues, dtype=dtype, device=device)
         expectedOutputTensor = \
@@ -17033,14 +17034,12 @@ def test_hardswish(self, device, dtype):
 
         # normal
         self.assertEqual(torch.nn.functional.hardswish(inputTensor),
-                         expectedOutputTensor,
-                         atol=precision_4dps, rtol=0)
+                         expectedOutputTensor)
 
         # inplace
         inputTensorCpy = inputTensor.clone().detach()
         torch.nn.functional.hardswish(inputTensorCpy, inplace=True)
-        self.assertEqual(inputTensorCpy, expectedOutputTensor,
-                         atol=precision_4dps, rtol=0)
+        self.assertEqual(inputTensorCpy, expectedOutputTensor)
 
     @onlyCPU
     @dtypes(torch.float, torch.double)
@@ -17054,6 +17053,8 @@ def test_sigmoid(self, device, dtype):
                          torch.tensor(expectedOutput, dtype=dtype, device=device),
                          atol=precision_4dps, rtol=0)
 
+    @precisionOverride({torch.bfloat16: 1e-2, torch.float: 0.0002, torch.double: 0.0002})
+    @dtypesIfCUDA(torch.float, torch.double, torch.bfloat16)
     @dtypes(torch.float, torch.double)
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     def test_hardsigmoid(self, device, dtype):
@@ -17061,18 +17062,15 @@ def test_hardsigmoid(self, device, dtype):
         expectedOutput = np.minimum(np.maximum((np.add(inputValues, 3)), 0), 6) / 6.0
 
         inputTensor = torch.tensor(inputValues, dtype=dtype, device=device)
-        precision_4dps = 0.0002
 
         # normal
         self.assertEqual(torch.nn.functional.hardsigmoid(inputTensor),
-                         torch.tensor(expectedOutput, dtype=dtype, device=device),
-                         atol=precision_4dps, rtol=0)
+                         torch.tensor(expectedOutput, dtype=dtype, device=device))
 
         # inplace
         inputTensorCpy = inputTensor.clone().detach()
         self.assertEqual(torch.nn.functional.hardsigmoid(inputTensorCpy, inplace=True),
-                         torch.tensor(expectedOutput, dtype=dtype, device=device),
-                         atol=precision_4dps, rtol=0)
+                         torch.tensor(expectedOutput, dtype=dtype, device=device))
 
     @skipIfNoSciPy
     @dtypes(torch.float, torch.double)

From adafd3d4b2d138ec020171d077c9bb01157ca9b1 Mon Sep 17 00:00:00 2001
From: Pritam Damania <pritam.damania@fb.com>
Date: Mon, 26 Oct 2020 17:27:12 -0700
Subject: [PATCH 023/169] Support RRef.backward() for local RRefs. (#46568)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46568

This PR adds support for an RRef.backward() API. This would be useful
in applications like pipeline parallelism as described here:
https://github.com/pytorch/pytorch/issues/44827

This PR only adds support for local RRefs, remote RRef support will be added in
a follow up PR.
ghstack-source-id: 115100729

Test Plan:
1) unit tests.
2) waitforbuildbot

Reviewed By: mrshenli

Differential Revision: D24406311

fbshipit-source-id: fb0b4e185d9721bf57f4dea9847e0aaa66b3e513
---
 torch/csrc/distributed/rpc/init.cpp           | 37 +++++++++++++++++++
 torch/csrc/distributed/rpc/py_rref.cpp        | 21 +++++++++++
 torch/csrc/distributed/rpc/py_rref.h          |  3 ++
 .../_internal/distributed/rpc/rpc_test.py     | 35 ++++++++++++++++++
 4 files changed, 96 insertions(+)

diff --git a/torch/csrc/distributed/rpc/init.cpp b/torch/csrc/distributed/rpc/init.cpp
index d278ac59ca75..2f608bd6fd33 100644
--- a/torch/csrc/distributed/rpc/init.cpp
+++ b/torch/csrc/distributed/rpc/init.cpp
@@ -393,6 +393,43 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
                   Set future that is completed when the profiling event corresponding
                   to the creation of this RRef on the remote node has been recorded.
               )")
+          .def(
+              "backward",
+              [](PyRRef& self,
+                 int64_t dist_autograd_ctx_id,
+                 bool retain_graph) {
+                self.backward(dist_autograd_ctx_id, retain_graph);
+              },
+              py::arg("dist_autograd_ctx_id") = -1,
+              py::arg("retain_graph") = false,
+              py::call_guard<py::gil_scoped_release>(),
+              R"(
+                  Runs the backward pass using the RRef as the root of the
+                  backward pass. If ``dist_autograd_ctx_id`` is provided,
+                  we perform a distributed backward pass using the provided
+                  ctx_id starting from the owner of the RRef. In this case,
+                  :meth:`~torch.distributed.autograd.get_gradients` should be
+                  used to retrieve the gradients. If ``dist_autograd_ctx_id``
+                  is ``None``, it is assumed that this is a local autograd graph
+                  and we only perform a local backward pass. The value of the
+                  RRef is expected to be a scalar Tensor.
+
+                Arguments:
+                    dist_autograd_ctx_id (int, optional): The distributed
+                        autograd context id for which we should retrieve the
+                        gradients (default: -1).
+                    retain_graph(bool, optional): If ``False``, the graph used to
+                        compute the grad will be freed. Note that in nearly all
+                        cases setting this option to ``True`` is not needed and
+                        often can be worked around in a much more efficient way.
+                        Usually, you need to set this to ``True`` to run backward
+                        multiple times (default: False).
+
+                Example::
+                    >>> import torch.distributed.autograd as dist_autograd
+                    >>> with dist_autograd.context() as context_id:
+                    >>>     rref.backward(context_id)
+                )")
           // not releasing GIL to avoid context switch
           .def("__repr__", &PyRRef::str);
 
diff --git a/torch/csrc/distributed/rpc/py_rref.cpp b/torch/csrc/distributed/rpc/py_rref.cpp
index 823e21d20b4b..6243d0b78e83 100644
--- a/torch/csrc/distributed/rpc/py_rref.cpp
+++ b/torch/csrc/distributed/rpc/py_rref.cpp
@@ -1,5 +1,7 @@
 #include <torch/csrc/distributed/rpc/py_rref.h>
 
+#include <torch/csrc/autograd/autograd.h>
+#include <torch/csrc/distributed/autograd/autograd.h>
 #include <torch/csrc/distributed/rpc/python_functions.h>
 #include <torch/csrc/distributed/rpc/python_rpc_handler.h>
 #include <torch/csrc/distributed/rpc/rref_context.h>
@@ -283,6 +285,25 @@ c10::IValue PyRRef::toIValue() const {
   return IValue(rrefPtr);
 }
 
+void PyRRef::backward(int64_t dist_autograd_ctx_id, bool retain_graph) {
+  if (rref_->isOwner()) {
+    const auto& value =
+        c10::static_intrusive_pointer_cast<const OwnerRRef>(rref_)->getValue();
+    TORCH_CHECK(
+        value.isTensor(), "RRef should contain a tensor for .backward()");
+    auto root = value.toTensor();
+
+    if (dist_autograd_ctx_id == -1) {
+      torch::autograd::backward({root});
+    } else {
+      torch::distributed::autograd::backward(
+          dist_autograd_ctx_id, {value.toTensor()}, retain_graph);
+    }
+  } else {
+    // TODO
+  }
+}
+
 } // namespace rpc
 } // namespace distributed
 } // namespace torch
diff --git a/torch/csrc/distributed/rpc/py_rref.h b/torch/csrc/distributed/rpc/py_rref.h
index 3cc7ab73f019..6af75cd45c3e 100644
--- a/torch/csrc/distributed/rpc/py_rref.h
+++ b/torch/csrc/distributed/rpc/py_rref.h
@@ -51,6 +51,9 @@ class PYBIND11_EXPORT PyRRef {
   // get the type of the data object referenced by this RRef.
   py::object getRRefType();
 
+  // Run the backward pass with the RRef as the root.
+  void backward(int64_t dist_autograd_ctx_id, bool retain_graph);
+
  private:
   c10::intrusive_ptr<RRef> rref_;
   c10::optional<c10::intrusive_ptr<JitFuture>> profilingFuture_;
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index 82be0fd97731..8dc25a6a56da 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -3700,6 +3700,41 @@ def test_cannot_infer_backend_from_options(self):
                 rpc_backend_options=rpc_backend_options,
             )
 
+    @dist_init
+    def test_local_rref_backward(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        t1 = torch.rand(10, 10, requires_grad=True)
+        rref = rpc.RRef(t1.sum() + t1.sum())
+        rref.backward()
+        expected_grad = torch.ones_like(t1) * 2
+        self.assertEqual(expected_grad, t1.grad)
+
+        with dist_autograd.context() as context_id:
+            t2 = rpc.rpc_sync(dst, torch.add, args=(t1, t1))
+            rref = rpc.RRef(t2.sum())
+            rref.backward(context_id)
+            self.assertEqual(expected_grad, dist_autograd.get_gradients(context_id)[t1])
+
+        # Double backward.
+        with dist_autograd.context() as context_id:
+            t2 = rpc.rpc_sync(dst, torch.add, args=(t1, t1))
+            rref = rpc.RRef(t2.sum())
+            rref.backward(context_id, retain_graph=True)
+            rref.backward(context_id)
+            self.assertEqual(expected_grad * 2, dist_autograd.get_gradients(context_id)[t1])
+
+        # Test errors.
+        with self.assertRaisesRegex(RuntimeError, "tensors does not require grad and does not have a grad_fn"):
+            rpc.RRef(torch.rand(10)).backward()
+
+        with self.assertRaisesRegex(RuntimeError, "grad can be implicitly created only for scalar outputs"):
+            rpc.RRef(torch.rand(10, requires_grad=True)).backward()
+
+        with self.assertRaisesRegex(RuntimeError, "Could not find autograd context with id: 100"):
+            rpc.RRef(torch.rand(10, requires_grad=True).sum()).backward(100)
+
+        with self.assertRaisesRegex(RuntimeError, "RRef should contain a tensor for .backward()"):
+            rpc.RRef("foo").backward()
 
 class ProcessGroupAgentRpcTest(RpcAgentTestFixture):
 

From a6cd294c9bb2c3ab6f82af0265102ae557d25390 Mon Sep 17 00:00:00 2001
From: Yi Wang <wayi@fb.com>
Date: Mon, 26 Oct 2020 18:05:39 -0700
Subject: [PATCH 024/169] [Gradient Compression] Refactor CommHookInterface and
 PythonCommHook. (#46512)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46512

1. Merge 1-line PythonCommHook constructor into the header for simplicity.
2. Move the implementation of PythonCommHook destructor from the header file to cpp file.
3. Rename processFuture method as parseHookResult for readability.
4. Simplify some comments.

Original PR issue: C++ DDP Communication Hook https://github.com/pytorch/pytorch/issues/46348
ghstack-source-id: 115161086

Test Plan:
buck test mode/dev-nosan caffe2/test/distributed:c10d -- test_ddp_comm_hook_allreduce_hook_nccl

buck test mode/dev-nosan caffe2/test/distributed:c10d -- test_ddp_comm_hook_sparse_gradients

buck test mode/dev-nosan caffe2/test/distributed:c10d -- test_ddp_comm_hook_allreduce_with_then_hook_nccl

buck test mode/dev-nosan caffe2/test/distributed:c10d -- test_ddp_comm_hook_future_passing_gpu_gloo

Reviewed By: jiayisuse

Differential Revision: D24374282

fbshipit-source-id: c8dbdd764bca5b3fa247708f1218cb5ff3e321bb
---
 torch/csrc/distributed/c10d/comm.cpp    | 28 +++++++++----
 torch/csrc/distributed/c10d/comm.h      | 55 +++++++++++--------------
 torch/csrc/distributed/c10d/reducer.cpp |  2 +-
 3 files changed, 43 insertions(+), 42 deletions(-)

diff --git a/torch/csrc/distributed/c10d/comm.cpp b/torch/csrc/distributed/c10d/comm.cpp
index 2eb79283a088..6d1181514aeb 100644
--- a/torch/csrc/distributed/c10d/comm.cpp
+++ b/torch/csrc/distributed/c10d/comm.cpp
@@ -85,8 +85,16 @@ void broadcast_coalesced(
   }
 }
 
-PythonCommHook::PythonCommHook(py::object state, py::object hook)
-    : state_(std::move(state)), hook_(std::move(hook)){};
+PythonCommHook::~PythonCommHook() {
+  py::gil_scoped_acquire ag;
+  state_.dec_ref();
+  hook_.dec_ref();
+  // Explicitly set state_ and hook_ to nullptr to prevent py::object's dtor
+  // to decref on the PyObject again.
+  // See Note [Destructing py::object] in python_ivalue.h
+  state_.ptr() = nullptr;
+  hook_.ptr() = nullptr;
+}
 
 c10::intrusive_ptr<torch::jit::Future> PythonCommHook::runHook(
     const GradBucket& bucket) {
@@ -109,20 +117,22 @@ c10::intrusive_ptr<torch::jit::Future> PythonCommHook::runHook(
   }
 }
 
-std::vector<at::Tensor> PythonCommHook::processFuture(
-    c10::IValue future_value) {
-  // Since we have a Python hook, future_value can be a PyObject.
-  if (future_value.isPyObject()) {
-    // We first convert it to an IValue that contains a TensorVector.
+std::vector<at::Tensor> PythonCommHook::parseHookResult(
+    const c10::IValue& result) {
+  TORCH_INTERNAL_ASSERT(
+      result.isPyObject() || result.isTensorList(),
+      "expected the hook result is either a PyObject or TensorList");
+
+  if (result.isPyObject()) {
     py::gil_scoped_acquire ag;
-    py::object obj = torch::jit::toPyObject(future_value);
+    py::object obj = torch::jit::toPyObject(result);
     auto value = torch::jit::toIValue(
         obj, c10::ListType::create(c10::TensorType::get()));
 
     return value.toTensorVector();
   }
 
-  return future_value.toTensorVector();
+  return result.toTensorVector();
 }
 
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/comm.h b/torch/csrc/distributed/c10d/comm.h
index 2eb626c40232..bb5391a68aaa 100644
--- a/torch/csrc/distributed/c10d/comm.h
+++ b/torch/csrc/distributed/c10d/comm.h
@@ -35,57 +35,48 @@ class GradBucket {
   std::vector<at::Tensor> tensors_;
 };
 
-// DDP's c10d reducer allows communication hooks defined as a sub class
-// of CommHookInterface. CommHookInterface is an abstract class and can
-// be used to implement both Python and CPP hooks.
-struct TORCH_PYTHON_API CommHookInterface {
+// Base class of both `PythonCommHook` and `CppCommHook`.
+// Requires implementing 1) `runHook` method that communicates gradients
+// asynchronously, and 2) `parseHookResult` method that converts the hook result
+// into a tensor vector.
+class TORCH_PYTHON_API CommHookInterface {
  public:
   virtual ~CommHookInterface() {}
 
-  // runHook takes a GradBucket type bucket and passes the tensors of
-  // this grad bucket to hook's callback. This function is called once
-  // the bucket is ready. The hook can perform whatever processing is
-  // needed and return a Future that will hold the new value of the grad
-  // bucket's tensors once ready.
+  // Passes the input grad bucket to the registered communication hook.
+  // Once the tensors in the bucket are ready, kicks off the hook asynchronously
+  // and returns a future that holds the communication results.
   virtual c10::intrusive_ptr<torch::jit::Future> runHook(
       const GradBucket& bucket) = 0;
 
-  // Once the grad bucket of Future is ready, c10d reducer will call this
-  // function to get the resulting tensors of the grad bucket. Then c10d
-  // reducer will use these tensors and copy grads to the grads of individual
+  // Returns the resulting tensors once the communication hook result is ready.
+  // The resulting tensors will then be copied to the grads of individual
   // parameters.
-  virtual std::vector<at::Tensor> processFuture(c10::IValue future_value) = 0;
+  virtual std::vector<at::Tensor> parseHookResult(
+      const c10::IValue& result) = 0;
 };
 
-// PythonCommHook enables registering a python hook to c10d reducer and is a
-// sub class of CommHookInterface.
 class TORCH_PYTHON_API PythonCommHook : public CommHookInterface {
  public:
-  // The constructor takes a state and a callable hook. Inputs are Python
-  // objects. The state is passed to the hook in runHook function can be used to
-  // maintain and update any state information that users would like to maintain
-  // as part of the training process. The hook can perform whatever processing
-  // user specified and return a Future indicating completion of any async work.
-  PythonCommHook(py::object state, py::object hook);
+  // Takes a state and a callable hook. The inputs are Python objects.
+  // The state is passed to the hook in runHook method, and it can be used to
+  // maintain and update any state information during the execution of the hook.
+  // The hook performs user-specified processing and returns a future indicating
+  // asychronous communication of gradients.
+  PythonCommHook(py::object state, py::object hook)
+      : state_(std::move(state)), hook_(std::move(hook)) {}
 
-  ~PythonCommHook() override {
-    py::gil_scoped_acquire ag;
-    state_.dec_ref();
-    hook_.dec_ref();
-    // explicitly setting PyObject* state_ and hook_ to nullptr to prevent
-    // py::object's dtor to decref on the PyObject again.
-    // See Note [Destructing py::object] in python_ivalue.h
-    state_.ptr() = nullptr;
-    hook_.ptr() = nullptr;
-  }
+  ~PythonCommHook() override;
 
   c10::intrusive_ptr<torch::jit::Future> runHook(
       const GradBucket& bucket) override;
 
-  std::vector<at::Tensor> processFuture(c10::IValue future_value) override;
+  std::vector<at::Tensor> parseHookResult(const c10::IValue& result) override;
 
  private:
+  // Only needed for stateful communication.
   py::object state_;
+  // Indicates an asynchrounous communication of gradients.
   py::object hook_;
 };
 
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index f146a2a9be45..ea1eef082a52 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -1165,7 +1165,7 @@ void Reducer::finalize_backward() {
       bucket.future_work->wait();
 
       auto future_result =
-          comm_hook_->processFuture(bucket.future_work->value());
+          comm_hook_->parseHookResult(bucket.future_work->value());
 
       for (size_t i = 0; i < future_result.size(); i++) {
         auto& replica = bucket.replicas[i];

From a4adc1b6d719f728b25f64bc461a21e22cae76d5 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Mon, 26 Oct 2020 18:10:08 -0700
Subject: [PATCH 025/169] Fix unused variable warning (#46838)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46838

`SCALAR_TYPE` may be unused in some contexts where the macro is used. We use the standard `(void)var` trick to suppress the compiler warning in these instances.

Test Plan: Standard pre-commit tests.

Reviewed By: jerryzh168

Differential Revision: D24481142

fbshipit-source-id: 4fcde669cc279b8863443d49c51edaee69f4d7bd
---
 aten/src/ATen/Dispatch.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h
index e0fc25c394d3..e0d8a9a525b3 100644
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@@ -31,6 +31,8 @@
     const auto& SCALAR_TYPE C10_UNUSED_DISPATCH_CUDA_WORKAROUND = enum_type; \
     const auto& UNDERLYING_TYPE C10_UNUSED_DISPATCH_CUDA_WORKAROUND =        \
         toUnderlying(enum_type);                                             \
+    (void)SCALAR_TYPE;  /* Suppress unused-var compiler warning */           \
+    /* TODO: Use [[maybe-unused]] when C++17 becomes the standard */         \
     return __VA_ARGS__();                                                    \
   }
 

From a602811da748783f310c85abb898bf356a5edf66 Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Mon, 26 Oct 2020 18:13:55 -0700
Subject: [PATCH 026/169] [ROCm] fix bug in miopen findAlgorithm. (#46852)

Summary:
findAlgorithm should return if and only if a suitable algorithm is found.
The default algorithm is not guaranteed to have been cached.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46852

Reviewed By: izdeby

Differential Revision: D24546748

Pulled By: bhosmer

fbshipit-source-id: 171137b377193e0825769b61d42a05016f02c34c
---
 aten/src/ATen/native/miopen/Conv_miopen.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp
index 3f6e78e77c9f..27e119d377bc 100644
--- a/aten/src/ATen/native/miopen/Conv_miopen.cpp
+++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@@ -468,7 +468,6 @@ void findAlgorithm(const ConvolutionArgs& args, bool benchmark, algo_t* algo) {
 
   if (args.params.deterministic && !benchmark) {
     *algo = search::DEFAULT_ALGO;
-    return;
   }
 
   if (cache.find(args.params, algo)) {

From 2397c8d1f78cb5b48bcec9c180c1eef81c13b887 Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Mon, 26 Oct 2020 18:14:49 -0700
Subject: [PATCH 027/169] [pytorch] Improve/fix heuristics for using mkldnn vs
 native conv (#46675)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46675

We've found a few heuristics for using/not using mkldnn that seem to generally
improve performance on 2d and 3d conv.

- 1x1 convolutions are basically batch matmuls, and mkldnn's implementation
  appears to usually be slower than using the native conv (which lowers to
  aten::mm, which in turn calls mkl gemm).

- 3d conv was often not using mkldnn even when it's beneficial, because the
  heuristic was checking the kernel depth rather than height/width.  mkldnn
  seems to be faster for (1, 7, 7) and (3, 7, 7) kernel sizes, which are
  allowed by the new heuristic.

Test Plan:
Bento notebooks showing before/after:
before: https://www.internalfb.com/intern/anp/view/?id=38089
after: https://www.internalfb.com/intern/anp/view/?id=380893

Also, I've run a conv fuzzer, and it generally supports these heuristics.  I'm
not sure how to best share the data since there's a lot of it (I tried about
50k parameter combinations).

For the 1x1 case, about 70% were faster with "native".  I played with
constructing a decision tree (using scikit-learn) and found that switching back
to MKL for batch size > 16 might be slightly better still, but I'm not sure
it's worth complicating the heuristic.

Results for some popular shapes in tabular format:
```
[------------------------- conv2d_1x1 ------------------------]
                                           |   base   |   diff
1 threads: ----------------------------------------------------
      [1, 128, 56, 56] [256, 128, 1, 1]    |  3665.3  |  2838.4
      [1, 512, 14, 14] [1024, 512, 1, 1]   |  3174.7  |  3164.0
      [1, 64, 56, 56] [256, 64, 1, 1]      |  2249.1  |  1468.8
      [1, 1024, 14, 14] [512, 1024, 1, 1]  |  3158.2  |  3147.7
      [1, 1024, 7, 7] [2048, 1024, 1, 1]   |  8191.8  |  3973.9
      [1, 2048, 7, 7] [1024, 2048, 1, 1]   |  7901.2  |  3861.6
      [1, 256, 28, 28] [512, 256, 1, 1]    |  3103.9  |  2775.9
2 threads: ----------------------------------------------------
      [1, 128, 56, 56] [256, 128, 1, 1]    |  1973.7  |  1475.8
      [1, 512, 14, 14] [1024, 512, 1, 1]   |  2265.0  |  1603.0
      [1, 64, 56, 56] [256, 64, 1, 1]      |  1445.4  |   789.8
      [1, 1024, 14, 14] [512, 1024, 1, 1]  |  2298.8  |  1620.0
      [1, 1024, 7, 7] [2048, 1024, 1, 1]   |  6350.7  |  1995.0
      [1, 2048, 7, 7] [1024, 2048, 1, 1]   |  6471.2  |  1903.7
      [1, 256, 28, 28] [512, 256, 1, 1]    |  1932.3  |  1524.2
4 threads: ----------------------------------------------------
      [1, 128, 56, 56] [256, 128, 1, 1]    |  1198.8  |   785.6
      [1, 512, 14, 14] [1024, 512, 1, 1]   |  1305.0  |   901.6
      [1, 64, 56, 56] [256, 64, 1, 1]      |   791.0  |   472.9
      [1, 1024, 14, 14] [512, 1024, 1, 1]  |  1311.2  |   908.5
      [1, 1024, 7, 7] [2048, 1024, 1, 1]   |  3958.6  |   997.7
      [1, 2048, 7, 7] [1024, 2048, 1, 1]   |  4099.6  |  1023.1
      [1, 256, 28, 28] [512, 256, 1, 1]    |  1120.3  |   740.8

Times are in microseconds (us).

[--------------------- conv2d_7x7 ---------------------]
                                      |   base  |   diff
1 threads: ---------------------------------------------
      [25, 3, 48, 320] [64, 3, 7, 7]  |  209.3  |  229.3
      [1, 3, 384, 288] [64, 3, 7, 7]  |   68.9  |   72.3
2 threads: ---------------------------------------------
      [25, 3, 48, 320] [64, 3, 7, 7]  |  116.0  |  117.6
      [1, 3, 384, 288] [64, 3, 7, 7]  |   40.4  |   38.7
4 threads: ---------------------------------------------
      [25, 3, 48, 320] [64, 3, 7, 7]  |   64.2  |   66.5
      [1, 3, 384, 288] [64, 3, 7, 7]  |   21.4  |   21.9

Times are in milliseconds (ms).

[---------------------------- conv3d ---------------------------]
                                               |   base  |   diff
1 threads: ------------------------------------------------------
      [1, 3, 16, 224, 224] [32, 3, 1, 7, 7]    |  602.8  |  296.2
      [1, 3, 4, 112, 112] [64, 3, 3, 7, 7]     |   52.5  |   26.5
      [1, 256, 8, 14, 14] [256, 256, 3, 3, 3]  |   50.0  |   50.3
2 threads: ------------------------------------------------------
      [1, 3, 16, 224, 224] [32, 3, 1, 7, 7]    |  351.0  |  168.1
      [1, 3, 4, 112, 112] [64, 3, 3, 7, 7]     |   38.5  |   14.9
      [1, 256, 8, 14, 14] [256, 256, 3, 3, 3]  |   24.8  |   26.2
4 threads: ------------------------------------------------------
      [1, 3, 16, 224, 224] [32, 3, 1, 7, 7]    |  212.6  |   96.0
      [1, 3, 4, 112, 112] [64, 3, 3, 7, 7]     |   21.5  |    7.6
      [1, 256, 8, 14, 14] [256, 256, 3, 3, 3]  |   12.7  |   13.3

Times are in milliseconds (ms).
```

Reviewed By: jansel

Differential Revision: D24452071

fbshipit-source-id: 12687971be531831530dc29bf2fc079a917d0c8d
---
 aten/src/ATen/native/Convolution.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index f48ad9a4a6cb..6dbf1e5535ed 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -237,7 +237,11 @@ auto ConvParams::use_mkldnn(const at::Tensor& input, const at::Tensor& weight) c
     (input.options().backend() == at::Backend::CPU &&
      input.scalar_type() == kFloat && // only on CPU Float Tensors
      !transposed && // or transposed tensors
-     (groups > 1 || weight.size(2) > 3 || input.size(0) > 1
+     (is_strided() || is_dilated() || input.size(0) >= 16 ||
+      weight.size(-1) != 1 || weight.size(-2) != 1) &&
+     (groups > 1
+      || (weight.size(-1) > 3 && weight.size(-2) > 3)
+      || input.size(0) > 1
       || input.size(0)*input.size(1)*input.size(2)*input.size(3) > 20480)); // for some case, native is faster
 #endif
   return false;

From c9bf03a6c4bbe069f2110175693742e0dc17b523 Mon Sep 17 00:00:00 2001
From: Ashkan Aliabadi <ashkanaliabadi@fb.com>
Date: Mon, 26 Oct 2020 18:21:18 -0700
Subject: [PATCH 028/169] Add Vulkan Tensor. (#44015)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44015

Test Plan: Imported from OSS

Reviewed By: IvanKobzarev

Differential Revision: D23820827

Pulled By: AshkanAliabadi

fbshipit-source-id: 7691da56a5d0073d078b901d8951437757ab1085
---
 aten/src/ATen/native/vulkan/api/Adapter.h    |    5 +
 aten/src/ATen/native/vulkan/api/Command.cpp  |   48 +-
 aten/src/ATen/native/vulkan/api/Command.h    |    7 +
 aten/src/ATen/native/vulkan/api/Context.cpp  |   13 +-
 aten/src/ATen/native/vulkan/api/Context.h    |    8 +-
 aten/src/ATen/native/vulkan/api/Pipeline.cpp |    6 +-
 aten/src/ATen/native/vulkan/api/Pipeline.h   |   19 +-
 aten/src/ATen/native/vulkan/api/Resource.cpp |   29 +-
 aten/src/ATen/native/vulkan/api/Resource.h   |  120 +-
 aten/src/ATen/native/vulkan/api/Shader.h     |   12 +-
 aten/src/ATen/native/vulkan/api/Utils.h      |   21 +
 aten/src/ATen/native/vulkan/api/api.h        |    1 +
 aten/src/ATen/native/vulkan/ops/Common.h     |    9 +
 aten/src/ATen/native/vulkan/ops/Tensor.cpp   | 1249 ++++++++++++++++++
 aten/src/ATen/native/vulkan/ops/Tensor.h     |  598 +++++++++
 15 files changed, 2059 insertions(+), 86 deletions(-)
 create mode 100644 aten/src/ATen/native/vulkan/api/Utils.h
 create mode 100644 aten/src/ATen/native/vulkan/ops/Common.h
 create mode 100644 aten/src/ATen/native/vulkan/ops/Tensor.cpp
 create mode 100644 aten/src/ATen/native/vulkan/ops/Tensor.h

diff --git a/aten/src/ATen/native/vulkan/api/Adapter.h b/aten/src/ATen/native/vulkan/api/Adapter.h
index 0efc08010ef5..4ba02a5e9926 100644
--- a/aten/src/ATen/native/vulkan/api/Adapter.h
+++ b/aten/src/ATen/native/vulkan/api/Adapter.h
@@ -4,6 +4,7 @@
 
 #include <ATen/native/vulkan/api/Common.h>
 #include <ATen/native/vulkan/api/Runtime.h>
+#include <ATen/native/vulkan/api/Shader.h>
 
 namespace at {
 namespace native {
@@ -30,6 +31,10 @@ struct Adapter final {
     // for now.
     return VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU == properties.deviceType;
   }
+
+  inline Shader::WorkGroup local_work_group_size() const {
+    return { 8u, 8u, 1u, };
+  }
 };
 
 } // namespace api
diff --git a/aten/src/ATen/native/vulkan/api/Command.cpp b/aten/src/ATen/native/vulkan/api/Command.cpp
index cdf96f88f639..3066e815ebed 100644
--- a/aten/src/ATen/native/vulkan/api/Command.cpp
+++ b/aten/src/ATen/native/vulkan/api/Command.cpp
@@ -1,5 +1,6 @@
 #include <ATen/native/vulkan/api/Command.h>
 #include <ATen/native/vulkan/api/Adapter.h>
+#include <ATen/native/vulkan/api/Utils.h>
 
 namespace at {
 namespace native {
@@ -69,6 +70,10 @@ VkCommandBuffer allocate_command_buffer(
 
 } // namespace
 
+Command::Buffer::Buffer()
+  : command_buffer_(VK_NULL_HANDLE) {
+}
+
 Command::Buffer::Buffer(
     const VkDevice device,
     const VkCommandPool command_pool)
@@ -113,19 +118,28 @@ void Command::Buffer::barrier(
       "Potential reason: This command buffer is moved from.");
 
   c10::SmallVector<VkMemoryBarrier, 1u> global_memory_barriers;
-  c10::SmallVector<VkImageMemoryBarrier, 1u> image_memory_barriers;
+  c10::SmallVector<VkImageMemoryBarrier, 4u> image_memory_barriers;
 
-  for (const Resource::Buffer::Barrier& barrier : barrier.buffers) {
+  if (!barrier.buffers.empty()) {
     // Using global memory barriers instead of buffer memory barriers for
     // buffers.  The consensus seems to be that there is no advantage in
-    // using the latter.
+    // using the latter in favor of the former.
 
-    global_memory_barriers.push_back({
-          VK_STRUCTURE_TYPE_MEMORY_BARRIER,
-          nullptr,
-          barrier.memory.src,
-          barrier.memory.dst,
-        });
+    VkMemoryBarrier global_memory_barrier{
+      VK_STRUCTURE_TYPE_MEMORY_BARRIER,
+      nullptr,
+      0u,
+      0u,
+    };
+
+    // Coalesce all buffer memory barriers into one global memory barrier.
+
+    for (const Resource::Buffer::Barrier& barrier : barrier.buffers) {
+      global_memory_barrier.srcAccessMask |= barrier.memory.src;
+      global_memory_barrier.dstAccessMask |= barrier.memory.dst;
+    }
+
+    global_memory_barriers.push_back(global_memory_barrier);
   }
 
   for (const Resource::Image::Barrier& barrier : barrier.images) {
@@ -248,15 +262,17 @@ void Command::Buffer::dispatch(
       "This command buffer is in an invalid state! "
       "Potential reason: This command buffer is moved from.");
 
-  static const auto div_round_up = [](const uint32_t n, const uint32_t d) {
-    return (n + d - 1u) / d;
-  };
-
   vkCmdDispatch(
       command_buffer_,
-      div_round_up(global_work_group.x, bound_.pipeline.local_work_group.x),
-      div_round_up(global_work_group.y, bound_.pipeline.local_work_group.y),
-      div_round_up(global_work_group.z, bound_.pipeline.local_work_group.z));
+      div_up(
+          global_work_group.width,
+          bound_.pipeline.local_work_group.width),
+      div_up(
+          global_work_group.height,
+          bound_.pipeline.local_work_group.height),
+      div_up(
+          global_work_group.depth,
+          bound_.pipeline.local_work_group.depth));
 }
 
 void Command::Buffer::submit(
diff --git a/aten/src/ATen/native/vulkan/api/Command.h b/aten/src/ATen/native/vulkan/api/Command.h
index 38f251ba4d2b..8e2f235cfa27 100644
--- a/aten/src/ATen/native/vulkan/api/Command.h
+++ b/aten/src/ATen/native/vulkan/api/Command.h
@@ -20,6 +20,7 @@ struct Command final {
 
   class Buffer final {
    public:
+    Buffer();
     Buffer(VkDevice device, VkCommandPool command_pool);
     Buffer(const Buffer&) = delete;
     Buffer& operator=(const Buffer&) = delete;
@@ -27,6 +28,8 @@ struct Command final {
     Buffer& operator=(Buffer&&);
     ~Buffer() = default;
 
+    operator bool() const;
+
     void begin();
     void end();
     void barrier(const Pipeline::Barrier& barrier);
@@ -91,6 +94,10 @@ inline Command::Buffer& Command::Buffer::operator=(Buffer&& buffer) {
   return *this;
 }
 
+inline Command::Buffer::operator bool() const {
+  return VK_NULL_HANDLE != command_buffer_;
+}
+
 } // namespace api
 } // namespace vulkan
 } // namespace native
diff --git a/aten/src/ATen/native/vulkan/api/Context.cpp b/aten/src/ATen/native/vulkan/api/Context.cpp
index f28d8819626b..ef638f917956 100644
--- a/aten/src/ATen/native/vulkan/api/Context.cpp
+++ b/aten/src/ATen/native/vulkan/api/Context.cpp
@@ -132,11 +132,12 @@ Context* context() {
 Descriptor::Set dispatch_prologue(
     Command::Buffer& command_buffer,
     const Shader::Layout::Signature& shader_layout_signature,
-    const Shader::Descriptor& shader_descriptor,
-    const Shader::WorkGroup& local_work_group) {
-  Descriptor& descriptor = context()->descriptor();
-  Pipeline& pipeline = context()->pipeline();
-  Shader& shader = context()->shader();
+    const Shader::Descriptor& shader_descriptor) {
+  Context* const context = api::context();
+  const GPU gpu = context->gpu();
+  Descriptor& descriptor = context->descriptor();
+  Pipeline& pipeline = context->pipeline();
+  Shader& shader = context->shader();
 
   const Shader::Layout::Object shader_layout =
       shader.layout.cache.retrieve({
@@ -149,7 +150,7 @@ Descriptor::Set dispatch_prologue(
           shader_layout.handle,
         }),
         shader.cache.retrieve(shader_descriptor),
-        local_work_group,
+        gpu.adapter->local_work_group_size(),
       }));
 
   return descriptor.pool.allocate(shader_layout);
diff --git a/aten/src/ATen/native/vulkan/api/Context.h b/aten/src/ATen/native/vulkan/api/Context.h
index 95239a1e3042..687ddbbfe931 100644
--- a/aten/src/ATen/native/vulkan/api/Context.h
+++ b/aten/src/ATen/native/vulkan/api/Context.h
@@ -47,7 +47,6 @@ class Context final {
       Command::Buffer& command_buffer,
       const Shader::Layout::Signature& shader_layout_signature,
       const Shader::Descriptor& shader_descriptor,
-      const Shader::WorkGroup& local_work_group,
       const Shader::WorkGroup& global_work_group,
       Arguments&&... arguments);
 
@@ -140,22 +139,19 @@ inline void Context::dispatch(
     Command::Buffer& command_buffer,
     const Shader::Layout::Signature& shader_layout_signature,
     const Shader::Descriptor& shader_descriptor,
-    const Shader::WorkGroup& local_work_group,
     const Shader::WorkGroup& global_work_group,
     Arguments&&... arguments) {
   // Forward declaration
   Descriptor::Set dispatch_prologue(
       Command::Buffer&,
       const Shader::Layout::Signature&,
-      const Shader::Descriptor&,
-      const Shader::WorkGroup&);
+      const Shader::Descriptor&);
 
   // Factor out template parameter independent code to minimize code bloat.
   Descriptor::Set descriptor_set = dispatch_prologue(
       command_buffer,
       shader_layout_signature,
-      shader_descriptor,
-      local_work_group);
+      shader_descriptor);
 
   detail::bind(
       descriptor_set,
diff --git a/aten/src/ATen/native/vulkan/api/Pipeline.cpp b/aten/src/ATen/native/vulkan/api/Pipeline.cpp
index 93c954354ab2..9facc3f49e0f 100644
--- a/aten/src/ATen/native/vulkan/api/Pipeline.cpp
+++ b/aten/src/ATen/native/vulkan/api/Pipeline.cpp
@@ -101,11 +101,11 @@ typename Pipeline::Factory::Handle Pipeline::Factory::operator()(
       "Invalid Vulkan shader module!");
 
   constexpr uint32_t x_offset = 0u;
-  constexpr uint32_t x_size = sizeof(Shader::WorkGroup::x);
+  constexpr uint32_t x_size = sizeof(Shader::WorkGroup::width);
   constexpr uint32_t y_offset = x_offset + x_size;
-  constexpr uint32_t y_size = sizeof(Shader::WorkGroup::y);
+  constexpr uint32_t y_size = sizeof(Shader::WorkGroup::height);
   constexpr uint32_t z_offset = y_offset + y_size;
-  constexpr uint32_t z_size = sizeof(Shader::WorkGroup::z);
+  constexpr uint32_t z_size = sizeof(Shader::WorkGroup::depth);
 
   constexpr VkSpecializationMapEntry specialization_map_entires[3]{
     // X
diff --git a/aten/src/ATen/native/vulkan/api/Pipeline.h b/aten/src/ATen/native/vulkan/api/Pipeline.h
index 9ef2bf8c3991..50893b709473 100644
--- a/aten/src/ATen/native/vulkan/api/Pipeline.h
+++ b/aten/src/ATen/native/vulkan/api/Pipeline.h
@@ -43,8 +43,10 @@ struct Pipeline final {
       VkPipelineStageFlags dst;
     } stage;
 
-    c10::SmallVector<Resource::Buffer::Barrier, 1u> buffers;
-    c10::SmallVector<Resource::Image::Barrier, 1u> images;
+    c10::SmallVector<Resource::Buffer::Barrier, 4u> buffers;
+    c10::SmallVector<Resource::Image::Barrier, 4u> images;
+
+    operator bool() const;
   };
 
   //
@@ -169,6 +171,13 @@ struct Pipeline final {
 // Impl
 //
 
+inline Pipeline::Barrier::operator bool() const {
+  return (0u != stage.src) ||
+         (0u != stage.dst) ||
+         !buffers.empty() ||
+         !images.empty();
+}
+
 inline bool operator==(
     const Pipeline::Layout::Descriptor& _1,
     const Pipeline::Layout::Descriptor& _2) {
@@ -193,9 +202,9 @@ inline size_t Pipeline::Factory::Hasher::operator()(
   return c10::get_hash(
       descriptor.pipeline_layout,
       descriptor.shader_module,
-      descriptor.local_work_group.x,
-      descriptor.local_work_group.y,
-      descriptor.local_work_group.z);
+      descriptor.local_work_group.width,
+      descriptor.local_work_group.height,
+      descriptor.local_work_group.depth);
 }
 
 inline Pipeline::Object::operator bool() const {
diff --git a/aten/src/ATen/native/vulkan/api/Resource.cpp b/aten/src/ATen/native/vulkan/api/Resource.cpp
index 436288645e95..9f29ff2cbfd5 100644
--- a/aten/src/ATen/native/vulkan/api/Resource.cpp
+++ b/aten/src/ATen/native/vulkan/api/Resource.cpp
@@ -46,13 +46,13 @@ VmaAllocator create_allocator(
 }
 
 VmaAllocationCreateInfo create_allocation_create_info(
-    const VmaMemoryUsage usage) {
+    const Resource::Memory::Descriptor& descriptor) {
   return VmaAllocationCreateInfo{
     0u, /* VMA_ALLOCATION_CREATE_MAPPED_BIT - MoltenVK Issue #175 */
         /* VMA_ALLOCATION_CREATE_STRATEGY_MIN_FRAGMENTATION_BIT */
-    usage,
-    0u,
-    0u,
+    descriptor.usage,
+    descriptor.required,
+    descriptor.preferred,
     0u,
     VK_NULL_HANDLE,
     nullptr,
@@ -101,7 +101,7 @@ void* map(const Resource::Memory& memory) {
 Resource::Memory::Scope::Scope(
     const VmaAllocator allocator,
     const VmaAllocation allocation,
-    const Access access)
+    const Access::Flags access)
   : allocator_(allocator),
     allocation_(allocation),
     access_(access) {
@@ -121,7 +121,7 @@ void Resource::Memory::Scope::operator()(const void* const data) const {
 
   vmaUnmapMemory(allocator_, allocation_);
 
-  if (Access::Write == access_) {
+  if (access_ & Access::Write) {
     // Call will be ignored by implementation if the memory type this allocation
     // belongs to is not HOST_VISIBLE or is HOST_COHERENT, which is the behavior
     // we want.
@@ -177,6 +177,23 @@ Resource::Image::Sampler::Factory::operator()(
   };
 }
 
+VkFence Resource::Fence::handle(const bool add_to_waitlist) const {
+  if (!pool) {
+    return VK_NULL_HANDLE;
+  }
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      id < pool->fence_.pool.size(),
+      "Invalid Vulkan fence!");
+
+  const VkFence fence = pool->fence_.pool[id].get();
+  if (add_to_waitlist) {
+    pool->fence_.waitlist.push_back(fence);
+  }
+
+  return fence;
+}
+
 void Resource::Fence::wait(const uint64_t timeout_nanoseconds) {
   const VkFence fence = handle(/* add_to_waitlist = */ false);
 
diff --git a/aten/src/ATen/native/vulkan/api/Resource.h b/aten/src/ATen/native/vulkan/api/Resource.h
index afe008c14b20..a9428d272782 100644
--- a/aten/src/ATen/native/vulkan/api/Resource.h
+++ b/aten/src/ATen/native/vulkan/api/Resource.h
@@ -12,6 +12,7 @@ namespace native {
 namespace vulkan {
 namespace api {
 
+
 struct Resource final {
   class Pool;
 
@@ -29,22 +30,49 @@ struct Resource final {
       VkAccessFlags dst;
     };
 
+    /*
+      Descriptor
+    */
+
+    struct Descriptor final {
+      VmaMemoryUsage usage;
+      VkMemoryPropertyFlags /* optional */ required;
+      VkMemoryPropertyFlags /* optional */ preferred;
+    };
+
     VmaAllocator allocator;
     VmaAllocation allocation;
 
+    struct Access final {
+      typedef uint8_t Flags;
+
+      enum Type : Flags {
+        Read = 1u << 0u,
+        Write = 1u << 1u,
+      };
+
+      template<typename Type, Flags access>
+      using Pointer = std::add_pointer_t<
+          std::conditional_t<
+              0u != (access & Write),
+              Type,
+              std::add_const_t<Type>>>;
+    };
+
     class Scope;
     template<typename Type>
-    using Data = Handle<Type, Scope>;
+    using Handle = Handle<Type, Scope>;
 
     template<
         typename Type,
-        typename Pointer = std::add_pointer_t<std::add_const_t<Type>>>
-    Data<Pointer> map() const &;
+        typename Pointer = Access::Pointer<Type, Access::Read>>
+    Handle<Pointer> map() const &;
 
     template<
         typename Type,
-        typename Pointer = std::add_pointer_t<Type>>
-    Data<Pointer> map() &;
+        Access::Flags kAccess,
+        typename Pointer = Access::Pointer<Type, kAccess>>
+    Handle<Pointer> map() &;
 
    private:
     // Intentionally disabed to ensure memory access is always properly
@@ -54,10 +82,10 @@ struct Resource final {
     // for seemingly ineffective memory writes and hard to hunt down bugs.
 
     template<typename Type, typename Pointer>
-    Data<Pointer> map() const && = delete;
+    Handle<Pointer> map() const && = delete;
 
-    template<typename Type, typename Pointer>
-    Data<Pointer> map() && = delete;
+    template<typename Type, Access::Flags kAccess, typename Pointer>
+    Handle<Pointer> map() && = delete;
   };
 
   //
@@ -74,7 +102,7 @@ struct Resource final {
 
       struct {
         VkBufferUsageFlags buffer;
-        VmaMemoryUsage memory;
+        Memory::Descriptor memory;
       } usage;
     };
 
@@ -171,7 +199,7 @@ struct Resource final {
 
       struct {
         VkImageUsageFlags image;
-        VmaMemoryUsage memory;
+        Memory::Descriptor memory;
       } usage;
 
       struct {
@@ -241,11 +269,18 @@ struct Resource final {
     Pool& operator=(Pool&&);
     ~Pool();
 
+    // Primary
+
     Buffer buffer(const Buffer::Descriptor& descriptor);
     Image image(const Image::Descriptor& descriptor);
     Fence fence();
     void purge();
 
+    // Helper
+
+    template <typename Block>
+    Buffer uniform(const Block& block);
+
    private:
     friend struct Fence;
 
@@ -284,37 +319,42 @@ struct Resource final {
 
 class Resource::Memory::Scope final {
  public:
-  enum class Access {
-    Read,
-    Write,
-  };
+  Scope(
+      VmaAllocator allocator,
+      VmaAllocation allocation,
+      Access::Flags access);
 
-  Scope(VmaAllocator allocator, VmaAllocation allocation, Access access);
   void operator()(const void* data) const;
 
  private:
   VmaAllocator allocator_;
   VmaAllocation allocation_;
-  Access access_;
+  Access::Flags access_;
 };
 
 template<typename, typename Pointer>
-inline Resource::Memory::Data<Pointer> Resource::Memory::map() const & {
+inline Resource::Memory::Handle<Pointer> Resource::Memory::map() const & {
   void* map(const Memory& memory);
 
-  return Data<Pointer>{
+  return Handle<Pointer>{
     reinterpret_cast<Pointer>(map(*this)),
-    Scope(allocator, allocation, Scope::Access::Read),
+    Scope(allocator, allocation, Access::Read),
   };
 }
 
-template<typename, typename Pointer>
-inline Resource::Memory::Data<Pointer> Resource::Memory::map() & {
+template<typename, Resource::Memory::Access::Flags kAccess, typename Pointer>
+inline Resource::Memory::Handle<Pointer> Resource::Memory::map() & {
   void* map(const Memory& memory);
 
-  return Data<Pointer>{
+  static_assert(
+      (kAccess == Access::Read) ||
+      (kAccess == Access::Write) ||
+      (kAccess == (Access::Read | Access::Write)),
+      "Invalid memory access!");
+
+  return Handle<Pointer>{
     reinterpret_cast<Pointer>(map(*this)),
-    Scope(allocator, allocation, Scope::Access::Write),
+    Scope(allocator, allocation, kAccess),
   };
 }
 
@@ -356,21 +396,29 @@ inline Resource::Fence::operator bool() const {
   return pool;
 }
 
-inline VkFence Resource::Fence::handle(const bool add_to_waitlist) const {
-  if (!pool) {
-    return VK_NULL_HANDLE;
-  }
-
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-      id < pool->fence_.pool.size(),
-      "Invalid Vulkan fence!");
-
-  const VkFence fence = pool->fence_.pool[id].get();
-  if (add_to_waitlist) {
-    pool->fence_.waitlist.push_back(fence);
+template<typename Block>
+inline Resource::Buffer Resource::Pool::uniform(const Block& block) {
+  Buffer uniform = this->buffer({
+      sizeof(Block),
+      {
+        VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
+        {
+          VMA_MEMORY_USAGE_CPU_TO_GPU,
+          0u,
+          0u,
+        },
+      },
+    });
+
+  {
+    Memory::Handle<Block*> memory = uniform.memory.template map<
+        Block,
+        Memory::Access::Write>();
+
+    *memory.get() = block;
   }
 
-  return fence;
+  return uniform;
 }
 
 } // namespace api
diff --git a/aten/src/ATen/native/vulkan/api/Shader.h b/aten/src/ATen/native/vulkan/api/Shader.h
index f3e62d9c4d0a..b2238a95de50 100644
--- a/aten/src/ATen/native/vulkan/api/Shader.h
+++ b/aten/src/ATen/native/vulkan/api/Shader.h
@@ -112,11 +112,7 @@ struct Shader final {
   // Work Group
   //
 
-  struct WorkGroup final {
-    uint32_t x;
-    uint32_t y;
-    uint32_t z;
-  };
+  typedef VkExtent3D WorkGroup;
 
   /*
     Descriptor
@@ -228,9 +224,9 @@ inline void Shader::Layout::Cache::purge() {
 inline bool operator==(
     const Shader::WorkGroup& _1,
     const Shader::WorkGroup& _2) {
-  return (_1.x == _2.x) &&
-         (_1.y == _2.y) &&
-         (_1.z == _2.z);
+  return (_1.width == _2.width) &&
+         (_1.height == _2.height) &&
+         (_1.depth == _2.depth);
 }
 
 inline Shader::Descriptor::Descriptor(const char* const glsl)
diff --git a/aten/src/ATen/native/vulkan/api/Utils.h b/aten/src/ATen/native/vulkan/api/Utils.h
new file mode 100644
index 000000000000..2882d4dc2283
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/api/Utils.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#ifdef USE_VULKAN_API
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace api {
+
+inline uint32_t div_up(
+    const uint32_t numerator,
+    const uint32_t denominator) {
+  return (numerator + denominator - 1u) / denominator;
+}
+
+} // namespace api
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/api/api.h b/aten/src/ATen/native/vulkan/api/api.h
index b46793cbb660..c20d8b71e3c6 100644
--- a/aten/src/ATen/native/vulkan/api/api.h
+++ b/aten/src/ATen/native/vulkan/api/api.h
@@ -12,5 +12,6 @@
 #include <ATen/native/vulkan/api/Resource.h>
 #include <ATen/native/vulkan/api/Runtime.h>
 #include <ATen/native/vulkan/api/Shader.h>
+#include <ATen/native/vulkan/api/Utils.h>
 
 #endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/ops/Common.h b/aten/src/ATen/native/vulkan/ops/Common.h
new file mode 100644
index 000000000000..121b40cbdb4b
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/ops/Common.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#ifdef USE_VULKAN_API
+
+#include <ATen/ATen.h>
+#include <ATen/native/vulkan/api/api.h>
+#include <ATen/native/vulkan/ops/Tensor.h>
+
+#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/ops/Tensor.cpp b/aten/src/ATen/native/vulkan/ops/Tensor.cpp
new file mode 100644
index 000000000000..a51fd972d19a
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/ops/Tensor.cpp
@@ -0,0 +1,1249 @@
+#include <ATen/native/vulkan/ops/Tensor.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace ops {
+namespace {
+
+VkDeviceSize bytes(
+    const IntArrayRef sizes,
+    const caffe2::TypeMeta dtype) {
+  VkDeviceSize size = c10::elementSize(c10::typeMetaToScalarType(dtype));
+
+  // Forward declaration
+  bool requires_image(IntArrayRef);
+
+  if (requires_image(sizes)) {
+    // Forward declaration
+    VkExtent3D image_extents(IntArrayRef);
+
+    const VkExtent3D extents = image_extents(sizes);
+    size *= extents.width * extents.height * (4u * extents.depth);
+  }
+  else {
+    size = std::accumulate(
+        sizes.cbegin(),
+        sizes.cend(),
+        size,
+        std::multiplies<int64_t>());
+  }
+
+  return size;
+}
+
+VkFormat convert(const caffe2::TypeMeta dtype) {
+  switch (c10::typeMetaToScalarType(dtype)) {
+    case kFloat:
+#ifdef VULKAN_FP16_INFERENCE
+      return VK_FORMAT_R16G16B16A16_SFLOAT;
+#else
+      return VK_FORMAT_R32G32B32A32_SFLOAT;
+#endif /* VULKAN_FP16_INFERENCE */
+
+    default:
+      TORCH_CHECK(
+        false,
+        "Vulkan tensor format not supported!");
+  }
+
+  return VK_FORMAT_UNDEFINED;
+}
+
+vTensor::Access::Flags convert(const VkAccessFlags vk_access) {
+  vTensor::Access::Flags access = 0u;
+
+  constexpr VkAccessFlags kRead =
+      VK_ACCESS_HOST_READ_BIT |
+      VK_ACCESS_MEMORY_READ_BIT |
+      VK_ACCESS_SHADER_READ_BIT |
+      VK_ACCESS_TRANSFER_READ_BIT |
+      VK_ACCESS_UNIFORM_READ_BIT;
+
+  constexpr VkAccessFlags kWrite =
+      VK_ACCESS_HOST_WRITE_BIT |
+      VK_ACCESS_MEMORY_WRITE_BIT |
+      VK_ACCESS_SHADER_WRITE_BIT |
+      VK_ACCESS_TRANSFER_WRITE_BIT;
+
+  if (vk_access & kRead) {
+    access |= vTensor::Access::Read;
+  }
+
+  if (vk_access & kWrite) {
+    access |= vTensor::Access::Write;
+  }
+
+  return access;
+}
+
+vTensor::Buffer allocate_buffer(
+    api::Context* const context,
+    const IntArrayRef sizes,
+    const TensorOptions& options) {
+  TORCH_CHECK(!sizes.empty(), "Invalid Vulkan tensor size!");
+  verify(options);
+
+  // Forward declaration
+  bool requires_staging(api::Context*);
+
+  const VkFlags usage = [context]() {
+    VkFlags usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
+
+    if (requires_staging(context)) {
+      usage |= VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
+               VK_BUFFER_USAGE_TRANSFER_DST_BIT;
+    }
+
+    return usage;
+  }();
+
+  const auto memory = [context]() -> api::Resource::Memory::Descriptor {
+    if (requires_staging(context)) {
+      return {
+        VMA_MEMORY_USAGE_GPU_ONLY,
+        0u,
+        VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+      };
+    }
+
+    return {
+      VMA_MEMORY_USAGE_UNKNOWN,
+      VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT,
+      VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+    };
+  }();
+
+  return context->resource().pool.buffer(
+      vTensor::Buffer::Descriptor{
+        bytes(sizes, options.dtype()),
+        // Usage
+        {
+          usage,
+          memory,
+        },
+      });
+}
+
+bool requires_image(const IntArrayRef sizes) {
+  return (1u <= sizes.size()) && (sizes.size() <= 4u);
+}
+
+VkExtent3D image_extents(const IntArrayRef sizes) {
+  int64_t width = 1;
+  int64_t height = 1;
+  int64_t depth = 1;
+
+  switch (sizes.size()) {
+    case 1:
+      width = sizes[0];
+      break;
+
+    case 2:
+      width = sizes[1];
+      height = sizes[0];
+      break;
+
+    case 3:
+      width = sizes[2];
+      height = sizes[1];
+      depth = sizes[0];
+      break;
+
+    case 4:
+      width = sizes[3];
+      height = sizes[2];
+      depth = sizes[0] * sizes[1];
+      break;
+
+    default:
+      TORCH_INTERNAL_ASSERT(
+          false,
+          "Only Tensors with 1 <= dim <= 4 can be represented as a Vulkan Image!");
+  }
+
+  return {
+    width,
+    height,
+    api::div_up(depth, 4u),
+  };
+}
+
+vTensor::Image allocate_image(
+    api::Context* const context,
+    const VkExtent3D& extents,
+    const TensorOptions& options) {
+  verify(options);
+
+  return context->resource().pool.image(
+      vTensor::Image::Descriptor{
+        VK_IMAGE_TYPE_3D,
+        convert(options.dtype()),
+        extents,
+        // Usage
+        {
+          VK_IMAGE_USAGE_SAMPLED_BIT |
+              VK_IMAGE_USAGE_STORAGE_BIT,
+          {
+            VMA_MEMORY_USAGE_GPU_ONLY,
+            0u,
+            VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+          },
+        },
+        // View
+        {
+          VK_IMAGE_VIEW_TYPE_3D,
+          convert(options.dtype()),
+        },
+      });
+}
+
+bool requires_staging(api::Context* const context) {
+  return !context->gpu().adapter->has_unified_memory();
+}
+
+vTensor::Buffer allocate_staging(
+    api::Context* const context,
+    const IntArrayRef sizes,
+    const TensorOptions& options) {
+  TORCH_CHECK(!sizes.empty(), "Invalid Vulkan tensor size!");
+  verify(options);
+
+  return context->resource().pool.buffer(
+      vTensor::Buffer::Descriptor{
+        bytes(sizes, options.dtype()),
+        // Usage
+        {
+          VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
+              VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+          {
+            VMA_MEMORY_USAGE_CPU_ONLY,
+            0u,
+            0u,
+          },
+        },
+      });
+}
+
+vTensor::Fence allocate_fence(
+    api::Context* const context) {
+  return context->resource().pool.fence();
+}
+
+enum class Barrier {
+  None,
+  Exectution,
+  Memory,
+};
+
+Barrier categorize(
+    const VkAccessFlags vk_src_access,
+    const VkAccessFlags vk_dst_access) {
+  if (0u == vk_src_access) {
+    return Barrier::None;
+  }
+
+  const vTensor::Access::Flags src_access = convert(vk_src_access);
+  const vTensor::Access::Flags dst_access = convert(vk_dst_access);
+
+  if (vTensor::Access::Read == (src_access & vTensor::Access::Read)) {
+    if (vTensor::Access::Read == (dst_access & vTensor::Access::Read)) {
+      // RAR (Read after Read)
+      return Barrier::None;
+    }
+
+    // WAR (Write after Read)
+    return Barrier::Exectution;
+  }
+
+  // RAW (Read after Write), or WAW (Write after Write)
+  return Barrier::Memory;
+};
+
+Barrier categorize(
+    const VkAccessFlags vk_src_access,
+    const VkAccessFlags vk_dst_access,
+    const VkImageLayout vk_src_layout,
+    const VkImageLayout vk_dst_layout) {
+  if (vk_src_layout != vk_dst_layout) {
+    return Barrier::Memory;
+  }
+
+  return categorize(vk_src_access, vk_dst_access);
+}
+
+} // namespace
+
+vTensor::vTensor(
+    api::Context* const context,
+    const IntArrayRef sizes,
+    const TensorOptions& options)
+  : view_(new View{
+      context,
+      sizes,
+      options,
+    }) {
+}
+
+const vTensor* vTensor::host() const {
+  view_->staging(Access::Read);
+  return this;
+}
+
+vTensor* vTensor::host(const Access::Flags access) {
+  view_->staging(access);
+  return this;
+}
+
+vTensor::Buffer::Object vTensor::buffer() const & {
+  return view_->buffer(Access::Read).object;
+}
+
+vTensor::Buffer::Object vTensor::buffer(
+    const Access::Flags access) & {
+  return view_->buffer(access).object;
+}
+
+vTensor::Buffer::Object vTensor::buffer(
+    api::Command::Buffer& command_buffer) const & {
+  return view_->buffer(command_buffer, Access::Read).object;
+}
+
+vTensor::Buffer::Object vTensor::buffer(
+    api::Command::Buffer& command_buffer,
+    const Access::Flags access) & {
+  return view_->buffer(command_buffer, access).object;
+}
+
+vTensor::Image::Object vTensor::image() const & {
+  return view_->image(Access::Read).object;
+}
+
+vTensor::Image::Object vTensor::image(
+    const Access::Flags access) & {
+  return view_->image(access).object;
+}
+
+vTensor::Image::Object vTensor::image(
+    api::Command::Buffer& command_buffer) const & {
+  return view_->image(command_buffer, Access::Read).object;
+}
+
+vTensor::Image::Object vTensor::image(
+    api::Command::Buffer& command_buffer,
+    const Access::Flags access) & {
+  return view_->image(command_buffer, access).object;
+}
+
+vTensor::View::View()
+    // Resources
+  : buffer_{},
+    image_{},
+    staging_{},
+    fence_{},
+    // Context
+    context_(nullptr),
+    // State
+    state_{},
+    // Metadata
+    extents_{} {
+}
+
+vTensor::View::View(
+    api::Context* const context,
+    const IntArrayRef sizes,
+    const TensorOptions& options)
+    // Resources
+  : buffer_{},
+    image_{},
+    staging_{},
+    fence_{},
+    // Context
+    context_(context),
+    // State
+    state_(context, sizes),
+    // Metadata
+    extents_(image_extents(sizes)),
+    options_(options),
+    sizes_(sizes),
+    strides_(sizes.size()) {
+  ops::verify(options);
+}
+
+// We typically do not know whether we need a command buffer to service a request
+// until we have perfomed a bunch of checks in nested logic, and even then we
+// may end up with the always issued state transition optimized away under
+// certain conditions, which makes a policy of always allocating a command buffer
+// up front, only to end up using it at times, a wasteful approach.  This class
+// answers that need.
+
+class vTensor::View::CMD final {
+ public:
+  CMD(const View&);
+  CMD(const View&, api::Command::Buffer&);
+  CMD(const CMD&) = delete;
+  CMD& operator=(const CMD&) = delete;
+  CMD(CMD&&) = delete;
+  CMD& operator=(CMD&&) = delete;
+  ~CMD() = default;
+
+  typedef api::Resource::Buffer Buffer;
+  typedef api::Resource::Image Image;
+  typedef api::Resource::Fence Fence;
+
+  void barrier(State::Transition transition);
+
+  void copy_buffer_to_staging(
+      State& state,
+      const Buffer::Object& buffer,
+      Buffer::Object& staging);
+
+  void copy_staging_to_buffer(
+      State& state,
+      const Buffer::Object& staging,
+      Buffer::Object& buffer);
+
+  void copy_buffer_to_image(
+      State& state,
+      const Buffer::Object& buffer,
+      Image::Object& image);
+
+  void copy_image_to_buffer(
+      State& state,
+      const Image::Object& image,
+      Buffer::Object& buffer);
+
+  void submit(Fence fence = {});
+
+ private:
+  api::Command::Buffer& command_buffer();
+
+ private:
+  const View& view_;
+
+  enum class Type {
+    Internal,
+    External,
+  } type;
+
+  union {
+    api::Command::Buffer internal;
+    api::Command::Buffer* external;
+  } command_buffer_;
+};
+
+vTensor::View::CMD::CMD(
+    const View& view)
+  : view_(view),
+    type(Type::Internal),
+    command_buffer_{} {
+}
+
+vTensor::View::CMD::CMD(
+    const View& view,
+    api::Command::Buffer& external)
+  : view_(view),
+    type(Type::External),
+    command_buffer_{
+      .external = &external,
+    } {
+}
+
+api::Command::Buffer& vTensor::View::CMD::command_buffer() {
+  switch (type) {
+    case Type::Internal:
+      if (!command_buffer_.internal) {
+        command_buffer_.internal = view_.context_->command().pool.allocate();
+        command_buffer_.internal.begin();
+      }
+
+      return command_buffer_.internal;
+
+    case Type::External:
+      return *(command_buffer_.external);
+
+    default:
+      TORCH_INTERNAL_ASSERT(false, "Unknown command buffer type!");
+      break;
+  }
+}
+
+void vTensor::View::CMD::barrier(State::Transition transition) {
+  // Buffer and Staging are just an alias for the same memory location on UMA.
+
+  if (view_.state_.is_uma()) {
+    transition.first.buffer.stage |= transition.first.staging.stage;
+    transition.first.buffer.access |= transition.first.staging.access;
+    transition.first.staging = {};
+
+    transition.second.buffer.stage |= transition.second.staging.stage;
+    transition.second.buffer.access |= transition.second.staging.access;
+    transition.second.staging = {};
+  }
+
+  // Filter out host dependencies out of source, per Vulkan spec host write ordering guarantees:
+  // https://www.khronos.org/registry/vulkan/specs/1.2/html/vkspec.html#synchronization-submission-host-writes
+
+  const auto filter_stage =[](VkPipelineStageFlags& stage) {
+    stage &= ~VK_PIPELINE_STAGE_HOST_BIT;
+  };
+
+  filter_stage(transition.first.buffer.stage);
+  filter_stage(transition.first.staging.stage);
+
+  const auto filter_access =[](VkAccessFlags& access) {
+    access &= ~(VK_ACCESS_HOST_READ_BIT | VK_ACCESS_HOST_WRITE_BIT);
+  };
+
+  filter_access(transition.first.buffer.access);
+  filter_access(transition.first.staging.access);
+
+  api::Pipeline::Barrier barrier{};
+
+  if (transition.second.staging) {
+    const State::Bundle::Buffer from = transition.first.staging;
+    const State::Bundle::Buffer to = transition.second.staging;
+
+    const Barrier category = categorize(
+        from.access,
+        to.access);
+
+    if (Barrier::None != category) {
+      barrier.stage.src |= from.stage;
+      barrier.stage.dst |= to.stage;
+
+      if (Barrier::Memory == category) {
+        barrier.buffers.push_back({
+          view_.staging().object,
+          {
+            from.access,
+            to.access,
+          },
+        });
+      }
+    }
+  }
+
+  if (transition.second.buffer) {
+    const State::Bundle::Buffer from = transition.first.buffer;
+    const State::Bundle::Buffer to = transition.second.buffer;
+
+    const Barrier category = categorize(
+        from.access,
+        to.access);
+
+    if (Barrier::None != category) {
+      barrier.stage.src |= from.stage;
+      barrier.stage.dst |= to.stage;
+
+      if (Barrier::Memory == category) {
+        barrier.buffers.push_back({
+          view_.buffer().object,
+          {
+            from.access,
+            to.access,
+          },
+        });
+      }
+    }
+  }
+
+  if (transition.second.image) {
+    const State::Bundle::Image from = transition.first.image;
+    const State::Bundle::Image to = transition.second.image;
+
+    const Barrier category = categorize(
+        from.access,
+        to.access,
+        from.layout,
+        to.layout);
+
+    if (Barrier::None != category) {
+      barrier.stage.src |= from.stage;
+      barrier.stage.dst |= to.stage;
+
+      if (Barrier::Memory == category) {
+        TORCH_INTERNAL_ASSERT(
+            from.layout == view_.image().object.layout,
+            "Invalid image layout!");
+
+        barrier.images.push_back({
+          view_.image().object,
+          {
+            from.access,
+            to.access,
+          },
+          {
+            from.layout,
+            to.layout,
+          },
+        });
+
+        view_.image().object.layout = to.layout;
+      }
+    }
+  }
+
+  // If we are left with anything meaningful, insert a barrier.
+
+  if (barrier) {
+    if (0u == barrier.stage.src) {
+      barrier.stage.src = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+    }
+
+    if (0u == barrier.stage.dst) {
+      barrier.stage.src = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
+    }
+
+    // Optimization opportunity: delay and batch.
+
+    command_buffer().barrier(barrier);
+  }
+}
+
+void vTensor::View::CMD::copy_buffer_to_staging(
+    State& state,
+    const Buffer::Object& buffer,
+    Buffer::Object& staging) {
+  if (state.is_clean(Component::Staging) || state.is_uma()) {
+    return;
+  }
+
+  barrier(
+      state.transition({
+          // Staging
+          {
+            VK_PIPELINE_STAGE_TRANSFER_BIT,
+            VK_ACCESS_TRANSFER_WRITE_BIT,
+          },
+          // Buffer
+          {
+            VK_PIPELINE_STAGE_TRANSFER_BIT,
+            VK_ACCESS_TRANSFER_READ_BIT,
+          },
+          // Image
+          {},
+        }));
+
+  command_buffer().copy(buffer, staging);
+}
+
+void vTensor::View::CMD::copy_staging_to_buffer(
+    State& state,
+    const Buffer::Object& staging,
+    Buffer::Object& buffer) {
+  if (state.is_clean(Component::Buffer) || state.is_uma()) {
+    return;
+  }
+
+  barrier(
+      state.transition({
+          // Staging
+          {
+            VK_PIPELINE_STAGE_TRANSFER_BIT,
+            VK_ACCESS_TRANSFER_READ_BIT,
+          },
+          // Buffer
+          {
+            VK_PIPELINE_STAGE_TRANSFER_BIT,
+            VK_ACCESS_TRANSFER_WRITE_BIT,
+          },
+          // Image
+          {},
+        }));
+
+  command_buffer().copy(staging, buffer);
+}
+
+void vTensor::View::CMD::copy_buffer_to_image(
+    State& state,
+    const Buffer::Object& buffer,
+    Image::Object& image) {
+  if (state.is_clean(Component::Image)) {
+    return;
+  }
+
+  barrier(
+      state.transition({
+          // Staging
+          {},
+          // Buffer
+          {
+            VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+            VK_ACCESS_SHADER_READ_BIT,
+          },
+          // Image
+          {
+            VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+            VK_ACCESS_SHADER_WRITE_BIT,
+            VK_IMAGE_LAYOUT_GENERAL,
+          },
+        }));
+
+  const struct {
+    uint32_t width;
+    uint32_t height;
+  } block {
+    view_.extents().width,
+    view_.extents().height,
+  };
+
+  view_.context_->dispatch(
+      command_buffer(),
+      {
+        VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+        VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+        VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+      },
+      VK_KERNEL(nchw_to_image),
+      view_.extents(),
+      image,
+      buffer,
+      // Object lifetime is managed by the resource pool.
+      // It is OK not to keep track of the handle.
+      view_.context_->resource().pool.uniform(block).object);
+}
+
+void vTensor::View::CMD::copy_image_to_buffer(
+    State& state,
+    const Image::Object& image,
+    Buffer::Object& buffer) {
+  if (state.is_clean(Component::Buffer)) {
+    return;
+  }
+
+  barrier(
+      state.transition({
+          // Staging
+          {},
+          // Buffer
+          {
+            VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+            VK_ACCESS_SHADER_WRITE_BIT,
+          },
+          // Image
+          {
+            VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+            VK_ACCESS_SHADER_READ_BIT,
+            VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+          },
+        }));
+
+  const struct {
+    uint32_t width;
+    uint32_t height;
+  } block {
+    view_.extents().width,
+    view_.extents().height,
+  };
+
+  view_.context_->dispatch(
+      command_buffer(),
+      {
+        VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+        VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+        VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+      },
+      VK_KERNEL(image_to_nchw),
+      view_.extents(),
+      image,
+      buffer,
+      // Object lifetime is managed by the resource pool.
+      // It is OK not to keep track of the handle.
+      view_.context_->resource().pool.uniform(block).object);
+}
+
+void vTensor::View::CMD::submit(const api::Resource::Fence fence) {
+  if ((Type::Internal == type) && command_buffer_.internal) {
+    command_buffer_.internal.end();
+    command_buffer_.internal.submit(view_.context_->gpu().queue, fence);
+  }
+}
+
+vTensor::Buffer& vTensor::View::buffer() const {
+  if (!buffer_) {
+    buffer_ = allocate_buffer(
+        context_,
+        sizes(),
+        options());
+  }
+
+  return buffer_;
+}
+
+vTensor::Buffer& vTensor::View::buffer(
+    const Access::Flags access) const {
+  CMD command_buffer(*this);
+  Buffer& buffer = this->buffer(command_buffer, access);
+  command_buffer.submit();
+
+  return buffer;
+}
+
+vTensor::Buffer& vTensor::View::buffer(
+    api::Command::Buffer& command_buffer_,
+    const Access::Flags access) const {
+  CMD command_buffer(*this, command_buffer_);
+  return buffer(command_buffer, access);
+}
+
+vTensor::Buffer& vTensor::View::buffer(
+    CMD& command_buffer,
+    const Access::Flags access) const {
+  if ((access & Access::Read) && state_.is_dirty(Component::Buffer)) {
+    if (state_.is_clean(Component::Staging)) {
+      command_buffer.copy_staging_to_buffer(
+          state_,
+          staging(command_buffer, Access::Read).object,
+          buffer().object);
+    }
+    else if (state_.is_clean(Component::Image)) {
+      command_buffer.copy_image_to_buffer(
+          state_,
+          image(command_buffer, Access::Read).object,
+          buffer().object);
+    }
+    else {
+      TORCH_INTERNAL_ASSERT(
+          false,
+          "Invalid state!");
+    }
+  }
+
+  command_buffer.barrier(
+      state_.transition({
+          // Staging
+          {},
+          // Buffer
+          {
+            VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+            [access]() {
+              VkAccessFlags vk_access = 0u;
+
+              if (access & Access::Read) {
+                vk_access |= VK_ACCESS_SHADER_READ_BIT;
+              }
+
+              if (access & Access::Write) {
+                vk_access |= VK_ACCESS_SHADER_WRITE_BIT;
+              }
+
+              return vk_access;
+            }(),
+          },
+          // Image
+          {},
+        }));
+
+  if (access & Access::Write) {
+    state_.set_dirty(Component::All);
+  }
+
+  state_.set_clean(Component::Buffer);
+
+  return buffer();
+}
+
+vTensor::Image& vTensor::View::image() const {
+  if (!image_ && state_.is_available(Component::Image)) {
+    image_ = allocate_image(
+        context_,
+        extents(),
+        options());
+  }
+
+  return image_;
+}
+
+vTensor::Image& vTensor::View::image(
+    const Access::Flags access) const {
+  CMD command_buffer(*this);
+  Image& image = this->image(command_buffer, access);
+  command_buffer.submit();
+
+  return image;
+}
+
+vTensor::Image& vTensor::View::image(
+    api::Command::Buffer& command_buffer_,
+    const Access::Flags access) const {
+  CMD command_buffer(*this, command_buffer_);
+  return image(command_buffer, access);
+}
+
+vTensor::Image& vTensor::View::image(
+    CMD& command_buffer,
+    const Access::Flags access) const {
+  if ((access & Access::Read) && state_.is_dirty(Component::Image)) {
+    command_buffer.copy_buffer_to_image(
+        state_,
+        buffer(command_buffer, Access::Read).object,
+        image().object);
+  }
+
+  command_buffer.barrier(
+      state_.transition({
+          // Staging
+          {},
+          // Buffer
+          {},
+          // Image
+          {
+            VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+            [access]() {
+              VkAccessFlags vk_access = 0u;
+
+              if (access & Access::Read) {
+                vk_access |= VK_ACCESS_SHADER_READ_BIT;
+              }
+
+              if (access & Access::Write) {
+                vk_access |= VK_ACCESS_SHADER_WRITE_BIT;
+              }
+
+              return vk_access;
+            }(),
+            [access]() {
+              if (Access::Read == (access & Access::Read)) {
+                return VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+              }
+
+              return VK_IMAGE_LAYOUT_GENERAL;
+            }(),
+          },
+        }));
+
+  if (access & Access::Write) {
+    state_.set_dirty(Component::All);
+  }
+
+  state_.set_clean(Component::Image);
+
+  return image();
+}
+
+vTensor::Buffer& vTensor::View::staging() const {
+  if (!state_.is_available(Component::Staging)) {
+    return buffer();
+  }
+
+  if (!staging_) {
+    staging_ = allocate_staging(
+        context_,
+        sizes(),
+        options());
+  }
+
+  return staging_;
+}
+
+vTensor::Buffer& vTensor::View::staging(const Access::Flags access) const {
+  CMD command_buffer(*this);
+  Buffer& staging = this->staging(command_buffer, access);
+  command_buffer.submit(fence());
+
+  return staging;
+}
+
+vTensor::Buffer& vTensor::View::staging(
+    CMD& command_buffer,
+    const Access::Flags access) const {
+  if ((access & Access::Read) && state_.is_dirty(Component::Staging)) {
+    command_buffer.copy_buffer_to_staging(
+        state_,
+        buffer(command_buffer, Access::Read).object,
+        staging().object);
+  }
+
+  command_buffer.barrier(
+      state_.transition({
+          // Staging
+          {
+            VK_PIPELINE_STAGE_HOST_BIT,
+            [access]() {
+              VkAccessFlags vk_access = 0u;
+
+              if (access & Access::Read) {
+                vk_access |= VK_ACCESS_HOST_READ_BIT;
+              }
+
+              if (access & Access::Write) {
+                vk_access |= VK_ACCESS_HOST_WRITE_BIT;
+              }
+
+              return vk_access;
+            }(),
+          },
+          // Buffer
+          {},
+          // Image
+          {},
+        }));
+
+  if (access & Access::Write) {
+    state_.set_dirty(Component::All);
+  }
+
+  state_.set_clean(Component::Staging);
+
+  return staging();
+}
+
+vTensor::Memory& vTensor::View::wait() const {
+  if (fence_) {
+    fence_.wait();
+  }
+
+  return staging().memory;
+}
+
+vTensor::Fence& vTensor::View::fence() const {
+  return (fence_ = allocate_fence(context_));
+}
+
+void vTensor::View::verify() const {
+  TORCH_INTERNAL_ASSERT(!image_ || state_.is_available(Component::Image));
+  TORCH_INTERNAL_ASSERT(!staging_ || state_.is_discrete());
+}
+
+vTensor::View::State::State()
+  : available_{},
+    dirty_{},
+    bundle_{} {
+}
+
+vTensor::View::State::State(
+    api::Context* const context,
+    const IntArrayRef sizes)
+  : available_{},
+    dirty_{},
+    bundle_{} {
+  available_ |= Component::Buffer;
+
+  if (requires_image(sizes)) {
+    available_ |= Component::Image;
+  }
+
+  if (requires_staging(context)) {
+    available_ |= Component::Staging;
+  }
+}
+
+vTensor::View::State::Transition
+vTensor::View::State::transition(const Bundle bundle) {
+  const Bundle from = bundle_;
+  Bundle& to = bundle_;
+
+  if (bundle.staging) {
+    to.staging = bundle.staging;
+  }
+
+  if (bundle.buffer) {
+    to.buffer = bundle.buffer;
+  }
+
+  if (bundle.image) {
+    to.image = bundle.image;
+  }
+
+#ifdef DEBUG
+  // Forward declaration
+  std::ostream& operator<<(
+      std::ostream&,
+      const View::State::Bundle&);
+
+  std::cout << "From:" << std::endl << from << std::endl;
+  std::cout << "To:" << std::endl << to << std::endl;
+#endif /* DEBUG */
+
+  return Transition{
+    from,
+    to,
+  };
+}
+
+void verify(const TensorOptions& options) {
+  TORCH_CHECK(
+      !options.has_requires_grad() || !options.requires_grad(),
+      "'requires_grad' tensor option is not yet supported under Vulkan!");
+
+  TORCH_CHECK(
+      !options.has_pinned_memory() || !options.pinned_memory(),
+      "'pinned_memory' tensor option is not yet supported under Vulkan!");
+
+  TORCH_CHECK(
+      !options.has_layout() || (c10::kStrided == options.layout()),
+      "'layout' tensor option is not yet supported under Vulkan!");
+
+  TORCH_CHECK(
+      !options.has_memory_format(),
+      "'memory_format' tensor option is not yet supported under Vulkan!");
+}
+
+//
+// Debug
+//
+
+namespace {
+
+// Considering that VkAccessFlags is a weak typedef of a built-in data type, we
+// need to introduce a new type to allow overload resolution distinguish between
+// the two.
+
+struct Access final {
+  VkAccessFlags value;
+};
+
+std::ostream& operator<<(
+    std::ostream& stream,
+    const Access& access) {
+  stream << "Access: ";
+
+  if (0u == access.value) {
+    return stream << "  0";
+  }
+
+  if (access.value & VK_ACCESS_HOST_READ_BIT) {
+    stream << "  VK_ACCESS_HOST_READ_BIT";
+  }
+
+  if (access.value & VK_ACCESS_HOST_WRITE_BIT) {
+    stream << "  VK_ACCESS_HOST_WRITE_BIT";
+  }
+
+  if (access.value & VK_ACCESS_MEMORY_READ_BIT) {
+    stream << "  VK_ACCESS_MEMORY_READ_BIT";
+  }
+
+  if (access.value & VK_ACCESS_MEMORY_WRITE_BIT) {
+    stream << "  VK_ACCESS_MEMORY_WRITE_BIT";
+  }
+
+  if (access.value & VK_ACCESS_SHADER_READ_BIT) {
+    stream << "  VK_ACCESS_SHADER_READ_BIT";
+  }
+
+  if (access.value & VK_ACCESS_SHADER_WRITE_BIT) {
+    stream << "  VK_ACCESS_SHADER_WRITE_BIT";
+  }
+
+  if (access.value & VK_ACCESS_TRANSFER_READ_BIT) {
+    stream << "  VK_ACCESS_TRANSFER_READ_BIT";
+  }
+
+  if (access.value & VK_ACCESS_TRANSFER_WRITE_BIT) {
+    stream << "  VK_ACCESS_TRANSFER_WRITE_BIT";
+  }
+
+  return stream;
+}
+
+// Considering that VkImageLayout is a weak typedef of a built-in data type,
+// we need to introduce a new type to allow overload resolution distinguish
+// between the two.
+
+struct Layout final {
+  VkImageLayout value;
+};
+
+std::ostream& operator<<(
+    std::ostream& stream,
+    const Layout& layout) {
+  stream << "Layout: ";
+
+  switch (layout.value) {
+    case VK_IMAGE_LAYOUT_UNDEFINED:
+      stream << "  VK_IMAGE_LAYOUT_UNDEFINED";
+      break;
+
+    case VK_IMAGE_LAYOUT_GENERAL:
+      stream << "  VK_IMAGE_LAYOUT_GENERAL";
+      break;
+
+    case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL:
+      stream << "  VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL";
+      break;
+
+    default:
+      stream << "  Unknown!";
+      break;
+  };
+
+  return stream;
+}
+
+// Considering that VkPipelineStageFlags is a weak typedef of a built-in data
+// type, we need to introduce a new type to allow overload resolution distinguish
+// between the two.
+
+struct Stage final {
+  VkPipelineStageFlags value;
+};
+
+std::ostream& operator<<(
+    std::ostream& stream,
+    const Stage& stage) {
+  stream << "Stage: ";
+
+  if (0u == stage.value) {
+    return stream << "  0";
+  }
+
+  if (stage.value & VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT) {
+    stream << "  VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT";
+  }
+
+  if (stage.value & VK_PIPELINE_STAGE_HOST_BIT) {
+    stream << "  VK_PIPELINE_STAGE_HOST_BIT";
+  }
+
+  if (stage.value & VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT) {
+    stream << "  VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT";
+  }
+
+  if (stage.value & VK_PIPELINE_STAGE_TRANSFER_BIT) {
+    stream << "  VK_PIPELINE_STAGE_TRANSFER_BIT";
+  }
+
+  return stream;
+}
+
+} // namespace
+
+std::ostream& operator<<(
+    std::ostream& stream,
+    const vTensor::View::State::Bundle& bundle) {
+  stream << "Staging\n " <<
+      Stage{
+        bundle.staging.stage,
+      } << "\n " <<
+      Access{
+        bundle.staging.access,
+      } << std::endl;
+
+  stream << "Buffer\n " <<
+      Stage{
+        bundle.buffer.stage,
+      } << "\n " <<
+      Access{
+        bundle.buffer.access,
+      } << std::endl;
+
+  stream << "Image\n " <<
+      Stage{
+        bundle.image.stage,
+      } << "\n " <<
+      Access{
+        bundle.image.access,
+      } <<  "\n " <<
+      Layout{
+        bundle.image.layout,
+      } << std::endl;
+
+  return stream;
+}
+
+} // namespace ops
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/vulkan/ops/Tensor.h b/aten/src/ATen/native/vulkan/ops/Tensor.h
new file mode 100644
index 000000000000..08eede9a4f18
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/ops/Tensor.h
@@ -0,0 +1,598 @@
+#pragma once
+
+#ifdef USE_VULKAN_API
+
+#include <ATen/native/vulkan/ops/Common.h>
+#include <ATen/native/vulkan/VulkanOpaqueTensorImpl.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace ops {
+
+//
+// This class represents a Vulkan tensor and provides an abstraction layer
+// that allows both the CPU, and the GPU, to view a Vulkan (buffer, image)
+// pair as one coherent, synchronized unit of storage on both UMA and discrete
+// systems.  Expanding on the previous sentence, this class tries to address
+// two orthogonal implementation complexities that arise as a result of the
+// aforementioned goal of memory coherence:
+//
+// 1) First, synchronization across processors; CPUs and GPUs are separate
+//    processors, and even though they share the same address space in a system
+//    with a unified memory architecture, their address spaces only partially
+//    overlap on systems with a discrete GPU.  Consequently on discrete systems,
+//    while it is still technically possible to take advantage of this shared
+//    address space to maintain one single copy of the data, different access
+//    latencies from CPU and GPU to this shared location usually necessitates
+//    maintaining two copies each in processor-local memory, otherwise memory
+//    access latency will hurt from the processor to which this data is not
+//    close.  This shared memory is more often than not located in system memory,
+//    making for slow GPU read and write access over the PCI-e bus on discrete.
+//    Maintaining two separate copies on the other hand, requires synchronization
+//    to guarantee coherence.  This is not an issue on UMA and this implementation
+//    accounts for that optimization.
+//
+// 2) Second, synchronization across resources (i.e. buffers and images); GPU
+//    drivers pack images in proprietory formats for better locality of access
+//    and to enable lossless compression.  These conversions are both expensive
+//    (in general) and manual (in Vulkan.)  This requires a second order of
+//    synchronization to guarantee coherence between the contents of the buffer
+//    and image otherwise they will go out of sync.
+//
+// It is extremely important to keep in mind that the functionality this class
+// provides is generally expensive.  For optimal performance, the user of this
+// class should:
+//
+// 1) Avoid frequent CPU <=> GPU transfers which will be triggered if data is
+//    write accessed on one processor and read / write accessed on the other.
+//
+// 2) Avoid frequent buffer <=> image conversions which will be trigerred if
+//    data is write accessed as a buffer (image) and read accessed as an
+//    image (buffer).
+//
+// 3) When and if a synchronization is unavoidable, place as much distance
+//    between the synchronization is triggered and the data is accessed since
+//    all synchronizations this class provides are async.
+//
+// For optimal performance, access the data as images, and keep the data on GPU,
+// and above all understand the expensive data flow that this class abstracts
+// away.
+//
+// vTensor tries to address a specific concern and intentionally does not expose
+// GPU tensor memory directly.  Please keep that behavior intact as the whole
+// data model fundamentally depends on limiting what the user can achieve through
+// the interface to guarantee performance and coherence.
+//
+// A vTensor is associated with an api::Context as preparation for multi-GPU
+// support.
+//
+
+class vTensor final {
+ public:
+  vTensor() = default;
+  vTensor(
+      api::Context* context,
+      IntArrayRef sizes,
+      const TensorOptions& options);
+
+  /*
+    Types
+  */
+
+  typedef api::Resource::Memory::Access Access;
+  typedef api::Resource::Buffer Buffer;
+  typedef api::Resource::Fence Fence;
+  typedef api::Resource::Image Image;
+  typedef api::Resource::Memory Memory;
+
+  /*
+    Future
+  */
+
+  template<typename Type, Access::Flags kAccess>
+  class Future final {
+    template<typename T, Access::Flags A>
+    using is_convertible = std::enable_if_t<
+        std::is_convertible<
+            Access::Pointer<T, A>,
+            Access::Pointer<Type, kAccess>>::value>;
+
+   public:
+    explicit Future(const vTensor* tensor);
+    Future(const Future&) = delete;
+    Future& operator=(const Future&) = delete;
+    Future(Future&&);
+    Future& operator=(Future&&) &;
+    Future& operator=(Future&&) && = delete;
+    template<typename T, Access::Flags A, typename = is_convertible<T, A>>
+    Future(Future<T, A>&&);
+    template<typename T, Access::Flags A, typename = is_convertible<T, A>>
+    Future& operator=(Future<T, A>&&) &;
+    template<typename T, Access::Flags A>
+    Future& operator=(Future<T, A>&&) && = delete;
+    ~Future();
+
+    typedef Memory::Handle<
+        Access::Pointer<
+            Type,
+            kAccess>> Payload;
+
+    // This is a blocking operation as the name suggests.  A call to host() will
+    // trigger an async copy if pending writes are detected.  Consequently, for
+    // optimal performance, put as much time and distance between the place
+    // where a vTensor::host() call occurs and the location where the returned
+    // future is explicitly waited on as a result of a call to this function.
+
+    Payload wait() const &;
+
+   private:
+    // Intentionally disabed to enforce a usage pattern wherein the Future's
+    // lifetime exceeds that of the Payload as we use the Future's destructor
+    // to eagerly (as opposed to lazily and upon first use) upload the
+    // modifications back onto the GPU in an effort to hide the upload latency.
+
+    Payload wait() const && = delete;
+
+   private:
+    template<typename, Access::Flags>
+    friend class Future;
+
+   private:
+    const vTensor* tensor_;
+  };
+
+  /*
+    Host access - these functions will be expensive if they trigger a GPU -> CPU
+    sync due to pending writes.  A call to host() will trigger an async copy in
+    such scenarios, which is then explictly waited on as part of Future::wait().
+    Consequently, for optimal performance, put as much time and distance between
+    the place where this function is called, and the location where the future is
+    waited on.
+  */
+
+  template<typename Type>
+  Future<Type, Access::Read> host() const &;
+
+  template<typename Type, Access::Flags kAccess>
+  Future<Type, kAccess> host() &;
+
+  /*
+    Device access - these functions will be expensive if they trigger a buffer
+    <-> image or CPU -> GPU sync due to pending writes.  These functions are
+    non-blocking on the host as the copy operation is carried out by the GPU
+    asynchronously.  Regardless, they result in extra work that could have been
+    avoided or at least minimized if all data access had occured through one
+    single processor (GPU in this case) and on one type of resource (image for
+    best performance.)  Consequently, for optimal performance, avoid mixed reads
+    and writes across processor boundaries, and do your best to minimize layout
+    transitions as a result of working with images only (as opposed to mixed
+    buffer - image usage.)
+    This implementation intentionally restricts user access to the buffer and
+    image objects only, as opposed to their underlying memory, for the sake of
+    predictability of usage and efficiency.
+  */
+
+  Buffer::Object buffer() const &;
+  Buffer::Object buffer(Access::Flags access) &;
+  Buffer::Object buffer(api::Command::Buffer&) const &;
+  Buffer::Object buffer(api::Command::Buffer&, Access::Flags) &;
+
+  bool has_image() const;
+  Image::Object image() const &;
+  Image::Object image(Access::Flags access) &;
+  Image::Object image(api::Command::Buffer&) const &;
+  Image::Object image(api::Command::Buffer&, Access::Flags) &;
+
+  /*
+    Metadata
+  */
+
+  const VkExtent3D& extents() const;
+  const TensorOptions& options() const;
+  IntArrayRef sizes() const;
+  IntArrayRef strides() const;
+
+ private:
+  // Some overloads below are intentionally disabled to enforce a usage pattern
+  // that ensures the Tensor's lifetime exceeds that of the scope in which the
+  // underlying data is accessed.  Allowing deleted overloads below to be
+  // invoked on a temporary would open the door to the possibility of accessing
+  // the underlying memory out of the expected scope.
+
+  /*
+    Host
+  */
+
+  const vTensor* host() const;
+  vTensor* host(Access::Flags access);
+
+  template<typename Type>
+  Future<Type, Access::Read> host() const && = delete;
+
+  template<typename Type, Access::Flags kAccess>
+  Future<Type, kAccess> host() && = delete;
+
+  /*
+    Device
+  */
+
+  Buffer::Object buffer() const && = delete;
+  Buffer::Object buffer(Access::Flags) && = delete;
+  Buffer::Object buffer(api::Command::Buffer&) const && = delete;
+  Buffer::Object buffer(api::Command::Buffer&, Access::Flags) && = delete;
+
+  Image::Object image() const && = delete;
+  Image::Object image(Access::Flags) && = delete;
+  Image::Object image(api::Command::Buffer&) const && = delete;
+  Image::Object image(api::Command::Buffer&, Access::Flags) && = delete;
+
+ private:
+  class View final {
+   public:
+    View();
+    View(
+        api::Context* context,
+        IntArrayRef sizes,
+        const TensorOptions& options);
+    View(const View&) = delete;
+    View& operator=(const View&) = delete;
+    View(View&&) = default;
+    View operator=(View&&) = delete;
+    ~View() = default;
+
+    Buffer& buffer(Access::Flags) const;
+    Buffer& buffer(api::Command::Buffer&, Access::Flags) const;
+
+    bool has_image() const;
+    Image& image(Access::Flags) const;
+    Image& image(api::Command::Buffer&, Access::Flags) const;
+
+    Buffer& staging(Access::Flags) const;
+    Buffer& staging(api::Command::Buffer&, Access::Flags) const;
+    vTensor::Memory& wait() const;
+
+    const VkExtent3D& extents() const;
+    const TensorOptions& options() const;
+    IntArrayRef sizes() const;
+    IntArrayRef strides() const;
+
+   private:
+    class CMD;
+
+    class State final {
+     public:
+      State();
+      State(api::Context*, IntArrayRef);
+
+      struct Bundle final {
+        struct Buffer final {
+          VkPipelineStageFlags stage;
+          VkAccessFlags access;
+
+          operator bool() const;
+        } staging, buffer;
+
+        struct Image final {
+          VkPipelineStageFlags stage;
+          VkAccessFlags access;
+          VkImageLayout layout;
+
+          operator bool() const;
+        } image;
+      };
+
+      struct Component final {
+        typedef uint8_t Flags;
+
+        enum Type : Flags {
+          Buffer = 1u << 0u,
+          Image = 1u << 1u,
+          Staging = 1u << 2u,
+          All = Buffer | Image | Staging,
+        };
+      };
+
+      // Availability
+      bool is_available(Component::Flags) const;
+      bool is_discrete() const;
+      bool is_uma() const;
+
+      // Clean / Dirty
+      bool is_clean(Component::Flags) const;
+      bool is_dirty(Component::Flags) const;
+      void set_clean(Component::Flags);
+      void set_dirty(Component::Flags);
+
+      // Transition
+      typedef std::pair<Bundle, Bundle> Transition;
+      Transition transition(Bundle to);
+
+     private:
+      Component::Flags available_;
+      Component::Flags dirty_;
+      Bundle bundle_;
+    };
+
+    typedef State::Component Component;
+
+   private:
+    // Accessors / Lazy Allocation
+    Buffer& buffer() const;
+    Buffer& buffer(CMD&, Access::Flags) const;
+    Image& image() const;
+    Image& image(CMD&, Access::Flags) const;
+    Buffer& staging() const;
+    Buffer& staging(CMD&, Access::Flags) const;
+    Fence& fence() const;
+
+    // Validation
+    void verify() const;
+
+   private:
+    // Resources
+    mutable Buffer buffer_;
+    mutable Image image_;
+    mutable Buffer staging_;
+    mutable Fence fence_;
+
+    // Context
+    api::Context* context_;
+
+    // State
+    mutable State state_;
+
+    // Metadata
+    VkExtent3D extents_;
+    TensorOptions options_;
+    c10::SmallVector<int64_t, 6u> sizes_;
+    c10::SmallVector<int64_t, 6u> strides_;
+
+   private:
+    // Debug
+    friend std::ostream& operator<<(
+      std::ostream&,
+      const View::State::Bundle&);
+  };
+
+  // Even at the cost of a heap allocation plus the resulting negative impact
+  // on cache locality due to the subsequent pointer chasing, it is still
+  // critcal to share the view across vTensor implementations to minimize
+  // programmer errors.  Ideally this class should have been only made movable,
+  // and non-copyable - something we cannot do unfortunately due to the inner
+  // workings of at::TensorImpl requiring copy semantics in
+  // at::TensorImpl::release_resources() to function as expected.  Now that this
+  // class is made copyable though, a new door to a whole new class of bugs is
+  // opened, in that there now is a chance of two [shallow] copies, have their
+  // State objects go out of sync as a result of an operation being performed on
+  // one shallow copy that is not reflected in the other.  Technically, if the
+  // programmer is very careful, it is possible to avoid this trap and not pay
+  // the cost of indirection, but the resulting bugs of missing memory barriers
+  // will be so frustrating to hunt down for those unfamiliar with the internal
+  // mechanics of this class, that I decided to take the performance pentalty
+  // of this extra layer of indirection in favor of making this class easier
+  // to use.
+
+  std::shared_ptr<View> view_;
+
+ private:
+  // Debug
+  friend std::ostream& operator<<(
+      std::ostream&,
+      const View::State::Bundle&);
+};
+
+const vTensor& convert(const Tensor& tensor);
+vTensor& convert(Tensor& tensor);
+Tensor convert(const vTensor& tensor);
+
+using vTensorImpl = VulkanOpaqueTensorImpl<vTensor>;
+void verify(const TensorOptions& options);
+
+//
+// Impl
+//
+
+template<typename Type, vTensor::Access::Flags kAccess>
+inline vTensor::Future<Type, kAccess>::Future(
+    const vTensor* const tensor)
+  : tensor_(tensor) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      tensor_,
+      "Invalid Vulkan tensor!");
+}
+
+template<typename Type, vTensor::Access::Flags kAccess>
+inline vTensor::Future<Type, kAccess>::Future(
+    Future&& future)
+  : tensor_(std::move(future.tensor_)) {
+  future.tensor_ = nullptr;
+}
+
+template<typename Type, vTensor::Access::Flags kAccess>
+inline vTensor::Future<Type, kAccess>&
+vTensor::Future<Type, kAccess>::operator=(
+    Future&& future) & {
+  tensor_ = std::move(future.tensor_);
+  future.tensor_ = nullptr;
+  return *this;
+}
+
+template<typename Type, vTensor::Access::Flags kAccess>
+template<typename Type_, vTensor::Access::Flags kAccess_, typename>
+inline vTensor::Future<Type, kAccess>::Future(
+    Future<Type_, kAccess_>&& future)
+  : tensor_(std::move(future.tensor_)) {
+  future.tensor_ = nullptr;
+}
+
+template<typename Type, vTensor::Access::Flags kAccess>
+template<typename Type_, vTensor::Access::Flags kAccess_, typename>
+inline vTensor::Future<Type, kAccess>&
+vTensor::Future<Type, kAccess>::operator=(
+    Future<Type_, kAccess_>&& future) & {
+  tensor_ = std::move(future.tensor_);
+  future.tensor_ = nullptr;
+  return *this;
+}
+
+template<typename Type, vTensor::Access::Flags kAccess>
+inline vTensor::Future<Type, kAccess>::~Future() {
+#if VULKAN_SYNC_TENSORS_EAGERLY
+  // Sync eagerly in an effort to hide latency.
+  // Upside: Kick off the async transfer early on to keep the GPU busy.
+  // Downside: An extra CPU command submission.
+  if (tensor_ && (Access::Write & kAccess)) {
+    if (tensor_->has_image()) {
+      tensor_->image();
+    }
+    else {
+      tensor_->buffer();
+    }
+  }
+#endif
+}
+
+template<typename Type, vTensor::Access::Flags kAccess>
+inline typename vTensor::Future<Type, kAccess>::Payload
+vTensor::Future<Type, kAccess>::wait() const & {
+  TORCH_CHECK(
+      tensor_,
+      "vTensor::Future is in an invalid state!  "
+      "Potential reason: This future is moved from.");
+
+  return tensor_->view_->wait().template map<Type, kAccess>();
+}
+
+template<typename Type>
+inline vTensor::Future<Type, vTensor::Access::Read> vTensor::host() const & {
+  return Future<Type, vTensor::Access::Read>(host());
+}
+
+template<typename Type, vTensor::Access::Flags kAccess>
+inline vTensor::Future<Type, kAccess> vTensor::host() & {
+  return Future<Type, kAccess>(host(kAccess));
+}
+
+inline bool vTensor::has_image() const {
+  return view_->has_image();
+}
+
+inline const VkExtent3D& vTensor::extents() const {
+  return view_->extents();
+}
+
+inline const TensorOptions& vTensor::options() const {
+  return view_->options();
+}
+
+inline IntArrayRef vTensor::sizes() const {
+  return view_->sizes();
+}
+
+inline IntArrayRef vTensor::strides() const {
+  return view_->strides();
+}
+
+inline bool vTensor::View::has_image() const {
+  return state_.is_available(View::Component::Image);
+}
+
+inline const VkExtent3D& vTensor::View::extents() const {
+  return extents_;
+}
+
+inline const TensorOptions& vTensor::View::options() const {
+  return options_;
+}
+
+inline IntArrayRef vTensor::View::sizes() const {
+  return sizes_;
+}
+
+inline IntArrayRef vTensor::View::strides() const {
+  return strides_;
+}
+
+inline vTensor::View::State::Bundle::Buffer::operator bool() const {
+  return (0u != stage) &&
+         (0u != access);
+}
+
+inline vTensor::View::State::Bundle::Image::operator bool() const {
+  return (0u != stage) &&
+         (0u != access) &&
+         (VK_IMAGE_LAYOUT_UNDEFINED != layout);
+}
+
+inline bool vTensor::View::State::is_available(
+    const Component::Flags components) const {
+  return available_ & components;
+}
+
+inline bool vTensor::View::State::is_discrete() const {
+  return is_available(Component::Staging);
+}
+
+inline bool vTensor::View::State::is_uma() const {
+  return !is_discrete();
+}
+
+inline bool vTensor::View::State::is_clean(
+    const Component::Flags components) const {
+  return !is_dirty(components);
+}
+
+inline bool vTensor::View::State::is_dirty(
+    const Component::Flags components) const {
+  return dirty_ & components;
+}
+
+inline void vTensor::View::State::set_clean(
+    const Component::Flags components) {
+  dirty_ &= ~components;
+}
+
+inline void vTensor::View::State::set_dirty(
+    const Component::Flags components) {
+  dirty_ |= components;
+}
+
+inline const vTensor& convert(const Tensor& tensor) {
+  TORCH_INTERNAL_ASSERT(
+      tensor.is_vulkan(),
+      "Vulkan tensor expected!");
+
+  const vTensorImpl* const impl =
+      static_cast<const vTensorImpl*>(tensor.unsafeGetTensorImpl());
+
+  return impl->opaque_handle();
+}
+
+inline vTensor& convert(Tensor& tensor) {
+  TORCH_INTERNAL_ASSERT(
+      tensor.is_vulkan(),
+      "Vulkan tensor expected!");
+
+  vTensorImpl* const impl =
+      static_cast<vTensorImpl*>(tensor.unsafeGetTensorImpl());
+
+  return impl->unsafe_opaque_handle();
+}
+
+inline Tensor convert(const vTensor& tensor) {
+  return at::detail::make_tensor<vTensorImpl>(
+      DispatchKeySet(DispatchKey::Vulkan),
+      tensor.options().dtype(),
+      at::Device(at::kVulkan),
+      tensor,
+      tensor.sizes(),
+      tensor.strides());
+}
+
+} // namespace ops
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#endif /* USE_VULKAN_API */

From af27da93dec16fa39d168aa257f8beeab18fc7d6 Mon Sep 17 00:00:00 2001
From: Ashkan Aliabadi <ashkanaliabadi@fb.com>
Date: Mon, 26 Oct 2020 18:21:18 -0700
Subject: [PATCH 029/169] Add Vulkan Tensor factory. (#44016)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44016

Test Plan: Imported from OSS

Reviewed By: IvanKobzarev

Differential Revision: D23820823

Pulled By: AshkanAliabadi

fbshipit-source-id: d007650f255fd79b4a2f4bba0bf8ea00f9a2e6cf
---
 aten/src/ATen/native/vulkan/VulkanAten.cpp  |  4 ++
 aten/src/ATen/native/vulkan/ops/Factory.cpp | 58 +++++++++++++++++++++
 aten/src/ATen/test/vulkan_api_test.cpp      |  6 +++
 3 files changed, 68 insertions(+)
 create mode 100644 aten/src/ATen/native/vulkan/ops/Factory.cpp

diff --git a/aten/src/ATen/native/vulkan/VulkanAten.cpp b/aten/src/ATen/native/vulkan/VulkanAten.cpp
index 72d5e15208ec..6a781c3ab69b 100644
--- a/aten/src/ATen/native/vulkan/VulkanAten.cpp
+++ b/aten/src/ATen/native/vulkan/VulkanAten.cpp
@@ -537,6 +537,8 @@ Tensor mean(
   return new_with_vtensor_vulkan(std::move(output), self.options());
 }
 
+#ifndef USE_VULKAN_API
+
 TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
   m.impl("slice.Tensor", TORCH_FN(at::native::vulkan::aten::slice));
   m.impl("reshape", TORCH_FN(at::native::vulkan::aten::reshape));
@@ -570,6 +572,8 @@ TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
   m.impl_UNBOXED("add_.Tensor", at::native::vulkan::aten::add_);
 }
 
+#endif /* USE_VULKAN_API */
+
 Tensor& copy_from_vulkan_(Tensor& self, const Tensor& src) {
   TORCH_INTERNAL_ASSERT(
       src.device().type() == DeviceType::Vulkan,
diff --git a/aten/src/ATen/native/vulkan/ops/Factory.cpp b/aten/src/ATen/native/vulkan/ops/Factory.cpp
new file mode 100644
index 000000000000..fc58fe46d438
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/ops/Factory.cpp
@@ -0,0 +1,58 @@
+#include <ATen/native/vulkan/ops/Common.h>
+#include <torch/library.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace ops {
+namespace {
+
+Tensor empty_memory_format(
+    const IntArrayRef sizes,
+    const TensorOptions& options_,
+    const optional<MemoryFormat> memory_format = c10::nullopt) {
+  TORCH_CHECK(
+      !(options_.has_memory_format() && memory_format.has_value()),
+      "Cannot set memory_format both in TensorOptions and explicit argument!");
+
+  const TensorOptions options = options_.merge_in(
+      TensorOptions().memory_format(memory_format));
+  verify(options);
+
+  return convert(vTensor{
+      api::context(),
+      sizes,
+      options,
+    });
+}
+
+Tensor empty_strided(
+    const IntArrayRef sizes,
+    const IntArrayRef /* strides */,
+    const optional<ScalarType> dtype,
+    const optional<Layout> layout,
+    const optional<Device> device,
+    const optional<bool> pin_memory) {
+  return empty_memory_format(
+      sizes,
+      TensorOptions().
+          dtype(dtype).
+          layout(layout).
+          device(device).
+          pinned_memory(pin_memory));
+}
+
+#ifdef USE_VULKAN_API
+
+TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
+  m.impl_UNBOXED("empty.memory_format", at::native::vulkan::ops::empty_memory_format);
+  m.impl("empty_strided", TORCH_FN(at::native::vulkan::ops::empty_strided));
+}
+
+#endif /* USE_VULKAN_API */
+
+} // namespace
+} // namespace ops
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index 77bbbfc8edbe..c7ac15544a99 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -1,9 +1,15 @@
 #include <gtest/gtest.h>
 
+#include <ATen/ATen.h>
+
 #ifdef USE_VULKAN_API
 
 namespace {
 
+TEST(VulkanAPITest, empty) {
+  ASSERT_NO_THROW(at::empty({1, 17, 41, 53}, at::device(at::kVulkan).dtype(at::kFloat)));
+}
+
 } // namespace
 
 #endif /* USE_VULKAN_API */

From b3e64c86e06a34fe25be64cd42c24b47a02505e4 Mon Sep 17 00:00:00 2001
From: Oleg Khabinov <khabinov@fb.com>
Date: Mon, 26 Oct 2020 20:34:14 -0700
Subject: [PATCH 030/169] Remove loop_test mode (#46618)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46618

Using D19631971 (https://github.com/pytorch/pytorch/commit/b4b1b100bddf92ab536cac1d05aab84695d06edc) and https://github.com/pytorch/pytorch/pull/32935/files as a reference.

Test Plan:
```
$ buck run glow/fb/test:sparsenn_test -- --gtest_filter='SparseNNTest.vanillaC2' --onnxifi_debug_mode --nocaffe2_predictor_use_memonger
```

Generated dot file https://www.internalfb.com/intern/graphviz/?paste=P146216905

Reviewed By: yinghai

Differential Revision: D24427800

fbshipit-source-id: 7d1d8768352a52af104e0a75ce982c1eb861aa73
---
 caffe2/opt/glow_net_transform.cc  |   6 -
 caffe2/opt/onnxifi_transformer.cc | 245 +-----------------------------
 caffe2/opt/onnxifi_transformer.h  |   3 -
 3 files changed, 2 insertions(+), 252 deletions(-)

diff --git a/caffe2/opt/glow_net_transform.cc b/caffe2/opt/glow_net_transform.cc
index 12bd060c27d6..ece62abea258 100644
--- a/caffe2/opt/glow_net_transform.cc
+++ b/caffe2/opt/glow_net_transform.cc
@@ -13,11 +13,6 @@ C10_DEFINE_bool(
     true,
     "Attach AdjustBatch ops at input/outputs of the Onnxifi ops");
 
-C10_DEFINE_bool(
-    onnxifi_loop_test_mode,
-    false,
-    "For test purpose only. Build a dummy net just to test the functionality");
-
 C10_DEFINE_bool(
     enforce_fp32_inputs_into_fp16,
     false,
@@ -146,7 +141,6 @@ void onnxifi(
   opts.load_model_by_blob = load_model_by_blob;
   opts.enforce_fp32_inputs_into_fp16 = FLAGS_enforce_fp32_inputs_into_fp16;
   opts.merge_fp32_inputs_into_fp16 = FLAGS_merge_fp32_inputs_into_fp16;
-  opts.loop_test = FLAGS_onnxifi_loop_test_mode;
   opts.predictor_net_ssa_rewritten = predictor_net_ssa_rewritten;
   opts.timeout = FLAGS_onnxifi_timeout_ms;
 
diff --git a/caffe2/opt/onnxifi_transformer.cc b/caffe2/opt/onnxifi_transformer.cc
index e849f0edb272..c77101984790 100644
--- a/caffe2/opt/onnxifi_transformer.cc
+++ b/caffe2/opt/onnxifi_transformer.cc
@@ -403,235 +403,6 @@ void mergeFp32InputsAndConvertToFp16(
   }
 }
 
-NetDef buildLoopTestNet(
-    const NetDef& net,
-    const std::unordered_set<std::string>& initialization_list,
-    std::unordered_map<std::string, ShapeInfo>* shape_hints,
-    size_t batch_size) {
-  NetDef net_dummy;
-
-  // Add non-weigh inputs only
-  for (const auto& i : net.external_input()) {
-    if (!initialization_list.count(i)) {
-      net_dummy.add_external_input(i);
-    }
-  }
-  for (const auto& o : net.external_output()) {
-    net_dummy.add_external_output(o);
-  }
-
-  // Now categorize the inputs into the following groups. We don't support
-  // handling of 3d inputs yet, but it can be done easily by converting n-d
-  // inputs into 2-d with Reshape or ReduceSum
-  std::unordered_set<std::string> batched_2d_inputs;
-  std::unordered_set<std::string> other_2d_inputs;
-  std::unordered_set<std::string> all_1d_inputs;
-  auto addCast = [&net_dummy](
-                     const std::string& i,
-                     std::string& in,
-                     caffe2::TensorProto::DataType dtype) mutable {
-    int multiplier = 1;
-    if (dtype != caffe2::TensorProto::FLOAT) {
-      in += "_fp32";
-      net_dummy.add_op()->CopyFrom(CreateOperatorDef(
-          "Clip",
-          "",
-          {i},
-          {in},
-          {MakeArgument<float>("min", 0.0), MakeArgument<float>("max", 1.0)}));
-      if (dtype == caffe2::TensorProto::INT8 ||
-          dtype == caffe2::TensorProto::UINT8) {
-        multiplier = sizeof(float) / sizeof(int8_t);
-      } else if (
-          dtype == caffe2::TensorProto::INT16 ||
-          dtype == caffe2::TensorProto::UINT16 ||
-          dtype == caffe2::TensorProto::FLOAT16) {
-        multiplier = sizeof(float) / sizeof(int16_t);
-      } else if (dtype == caffe2::TensorProto::INT64) {
-        // Special case, it should really be 0.5
-        multiplier = 0;
-      }
-    }
-    return multiplier;
-  };
-  auto adjustDim = [](int d, int m, TensorShape& shape) {
-    if (m > 1) {
-      CAFFE_ENFORCE_EQ(shape.dims(d) % m, 0);
-      shape.set_dims(d, shape.dims(d) / m);
-    } else if (m == 0) {
-      shape.set_dims(d, shape.dims(d) * 2);
-    }
-    shape.set_data_type(caffe2::TensorProto::FLOAT);
-  };
-  size_t dim2 = 0;
-  for (const auto& i : net_dummy.external_input()) {
-    auto it = shape_hints->find(i);
-    CAFFE_ENFORCE(
-        it != shape_hints->end(), "Cannot find shape info for input ", i);
-    auto& shape = it->second.shape;
-    std::string in = i;
-    // Trick here: since backend like glow doesn't support non-float
-    // arithmatics, we need to be creative and bitcast non-float data type into
-    // float while maintaining the same bit lengths. We do this by changing the
-    // shape dim. So that we will always load the same amount of bits onto the
-    // backend. To avoid numeric complication, we add a Clip.
-    if (shape.dims_size() == 2) {
-      auto m = addCast(i, in, shape.data_type());
-      adjustDim(1, m, shape);
-      if (shape.dims(0) == batch_size) {
-        batched_2d_inputs.emplace(in);
-        dim2 += shape.dims(1);
-      } else {
-        other_2d_inputs.emplace(in);
-      }
-    } else if (shape.dims_size() == 1) {
-      auto m = addCast(i, in, shape.data_type());
-      adjustDim(0, m, shape);
-      all_1d_inputs.emplace(in);
-    } else {
-      const std::string fin = i + "_flatten";
-      net_dummy.add_op()->CopyFrom(
-          CreateOperatorDef("Flatten", "", {i}, {fin}, {}));
-      in = fin;
-      auto m = addCast(fin, in, shape.data_type());
-      auto last = shape.dims_size() - 1;
-      adjustDim(last, m, shape);
-      size_t ndim = 1;
-      for (unsigned k = 1; k < shape.dims_size(); ++k) {
-        ndim *= shape.dims(k);
-      }
-      if (shape.dims(0) == batch_size) {
-        batched_2d_inputs.emplace(in);
-        dim2 += ndim;
-      } else {
-        other_2d_inputs.emplace(in);
-      }
-    }
-  }
-
-  // Add adjusted shape hints
-  auto* shape_arg = net_dummy.add_arg();
-  auto* qshape_arg = net_dummy.add_arg();
-  shape_arg->set_name("input_shape_info");
-  qshape_arg->set_name("input_qshape_info");
-  for (const auto& i : net_dummy.external_input()) {
-    auto info = shape_hints->at(i);
-    if (!info.is_quantized) {
-      shape_arg->mutable_tensors()->Add()->CopyFrom(
-          wrapShapeInfoIntoTensorProto(i, info));
-    } else {
-      qshape_arg->mutable_qtensors()->Add()->CopyFrom(
-          wrapShapeInfoIntoQTensorProto(i, info));
-    }
-  }
-
-  // Collect all the input together into a 2d tensor of {batch_size, X}
-  std::vector<std::string> concat2d_batched(
-      batched_2d_inputs.begin(), batched_2d_inputs.end());
-  const std::string concat_out = "batch_2d_concat";
-  net_dummy.add_op()->CopyFrom(CreateOperatorDef(
-      "Concat",
-      "",
-      concat2d_batched,
-      {concat_out, "batch_2d_concat_split_info"},
-      {MakeArgument<int>("axis", 1)}));
-  std::vector<std::string> scalars;
-  for (const auto& i : other_2d_inputs) {
-    std::string o = i + "_reduced";
-    net_dummy.add_op()->CopyFrom(CreateOperatorDef(
-        "ReduceSum",
-        "",
-        {i},
-        {o},
-        {MakeArgument<std::vector<int>>("axes", {0, 1}),
-         MakeArgument<int>("keepdims", 0)}));
-    scalars.emplace_back(std::move(o));
-  }
-  for (const auto& i : all_1d_inputs) {
-    std::string o = i + "_reduced";
-    net_dummy.add_op()->CopyFrom(CreateOperatorDef(
-        "ReduceSum",
-        "",
-        {i},
-        {o},
-        {MakeArgument<std::vector<int>>("axes", {0}),
-         MakeArgument<int>("keepdims", 0)}));
-    scalars.emplace_back(std::move(o));
-  }
-  const std::string summed = "summed";
-  net_dummy.add_op()->CopyFrom(
-      CreateOperatorDef("Sum", "", scalars, {summed}, {}));
-  const std::string out = "result_out";
-  net_dummy.add_op()->CopyFrom(CreateOperatorDef(
-      "Add",
-      "",
-      {concat_out, summed},
-      {out},
-      {MakeArgument<int>("broadcast", 1)}));
-
-  for (const auto& o : net_dummy.external_output()) {
-    const auto it = shape_hints->find(o);
-    CAFFE_ENFORCE(
-        it != shape_hints->end(), "Cannot find shape info for output ", o);
-    const auto& shape = it->second.shape;
-    // TODO: all doable but I'm lazy
-    if (shape.data_type() != caffe2::TensorProto::FLOAT) {
-      CAFFE_THROW("We need a Cast op to match the output data type");
-    }
-    if (shape.dims_size() == 2) {
-      if (shape.dims(0) == batch_size) {
-        if (shape.dims(1) > dim2) {
-          CAFFE_THROW(
-              "We need Tile op to match the output dim ",
-              shape.dims(1),
-              " vs ",
-              dim2);
-        } else if (shape.dims(1) == dim2) {
-          net_dummy.add_op()->CopyFrom(
-              CreateOperatorDef("Copy", "", {out}, {o}, {}));
-        } else {
-          net_dummy.add_op()->CopyFrom(CreateOperatorDef(
-              "Slice",
-              "",
-              {out},
-              {o},
-              {MakeArgument<std::vector<int>>("starts", {0, 0}),
-               MakeArgument<std::vector<int>>(
-                   "ends", {-1, static_cast<int>(shape.dims(1))})}));
-        }
-      }
-    } else if (shape.dims_size() == 1) {
-      if (shape.dims(0) == batch_size) {
-        const std::string oi = o + "_pre";
-        net_dummy.add_op()->CopyFrom(CreateOperatorDef(
-            "Slice",
-            "",
-            {out},
-            {oi},
-            {MakeArgument<std::vector<int>>("starts", {0, 0}),
-             MakeArgument<std::vector<int>>("ends", {-1, 1})}));
-        net_dummy.add_op()->CopyFrom(CreateOperatorDef(
-            "Reshape",
-            "",
-            {oi},
-            {o},
-            {MakeArgument<std::vector<int>>(
-                "shape", {static_cast<int>(batch_size)})}));
-      } else {
-        CAFFE_THROW(
-            "We need Slice and Tile op to match the output dim ",
-            shape.dims(0),
-            " vs ",
-            batch_size);
-      }
-    } else {
-      CAFFE_THROW("Only support 1D/2D outputs for now");
-    }
-  }
-
-  return net_dummy;
-}
-
 } // namespace
 
 void splitSparseLengthsSumSparse(NetDef* net, const Workspace& ws) {
@@ -928,19 +699,7 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOpViaC2(
     }
   }
 
-  // Rewrite the net into a dummy in loop test mode
-  ShapeInfoMap new_shape_hints;
-  if (opts_.loop_test) {
-    new_shape_hints = shape_hints;
-    onnxifi_net = buildLoopTestNet(
-        onnxifi_net,
-        initialization_list,
-        &new_shape_hints,
-        opts_.bound_shape_spec.max_batch_size);
-    initialization_list.clear();
-  }
-
-  // Add parition info
+  // Add partition info
   for (const auto& p : partition_infos_) {
     onnxifi_net.add_partition_info()->CopyFrom(p);
   }
@@ -965,7 +724,7 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOpViaC2(
       initialization_list,
       onnxifi_net_inputs,
       onnxifi_net_outputs,
-      opts_.loop_test ? new_shape_hints : shape_hints);
+      shape_hints);
   NetDef net_opt = composeResultNet(onnxifi_op);
 
   // Debugging stuff
diff --git a/caffe2/opt/onnxifi_transformer.h b/caffe2/opt/onnxifi_transformer.h
index 9af168a4d20e..86e061411dd9 100644
--- a/caffe2/opt/onnxifi_transformer.h
+++ b/caffe2/opt/onnxifi_transformer.h
@@ -39,9 +39,6 @@ struct OnnxifiTransformerOptions final : public BackendTransformOptions {
   // fp16 or not
   bool merge_fp32_inputs_into_fp16{false};
 
-  // Enter loop test mode
-  bool loop_test{false};
-
   // Whether the net has been ssaRewritten
   bool predictor_net_ssa_rewritten{false};
 

From 4a35280ec2a80d8074ee01bf353af1360e3405f3 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Mon, 26 Oct 2020 20:52:06 -0700
Subject: [PATCH 031/169] [c10] fix weak_intrusive_ptr lock() (#46007)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46007

When owner released the object, target will become null and illegal to
access refcount_ again. This PR fixes this and return null in that case.

Test Plan: Imported from OSS

Reviewed By: gmagogsfm

Differential Revision: D24374846

Pulled By: wanchaol

fbshipit-source-id: 741074f59c0904a4d60b7bde956cad2d0925be4e
---
 c10/test/util/intrusive_ptr_test.cpp | 15 +++++++++++++++
 c10/util/intrusive_ptr.h             | 22 +++++++++++++---------
 2 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/c10/test/util/intrusive_ptr_test.cpp b/c10/test/util/intrusive_ptr_test.cpp
index 233a4442f2a8..2ea283d1a4f0 100644
--- a/c10/test/util/intrusive_ptr_test.cpp
+++ b/c10/test/util/intrusive_ptr_test.cpp
@@ -1653,6 +1653,21 @@ TEST(WeakIntrusivePtrTest, givenPtr_whenLocking_thenReturnsCorrectObject) {
   EXPECT_EQ(var.ptr.get(), locked.get());
 }
 
+TEST(WeakIntrusivePtrTest, expiredPtr_whenLocking_thenReturnsNullType) {
+  IntrusiveAndWeak<SomeClass> var = make_weak_intrusive<SomeClass>();
+  // reset the intrusive_ptr to test if weak pointer still valid
+  var.ptr.reset();
+  EXPECT_TRUE(var.weak.expired());
+  intrusive_ptr<SomeClass> locked = var.weak.lock();
+  EXPECT_FALSE(locked.defined());
+}
+
+TEST(WeakIntrusivePtrTest, weakNullPtr_locking) {
+  auto weak_ptr = make_invalid_weak<SomeClass>();
+  intrusive_ptr<SomeClass> locked = weak_ptr.lock();
+  EXPECT_FALSE(locked.defined());
+}
+
 TEST(
     WeakIntrusivePtrTest,
     givenValidPtr_whenMoveAssigning_thenPointsToSameObject) {
diff --git a/c10/util/intrusive_ptr.h b/c10/util/intrusive_ptr.h
index 308ee883794a..453196510aa8 100644
--- a/c10/util/intrusive_ptr.h
+++ b/c10/util/intrusive_ptr.h
@@ -589,15 +589,19 @@ class weak_intrusive_ptr final {
   }
 
   intrusive_ptr<TTarget, NullType> lock() const noexcept {
-    auto refcount = target_->refcount_.load();
-    do {
-      if (refcount == 0) {
-        // Object already destructed, no strong references left anymore.
-        // Return nullptr.
-        return intrusive_ptr<TTarget, NullType>(NullType::singleton());
-      }
-    } while (!target_->refcount_.compare_exchange_weak(refcount, refcount + 1));
-    return intrusive_ptr<TTarget, NullType>(target_);
+    if (expired()) {
+      return intrusive_ptr<TTarget, NullType>(NullType::singleton());
+    } else {
+      auto refcount = target_->refcount_.load();
+      do {
+        if (refcount == 0) {
+          // Object already destructed, no strong references left anymore.
+          // Return nullptr.
+          return intrusive_ptr<TTarget, NullType>(NullType::singleton());
+        }
+      } while (!target_->refcount_.compare_exchange_weak(refcount, refcount + 1));
+      return intrusive_ptr<TTarget, NullType>(target_);
+    }
   }
 
   /**

From 9858b012ec5cc1bf9775f446f743981daaab9dd1 Mon Sep 17 00:00:00 2001
From: Jeffrey Wan <jw3468@fb.com>
Date: Mon, 26 Oct 2020 21:00:40 -0700
Subject: [PATCH 032/169] Fix TripletMarginWithDistanceLoss example code
 (#46853)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/45210

Removes `requires_grad=True` from all the `randint`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46853

Reviewed By: izdeby

Differential Revision: D24549483

Pulled By: soulitzer

fbshipit-source-id: c03576571ed0b2dbb281870f29a28eb6f6209c65
---
 torch/nn/modules/loss.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index 796861b0f4b3..29f07543f7a8 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -1323,9 +1323,9 @@ class TripletMarginWithDistanceLoss(_Loss):
 
     >>> # Initialize embeddings
     >>> embedding = nn.Embedding(1000, 128)
-    >>> anchor_ids = torch.randint(0, 1000, (1,), requires_grad=True)
-    >>> positive_ids = torch.randint(0, 1000, (1,), requires_grad=True)
-    >>> negative_ids = torch.randint(0, 1000, (1,), requires_grad=True)
+    >>> anchor_ids = torch.randint(0, 1000, (1,))
+    >>> positive_ids = torch.randint(0, 1000, (1,))
+    >>> negative_ids = torch.randint(0, 1000, (1,))
     >>> anchor = embedding(anchor_ids)
     >>> positive = embedding(positive_ids)
     >>> negative = embedding(negative_ids)

From 37da6d26ffc3a5d1c3dcdc48358d4569f6e20123 Mon Sep 17 00:00:00 2001
From: Linbin Yu <linbin@fb.com>
Date: Mon, 26 Oct 2020 21:00:47 -0700
Subject: [PATCH 033/169] add fburl link to error message (#46795)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46795

add fburl link to the error message of missing ops so user can debug themselves.

Test Plan: fburl.com/missing_ops

Reviewed By: iseeyuan

Differential Revision: D24519992

fbshipit-source-id: d2d16db7e9d9c84ce2c4600532eb253c30b31971
---
 torch/csrc/jit/mobile/import.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp
index c7f41b902ad6..96b07612c903 100644
--- a/torch/csrc/jit/mobile/import.cpp
+++ b/torch/csrc/jit/mobile/import.cpp
@@ -86,8 +86,7 @@ void print_unsupported_ops_and_throw(
   TORCH_CHECK(
       false,
       "Following ops cannot be found. ",
-      "May need to add them explicitly to the selective build operator whitelist, ",
-      "or re-run the export_opnames to update the whitelist:",
+      "Check fburl.com/missing_ops for the fix.",
       error_message);
 }
 

From bcbb6baccfa12b3ce062d7108652eae284b501a9 Mon Sep 17 00:00:00 2001
From: Hong Xu <hong@topbug.net>
Date: Mon, 26 Oct 2020 21:10:49 -0700
Subject: [PATCH 034/169] Add a warning message that torch.sign would not
 support complex numbers (#43280)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/43280

Test Plan: Imported from OSS

Reviewed By: ansley

Differential Revision: D24538769

Pulled By: anjali411

fbshipit-source-id: ab2d5283501e4c1d7d401d508e32f685add7ebb1
---
 aten/src/ATen/native/UnaryOps.cpp | 6 +++++-
 test/test_torch.py                | 5 ++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index 99ec62e972c6..08b214846944 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -316,7 +316,11 @@ Tensor& rsqrt_out(Tensor& result, const Tensor& self) { return unary_op_impl_out
 Tensor rsqrt(const Tensor& self) { return unary_op_impl(self, at::rsqrt_out); }
 Tensor& rsqrt_(Tensor& self) { return unary_op_impl_(self, at::rsqrt_out); }
 
-Tensor& sign_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, sign_stub); }
+Tensor& sign_out(Tensor& result, const Tensor& self) {
+  TORCH_CHECK(!self.is_complex(),
+              "Unlike NumPy, torch.sign is not intended to support complex numbers. Please use torch.sgn instead.");
+  return unary_op_impl_out(result, self, sign_stub);
+}
 Tensor sign(const Tensor& self) { return unary_op_impl(self, at::sign_out); }
 Tensor& sign_(Tensor& self) { return unary_op_impl_(self, at::sign_out); }
 
diff --git a/test/test_torch.py b/test/test_torch.py
index 4a5c1f3f168b..1f3241cf3b79 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -5300,7 +5300,10 @@ def test_complex_assert_raises(self, device):
                 self.assertRaises(RuntimeError,
                                   lambda: zeros.index_add(0, torch.arange(0, size[0], dtype=torch.long, device=device), tensor))
 
-            self.assertRaises(RuntimeError, lambda: torch.sign(torch.tensor([4j], device=device, dtype=dtype)))
+            with self.assertRaisesRegex(RuntimeError,
+                                        (r'Unlike NumPy, torch.sign is not intended to support complex numbers\. '
+                                         r'Please use torch.sgn instead\.')):
+                torch.sign(torch.tensor([4j], device=device, dtype=dtype))
 
             a = torch.rand((2, 2), dtype=dtype, device=device)
             b = torch.rand((2, 2), dtype=dtype, device=device)

From 60eded6c0f34aac79e3adbce3c1d713d5e9bc0b0 Mon Sep 17 00:00:00 2001
From: Jack Montgomery <jackmontgomery@fb.com>
Date: Mon, 26 Oct 2020 22:27:03 -0700
Subject: [PATCH 035/169] Add single element tuple output from
 to_backend/to_glow (#5029)

Summary:
Pull Request resolved: https://github.com/pytorch/glow/pull/5029

Support single element tuples in to_backend

Test Plan: new unit test for to_glow

Reviewed By: andrewmillspaugh

Differential Revision: D24539869

fbshipit-source-id: fb385a7448167b2b948e70f6af081bcf78f338dc
---
 torch/csrc/jit/backends/backend_init.cpp | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/jit/backends/backend_init.cpp b/torch/csrc/jit/backends/backend_init.cpp
index 17c92cb14023..d23a518d1ca3 100644
--- a/torch/csrc/jit/backends/backend_init.cpp
+++ b/torch/csrc/jit/backends/backend_init.cpp
@@ -124,7 +124,7 @@ void initJitBackendBindings(PyObject* module) {
       static const auto method_ct = CodeTemplate(R"(
             def $method(self${,def_inputs}):
                 typed_inputs: List[Any] = [${fwd_inputs,}]
-                $ret, = self.__backend.execute(self.__handles["$method"], typed_inputs)
+                $unpack, = self.__backend.execute(self.__handles["$method"], typed_inputs)
                 ${refine,}
                 return $ret
             )");
@@ -181,7 +181,9 @@ void initJitBackendBindings(PyObject* module) {
       out_ss << "_0";
       type_check_ss << "assert isinstance(_0, ";
 
-      if (auto out_tuple_ty = out_ty->cast<TupleType>()) {
+      auto out_tuple_ty = out_ty->cast<TupleType>();
+
+      if (out_tuple_ty) {
         auto tuple_elements = out_tuple_ty->elements();
         type_check_ss << tuple_elements[0]->str() << ")";
         type_checks.emplace_back(type_check_ss.str());
@@ -201,6 +203,14 @@ void initJitBackendBindings(PyObject* module) {
       method_te.v("def_inputs", def_inputs);
       method_te.v("fwd_inputs", fwd_inputs);
       method_te.v("refine", type_checks);
+      method_te.s("unpack", out_ss.str());
+
+      // If the output type is a single element tuple then add an extra comma
+      // to ensure the final output maintains this type.
+      if (out_tuple_ty && out_tuple_ty->elements().size() == 1) {
+        out_ss << ",";
+      }
+
       method_te.s("ret", out_ss.str());
 
       loweredModule.define(

From 6b50ccc41cd9feb624243a518e1362659f2fada4 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Mon, 26 Oct 2020 23:49:53 -0700
Subject: [PATCH 036/169] [quant][graphmode][fx] Support
 sigmoid/hardsigmoid/tanh in qat (#46738) (#46871)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/46871

Test Plan:
Imported from OSS

Imported from OSS

Reviewed By: vkuzo

Differential Revision: D24547180

fbshipit-source-id: d2eb9aa74c6e5436204376b1a2ebcc6188d3562f
---
 test/quantization/test_quantize.py            |  33 ++++-
 test/quantization/test_quantize_fx.py         |  99 ++++++++++++---
 test/test_quantization.py                     |   3 +-
 torch/quantization/fake_quantize.py           |  22 +++-
 torch/quantization/fx/pattern_utils.py        |  18 ++-
 .../quantization/fx/quantization_patterns.py  |  34 ++++--
 torch/quantization/fx/quantize.py             | 113 ++++++++++--------
 torch/quantization/quantization_mappings.py   |  23 ++++
 torch/quantization/quantize.py                |  45 +++++--
 9 files changed, 292 insertions(+), 98 deletions(-)

diff --git a/test/quantization/test_quantize.py b/test/quantization/test_quantize.py
index 480ca18128c5..1ef6e9b89c0c 100644
--- a/test/quantization/test_quantize.py
+++ b/test/quantization/test_quantize.py
@@ -25,7 +25,8 @@
     float_qparams_dynamic_qconfig,
     PerChannelMinMaxObserver,
     QConfigDynamic,
-    default_dynamic_quant_observer
+    default_dynamic_quant_observer,
+    FixedQParamsFakeQuantize,
 )
 
 from torch.testing._internal.common_quantization import (
@@ -1247,6 +1248,36 @@ def forward(self, x):
     def test_leaky_relu(self):
         self._test_activation_op_impl(nn.LeakyReLU, nnq.LeakyReLU, {'negative_slope': 0.1, 'inplace': False})
 
+
+class TestEagerModeQATOps(QuantizationTestCase):
+    def test_fixed_qparam_ops(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.sigmoid = torch.nn.Sigmoid()
+                self.hardsigmoid = torch.nn.Hardsigmoid()
+                self.tanh = torch.nn.Tanh()
+                self.quant = QuantStub()
+                self.dequant = DeQuantStub()
+
+            def forward(self, x):
+                x = self.quant(x)
+                x = self.sigmoid(x)
+                x = self.hardsigmoid(x)
+                x = self.tanh(x)
+                x = self.dequant(x)
+                return x
+
+        m = M().train()
+        m.qconfig = default_qat_qconfig
+        m = prepare_qat(m)
+        for attr in ['sigmoid', 'hardsigmoid', 'tanh']:
+            self.assertEqual(type(getattr(m, attr).activation_post_process), FixedQParamsFakeQuantize)
+        m = convert(m)
+        # make sure activation post process is removed
+        for attr in ['sigmoid', 'hardsigmoid', 'tanh']:
+            self.assertFalse(hasattr(getattr(m, attr), 'activation_post_process'))
+
 class TestFunctionalModule(QuantizationTestCase):
     # Histogram Observers are slow, so have no-deadline to ensure test doesn't time out
     @given(train_mode=st.booleans())
diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index 4e352a80daa3..4113b754a3e8 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -23,6 +23,7 @@
     convert,
     PerChannelMinMaxObserver,
     QConfigDynamic,
+    FixedQParamsFakeQuantize,
 )
 
 # test utils
@@ -1410,7 +1411,7 @@ def test_general_value_ops(self):
         """
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(3, 3, 3)
                 self.avg_pool1d = torch.nn.AvgPool1d(3)
                 self.avg_pool2d = torch.nn.AvgPool2d(3)
@@ -1418,9 +1419,6 @@ def __init__(self):
                 self.adaptive_avg_pool1d = torch.nn.AdaptiveAvgPool1d((1))
                 self.adaptive_avg_pool2d = torch.nn.AdaptiveAvgPool2d((1, 1))
                 self.adaptive_avg_pool3d = torch.nn.AdaptiveAvgPool3d((1, 1, 1))
-                self.hardsigmoid = torch.nn.Hardsigmoid()
-                self.sigmoid = torch.nn.Sigmoid()
-                self.tanh = torch.nn.Tanh()
 
             def forward(self, x):
                 x = self.conv(x)
@@ -1442,21 +1440,6 @@ def forward(self, x):
                 x = x.mean([2, 3], True)
                 x = F.interpolate(x, 4, mode='nearest')
                 x = F.interpolate(x, 4, mode='linear')
-                x = self.hardsigmoid(x)
-                x = F.hardsigmoid(x)
-                x = F.hardsigmoid(x, inplace=True)
-                x = x.hardsigmoid()
-                x.hardsigmoid_()
-                x = self.sigmoid(x)
-                x = torch.sigmoid(x)
-                # F.sigmoid is deprecated
-                x = x.sigmoid()
-                x.sigmoid_()
-                x = self.tanh(x)
-                # F.tanh is deprecated
-                x = torch.tanh(x)
-                x = x.tanh()
-                x.tanh_()
                 x = self.conv(x)
                 return x
 
@@ -1488,6 +1471,84 @@ def forward(self, x):
             expected_node_occurrence=count_check,
             expected_node_list=order_check)
 
+    @skipIfNoFBGEMM
+    def test_fixed_qparams_ops(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(3, 3, 3)
+                self.sigmoid = torch.nn.Sigmoid()
+                self.hardsigmoid = torch.nn.Hardsigmoid()
+                self.tanh = torch.nn.Tanh()
+
+            def forward(self, x):
+                x = self.conv(x)
+                # F.sigmoid is deprecated
+                x = self.sigmoid(x)
+                x = torch.sigmoid(x)
+                x = x.sigmoid()
+                x.sigmoid_()
+                x = self.hardsigmoid(x)
+                x = F.hardsigmoid(x)
+                x = F.hardsigmoid(x, inplace=True)
+                x = x.hardsigmoid()
+                x.hardsigmoid_()
+                x = self.tanh(x)
+                # F.tanh is deprecated
+                x = torch.tanh(x)
+                x = x.tanh()
+                x.tanh_()
+                x = self.conv(x)
+                return x
+
+        for eval_mode in [True, False]:
+            # This model is not executable since we just put all ops
+            # in the same forward
+            m = M()
+            if eval_mode:
+                m.eval()
+                qconfig = default_qconfig
+                prepare = prepare_fx
+                fq_count = 0
+            else:
+                m.train()
+                qconfig = default_qat_qconfig
+                prepare = prepare_qat_fx
+                fq_count = 13
+
+            # nothing to fuse so skipping the fuse step
+            qconfig_dict = {'': qconfig}
+            prepared = prepare(m, qconfig_dict)
+            # check the correct number of activation_post_process is inserted
+            count_check = {
+                ns.call_module(FixedQParamsFakeQuantize) : fq_count,
+            }
+            self.checkGraphModuleNodes(
+                prepared,
+                expected_node_occurrence=count_check)
+            # not runnable
+            quantized = convert_fx(prepared)
+
+            # This checks that the dequantize from the output of first conv
+            # is being propagated to the end, so that we don't insert extra
+            # observers
+            # check exact counts of quantize and dequantize
+            count_check = {
+                ns.call_function(torch.quantize_per_tensor) : 1,
+                ns.call_method('dequantize') : 1
+            }
+            order_check = [
+                ns.call_function(torch.quantize_per_tensor),
+                ns.call_module(nnq.Conv2d),
+                ns.call_module(nn.Sigmoid),
+                ns.call_module(nnq.Conv2d),
+                ns.call_method('dequantize'),
+            ]
+            self.checkGraphModuleNodes(
+                quantized,
+                expected_node_occurrence=count_check,
+                expected_node_list=order_check)
+
     def test_float_functional(self):
         class TorchAdd(nn.Module):
             """Wrapper around torch.add so that all ops can be found at build"""
diff --git a/test/test_quantization.py b/test/test_quantization.py
index 8a6f05cb19de..26d3ee020983 100644
--- a/test/test_quantization.py
+++ b/test/test_quantization.py
@@ -44,6 +44,7 @@
 from quantization.test_quantize import TestPostTrainingDynamic  # noqa: F401
 from quantization.test_quantize import TestQuantizationAwareTraining  # noqa: F401
 from quantization.test_quantize import TestEagerModeOps  # noqa: F401
+from quantization.test_quantize import TestEagerModeQATOps  # noqa: F401
 
 # TODO: merge with other tests in test_quantize.py?
 from quantization.test_quantize import TestFunctionalModule  # noqa: F401
@@ -64,7 +65,7 @@
 from quantization.test_quantize_fx import TestQuantizeFxOps  # noqa: F401
 from quantization.test_quantize_fx import TestQuantizeFxModels  # noqa: F401
 
-# Tooling: numric_suite
+# Tooling: numeric_suite
 from quantization.test_numeric_suite import TestEagerModeNumericSuite  # noqa: F401
 
 # Backward Compatibility
diff --git a/torch/quantization/fake_quantize.py b/torch/quantization/fake_quantize.py
index dd2333883325..cacaad6bea0b 100644
--- a/torch/quantization/fake_quantize.py
+++ b/torch/quantization/fake_quantize.py
@@ -159,12 +159,12 @@ def forward(self, X):
 
     @torch.jit.export
     def extra_repr(self):
-        return 'fake_quant_enabled={}, observer_enabled={},\
-            quant_min={}, quant_max={}, dtype={}, qscheme={}, ch_axis={}, \
-        scale={}, zero_point={}'.format(
-            self.fake_quant_enabled, self.observer_enabled,
-            self.quant_min, self.quant_max,
-            self.dtype, self.qscheme, self.ch_axis, self.scale, self.zero_point)
+        return 'fake_quant_enabled={}, observer_enabled={}, ' \
+               'quant_min={}, quant_max={}, dtype={}, qscheme={}, ch_axis={}, ' \
+               'scale={}, zero_point={}'.format(
+                   self.fake_quant_enabled, self.observer_enabled,
+                   self.quant_min, self.quant_max,
+                   self.dtype, self.qscheme, self.ch_axis, self.scale, self.zero_point)
 
     def _save_to_state_dict(self, destination, prefix, keep_vars):
         # We cannot currently register scalar values as buffers, so need to manually
@@ -226,9 +226,19 @@ def forward(self, X):
                                                       self.quant_max)
         return X
 
+    @torch.jit.export
     def calculate_qparams(self):
         return self.scale, self.zero_point
 
+    @torch.jit.export
+    def extra_repr(self):
+        return 'fake_quant_enabled={}, observer_enabled={}, scale={}, zero_point={}, ' \
+               'dtype={}, quant_min={}, quant_max={}, qscheme={}'.format(
+                   self.fake_quant_enabled, self.observer_enabled,
+                   self.scale, self.zero_point, self.dtype,
+                   self.quant_min, self.quant_max, self.qscheme)
+
+
 default_fake_quant = FakeQuantize.with_args(observer=MovingAverageMinMaxObserver, quant_min=0, quant_max=255,
                                             dtype=torch.quint8, qscheme=torch.per_tensor_affine, reduce_range=True)
 default_weight_fake_quant = FakeQuantize.with_args(observer=MovingAverageMinMaxObserver, quant_min=-128, quant_max=127,
diff --git a/torch/quantization/fx/pattern_utils.py b/torch/quantization/fx/pattern_utils.py
index efc1cd492d1b..ccbd7cc2a2c4 100644
--- a/torch/quantization/fx/pattern_utils.py
+++ b/torch/quantization/fx/pattern_utils.py
@@ -14,10 +14,17 @@ def get_default_fusion_patterns():
     return DEFAULT_FUSION_PATTERNS
 
 DEFAULT_QUANTIZATION_PATTERNS = OrderedDict()
+# a map from pattern to activation_post_process(observer/fake_quant) consstructor for output activation
+# e.g. pattern: torch.sigmoid,
+#      output_activation_post_process: default_affine_fixed_qparam_fake_quant
+DEFAULT_OUTPUT_ACTIVATION_POST_PROCESS_MAP = dict()
+
 # Register pattern for both static quantization and qat
-def register_quant_pattern(pattern):
+def register_quant_pattern(pattern, output_activation_post_process=None):
     def insert(fn):
         DEFAULT_QUANTIZATION_PATTERNS[pattern] = fn
+        if output_activation_post_process is not None:
+            DEFAULT_OUTPUT_ACTIVATION_POST_PROCESS_MAP[pattern] = output_activation_post_process
         return fn
     return insert
 
@@ -25,6 +32,12 @@ def insert(fn):
 def get_default_quant_patterns():
     return DEFAULT_QUANTIZATION_PATTERNS
 
+# a map from pattern to output activation post process constructor
+# e.g. torch.sigmoid -> default_affine_fixed_qparam_fake_quant
+def get_default_output_activation_post_process_map():
+    return DEFAULT_OUTPUT_ACTIVATION_POST_PROCESS_MAP
+
+
 # Example use of register pattern function:
 # @register_fusion_pattern(torch.nn.ReLU, (torch.nn.BatchNorm2d, torch.nn.Conv2d)))
 # class ConvBNReLUFusion():
@@ -62,6 +75,9 @@ def is_match(modules, node, pattern, max_uses=sys.maxsize):
         elif node.target is getattr:
             if node.args[1] != pattern[1]:
                 return False
+    elif isinstance(self_match, str):
+        if node.op != 'call_method' or node.target != self_match:
+            return False
     elif node.target != self_match:
         return False
 
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index 12787e4a87db..2898a07a17c1 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -4,6 +4,10 @@
 )
 import torch.nn.quantized as nnq
 import torch.nn.quantized.dynamic as nnqd
+from torch.quantization import (
+    default_affine_fixed_qparams_fake_quant,
+    default_symmetric_fixed_qparams_fake_quant,
+)
 
 from ..quantization_mappings import (
     get_static_quant_module_class,
@@ -487,6 +491,22 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
         return quantizer.quantized_graph.create_node(
             'call_function', quantized_op, args, kwargs)
 
+@register_quant_pattern(torch.nn.Hardsigmoid, default_affine_fixed_qparams_fake_quant)
+@register_quant_pattern(torch.nn.functional.hardsigmoid, default_affine_fixed_qparams_fake_quant)
+@register_quant_pattern('hardsigmoid', default_affine_fixed_qparams_fake_quant)
+@register_quant_pattern('hardsigmoid_', default_affine_fixed_qparams_fake_quant)
+@register_quant_pattern(torch.nn.Sigmoid, default_affine_fixed_qparams_fake_quant)
+@register_quant_pattern(torch.sigmoid, default_affine_fixed_qparams_fake_quant)
+@register_quant_pattern('sigmoid', default_affine_fixed_qparams_fake_quant)
+@register_quant_pattern('sigmoid_', default_affine_fixed_qparams_fake_quant)
+@register_quant_pattern(torch.nn.Tanh, default_symmetric_fixed_qparams_fake_quant)
+@register_quant_pattern(torch.tanh, default_symmetric_fixed_qparams_fake_quant)
+@register_quant_pattern('tanh', default_symmetric_fixed_qparams_fake_quant)
+@register_quant_pattern('tanh_', default_symmetric_fixed_qparams_fake_quant)
+class FixedQParamsOpQuantizeHandler(QuantizeHandler):
+    def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_dict=None):
+        return quantizer.quantized_graph.node_copy(node, load_arg(quantized=None))
+
 # these ops have quantized equivalents that do not need any extra information
 @register_quant_pattern(torch.nn.AdaptiveAvgPool1d)
 @register_quant_pattern(torch.nn.AdaptiveAvgPool2d)
@@ -495,20 +515,16 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 @register_quant_pattern(torch.nn.AvgPool2d)
 @register_quant_pattern(torch.nn.AvgPool3d)
 @register_quant_pattern(torch.nn.Dropout)
-@register_quant_pattern(torch.nn.Hardsigmoid)
 @register_quant_pattern(torch.nn.Hardtanh)
 @register_quant_pattern(torch.nn.MaxPool1d)
 @register_quant_pattern(torch.nn.MaxPool2d)
 @register_quant_pattern(torch.nn.MaxPool3d)
 @register_quant_pattern(torch.nn.ReLU)
 @register_quant_pattern(torch.nn.ReLU6)
-@register_quant_pattern(torch.nn.Sigmoid)
-@register_quant_pattern(torch.nn.Tanh)
 @register_quant_pattern(torch.adaptive_avg_pool1d)
 @register_quant_pattern(torch.nn.functional.adaptive_avg_pool2d)
 @register_quant_pattern(torch.nn.functional.adaptive_avg_pool3d)
 @register_quant_pattern(torch.nn.functional.dropout)
-@register_quant_pattern(torch.nn.functional.hardsigmoid)
 @register_quant_pattern(torch.nn.functional.hardtanh)
 @register_quant_pattern(torch.nn.functional.hardtanh_)
 @register_quant_pattern(torch.nn.functional.interpolate)
@@ -528,11 +544,9 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 @register_quant_pattern(torch.mean)
 @register_quant_pattern(torch.min)
 @register_quant_pattern(torch.repeat_interleave)
-@register_quant_pattern(torch.sigmoid)
 @register_quant_pattern(torch.sort)
 @register_quant_pattern(torch.squeeze)
 @register_quant_pattern(torch.stack)
-@register_quant_pattern(torch.tanh)
 @register_quant_pattern(torch.unsqueeze)
 @register_quant_pattern(operator.getitem)
 @register_quant_pattern(operator.floordiv)
@@ -541,8 +555,6 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 @register_quant_pattern('contiguous')
 @register_quant_pattern('detach')
 @register_quant_pattern('detach_')
-@register_quant_pattern('hardsigmoid')
-@register_quant_pattern('hardsigmoid_')
 @register_quant_pattern('mean')
 @register_quant_pattern('numel')
 @register_quant_pattern('permute')
@@ -553,13 +565,9 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 @register_quant_pattern('reshape')
 @register_quant_pattern('resize_')
 @register_quant_pattern('shape')
-@register_quant_pattern('sigmoid')
-@register_quant_pattern('sigmoid_')
 @register_quant_pattern('size')
 @register_quant_pattern('squeeze')
 @register_quant_pattern('squeeze_')
-@register_quant_pattern('tanh')
-@register_quant_pattern('tanh_')
 @register_quant_pattern('transpose')
 @register_quant_pattern('unsqueeze')
 @register_quant_pattern('unsqueeze_')
@@ -570,7 +578,7 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 
 # Default quantization handler, used for quantization of input and output
 # of quantizable objects (e.g. modules and functionals)
-class DefaultQuant(QuantizeHandler):
+class DefaultQuantizeHandler(QuantizeHandler):
     def convert(self, quantizer, node):
         assert self.all_nodes
         root_module = quantizer.modules['']
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index 3eebef4ff10a..026bd6214155 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -19,11 +19,15 @@
     get_default_qat_module_mappings,
 )
 
-from ..quantize import _remove_qconfig
+from ..quantize import (
+    _remove_qconfig,
+    is_activation_post_process
+)
 
 from .pattern_utils import (
     is_match,
     get_default_quant_patterns,
+    get_default_output_activation_post_process_map,
 )
 
 from .standalone_module import (
@@ -131,11 +135,6 @@ def assert_and_get_unique_device(module):
     device = next(iter(devices)) if len(devices) > 0 else None
     return device
 
-def is_activation_post_process(module):
-    return (isinstance(module, torch.quantization.ObserverBase) or
-            isinstance(module, torch.quantization.FakeQuantize) or
-            isinstance(module, torch.quantization.FakeQuantizeBase))
-
 def is_submodule_of_fake_quant(name, module, named_modules):
     parent_name, _ = _parent_name(name)
     return is_activation_post_process(named_modules[parent_name])
@@ -353,7 +352,7 @@ def _prepare(self, model, qconfig_dict, inplace, prepare_custom_config_dict, is_
 
         # find _inputs_ to matched nodes that are not quantized, these
         # have to be quantized, which requires measuring stats,
-        # initialize an DefaultQuant object for each
+        # initialize an DefaultQuantizeHandler object for each
         quants = self._find_quants(model.graph, matches)
 
         self.activation_post_process_map = dict()
@@ -383,7 +382,7 @@ def load_arg(a):
                 continue
 
             prefix = node.name + '_activation_post_process_'
-            root_node, _, obj, qconfig = matches.get(node.name, (None, None, None, None))
+            root_node, _, pattern, obj, qconfig = matches.get(node.name, (None, None, None, None, None))
             if root_node is None:
                 env[node.name] = observed_graph.node_copy(node, load_arg)
             elif root_node is node:
@@ -430,9 +429,19 @@ def insert_observer(node, observer, device):
                 if not activation_is_statically_quantized(qconfig):
                     continue
 
-                # inserting observers for output of observed module, or mark the output
-                # as observed
-                if isinstance(obj, CopyNode):
+                if isinstance(obj, FixedQParamsOpQuantizeHandler) and model.training:
+                    # we only insert fake quantize module in qat
+                    activation_post_process_ctr = \
+                        get_default_output_activation_post_process_map().get(pattern, None)
+                    assert activation_post_process_ctr is not None, \
+                        'activation_post_process constructor not provided for ' + \
+                        'pattern:' + str(pattern)
+                    device = assert_and_get_unique_device(model)
+                    insert_observer(node, activation_post_process_ctr(), device)
+                elif (isinstance(obj, FixedQParamsOpQuantizeHandler) and
+                      not model.training) or isinstance(obj, CopyNode):
+                    # inserting observers for output of observed module, or mark the output
+                    # as observed
                     assert node.op in [
                         'call_module',
                         'call_function',
@@ -472,6 +481,7 @@ def is_observed(input_arg):
             else:
                 env[node.name] = observed_graph.node_copy(node, load_arg)
 
+            # insert observer for output of the node
             if node.name not in observed_node_names_set and node.name in quants:
                 if is_standalone_module and node.name in graph_inputs:
                     # we'll insert observer for input of standalone module
@@ -480,11 +490,11 @@ def is_observed(input_arg):
                     continue
                 get_new_observer_name = get_new_attr_name_with_prefix(prefix)
                 observer_name = get_new_observer_name(model)
-                _, qconfig, is_weight = quants[node.name]
-                if qconfig is not None:
+                _, activation_post_process_ctr = quants[node.name]
+                if activation_post_process_ctr is not None:
                     # TODO: use insert_observer
-                    new_observer = \
-                        qconfig.weight() if is_weight else qconfig.activation()
+                    new_observer = activation_post_process_ctr()
+
                     # respect device affinity when adding observers
                     device = assert_and_get_unique_device(model)
                     if device:
@@ -669,7 +679,7 @@ def is_quantized(node):
                     graph_output = map_arg(node.args[0], load_non_quantized)
                 self.quantized_graph.output(graph_output)
                 continue
-            root_node, matched, obj, qconfig = matches.get(node.name, (None, None, None, None))
+            root_node, matched, matched_pattern, obj, qconfig = matches.get(node.name, (None, None, None, None, None))
             if root_node is node:
                 if qconfig is None:
                     result = self.quantized_graph.node_copy(node, load_non_quantized)
@@ -682,7 +692,10 @@ def is_quantized(node):
                         quantized = True
 
                     # Need to get correct quantized/non-quantized state for the output of CopyNode
-                    if isinstance(obj, CopyNode):
+                    if type(obj) in [
+                            CopyNode,
+                            FixedQParamsOpQuantizeHandler
+                    ]:
                         assert node.op in [
                             'call_module',
                             'call_function',
@@ -751,13 +764,7 @@ def load_arg(a):
             else:
                 env[node.name] = act_post_process_removed_graph.node_copy(node, load_arg)
 
-        module_dict = dict(model.named_modules())
-        to_be_removed = []
-        for name, module in model.named_modules():
-            if is_activation_post_process(module) and not is_submodule_of_fake_quant(name, module, module_dict):
-                to_be_removed.append(name)
-        for n in to_be_removed:
-            delattr(model, n)
+        # removes qconfig and activation_post_process modules
         _remove_qconfig(model)
         model = GraphModule(model, act_post_process_removed_graph)
         return model
@@ -832,10 +839,10 @@ def _find_matches(
 
         Outputs a map of
           node_name ->
-            (node, matched_values, QuantizeHandler instance, qconfig)
+            (node, matched_values, matched_pattern, QuantizeHandler instance, qconfig)
 
         For example, {
-          'relu_1': (relu_1, [relu_1], <CopyNode instance>, QConfig(...)),
+          'relu_1': (relu_1, [relu_1], torch.nn.functional.relu, <CopyNode instance>, QConfig(...)),
           ...
         }
         """
@@ -862,7 +869,7 @@ def record_match(pattern, node, matched):
                         matched = []
                         record_match(pattern, node, matched)
                         for n in matched:
-                            match_map[n.name] = (node, matched, value(self, node), self.qconfig_map[n.name])
+                            match_map[n.name] = (node, matched, pattern, value(self, node), self.qconfig_map[n.name])
                             all_matched.add(n.name)
                         # break after finding the first match
                         break
@@ -873,7 +880,7 @@ def record_match(pattern, node, matched):
                type(self.modules[node.target]) in custom_module_class_mapping:
                 custom_module_qconfig = self.qconfig_map[node.name]
                 match_map[node.name] = (
-                    node, [node], CustomModuleQuantizeHandler(self, node), custom_module_qconfig)
+                    node, [node], None, CustomModuleQuantizeHandler(self, node), custom_module_qconfig)
 
         def is_standalone_module(module_path):
             if standalone_module_names is None:
@@ -888,7 +895,7 @@ def is_standalone_module(module_path):
                 # add node to matched nodes
                 custom_module_qconfig = self.qconfig_map[node.name]
                 match_map[node.name] = (
-                    node, [node], StandaloneModuleQuantizeHandler(self, node), custom_module_qconfig)
+                    node, [node], None, StandaloneModuleQuantizeHandler(self, node), custom_module_qconfig)
 
         return match_map
 
@@ -902,16 +909,13 @@ def _find_quants(self, graph, matches):
           - matches: output of self._find_matches function
 
         Outputs a map of
-          node_name -> (QuantizeHandler instance (always DefaultQuant), qconfig)
+         node_name -> (QuantizeHandler instance (always DefaultQuantizeHandler),
+         activation_post_process (observer/fake_quantize module) constructor)
         """
         quants = {}
 
-        def visit(node, qconfig):
+        def visit(node, matched_pattern, qconfig):
             def visit_arg(arg):
-                # note: we have to measure quantization information
-                # even for nodes where we might not use it because it is already
-                # quantized. This is because each match has the option to
-                # say NotImplemented (if for instance, it is an __add__ and the data type is not appropriate)
                 is_weight = False
                 if isinstance(node, Node) and node.op == 'call_function' and node.target in WEIGHT_INDEX_DICT:
                     for i, node_arg in enumerate(node.args):
@@ -919,26 +923,39 @@ def visit_arg(arg):
                             is_weight = True
                 if qconfig is not None and \
                    (activation_is_statically_quantized(qconfig) or is_weight):
-                    # overwrite previous quant config
-                    quants[arg.name] = (DefaultQuant(self, arg), qconfig, is_weight)
+                    act_post_process_ctr = qconfig.weight if is_weight else qconfig.activation
+                    quants[arg.name] = (DefaultQuantizeHandler(self, arg), qconfig, is_weight)
+                    # overwrite the constructor from qconfig
+                    act_post_process_ctr = \
+                        get_default_output_activation_post_process_map().get(
+                            matched_pattern,
+                            act_post_process_ctr)
+                    # overwrite previous activation post process constructor if necessary
+                    quants[arg.name] = (DefaultQuantizeHandler(self, arg), act_post_process_ctr)
             return visit_arg
 
         for node in graph.nodes:
             if node.name in matches:
-                root_node, matched, obj, qconfig = matches[node.name]
+                root_node, matched_nodes, matched_pattern, quantize_handler, qconfig = matches[node.name]
                 # don't attach observer/fake_quant for CopyNode
-                if isinstance(obj, CopyNode):
+                if isinstance(quantize_handler, CopyNode):
                     qconfig = None
                 if root_node is node:
-                    # matched[-1] is the first op in the sequence and
-                    # matched[0] is the last op in the sequence
+                    # matched_nodes[-1] is the first op in the sequence and
+                    # matched_nodes[0] is the last op in the sequence
                     # inputs
-                    map_arg(matched[-1].args, visit(matched[-1], qconfig))
-                    map_arg(matched[-1].kwargs, visit(matched[-1], qconfig))
+                    # matched_pattern is set to None for inputs because
+                    # we only want to select QuantizeHandler object based
+                    # on pattern for output, inputs will always use
+                    # DefaultQuantizeHandler
+                    map_arg(matched_nodes[-1].args, visit(matched_nodes[-1], None, qconfig))
+                    map_arg(matched_nodes[-1].kwargs, visit(matched_nodes[-1], None, qconfig))
+
                     # output
-                    if isinstance(obj, StandaloneModuleQuantizeHandler):
-                        # we don't insert observer for output of custom
-                        # module
-                        continue
-                    map_arg(matched[0], visit(None, qconfig))
+                    # we don't insert observer for output of standalone module
+                    if not isinstance(quantize_handler, StandaloneModuleQuantizeHandler):
+                        # passing in matched_pattern here so that we can customize
+                        # activation_post_process constructor for output based on the pattern, e.g.
+                        # for sigmoid op we'll use default_affine_fixed_qparam_fake_quant
+                        map_arg(matched_nodes[0], visit(None, matched_pattern, qconfig))
         return quants
diff --git a/torch/quantization/quantization_mappings.py b/torch/quantization/quantization_mappings.py
index 40fef784ce2d..82340a49309c 100644
--- a/torch/quantization/quantization_mappings.py
+++ b/torch/quantization/quantization_mappings.py
@@ -10,6 +10,10 @@
 import torch.nn.qat as nnqat
 
 from .stubs import QuantStub, DeQuantStub
+from .fake_quantize import (
+    default_affine_fixed_qparams_fake_quant,
+    default_symmetric_fixed_qparams_fake_quant,
+)
 
 # Default map for swapping float module to quantized ones
 DEFAULT_STATIC_QUANT_MODULE_MAPPINGS = {
@@ -91,6 +95,13 @@
     F.leaky_relu: torch._ops.ops.quantized.leaky_relu,
 }
 
+# mapping from module to output activation post process class
+DEFAULT_MODULE_TO_ACT_POST_PROCESS = {
+    nn.Hardsigmoid: default_affine_fixed_qparams_fake_quant,
+    nn.Sigmoid: default_affine_fixed_qparams_fake_quant,
+    nn.Tanh: default_symmetric_fixed_qparams_fake_quant,
+}
+
 def get_default_static_quant_module_mappings():
     ''' Get module mapping for post training static quantization
     '''
@@ -158,3 +169,15 @@ def get_quantized_operator(float_op):
     assert quantized_op is not None, \
         'Operator {} does not have corresponding quantized op'.format(str(float_op))
     return quantized_op
+
+def get_default_special_act_post_process(module_cls):
+    r""" Get the special activation post process for `module`, this has
+    higher priority than the activation post process in `qconfig`
+    e.g.
+    input: torch.nn.Sigmoid
+    output: default_affine_fixed_qparam_fake_quant
+    """
+    return DEFAULT_MODULE_TO_ACT_POST_PROCESS.get(module_cls, None)
+
+def has_special_act_post_process(module_cls):
+    return module_cls in DEFAULT_MODULE_TO_ACT_POST_PROCESS
diff --git a/torch/quantization/quantize.py b/torch/quantization/quantize.py
index 9aa52d373ff6..f217c5ffb65c 100644
--- a/torch/quantization/quantize.py
+++ b/torch/quantization/quantize.py
@@ -9,14 +9,23 @@
 import torch.nn.quantized as nnq
 import torch.nn.intrinsic.qat as nniqat
 
-from .quantization_mappings import (get_default_dynamic_quant_module_mappings,
-                                    get_default_static_quant_module_mappings,
-                                    get_default_qat_module_mappings,
-                                    get_default_qconfig_propagation_list)
+from .quantization_mappings import (
+    get_default_dynamic_quant_module_mappings,
+    get_default_static_quant_module_mappings,
+    get_default_qat_module_mappings,
+    get_default_qconfig_propagation_list,
+    has_special_act_post_process,
+    get_default_special_act_post_process,
+)
 
 from .stubs import DeQuantStub, QuantWrapper
 from .qconfig import default_dynamic_qconfig, float16_dynamic_qconfig, float_qparams_dynamic_qconfig
 
+def is_activation_post_process(module):
+    return (isinstance(module, torch.quantization.ObserverBase) or
+            isinstance(module, torch.quantization.FakeQuantize) or
+            isinstance(module, torch.quantization.FakeQuantizeBase))
+
 def _propagate_qconfig_helper(module, qconfig_dict, allow_list=None,
                               qconfig_parent=None, prefix=''):
     r"""This is a helper function for `propagate_qconfig_`
@@ -107,8 +116,8 @@ def add_observer_(module, qconfig_propagation_list=None, non_leaf_module_list=No
         )
         device = next(iter(devices)) if len(devices) > 0 else None
 
-    def get_activation_post_process(qconfig, device):
-        activation = qconfig.activation()
+    def get_activation_post_process(qconfig, device, special_act_post_process=None):
+        activation = qconfig.activation() if special_act_post_process is None else special_act_post_process()
         if device is not None:
             activation.to(device)
         return activation
@@ -116,13 +125,13 @@ def get_activation_post_process(qconfig, device):
     def needs_observation(m):
         return hasattr(m, 'qconfig') and m.qconfig is not None
 
-    def insert_activation_post_process(m):
+    def insert_activation_post_process(m, special_act_post_process=None):
         """ Adds an activation post process module and register
         a post hook that calls the module
         """
         if needs_observation(m):
             # observer and hook will be gone after we swap the module
-            m.add_module('activation_post_process', get_activation_post_process(m.qconfig, device))
+            m.add_module('activation_post_process', get_activation_post_process(m.qconfig, device, special_act_post_process))
             # Register observer as the first entry in the hook list
             # All post forward hooks are preserved and will be executed after the observer before convert
             handle = register_activation_post_process_hook(m)
@@ -130,8 +139,11 @@ def insert_activation_post_process(m):
 
     for name, child in module.named_children():
         if type(child) == nnq.FloatFunctional or type(child) == nnq.QFunctional:
-            if hasattr(child, 'qconfig') and child.qconfig is not None:
+            if needs_observation(child):
                 child.activation_post_process = get_activation_post_process(child.qconfig, device)
+        elif has_special_act_post_process(type(child)):
+            special_act_post_process = get_default_special_act_post_process(type(child))
+            insert_activation_post_process(child, special_act_post_process)
         elif non_leaf_module_list is not None and type(child) in non_leaf_module_list:
             insert_activation_post_process(child)
         elif needs_observation(child) and type(child) in custom_module_class_mapping:
@@ -229,6 +241,19 @@ def prepare(model, inplace=False, allow_list=None,
         custom_module_class_mapping=custom_module_class_mapping)
     return model
 
+def _remove_activation_post_process(module):
+    # TODO: maybe we should change activation_post_process to _activation_post_process
+    # to prevent it from being used by user
+    if hasattr(module, 'activation_post_process') and \
+       is_activation_post_process(module.activation_post_process):
+        delattr(module, 'activation_post_process')
+
+    # remove activation_post_proceess hook
+    for handle_id, hook_fn in module._forward_hooks.items():
+        if hook_fn is _observer_forward_hook:
+            module._forward_hooks.pop(handle_id)
+
+# TODO: rename to something more general
 def _remove_qconfig(module):
     r"""Clean up the qconfig left in the module so that new qconfig can be
     propagated.
@@ -242,6 +267,8 @@ def _remove_qconfig(module):
     if hasattr(module, "qconfig"):
         del module.qconfig
 
+    _remove_activation_post_process(module)
+
 def quantize(model, run_fn, run_args, mapping=None, inplace=False):
     r"""Quantize the input float model with post training static quantization.
 

From 4b6e30719112a23553d0bfc524fcb888078ac55d Mon Sep 17 00:00:00 2001
From: Raghavan Raman <raghavanr@fb.com>
Date: Mon, 26 Oct 2020 23:57:04 -0700
Subject: [PATCH 037/169] Replace flatten tensors with flatten loops. (#46737)

Summary:
This is the second attempt at replacing flatten tensors with flatten loops in `TensorExprKernel::generateStmt`. The first attempt (https://github.com/pytorch/pytorch/pull/46539) resulted in a build failure due to an exception that gets thrown during inline.

The reason for the build failure was because there was an inline step, which was supposed to happen on the unflattened tensors. This was necessary earlier because for every flattened tensor there was an unflattened tensor which had to be inlined. That is no longer necessary since we do not have 2 tensors (flattened and unflattened) now. Removed this inline.

Checked python and cpp tests on CPU as well as CUDA.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46737

Reviewed By: anjali411, izdeby

Differential Revision: D24534529

Pulled By: navahgar

fbshipit-source-id: 8b131a6be076fe94ed369550d9f54d3879fdfefd
---
 torch/csrc/jit/tensorexpr/kernel.cpp | 87 +++++++---------------------
 torch/csrc/jit/tensorexpr/kernel.h   |  2 -
 2 files changed, 20 insertions(+), 69 deletions(-)

diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index f9b933b78d65..53837184399d 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -1355,56 +1355,8 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) {
   }
 }
 
-void TensorExprKernel::flattenTensors(BackendType backendType) {
-  if (backendType != BackendType::kCudaCodeGen &&
-      backendType != BackendType::kBlockCodeGen) {
-    // We only need to flatten for GPU, for other backends just use the same
-    // tensors.
-    flatTensorOutputs_ = tensorOutputs_;
-    return;
-  }
-
-  flatTensorOutputs_.resize(tensorOutputs_.size());
-  for (size_t tensorIdx = 0; tensorIdx < tensorOutputs_.size(); tensorIdx++) {
-    Tensor* tensor = tensorOutputs_[tensorIdx];
-    ExprHandle totalCount = ExprHandle(tensor->dim(0));
-    for (int i = 1; i < tensor->ndim(); i++) {
-      const IntImm* totalCountImm = totalCount.AsNode<IntImm>();
-      const IntImm* tensorDimImm = dynamic_cast<const IntImm*>(tensor->dim(i));
-      if (totalCountImm && tensorDimImm) {
-        // TODO: switch to real constant folding when it is available.
-        totalCount = ExprHandle(totalCountImm->value() * tensorDimImm->value());
-      } else {
-        totalCount = totalCount * ExprHandle(tensor->dim(i));
-      }
-    }
-    // Flatten the index for GPU kernels.
-    // TODO: move this to fusing axis when it is ready.
-    Tensor* newOut = Compute(
-        tensor->buf()->name_hint() + "_flat",
-        {totalCount},
-        [tensor](const VarHandle& index) -> ExprHandle {
-          std::vector<ExprHandle> dims;
-          ExprHandle value = index;
-          for (int i = tensor->ndim() - 1; i >= 0; i--) {
-            ExprHandle idx = value;
-            if (i > 0) {
-              idx = Mod::make(value, ExprHandle(tensor->dim(i)));
-            }
-            dims.push_back(idx);
-            value = value / ExprHandle(tensor->dim(i));
-          }
-          std::reverse(dims.begin(), dims.end());
-          return tensor->call(dims);
-        });
-    flatTensorOutputs_[tensorIdx] = newOut;
-  }
-}
-
 Stmt* TensorExprKernel::generateStmt(BackendType backendType) {
-  flattenTensors(backendType);
-
-  torch::jit::tensorexpr::LoopNest l(flatTensorOutputs_);
+  torch::jit::tensorexpr::LoopNest l(tensorOutputs_);
   GRAPH_DEBUG("Original Stmt:\n", std::to_string(l.root_stmt()), "\n");
 
   bool hasReduction = NodeFinder<ReduceOp>::find(l.root_stmt()).size() != 0;
@@ -1417,12 +1369,12 @@ Stmt* TensorExprKernel::generateStmt(BackendType backendType) {
     l.computeInline(p.second->buf());
   }
   if (backendType == kCudaCodeGen) {
-    for (size_t i = 0; i < flatTensorOutputs_.size(); i++) {
-      Tensor* tensor = flatTensorOutputs_[i];
-
-      // For every output tensor we've created a flattened 1D tensor - let's
-      // mark the original output tensor with computeInline
-      l.computeInline(tensorOutputs_[i]->buf());
+    for (auto tensor : tensorOutputs_) {
+      std::vector<For*> loops = l.getLoopStmtsFor(tensor);
+      TORCH_INTERNAL_ASSERT(!loops.empty(), "loops should not be empty");
+      For* flattened = nullptr;
+      LoopNest::flatten(loops, &flattened);
+      assert(flattened);
 
       int loopLevels = getTECudaPointwiseLoopLevels();
       const int kDefaultLoopLevels = 2;
@@ -1437,8 +1389,7 @@ Stmt* TensorExprKernel::generateStmt(BackendType backendType) {
         if (blockSize < 0) {
           blockSize = kDefaultBlockSize;
         }
-        std::vector<For*> loops = l.getLoopStmtsFor(tensor);
-        l.splitWithMask(loops[0], blockSize, &outer, &inner);
+        l.splitWithMask(flattened, blockSize, &outer, &inner);
         l.setGPUBlockIndex(outer, 0);
         l.setGPUThreadIndex(inner, 0);
       } else if (loopLevels == 3) {
@@ -1451,8 +1402,7 @@ Stmt* TensorExprKernel::generateStmt(BackendType backendType) {
         const int kDefaultBlockSize = 256;
         blockCount = (blockCount > 0) ? blockCount : kDefaultBlockCount;
         blockSize = (blockSize > 0) ? blockSize : kDefaultBlockSize;
-        std::vector<For*> loops = l.getLoopStmtsFor(tensor);
-        l.splitWithMask(loops[0], blockCount * blockSize, &outer, &inner);
+        l.splitWithMask(flattened, blockCount * blockSize, &outer, &inner);
         l.splitWithMask(inner, blockSize, &inner1, &inner2);
         l.setGPUBlockIndex(inner1, 0);
         l.setGPUThreadIndex(inner2, 0);
@@ -1465,12 +1415,11 @@ Stmt* TensorExprKernel::generateStmt(BackendType backendType) {
 
   if (backendType == kBlockCodeGen) {
     auto block_analysis = std::make_unique<CreateBufferMap>();
-    for (size_t i = 0; i < flatTensorOutputs_.size(); i++) {
+    for (auto tensor : tensorOutputs_) {
       const int default_fp16_blocksize = 16;
       const int default_uint8_blocksize = 32;
       int blockSize = default_fp16_blocksize;
       // We only handle looplevels == 2 for now
-      Tensor* tensor = flatTensorOutputs_[i];
       // Run Block analysis to get multi dim buffer info
       auto root_stmt = l.root_stmt();
       root_stmt->accept(block_analysis.get());
@@ -1478,12 +1427,16 @@ Stmt* TensorExprKernel::generateStmt(BackendType backendType) {
       if (tensor->buf()->dtype().scalar_type() == ScalarType::Byte) {
         blockSize = default_uint8_blocksize;
       }
-      l.computeInline(l.getLoopBodyFor(tensorOutputs_[i]));
-      For* outer;
-      For* inner;
+
       std::vector<For*> loops = l.getLoopStmtsFor(tensor);
-      TORCH_INTERNAL_ASSERT(loops.size() > 0, "loops should not be empty");
-      l.splitWithMask(loops[0], blockSize, &outer, &inner);
+      TORCH_INTERNAL_ASSERT(!loops.empty(), "loops should not be empty");
+      For* flattened = nullptr;
+      LoopNest::flatten(loops, &flattened);
+      assert(flattened);
+
+      For* outer = nullptr;
+      For* inner = nullptr;
+      l.splitWithMask(flattened, blockSize, &outer, &inner);
       l.setGPUBlockIndex(outer, 0);
       l.setGPUThreadIndex(inner, 0);
       l.setBufferMap(outer, block_analysis->getBufferMap());
@@ -1531,7 +1484,7 @@ std::vector<CodeGen::BufferArg> TensorExprKernel::prepareBufferArgs() {
       params.emplace_back(stride.var);
     }
   }
-  for (auto& o : flatTensorOutputs_) {
+  for (auto& o : tensorOutputs_) {
     params.emplace_back(o);
   }
   return params;
diff --git a/torch/csrc/jit/tensorexpr/kernel.h b/torch/csrc/jit/tensorexpr/kernel.h
index 028b18112ab2..b21d6607fffb 100644
--- a/torch/csrc/jit/tensorexpr/kernel.h
+++ b/torch/csrc/jit/tensorexpr/kernel.h
@@ -125,7 +125,6 @@ class TORCH_API TensorExprKernel {
 
   Tensor* computeValue(const torch::jit::Value* v);
 
-  void flattenTensors(BackendType backendType);
   Stmt* generateStmt(BackendType backendType);
   std::vector<CodeGen::BufferArg> prepareBufferArgs();
 
@@ -191,7 +190,6 @@ class TORCH_API TensorExprKernel {
   int64_t nInputs_ = 0;
   std::vector<KernelArg> kernelArgs_;
   std::vector<Tensor*> tensorOutputs_;
-  std::vector<Tensor*> flatTensorOutputs_;
   std::unordered_map<int64_t, Tensor*> tensors_;
   std::unordered_map<int64_t, VarHandle> scalars_;
   std::unique_ptr<CodeGen> codegen_;

From 8640905088a002dbf149b59333a190c8aed2f5e5 Mon Sep 17 00:00:00 2001
From: Wang Xu <scottxu0730@gmail.com>
Date: Tue, 27 Oct 2020 00:10:09 -0700
Subject: [PATCH 038/169] add sparse_nn_partition (#46390)

Summary:
WIP: This PR adds sparse_nn_partition into Partitioner class. It includes logical device assignment for all dag nodes. The basic idea is to do size_based_partition separately for embedding nodes and non-embedding nodes. A test unit is also added in test_fx_experimental.py.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46390

Reviewed By: gcatron

Differential Revision: D24555415

Pulled By: scottxu0730

fbshipit-source-id: 8772af946d5226883759a02a1c827cfdfce66097
---
 test/test_fx_experimental.py                  |  92 ++++-
 torch/fx/experimental/Partitioner.py          | 365 ++++++++++++------
 .../experimental/subgraph_creation_example.py |   2 +-
 3 files changed, 327 insertions(+), 132 deletions(-)

diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py
index 8a6fbe4a3a90..34065987a4b2 100644
--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@@ -1,7 +1,7 @@
 import torch
 from torch.fx.symbolic_trace import symbolic_trace
 from torch.fx.experimental import GraphManipulation
-from torch.fx.experimental.Partitioner import Partitioner, Device
+from torch.fx.experimental.Partitioner import Partitioner, Device, PartitionerConfig
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.jit_utils import JitTestCase
 
@@ -20,12 +20,14 @@ def forward(self, a, b):
         )
         partitioner = Partitioner()
         devices = [
-            Device('dev_0', 125),
-            Device('dev_1', 125),
-            Device('dev_2', 125)
+            Device('dev_0', 125, 0),
+            Device('dev_1', 125, 1),
+            Device('dev_2', 125, 2)
         ]
-        ret = partitioner.partition_graph(traced, m, devices)
+        partitioner_config = PartitionerConfig(devices)
+        ret = partitioner.partition_graph(traced, m, partitioner_config)
         module_with_submodules = ret.module_with_submodules
+        dag = ret.dag
         self.assertEqual(traced(a, b), module_with_submodules(a, b))
 
     def test_size_based_partition(self):
@@ -51,12 +53,14 @@ def forward(self, a, b):
         )
         partitioner = Partitioner()
         devices = [
-            Device('dev_0', 125),
-            Device('dev_1', 125),
-            Device('dev_2', 125)
+            Device('dev_0', 125, 0),
+            Device('dev_1', 125, 1),
+            Device('dev_2', 125, 2)
         ]
-        ret = partitioner.partition_graph(traced, m, devices)
+        partitioner_config = PartitionerConfig(devices)
+        ret = partitioner.partition_graph(traced, m, partitioner_config)
         module_with_submodules = ret.module_with_submodules
+        dag = ret.dag
         self.assertEqual(traced(a, b), module_with_submodules(a, b))
         assert len(module_with_submodules.graph.nodes) == 7
 
@@ -82,14 +86,76 @@ def forward(self, a, b):
         )
         partitioner = Partitioner()
         devices = [
-            Device('dev_0', 125),
-            Device('dev_1', 125),
-            Device('dev_2', 125)
+            Device('dev_0', 125, 0),
+            Device('dev_1', 125, 1),
+            Device('dev_2', 125, 2)
         ]
-        ret = partitioner.partition_graph(traced, m, devices)
+        partitioner_config = PartitionerConfig(devices)
+        ret = partitioner.partition_graph(traced, m, partitioner_config)
         module_with_submodules = ret.module_with_submodules
+        dag = ret.dag
         self.assertEqual(traced(a, b), module_with_submodules(a, b))
         assert len(module_with_submodules.graph.nodes) == 5
 
+    def test_sparse_nn_partition(self):
+        class MyRecommendationModule(torch.nn.Module):
+            def create_mlp(self, num_of_layers: int, input_size: int, output_size: int):
+                layers = torch.nn.ModuleList()
+                for _ in range(num_of_layers):
+                    ll = torch.nn.Linear(input_size, output_size)
+                    layers.append(ll)
+                    layers.append(torch.nn.ReLU())
+                return layers
+
+            def __init__(self):
+                super(MyRecommendationModule, self).__init__()
+                layers = self.create_mlp(4, 4, 4)
+                self.bottom_layers = torch.nn.Sequential(*layers)
+                layers = self.create_mlp(3, 24, 24)
+                self.top_layers = torch.nn.Sequential(*layers)
+                self.embedding_layers = torch.nn.ModuleList()
+                el = torch.nn.EmbeddingBag(500000, 4, mode='sum', sparse=True)
+                self.embedding_layers.append(el)
+                for i in range(3):
+                    el = torch.nn.EmbeddingBag(1000000, 4, mode='sum', sparse=True)
+                    self.embedding_layers.append(el)
+                el = torch.nn.EmbeddingBag(500000, 4, mode='sum', sparse=True)
+                self.embedding_layers.append(el)
+
+            def forward(self, a, b, offset):
+                x = self.bottom_layers(a)
+                y = []
+                c = []
+                for i in range(len(self.embedding_layers)):
+                    temp = torch.randint(10, (8, ))
+                    c.append(temp + b)
+                for i in range(len(self.embedding_layers)):
+                    if i % 2 == 0:
+                        y.append(self.embedding_layers[i](c[i], offset))
+                    else:
+                        y.append(self.embedding_layers[i](torch.randint(10, (8, )), offset))
+                z = torch.cat([x] + y, dim=1)
+                p = self.top_layers(z)
+                return p
+
+        m = MyRecommendationModule()
+        a = torch.rand(2, 4)
+        b = torch.randint(10, (8, ))
+        offset = torch.randint(1, (2, ))
+        traced = symbolic_trace(m)
+        GraphManipulation.get_size_of_all_nodes(traced, [a, b, offset])
+        devices = [
+            Device('dev_0', 33000000, 0),
+            Device('dev_1', 33000000, 1),
+            Device('dev_2', 33000000, 2)
+        ]
+        partitioner_config = PartitionerConfig(devices, is_sparse_nn=True)
+        partitioner = Partitioner()
+        ret = partitioner.partition_graph(traced, m, partitioner_config)
+        module_with_submodules = ret.module_with_submodules
+        dag = ret.dag
+        self.assertEqual(traced(a, b, offset), module_with_submodules(a, b, offset))
+        assert len(module_with_submodules.graph.nodes) == 24
+
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/fx/experimental/Partitioner.py b/torch/fx/experimental/Partitioner.py
index d01ed6075e2b..fadbac42cdd0 100644
--- a/torch/fx/experimental/Partitioner.py
+++ b/torch/fx/experimental/Partitioner.py
@@ -1,18 +1,31 @@
 from torch.fx.graph_module import GraphModule
 from torch.fx.node import Node, map_arg
-from typing import Dict, List, Union, Set, NamedTuple, Tuple
+from typing import Dict, List, Set, NamedTuple, Tuple
 import torch
 from torch.fx.experimental.subgraph_creation_example import split_module
 import operator
 
-class DAGNode(NamedTuple):
+class DAGNode():
     """
     DAGNode class maintains useful information for a partition (submodule).
     inputs(submodule node) and outputs(submodule node).
     """
-    submodule: Node
-    input_nodes: List[Node]
-    output_nodes: List[Node]
+    def __init__(
+        self,
+        submodule_node: Node,
+        input_nodes: List[Node],
+        output_nodes: List[Node],
+        logical_device_ids: List[int],
+        size_bytes: int
+    ) -> None:
+        self.submodule_node: Node = submodule_node
+        self.input_nodes: List[Node] = input_nodes
+        self.output_nodes: List[Node] = output_nodes
+        self.logical_device_ids: List[int] = logical_device_ids
+        self.size_bytes = size_bytes
+
+    def __str__(self) -> str:
+        return str(self.submodule_node)
 
 class DAG:
     """DAG class contains all the DAG nodes"""
@@ -21,38 +34,36 @@ def __init__(self) -> None:
 
     def create_node(
         self,
-        submodule: Node,
+        submodule_node: Node,
         input_nodes: List[Node],
-        output_nodes: List[Node]
+        output_nodes: List[Node],
+        logical_devices: List[int],
+        size_bytes: int
     ) -> None:
-        node = DAGNode(submodule, input_nodes, output_nodes)
+        node = DAGNode(submodule_node, input_nodes, output_nodes, logical_devices, size_bytes)
         self.nodes.append(node)
 
 class Partition:
     """Partition class contains all the information about an individual partition.
     It also provides necessary methods for manipulation the partition.
     """
-    def __init__(self, partition_id: int, fx_module: GraphModule) -> None:
-        self.graph_module = fx_module
+    def __init__(self, partition_id: int) -> None:
         self.nodes: Set[Node] = set()
         self.partition_id = partition_id
         self.parents: Set['Partition'] = set()
         self.children: Set['Partition'] = set()
         self.bfs_level: int = -1
-
-    def add_node(self, node: Node) -> None:
-        """Append a new node into the partition."""
-        self.nodes.add(node)
-
-    def add_parent(self, partition: 'Partition') -> None:
-        self.parents.add(partition)
-
-    def add_child(self, partition: 'Partition') -> None:
-        self.children.add(partition)
+        self.used_mem_bytes: int = 0
+        self.logical_device_ids: List[int] = []
 
     def __str__(self):
         return str(self.partition_id)
 
+    def recalculate_mem_size(self):
+        self.used_mem_bytes = 0
+        for node in self.nodes:
+            self.used_mem_bytes += get_extra_size_of(node, self.nodes)
+
 class PartitionResult(NamedTuple):
     """NameTuple used for returning DAG and a new graph module
     """
@@ -61,45 +72,68 @@ class PartitionResult(NamedTuple):
 
 class Device(NamedTuple):
     name: str
-    available_mem_bytes: Union[float, int]
+    available_mem_bytes: int
+    logical_id: int
+
+class PartitionerConfig(NamedTuple):
+    devices: List[Device]
+    is_sparse_nn: bool = False
+
+def get_extra_size_of(node: Node, nodes: Set[Node]) -> int:
+    """Given a node and a set of nodes,
+       this function return the extra size that needed
+       if this node is included in this set.
+    """
+    # Find all its input nodes
+    input_nodes: Dict[Node, None] = {}
+    map_arg(node.args, lambda n: input_nodes.setdefault(n))
+    map_arg(node.kwargs, lambda n: input_nodes.setdefault(n))
+    # Calculate total size of related nodes
+    total_size_of_input_nodes = 0
+    for n in input_nodes:
+        # Make sure this node hasn't been in this set yet
+        if n not in nodes:
+            size_bytes = getattr(n, 'size_bytes', None)
+            if size_bytes:
+                total_size_of_input_nodes += size_bytes.output_size
+            else:
+                raise RuntimeError('node has no size_bytes attr')
+    # Don't forget the op node itself
+    size_bytes = getattr(node, 'size_bytes', None)
+    if size_bytes:
+        total_size_of_input_nodes += size_bytes.total_size
+    else:
+        raise RuntimeError('node has no size_bytes attr')
+    return total_size_of_input_nodes
 
 class Partitioner:
     """A graph module may not fit into one device.
     Partitioner class helps cut one graph into subgraphs (partitions),
     so that each partition could fit into a different device.
     The main function of this class is self.partition_graph.
-    For self.partition_graph, first, it checks the size of the whole graph
-    and see if the whole graph can fit into one device.
-    If it does, it goes to self.find_single_partition
-    If the whole graph is even larger than the combined memory size of all devices,
-    a RuntimeError is raised.
-    If the whole graph cannot fit into one devices but
-    could be split into multiple devices, it goes to self.size_based_partition.
-    After the size_based_partition, it checks if the number of partitions exceeds
-    the number of devices. If it does, a RuntimeError is raised.
-    Otherwise, a DAG structure is returned
+    It will partition the graph based on the scheme specified in partition_config
+    A DAG structure is returned
     along with a new graph module with partitions as submodule nodes.
     """
     def __init__(self) -> None:
-        self.partitions: Set[Partition] = set()
+        self.partitions: List[Partition] = []
+        self.node_to_partitions: Dict[Node, int] = {}
         self.devices: List[Device] = []
-        self.node_to_partitions: Dict[Node, List[int]] = {}
-        self.partition_to_used_mem_bytes: Dict[Partition, int] = {}
 
     def partition_graph(
         self,
         fx_module: GraphModule,
         torch_module: torch.nn.Module,
-        devices: List[Device]
+        partitioner_config: PartitionerConfig
     ) -> PartitionResult:
         """
-        Given the fx module, torch module and devices,
+        Given the fx module, torch module and partitioner_config,
         find the partitions, do the partitions,
         and then return a DAG and a new fx module with submodule nodes (partitions)
         """
         self.graph_module = fx_module
-        self.devices = devices
         self.torch_module = torch_module
+        self.devices = partitioner_config.devices
         if len(self.devices) == 0:
             raise RuntimeError('No devices')
         available_mem_bytes = self.devices[0].available_mem_bytes
@@ -114,34 +148,36 @@ def partition_graph(
                 break
             total_size_of_graph += node.size_bytes.total_size
         if total_size_of_graph <= available_mem_bytes:
-            self.find_single_partition()
+            self.find_single_partition(total_size_of_graph)
         elif total_size_of_graph > len(self.devices) * available_mem_bytes:
             raise RuntimeError('Devices have no enough memory for the module')
         else:
             if not all(device.available_mem_bytes == available_mem_bytes for device in self.devices):
                 raise RuntimeError('All devices must have same memory size!')
-            self.size_based_partition(available_mem_bytes)
-        # Check if enought devices are provided for all partitions
-        if len(self.partitions) > len(self.devices):
-            raise RuntimeError('Lack of Devices')
+            if partitioner_config.is_sparse_nn:
+                self.sparse_nn_partition(available_mem_bytes)
+            else:
+                self.size_based_partition(available_mem_bytes)
         module_with_submodules = self.do_partition()
         # The DAG contains DAGNodes with info of each partition's input nodes, output nodes
         # and how partitions are connected.
-        dag = self.dump_partition_DAG(module_with_submodules)
+        dag = self.dump_dag(module_with_submodules)
         ret = PartitionResult(dag, module_with_submodules)
         return ret
 
-    def find_single_partition(self) -> None:
+    def find_single_partition(self, total_size_of_graph) -> None:
         """Only one partition (one graph on one device)."""
         partition_0 = self.create_partition()
         for node in self.graph_module.graph.nodes:
             if node.op == 'output':
                 break
-            self.node_to_partitions[node] = [partition_0.partition_id]
-            partition_0.add_node(node)
+            self.node_to_partitions[node] = partition_0.partition_id
+            partition_0.nodes.add(node)
+        partition_0.used_mem_bytes = total_size_of_graph
+        partition_0.logical_device_ids = [self.devices[0].logical_id]
         return
 
-    def size_based_partition(self, available_mem_bytes: Union[float, int]) -> None:
+    def size_based_partition(self, available_mem_bytes: int) -> None:
         """This method partitions the graph based on memory size.
            We assume all devices have the same memory size.
            The basic idea is:
@@ -159,73 +195,48 @@ def size_based_partition(self, available_mem_bytes: Union[float, int]) -> None:
            Then through self.combine_partition_based_on_size(),
            partitions will be combined to keep
            as less partitions as possible.
-           self.check_partition_dependecy checks if the combination of
-           partitions leads to a circular dependency
         """
         # Create the first partition
         partition = self.create_partition()
         # Track the used mem for the current partition
-        used_mem_bytes = 0
         for node in self.graph_module.graph.nodes:
             if node.op in {'call_module', 'call_method', 'call_function'}:
-                # Find all its input nodes
-                input_nodes: Dict[Node, None] = {}
-                map_arg(node.args, lambda n: input_nodes.setdefault(n))
-                map_arg(node.kwargs, lambda n: input_nodes.setdefault(n))
-                # Calculate total size of related nodes
-                total_size_of_input_nodes = 0
-                for n in input_nodes:
-                    # Make sure this node hasn't been in this partition yet
-                    if n not in partition.nodes:
-                        size_bytes = getattr(n, 'size_bytes', None)
-                        if size_bytes:
-                            total_size_of_input_nodes += size_bytes.output_size
-                        else:
-                            raise RuntimeError('node has no size_bytes attr')
-                # Don't forget the op node itself
-                size_bytes = getattr(node, 'size_bytes', None)
-                if size_bytes:
-                    total_size_of_input_nodes += size_bytes.total_size
-                else:
-                    raise RuntimeError('node has no size_bytes attr')
+                total_size_of_input_nodes = get_extra_size_of(node, partition.nodes)
                 # The current node with its inputs cannot fit into the current partition
-                if used_mem_bytes + total_size_of_input_nodes > available_mem_bytes:
-                    self.partition_to_used_mem_bytes[partition] = used_mem_bytes
+                if total_size_of_input_nodes + partition.used_mem_bytes > available_mem_bytes:
                     partition = self.create_partition()
-                    used_mem_bytes = 0
+                    total_size_of_input_nodes = get_extra_size_of(node, partition.nodes)
                     # The current node may be too large to fit into a whole new partition
-                    if total_size_of_input_nodes > available_mem_bytes:
+                    if total_size_of_input_nodes + partition.used_mem_bytes > available_mem_bytes:
                         raise RuntimeError(node.target + 'is too large to fit into a device')
                 # Add the current node into the current partition
-                partition.add_node(node)
-                # Add all input nodes if they are placeholders or constants
-                for n in input_nodes:
-                    if (n not in partition.nodes) and (n.op in {'placeholder', 'get_attr'}):
-                        partition.add_node(n)
-                used_mem_bytes = used_mem_bytes + total_size_of_input_nodes
-        # Update used mem mapping for the last partition
-        self.partition_to_used_mem_bytes[partition] = used_mem_bytes
+                partition.nodes.add(node)
+                partition.used_mem_bytes += total_size_of_input_nodes
         # Find parent partitions and child partitions for each partition.
         self.set_parents_and_children()
         # Combine small partitions
-        self.combine_partitions_based_on_size(available_mem_bytes)
+        self.combine_partitions_based_on_size(self.partitions[:], available_mem_bytes)
         # Reassign partition ids and update self.node_to_partitions.
         self.reorganize_partitions()
+        # Check if devices are enough for all partitions
+        if len(self.partitions) > len(self.devices):
+            msg = 'Need ' + str(len(self.partitions)) + ' devices, but only ' \
+                + str(len(self.devices)) + ' provided'
+            raise RuntimeError(msg)
+        for i, partition in enumerate(self.partitions):
+            partition.logical_device_ids = [self.devices[i].logical_id]
         return
 
     def do_partition(self) -> GraphModule:
         """Return a module with submodules (partitions)."""
-        for node in self.graph_module.graph.nodes:
-            if node.op == 'output':
-                break
         module_with_submodules = split_module(
             self.graph_module,
             self.torch_module,
-            lambda node: self.node_to_partitions[node][0]
+            lambda node: self.node_to_partitions[node]
         )
         return module_with_submodules
 
-    def dump_partition_DAG(self, module_with_submodules: GraphModule) -> DAG:
+    def dump_dag(self, module_with_submodules: GraphModule) -> DAG:
         dag = DAG()
         for node in module_with_submodules.graph.nodes:
             if node.op == 'output':
@@ -245,18 +256,24 @@ def dump_partition_DAG(self, module_with_submodules: GraphModule) -> DAG:
                 output_nodes = list(node.users)
             else:
                 output_nodes = [node]
-            dag.create_node(node, list(input_nodes), output_nodes)
+            partition_id = int(node.name.rsplit('_', 1)[-1])
+            device_ids = self.partitions[partition_id].logical_device_ids
+            size_bytes = self.partitions[partition_id].used_mem_bytes
+            dag.create_node(node, list(input_nodes), output_nodes, device_ids, size_bytes)
         return dag
 
     def create_partition(self) -> Partition:
         """Create a partition and append it to self.partitions."""
         partition_id = len(self.partitions)
-        assert isinstance(self.graph_module, GraphModule)
-        partition = Partition(partition_id, self.graph_module)
-        self.partitions.add(partition)
+        partition = Partition(partition_id)
+        self.partitions.append(partition)
         return partition
 
-    def combine_partitions_based_on_size(self, available_mem_bytes) -> None:
+    def combine_partitions_based_on_size(
+        self,
+        partitions: List[Partition],
+        available_mem_bytes: int
+    ) -> None:
         """Combining small partitions together to keep as less partitions as possible.
            Here is an example of the algorithm to do this:
            Assume some partitions, we first sort them based on partiiton used memory size.
@@ -274,36 +291,60 @@ def combine_partitions_based_on_size(self, available_mem_bytes) -> None:
         find_combination = True
         while find_combination:
             # Sort partitions based on memory size
-            sorted_partitions = sorted(self.partition_to_used_mem_bytes.items(), key=lambda item: item[1])
+            sorted_partitions = sorted(partitions, key=lambda p: p.used_mem_bytes)
             # Mark bfs level
             self.get_bfs_level_partition()
-            find_combination = self.find_partition_to_combine_based_on_size(sorted_partitions, available_mem_bytes)
+            find_combination, partitions = \
+                self.find_partition_to_combine_based_on_size(
+                    sorted_partitions,
+                    available_mem_bytes,
+                    partitions
+                )
         return
 
     def find_partition_to_combine_based_on_size(
         self,
-        sorted_partitions: List[Tuple[Partition, int]],
-        available_mem_bytes: int
-    ) -> bool:
+        sorted_partitions: List[Partition],
+        available_mem_bytes: int,
+        partitions: List[Partition]
+    ) -> Tuple[bool, List[Partition]]:
         """step 1 in self.combine_partition_based_on_size()"""
+
+        def calculate_mem_bytes_needed(p1, p2):
+            nodes = p1.nodes.union(p2.nodes)
+            mem_bytes_needed = 0
+            for node in nodes:
+                mem_bytes_needed += get_extra_size_of(node, nodes)
+            return mem_bytes_needed
+
         find_combination = False
-        smallest_partition = sorted_partitions.pop(0)[0]
-        left_mem = available_mem_bytes - self.partition_to_used_mem_bytes[smallest_partition]
-        for t in sorted_partitions[::-1]:
-            if t[1] <= left_mem and abs(smallest_partition.bfs_level - t[0].bfs_level) <= 1:
-                self.combine_two_partitions(t[0], smallest_partition)
-                find_combination = True
-                break
-        return find_combination
+        smallest_partition = sorted_partitions.pop(0)
+        for p in sorted_partitions[::-1]:
+            if abs(smallest_partition.bfs_level - p.bfs_level) <= 1:
+                # Calculate how many bytes needed if combined
+                mem_bytes_needed = calculate_mem_bytes_needed(p, smallest_partition)
+                if mem_bytes_needed <= available_mem_bytes:
+                    self.combine_two_partitions(p, smallest_partition)
+                    partitions.remove(smallest_partition)
+                    partitions.remove(p)
+                    partitions.append(self.partitions[-1])
+                    find_combination = True
+                    break
+        return find_combination, partitions
 
-    def combine_two_partitions(self, partition_0: Partition, partition_1: Partition) -> None:
+    def combine_two_partitions(
+        self,
+        partition_0: Partition,
+        partition_1: Partition,
+    ) -> None:
         """Given two partitions, combine them into a new one
-        and remove the previous two partitions
+        and remove the previous two partitions from self.partitions
         """
         partition = self.create_partition()
         partition.nodes = partition_0.nodes.union(partition_1.nodes)
         partition.parents = partition_0.parents.union(partition_1.parents)
         partition.children = partition_0.children.union(partition_1.children)
+        partition.recalculate_mem_size()
         partition.bfs_level = max(partition_0.bfs_level, partition_1.bfs_level)
         if partition_0 in partition.children:
             partition.children.remove(partition_0)
@@ -313,10 +354,6 @@ def combine_two_partitions(self, partition_0: Partition, partition_1: Partition)
             partition.children.remove(partition_1)
         if partition_1 in partition.parents:
             partition.parents.remove(partition_1)
-        self.partition_to_used_mem_bytes[partition] = self.partition_to_used_mem_bytes[partition_0] + \
-            self.partition_to_used_mem_bytes[partition_1]
-        del self.partition_to_used_mem_bytes[partition_0]
-        del self.partition_to_used_mem_bytes[partition_1]
         # Replace partition_0 and partition_1 with the new partition in children and parents
         for p in partition.parents:
             if partition_0 in p.children:
@@ -352,9 +389,9 @@ def set_parents_and_children(self) -> None:
                     for p in self.partitions:
                         if p != partition and n in p.nodes and node not in p.nodes:
                             if p not in partition.children:
-                                partition.add_child(p)
+                                partition.children.add(p)
                                 if partition not in p.parents:
-                                    p.add_parent(partition)
+                                    p.parents.add(partition)
         return
 
     def reorganize_partitions(self) -> None:
@@ -364,10 +401,7 @@ def reorganize_partitions(self) -> None:
         # Update self.node_to_partitions accordingly
         for partition in self.partitions:
             for node in partition.nodes:
-                if node not in self.node_to_partitions:
-                    self.node_to_partitions[node] = [partition.partition_id]
-                else:
-                    self.node_to_partitions[node].append(partition.partition_id)
+                self.node_to_partitions[node] = partition.partition_id
         return
 
     def get_bfs_level_partition(self) -> None:
@@ -393,3 +427,98 @@ def get_bfs_level_partition(self) -> None:
                 next_level = set()
                 level += 1
         return
+
+    def sparse_nn_partition(self, available_mem_bytes: int) -> None:
+        """This method partition a sparse nn module.
+           It first traverse all the nodes and do the partitions based on memory size.
+           If the current partition has no enough memory left for a new op node
+           (call_module, call_method, call_function), a new partition is created.
+           Different for size_based_partition, when traversing cross the boundary between
+           non-embedding nodes and embedding nodes, a new partition is created regardlessly.
+           For example, if the current node is a non-embedding node but the next node is an
+           embedding node, a new partition is created for the next node.
+           After the partition, the partitions are combined as much as possible.
+           The rule is that a non-embedding partition only
+           combines with another non-embedding one.
+           So as the embedding partitions.
+        """
+        def reset_partition_in_sparse_nn(partition, new_partition=True):
+            if in_embedding_region:
+                embedding_partitions.append(partition)
+            else:
+                non_embedding_partitions.append(partition)
+            if new_partition:
+                partition = self.create_partition()
+                partition.left_mem_bytes = available_mem_bytes
+                return partition
+            return None
+
+        def is_embedding_node(node: Node) -> bool:
+            """Check if a node is an embedding node"""
+            if node.op == 'call_module':
+                submodule = self.graph_module
+                for atom in str(node.target).split('.'):
+                    if not hasattr(submodule, atom):
+                        raise RuntimeError(f'Module {submodule} has no attribute {atom}')
+                    submodule = getattr(submodule, atom)
+                    if 'Embedding' in str(submodule):
+                        return True
+            return False
+
+        # Track embedding partitons and non-embedding partitions separately
+        embedding_partitions: List[Partition] = []
+        non_embedding_partitions: List[Partition] = []
+        # A Flag to check the boundary
+        in_embedding_region: bool = False
+        partition = self.create_partition()
+        for node in self.graph_module.graph.nodes:
+            if node.op in {'call_module', 'call_method', 'call_function'}:
+                # Check if crossing the boundary between embedding nodes and non embedding nodes
+                if is_embedding_node(node) != in_embedding_region:
+                    # Crossing the boundary
+                    # Check if the current partition is an empty partition
+                    if partition.used_mem_bytes != 0:
+                        # The current partition isn't an empty partition. Create a new one.
+                        partition = reset_partition_in_sparse_nn(partition)
+                    in_embedding_region = not in_embedding_region
+                total_size_of_input_nodes = get_extra_size_of(node, partition.nodes)
+                if total_size_of_input_nodes + partition.used_mem_bytes > available_mem_bytes:
+                    partition = reset_partition_in_sparse_nn(partition)
+                    total_size_of_input_nodes = get_extra_size_of(node, partition.nodes)
+                    if total_size_of_input_nodes > available_mem_bytes:
+                        raise RuntimeError(node.target + 'is too large to fit into a device')
+                partition.nodes.add(node)
+                partition.used_mem_bytes += total_size_of_input_nodes
+        reset_partition_in_sparse_nn(partition, new_partition=False)
+        # Set parents and children for each partition
+        self.set_parents_and_children()
+        # Combining non-embedding partitions
+        self.combine_partitions_based_on_size(non_embedding_partitions, available_mem_bytes)
+        # Combining embedding partitions
+        self.combine_partitions_based_on_size(embedding_partitions, available_mem_bytes)
+        self.reorganize_partitions()
+        total_size_of_non_embedding_partitions = 0
+        for partition in non_embedding_partitions:
+            total_size_of_non_embedding_partitions += partition.used_mem_bytes
+        # Check if devices are enough for all partitions
+        if len(embedding_partitions) > len(self.devices):
+            msg = 'Need ' + str(len(embedding_partitions)) + ' devices, but only ' \
+                + str(len(self.devices)) + ' provided'
+            raise RuntimeError(msg)
+        occupied_devices = []
+        for i, partition in enumerate(embedding_partitions):
+            # Check if all non-embedding partitions can fit into embedding partition devices
+            if total_size_of_non_embedding_partitions + partition.used_mem_bytes > available_mem_bytes:
+                raise RuntimeError(
+                    'partition_' +
+                    str(partition.partition_id) +
+                    '(embedding partition) and non embedding partitions can not fit into one device'
+                )
+            else:
+                # Add logical device to the partition
+                partition.logical_device_ids = [self.devices[i].logical_id]
+                occupied_devices.append(self.devices[i].logical_id)
+        # Add logical devices to the non_embedding_partitions
+        for partition in non_embedding_partitions:
+            partition.logical_device_ids = occupied_devices
+        return
diff --git a/torch/fx/experimental/subgraph_creation_example.py b/torch/fx/experimental/subgraph_creation_example.py
index dc473d53505d..930e8f35426e 100644
--- a/torch/fx/experimental/subgraph_creation_example.py
+++ b/torch/fx/experimental/subgraph_creation_example.py
@@ -115,8 +115,8 @@ def record_cross_partition_use(def_node : torch.fx.node.Node, use_node : Optiona
                     if not hasattr(target_attr, atom):
                         raise RuntimeError(f'Operator target {node.target} not found!')
                     target_attr = getattr(target_attr, atom)
-                partition.targets[node.target] = target_attr
                 target = target_atoms[-1]
+                partition.targets[target] = target_attr
 
             assert isinstance(gathered_args, tuple)
             assert isinstance(gathered_kwargs, dict)

From d5cd781cd3dbf85b4e4f9824c2764c6d46b34d0f Mon Sep 17 00:00:00 2001
From: Shijun Kong <shijunk@fb.com>
Date: Tue, 27 Oct 2020 06:39:01 -0700
Subject: [PATCH 039/169] Update dper3 to use torch.nan_to_num and nan_to_num_
 (#46873)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46873

OSS:
Add op benchmark for torch.nan_to_num and torch.nan_to_num_

Test Plan:
OSS:
`buck run mode/opt caffe2/benchmarks/operator_benchmark/pt:nan_to_num_test`

Reviewed By: qizzzh, houseroad

Differential Revision: D24521835

fbshipit-source-id: 1fd50a99e5329ffec2d470525ce6976d39424958
---
 .../operator_benchmark/pt/nan_to_num_test.py  | 59 +++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 benchmarks/operator_benchmark/pt/nan_to_num_test.py

diff --git a/benchmarks/operator_benchmark/pt/nan_to_num_test.py b/benchmarks/operator_benchmark/pt/nan_to_num_test.py
new file mode 100644
index 000000000000..72f5daf33afa
--- /dev/null
+++ b/benchmarks/operator_benchmark/pt/nan_to_num_test.py
@@ -0,0 +1,59 @@
+import operator_benchmark as op_bench
+import torch
+import math
+
+
+"""Microbenchmarks for torch.nan_to_num / nan_to_num_ operators"""
+
+# Configs for PT torch.nan_to_num / nan_to_num_ operators
+nan_to_num_long_configs = op_bench.cross_product_configs(
+    M=[32, 64, 128],
+    N=range(32, 128, 32),
+    dtype=[torch.float, torch.double],
+    op=["nan_to_num", "nan_to_num_"],
+    replace_inf=[True, False],
+    tags=["long"],
+)
+
+
+nan_to_num_short_configs = op_bench.cross_product_configs(
+    M=[16, 64],
+    N=[64, 64],
+    dtype=[torch.float, torch.double],
+    op=["nan_to_num", "nan_to_num_"],
+    replace_inf=[True, False],
+    tags=["short"],
+)
+
+
+class ReplaceNaNBenchmark(op_bench.TorchBenchmarkBase):
+    def init(self, M, N, dtype, op, replace_inf):
+        self.input = torch.randn(M, N, dtype=dtype)
+        self.input[0][0] = float("nan")
+        self.op = op
+        self.replace_inf = replace_inf
+        self.set_module_name("nan_to_num")
+
+    def forward(self):
+        # compare inplace
+        if self.op == "nan_to_num":
+            if self.replace_inf:
+                output = torch.nan_to_num(self.input, nan=1.0)
+            else:
+                output = torch.nan_to_num(self.input, nan=1.0, posinf=math.inf, neginf=-math.inf)
+        else:
+            if self.replace_inf:
+                output = torch.nan_to_num_(self.input, nan=1.0)
+            else:
+                output = torch.nan_to_num_(self.input, nan=1.0, posinf=math.inf, neginf=-math.inf)
+        return output
+
+
+op_bench.generate_pt_test(
+    nan_to_num_long_configs + nan_to_num_short_configs,
+    ReplaceNaNBenchmark,
+)
+
+
+if __name__ == "__main__":
+    op_bench.benchmark_runner.main()

From daf2a6a29da5109157bb44fc5e4dbc4fb5c56c15 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Tue, 27 Oct 2020 08:02:09 -0700
Subject: [PATCH 040/169] Increase no-output-timeout for OSX builds (#46891)

Summary:
Because conda-build native library relocation scripts can take a while.
From see https://app.circleci.com/pipelines/github/pytorch/pytorch/227245/workflows/e287613d-5e48-4bca-b3d8-b75df2be9f65/jobs/8235584 :
```
Oct 15 08:53:31
Oct 15 09:49:27    INFO:
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46891

Reviewed By: seemethere

Differential Revision: D24553645

Pulled By: malfet

fbshipit-source-id: 62b2251f174aec7ff573a8c4f8cb7a920fa3eaca
---
 .circleci/config.yml                                      | 2 +-
 .circleci/verbatim-sources/job-specs/binary-job-specs.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 2ba5046c9034..2ad408cf6b37 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -969,7 +969,7 @@ jobs:
 
     - run:
         name: Build
-        no_output_timeout: "1h"
+        no_output_timeout: "90m"
         command: |
           # Do not set -u here; there is some problem with CircleCI
           # variable expansion with PROMPT_COMMAND
diff --git a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
index 489dfefdbff1..f8d1dde4e5ad 100644
--- a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
+++ b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
@@ -174,7 +174,7 @@
 
     - run:
         name: Build
-        no_output_timeout: "1h"
+        no_output_timeout: "90m"
         command: |
           # Do not set -u here; there is some problem with CircleCI
           # variable expansion with PROMPT_COMMAND

From bbe5bfaa4f5ec1d80cd971198e4abcfd2ea96a7a Mon Sep 17 00:00:00 2001
From: Heitor Schueroff <heitorschueroff@fb.com>
Date: Tue, 27 Oct 2020 10:21:26 -0700
Subject: [PATCH 041/169] Add GradMode::enabled check to max_pool1d (#46767)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46767

## Benchmark
```
------------------------------------------------------------------------------------------ benchmark: 2 tests ------------------------------------------------------------------------------------------
Name (time in us)             Min                   Max                  Mean              StdDev                Median                IQR            Outliers         OPS            Rounds  Iterations
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
test_grad_disabled       390.0155 (1.0)        533.4131 (1.0)        392.6161 (1.0)        8.5603 (1.0)        390.7457 (1.0)       0.3912 (1.0)        98;319  2,547.0171 (1.0)        2416           1
test_grad_enabled      3,116.7269 (7.99)     4,073.2883 (7.64)     3,178.0827 (8.09)     122.7487 (14.34)    3,142.2675 (8.04)     33.0228 (84.42)       10;22    314.6551 (0.12)        225           1
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
```

*snippet (using pytest benchmark module)*
```
import torch

torch.set_num_threads(1)
x = torch.randn(1000, 10, 36, requires_grad=True)

def test_grad_enabled(benchmark):
    benchmark(torch.max_pool1d, x, 2)

def test_grad_disabled(benchmark):
    torch.set_grad_enabled(False)
    benchmark(torch.max_pool1d, x, 2)
```

Test Plan: Imported from OSS

Reviewed By: mrshenli

Differential Revision: D24565126

Pulled By: heitorschueroff

fbshipit-source-id: 91a93be9921f597db21e9dc277f6e36eae85b37a
---
 aten/src/ATen/native/MaxPooling.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/MaxPooling.cpp b/aten/src/ATen/native/MaxPooling.cpp
index 645822f55065..682af63cdafe 100644
--- a/aten/src/ATen/native/MaxPooling.cpp
+++ b/aten/src/ATen/native/MaxPooling.cpp
@@ -1,5 +1,6 @@
 #include <ATen/ATen.h>
 #include <ATen/NamedTensorUtils.h>
+#include <ATen/core/grad_mode.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/MaxPooling.h>
 #include <ATen/native/Pool.h>
@@ -98,10 +99,11 @@ Tensor max_pool1d(
     IntArrayRef dilation,
     bool ceil_mode) {
   if (self.is_quantized()) {
-    return at::quantized_max_pool1d(self, kernel_size, stride, padding,
-                                    dilation, ceil_mode);
+    return at::quantized_max_pool1d(
+        self, kernel_size, stride, padding, dilation, ceil_mode);
   }
-  if (self.requires_grad() || !self.device().is_cpu()) {
+  if ((self.requires_grad() && at::GradMode::is_enabled()) ||
+      !self.device().is_cpu()) {
     // Needs indices for grad and with_indices defines CUDA dispatch
     return std::get<0>(at::max_pool1d_with_indices(
         self, kernel_size, stride, padding, dilation, ceil_mode));

From 21e60643c0ad8962fe014c81cf2d87f72466c0f9 Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Tue, 27 Oct 2020 13:05:27 -0700
Subject: [PATCH 042/169] [numpy] `torch.log{2,10}` : promote integer inputs to
 float (#46810)

Summary:
References https://github.com/pytorch/pytorch/issues/42515

cc: mruberry

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46810

Reviewed By: izdeby

Differential Revision: D24536187

Pulled By: mruberry

fbshipit-source-id: b7dd7678d4e996f3dea0245c65055654e02be459
---
 aten/src/ATen/native/UnaryOps.cpp                 | 12 ++++++------
 aten/src/ATen/native/cuda/UnaryLogKernels.cu      |  6 +++---
 test/test_type_promotion.py                       |  4 ++--
 torch/csrc/jit/tensorexpr/kernel.cpp              | 15 +++++++++------
 .../_internal/common_methods_invocations.py       | 15 ++++++++++++---
 5 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index 08b214846944..518032c81b04 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -284,20 +284,20 @@ Tensor& i0_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(re
 Tensor i0(const Tensor& self) { return unary_op_impl(self, at::i0_out); }
 Tensor& i0_(Tensor& self) { return unary_op_impl_(self, at::i0_out); }
 
-Tensor& log_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, log_stub); }
-Tensor log(const Tensor& self) { return unary_op_impl(self, at::log_out); }
+Tensor& log_out(Tensor& result, const Tensor& self) { return unary_op_impl_float_out(result, self, log_stub); }
+Tensor log(const Tensor& self) { return unary_op_impl_float(self, log_stub); }
 Tensor& log_(Tensor& self) { return unary_op_impl_(self, at::log_out); }
 
-Tensor& log10_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, log10_stub); }
-Tensor log10(const Tensor& self) { return unary_op_impl(self, at::log10_out); }
+Tensor& log10_out(Tensor& result, const Tensor& self) { return unary_op_impl_float_out(result, self, log10_stub); }
+Tensor log10(const Tensor& self) { return unary_op_impl_float(self, log10_stub); }
 Tensor& log10_(Tensor& self) { return unary_op_impl_(self, at::log10_out); }
 
 Tensor& log1p_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, log1p_stub); }
 Tensor log1p(const Tensor& self) { return unary_op_impl(self, at::log1p_out); }
 Tensor& log1p_(Tensor& self) { return unary_op_impl_(self, at::log1p_out); }
 
-Tensor& log2_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, log2_stub); }
-Tensor log2(const Tensor& self) { return unary_op_impl(self, at::log2_out); }
+Tensor& log2_out(Tensor& result, const Tensor& self) { return unary_op_impl_float_out(result, self, log2_stub); }
+Tensor log2(const Tensor& self) { return unary_op_impl_float(self, log2_stub); }
 Tensor& log2_(Tensor& self) { return unary_op_impl_(self, at::log2_out); }
 
 Tensor& round_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, round_stub); }
diff --git a/aten/src/ATen/native/cuda/UnaryLogKernels.cu b/aten/src/ATen/native/cuda/UnaryLogKernels.cu
index a43fa541554b..44e73173af17 100644
--- a/aten/src/ATen/native/cuda/UnaryLogKernels.cu
+++ b/aten/src/ATen/native/cuda/UnaryLogKernels.cu
@@ -11,7 +11,7 @@
 namespace at { namespace native {
 
 void log_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "log_cuda", [&]() {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "log_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return ::log(a);
     });
@@ -19,7 +19,7 @@ void log_kernel_cuda(TensorIterator& iter) {
 }
 
 void log10_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "log10_cuda", [&]() {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "log10_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return ::log10(a);
     });
@@ -35,7 +35,7 @@ void log1p_kernel_cuda(TensorIterator& iter) {
 }
 
 void log2_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "log2_cuda", [&]() {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "log2_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return ::log2(a);
     });
diff --git a/test/test_type_promotion.py b/test/test_type_promotion.py
index 7f10915a5ac4..c6efa8f0d90d 100644
--- a/test/test_type_promotion.py
+++ b/test/test_type_promotion.py
@@ -919,8 +919,8 @@ def test_unary_op_out_casting(self, device, dtypes):
         t = torch.tensor((1), dtype=dtypes[0], device=device)
         out = torch.empty(0, dtype=dtypes[1], device=device)
 
-        ops = (torch.neg, torch.floor, torch.ceil, torch.cos, torch.erf, torch.log)
-        float_only_ops = {torch.floor, torch.ceil, torch.cos, torch.erf, torch.log}
+        ops = (torch.neg, torch.floor, torch.ceil, torch.cos, torch.erf)
+        float_only_ops = {torch.floor, torch.ceil, torch.cos, torch.erf}
         real_only_ops = {torch.floor, torch.ceil, torch.erf}
         for op in ops:
             if dtypes[0] is not dtypes[1]:
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index 53837184399d..68ec5c2e304a 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -935,13 +935,15 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) {
     } break;
 
     case aten::log: {
-      return computeOneOperand(
-          "aten_log", v, [](const ExprHandle& a) { return log(a); });
+      return computeOneOperand("aten_log", v, [](const ExprHandle& a) {
+        return log(promoteIntegerToFloat(a));
+      });
     } break;
 
     case aten::log10: {
-      return computeOneOperand(
-          "aten_log10", v, [](const ExprHandle& a) { return log10(a); });
+      return computeOneOperand("aten_log10", v, [](const ExprHandle& a) {
+        return log10(promoteIntegerToFloat(a));
+      });
     } break;
 
     case aten::log1p: {
@@ -950,8 +952,9 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) {
     } break;
 
     case aten::log2: {
-      return computeOneOperand(
-          "aten_log2", v, [](const ExprHandle& a) { return log2(a); });
+      return computeOneOperand("aten_log2", v, [](const ExprHandle& a) {
+        return log2(promoteIntegerToFloat(a));
+      });
     } break;
 
     case aten::exp: {
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 7d2a04f10e5a..48b24f1ae499 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -297,7 +297,10 @@ def sample_inputs(self, device, dtype, requires_grad=False):
     UnaryUfuncInfo('log',
                    ref=np.log,
                    domain=(0, float('inf')),
-                   dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
+                   dtypesIfCPU=all_types_and_complex_and(torch.bool, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   promotes_integers_to_float=True,
                    decorators=(precisionOverride({torch.bfloat16: 5e-2}),),
                    skips=(
                        SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',
@@ -312,7 +315,10 @@ def sample_inputs(self, device, dtype, requires_grad=False):
                    ref=np.log10,
                    domain=(0, float('inf')),
                    decorators=(precisionOverride({torch.bfloat16: 5e-2}),),
-                   dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
+                   dtypesIfCPU=all_types_and_complex_and(torch.bool, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   promotes_integers_to_float=True,
                    skips=(
                        SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',
                                 device_type='cuda', dtypes=[torch.cfloat, torch.cdouble]),
@@ -329,7 +335,10 @@ def sample_inputs(self, device, dtype, requires_grad=False):
     UnaryUfuncInfo('log2',
                    ref=np.log2,
                    domain=(0, float('inf')),
-                   dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
+                   dtypesIfCPU=all_types_and_complex_and(torch.bool, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   promotes_integers_to_float=True,
                    decorators=(precisionOverride({torch.bfloat16: 1e-1}),),
                    skips=(
                        SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',

From 8c39f198b48b278c5298f6c3831ddcae4bf35147 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Tue, 27 Oct 2020 13:06:12 -0700
Subject: [PATCH 043/169] Fix typo in setup.py (#46921)

Summary:
Also, be a bit future-proof in support version list

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46921

Reviewed By: seemethere

Differential Revision: D24568733

Pulled By: malfet

fbshipit-source-id: ae34f8da1ed39b80dc34db0b06e4ef142104a3ff
---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 54f170e3d6c6..2250fff2616a 100644
--- a/setup.py
+++ b/setup.py
@@ -731,6 +731,7 @@ def print_box(msg):
     with open(os.path.join(cwd, "README.md"), encoding="utf-8") as f:
         long_description = f.read()
 
+    version_range_max = max(sys.version_info[1], 8) + 1
     setup(
         name=package_name,
         version=version,
@@ -890,7 +891,7 @@ def print_box(msg):
             'Topic :: Software Development :: Libraries :: Python Modules',
             'Programming Language :: C++',
             'Programming Language :: Python :: 3',
-        ] + ['Programming Language :: Python :: 3.{}' for i in range(python_min_version[1], 9)],
+        ] + ['Programming Language :: Python :: 3.{}'.format(i) for i in range(python_min_version[1], version_range_max)],
         license='BSD-3',
         keywords='pytorch machine learning',
     )

From 151f31ba278b39a790044b0e1f60d14aeb71ad6f Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Tue, 27 Oct 2020 14:19:32 -0700
Subject: [PATCH 044/169] remove event not ready assertion from
 TestCuda.test_copy_non_blocking (#46857)

Summary:
It is incorrect to assume that a newly recorded event will immediately query as False.
This test is flaky on ROCm due to this incorrect assumption.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46857

Reviewed By: albanD

Differential Revision: D24565581

Pulled By: mrshenli

fbshipit-source-id: 0e9ba02cf52554957b29dbeaa5093696dc914b67
---
 test/test_cuda.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/test_cuda.py b/test/test_cuda.py
index 3fa44ee43b20..d20af4082d03 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -492,7 +492,6 @@ def _test_copy_non_blocking(a, b):
             event = torch.cuda.Event()
             a.copy_(b, non_blocking=True)
             event.record()
-            self.assertFalse(event.query())
             event.synchronize()
             self.assertEqual(a, b)
 

From 717e6d808114d73003e670169a296ba4eefb5d49 Mon Sep 17 00:00:00 2001
From: Guilherme Leobas <gleobas@quansight.com>
Date: Tue, 27 Oct 2020 14:23:14 -0700
Subject: [PATCH 045/169] add type annotations to comm.py (#46736)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/46735

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46736

Reviewed By: albanD

Differential Revision: D24565554

Pulled By: mrshenli

fbshipit-source-id: 4e40e4232ebf256af228f9c742ea4d28c626c616
---
 mypy.ini                  |  3 ---
 torch/_C/__init__.pyi.in  | 14 ++++++++++++++
 torch/nn/parallel/comm.py |  3 ++-
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index 535310411720..1fd1ce884520 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -119,9 +119,6 @@ ignore_errors = True
 [mypy-torch.nn.parallel._functions]
 ignore_errors = True
 
-[mypy-torch.nn.parallel.comm]
-ignore_errors = True
-
 [mypy-torch.nn.qat.modules.activations]
 ignore_errors = True
 
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index b603c8386a2c..64c746e7eff2 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -599,6 +599,20 @@ class _CudaDeviceProperties:
     is_integrated: _int
     is_multi_gpu_board: _int
 
+# Defined in torch/csrc/cuda/python_comm.cpp
+def _broadcast(tensor: Tensor, devices: List[_int]) -> List[Tensor]: ...
+def _broadcast_out(tensor: Tensor, out_tensors: List[Tensor]) -> List[Tensor]: ...
+def _broadcast_coalesced(
+    tensors: List[Tensor],
+    devices: List[_int],
+    buffer_size: _int
+) -> List[List[Tensor]]: ...
+
+def _scatter(tensor: Tensor, devices: List[_int], chunk_sizes: Optional[List[_int]], dim: _int, streams: Optional[List[Stream]]) -> List[Tensor]: ...
+def _scatter_out(tensor: Tensor, out_tensors: List[Tensor], dim: _int, streams: Optional[List[Stream]]) -> List[Tensor]: ...
+def _gather(tensors: List[Tensor], dim: _int, destination_index: Optional[_int]) -> Tensor: ...
+def _gather_out(tensors: List[Tensor], out_tensor: Tensor, dim: _int) -> Tensor: ...
+
 # Defined in torch/csrc/cuda/Stream.cpp
 class _CudaStreamBase:
     _cdata: _int
diff --git a/torch/nn/parallel/comm.py b/torch/nn/parallel/comm.py
index f38d7fcaafc4..331d3885bd30 100644
--- a/torch/nn/parallel/comm.py
+++ b/torch/nn/parallel/comm.py
@@ -3,6 +3,7 @@
 from torch.cuda import nccl
 from torch._utils import _take_tensors, _flatten_dense_tensors, \
     _unflatten_dense_tensors, _reorder_tensors_as, _get_device_index
+from typing import List
 
 
 def broadcast(tensor, devices=None, *, out=None):
@@ -121,7 +122,7 @@ def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760):
     """
     # TODO: When `len(inputs) == 1` and all inputs are on `destination`, just
     #       return `inputs`.
-    dense_tensors = [[] for _ in inputs]  # shape (num_gpus, num_tensors)
+    dense_tensors: List[List] = [[] for _ in inputs]  # shape (num_gpus, num_tensors)
     output = []
     ref_order = []
     # process sparse ones first since they may have different sizes on different gpus

From 79a1d2bd781c83591d3c024237243010c723f0ea Mon Sep 17 00:00:00 2001
From: Yuchen Huang <hyc@fb.com>
Date: Tue, 27 Oct 2020 14:49:30 -0700
Subject: [PATCH 046/169] [iOS] Bump up the cocoapods version (#46935)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46935

Bump up the cocoapods version
ghstack-source-id: 115283786

(Note: this ignores all push blocking failures!)

Test Plan: CI

Reviewed By: xta0

Differential Revision: D24572715

fbshipit-source-id: 41ffcd43512dc7d4e94af887fb5dfeab703d7602
---
 ios/LibTorch.podspec | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ios/LibTorch.podspec b/ios/LibTorch.podspec
index f74e2dc9f37e..236f1de7988f 100644
--- a/ios/LibTorch.podspec
+++ b/ios/LibTorch.podspec
@@ -1,6 +1,6 @@
 Pod::Spec.new do |s|
     s.name             = 'LibTorch'
-    s.version          = '1.6.1'
+    s.version          = '1.7.0'
     s.authors          = 'PyTorch Team'
     s.license          = { :type => 'BSD' }
     s.homepage         = 'https://github.com/pytorch/pytorch'

From dc53eefd256ad0110fae098be96d403ae7b85a97 Mon Sep 17 00:00:00 2001
From: Charles Coulombe <ccoulombe@users.noreply.github.com>
Date: Tue, 27 Oct 2020 14:56:16 -0700
Subject: [PATCH 047/169] Conditional requirement for py3.6 only (#46932)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/46930

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46932

Reviewed By: mrshenli

Differential Revision: D24574196

Pulled By: seemethere

fbshipit-source-id: 11daf8abe226670277f1b5682fd9890d23576271
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 07127f738ff9..759baf3984c3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,4 +5,4 @@ requests
 setuptools
 six
 typing_extensions
-dataclasses
+dataclasses; python_version<"3.7"

From 64d4b24a12bdb694138c7c6fb978e7621a9db4fb Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@fb.com>
Date: Tue, 27 Oct 2020 15:05:29 -0700
Subject: [PATCH 048/169] Adding link to gcov depending on GCC_VERSION (#46928)

Summary:
We already link g++ and gcc to the correct version, but we do not do that for gcov, which is needed for coverage.

This PR adds a link for gcov as well.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46928

Reviewed By: malfet

Differential Revision: D24569240

Pulled By: janeyx99

fbshipit-source-id: 4be012bff21ddae0c81339665b58324777b9304f
---
 .circleci/docker/common/install_gcc.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.circleci/docker/common/install_gcc.sh b/.circleci/docker/common/install_gcc.sh
index 48f17989f978..0e86df1c778c 100644
--- a/.circleci/docker/common/install_gcc.sh
+++ b/.circleci/docker/common/install_gcc.sh
@@ -15,6 +15,7 @@ if [ -n "$GCC_VERSION" ]; then
 
   update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-"$GCC_VERSION" 50
   update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-"$GCC_VERSION" 50
+  update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-"$GCC_VERSION" 50
 
   # Cleanup package manager
   apt-get autoclean && apt-get clean

From c20c840c1b6c958cba068c0cc7bdfaae28f1a827 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Tue, 27 Oct 2020 15:21:03 -0700
Subject: [PATCH 049/169] Install sccache from source (#46672)

Summary:
Build `sccache` from https://github.com/pytorch/sccache

Also, update sccache wrappers not to call sccache from sccache

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46672

Reviewed By: janeyx99

Differential Revision: D24455767

Pulled By: malfet

fbshipit-source-id: b475a65e6ad03b9a192ab29a6d9a14280cd76a92
---
 .circleci/docker/common/install_cache.sh | 38 +++++++++++++++++++++---
 1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/.circleci/docker/common/install_cache.sh b/.circleci/docker/common/install_cache.sh
index 17931375b6f0..fc1630272472 100644
--- a/.circleci/docker/common/install_cache.sh
+++ b/.circleci/docker/common/install_cache.sh
@@ -2,6 +2,28 @@
 
 set -ex
 
+install_ubuntu() {
+  echo "Preparing to build sccache from source"
+  apt-get update
+  apt-get install -y cargo pkg-config libssl-dev
+  echo "Checking out sccache repo"
+  git clone https://github.com/pytorch/sccache
+  cd sccache
+  echo "Building sccache"
+  cargo build --release
+  cp target/release/sccache /opt/cache/bin
+  echo "Cleaning up"
+  cd ..
+  rm -rf sccache
+  apt-get remove -y cargo rustc
+  apt-get autoclean && apt-get clean
+}
+
+install_binary() {
+  echo "Downloading sccache binary from S3 repo"
+  curl --retry 3 https://s3.amazonaws.com/ossci-linux/sccache -o /opt/cache/bin/sccache
+}
+
 mkdir -p /opt/cache/bin
 mkdir -p /opt/cache/lib
 sed -e 's|PATH="\(.*\)"|PATH="/opt/cache/bin:\1"|g' -i /etc/environment
@@ -11,12 +33,20 @@ export PATH="/opt/cache/bin:$PATH"
 if [ -n "$ROCM_VERSION" ]; then
   curl --retry 3 http://repo.radeon.com/misc/.sccache_amd/sccache -o /opt/cache/bin/sccache
 else
-  curl --retry 3 https://s3.amazonaws.com/ossci-linux/sccache -o /opt/cache/bin/sccache
+  ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+  case "$ID" in
+    ubuntu)
+      install_ubuntu
+      ;;
+    *)
+      install_binary
+      ;;
+  esac
 fi
 chmod a+x /opt/cache/bin/sccache
 
 function write_sccache_stub() {
-  printf "#!/bin/sh\nexec sccache $(which $1) \"\$@\"" > "/opt/cache/bin/$1"
+  printf "#!/bin/sh\nif [ \$(ps -p \$PPID -o comm=) != sccache ]; then\n  exec sccache $(which $1) \"\$@\"\nelse\n  exec $(which $1) \"\$@\"\nfi" > "/opt/cache/bin/$1"
   chmod a+x "/opt/cache/bin/$1"
 }
 
@@ -38,8 +68,8 @@ if [ -n "$CUDA_VERSION" ]; then
   # where CUDA is installed.  Instead, we install an nvcc symlink outside
   # of the PATH, and set CUDA_NVCC_EXECUTABLE so that we make use of it.
 
-  printf "#!/bin/sh\nexec sccache $(which nvcc) \"\$@\"" > /opt/cache/lib/nvcc
-  chmod a+x /opt/cache/lib/nvcc
+  write_sccache_stub nvcc
+  mv /opt/cache/bin/nvcc /opt/cache/lib/
 fi
 
 if [ -n "$ROCM_VERSION" ]; then

From 1479ed91be56a10e0014c8b1646fbcbf8d741dd9 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Tue, 27 Oct 2020 15:56:00 -0700
Subject: [PATCH 050/169] Add CUDA 11.1 CI (#46616)

Summary:
libtorch XImportant now runs on CUDA 11.1,

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46616

Reviewed By: gchanan

Differential Revision: D24452660

Pulled By: malfet

fbshipit-source-id: 3480a2533214f2d986444ff912f619503a75940d
---
 .circleci/cimodel/data/pytorch_build_data.py |  2 +-
 .circleci/config.yml                         | 24 ++++++++++----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/.circleci/cimodel/data/pytorch_build_data.py b/.circleci/cimodel/data/pytorch_build_data.py
index 5156e69b77af..12a40a17bed3 100644
--- a/.circleci/cimodel/data/pytorch_build_data.py
+++ b/.circleci/cimodel/data/pytorch_build_data.py
@@ -57,7 +57,7 @@
                     ]),
                 ]),
             ]),
-            ("11.0", [
+            ("11.1", [
                 ("3.8", [
                     X(True),
                     ("libtorch", [
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 2ad408cf6b37..38ec1ff25a3a 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -6806,37 +6806,37 @@ workflows:
           build_environment: "pytorch-libtorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7-build"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
       - pytorch_linux_build:
-          name: pytorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_build
+          name: pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_build
           requires:
-            - "docker-pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7"
+            - "docker-pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7"
           filters:
             branches:
               only:
                 - master
                 - /ci-all\/.*/
                 - /release\/.*/
-          build_environment: "pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7"
+          build_environment: "pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7-build"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7"
       - pytorch_linux_test:
-          name: pytorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_test
+          name: pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_test
           requires:
-            - pytorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_build
+            - pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_build
           filters:
             branches:
               only:
                 - master
                 - /ci-all\/.*/
                 - /release\/.*/
-          build_environment: "pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7"
+          build_environment: "pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - pytorch_linux_build:
-          name: pytorch_libtorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_build
+          name: pytorch_libtorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_build
           requires:
-            - "docker-pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7"
-          build_environment: "pytorch-libtorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7"
+            - "docker-pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7"
+          build_environment: "pytorch-libtorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7-build"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7"
       - pytorch_linux_build:
           name: pytorch_linux_bionic_py3_6_clang9_build
           requires:

From 8066e89f641eb51860d49f3c5a98336e77673eb2 Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Tue, 27 Oct 2020 15:59:15 -0700
Subject: [PATCH 051/169] quant: fix bug with copy.deepcopy of FX prepared
 quantization models (#46895)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46895

Bug: models after the FX graph mode quant prepare step lost information,
such as the extra attributes defined in `Quantizer.save_state`,
if the user performed `copy.deepcopy` on them.  The information was lost
because `GraphModule` does not copy attributes which are not present on
`nn.Module` by default.

Fix: define a custom `__deepcopy__` method on observed models and
whitelist the attributes we care about.

This is needed because users sometimes run `copy.deepcopy` on their
models during non-quantization related preparations, and we should make
sure that quantization related state survives these calls.

Test Plan:
```
python test/test_quantization.py TestQuantizeFx.test_deepcopy
python test/test_quantization.py TestQuantizeFx.test_standalone_module
```

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D24556035

fbshipit-source-id: f7a6b28b6d2225fa6189016f967f175f6733b124
---
 test/quantization/test_quantize_fx.py      | 28 ++++++++++++
 torch/quantization/fx/observed_module.py   | 52 ++++++++++++++++++++++
 torch/quantization/fx/quantize.py          | 13 +++---
 torch/quantization/fx/standalone_module.py | 30 -------------
 4 files changed, 85 insertions(+), 38 deletions(-)
 create mode 100644 torch/quantization/fx/observed_module.py
 delete mode 100644 torch/quantization/fx/standalone_module.py

diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index 4113b754a3e8..d2b2d6c82ddf 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -50,6 +50,7 @@
 )
 from torch.testing import FileCheck
 
+import copy
 import itertools
 import operator
 import unittest
@@ -816,6 +817,33 @@ def forward(self, x):
         # make sure these modules are not traced
         self.checkGraphModuleNodes(m, expected_node_occurrence=node_occurrence)
 
+    def test_prepared_model_deepcopy(self):
+        """Ensures that copy.deepcopy works correctly on a prepared model.
+        """
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(1, 1, 1)
+                self._foobar = 'foobar'
+                self.foobar2 = 'foobar2'
+
+            def forward(self, x):
+                x = self.conv(x)
+                return x
+
+        m = M()
+        print(m.__dict__.keys())
+        m.eval()
+        qconfig_dict = {'': torch.quantization.default_qconfig}
+        prepared = torch.quantization.prepare_fx(m, qconfig_dict)
+        # calibrate
+        prepared(torch.randn(4, 1, 4, 4))
+        # copy
+        prepared_copy = copy.deepcopy(prepared)
+        # quantize, should run with no errors
+        quantized = torch.quantization.convert_fx(prepared_copy)
+
+
 @skipIfNoFBGEMM
 class TestQuantizeFxOps(QuantizationTestCase):
     """Unit tests for individual ops
diff --git a/torch/quantization/fx/observed_module.py b/torch/quantization/fx/observed_module.py
new file mode 100644
index 000000000000..7b55033ecba6
--- /dev/null
+++ b/torch/quantization/fx/observed_module.py
@@ -0,0 +1,52 @@
+import torch
+import copy
+from torch.fx import GraphModule
+
+class ObservedGraphModule(GraphModule):
+
+    def get_preserved_attr_names(self):
+        return ['_activation_post_process_map',
+                '_patterns',
+                '_qconfig_map']
+
+    def __init__(self, root, graph):
+        preserved_attrs = dict()
+        for attr in self.get_preserved_attr_names():
+            preserved_attrs[attr] = getattr(root, attr)
+        super().__init__(root, graph)
+        for attr in preserved_attrs:
+            setattr(self, attr, preserved_attrs[attr])
+
+    # GraphModule does not copy attributes which are not in the __dict__
+    # of vanilla nn.Module.  So, we override __deepcopy__ in order
+    # to copy the quantization specific attributes correctly.
+    def __deepcopy__(self, memo):
+        fake_mod = torch.nn.Module()
+        fake_mod.__dict__ = copy.deepcopy(self.__dict__)
+        return ObservedGraphModule(fake_mod, self.graph)
+
+def mark_observed_module(module):
+    return ObservedGraphModule(module, module.graph)
+
+def is_observed_module(module):
+    return isinstance(module, ObservedGraphModule)
+
+class ObservedStandaloneGraphModule(ObservedGraphModule):
+
+    def get_preserved_attr_names(self):
+        return ['_activation_post_process_map',
+                '_patterns',
+                '_qconfig_map',
+                '_standalone_module_observed_input_idxs',
+                '_output_is_observed']
+
+    def __deepcopy__(self, memo):
+        fake_mod = torch.nn.Module()
+        fake_mod.__dict__ = copy.deepcopy(self.__dict__)
+        return ObservedStandaloneGraphModule(fake_mod, self.graph)
+
+def mark_observed_standalone_module(module):
+    return ObservedStandaloneGraphModule(module, module.graph)
+
+def is_observed_standalone_module(module):
+    return isinstance(module, ObservedStandaloneGraphModule)
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index 026bd6214155..60da8142ee2d 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -30,7 +30,9 @@
     get_default_output_activation_post_process_map,
 )
 
-from .standalone_module import (
+from .observed_module import (
+    mark_observed_module,
+    is_observed_module,
     mark_observed_standalone_module,
     is_observed_standalone_module,
 )
@@ -506,6 +508,7 @@ def is_observed(input_arg):
 
         model = GraphModule(model, observed_graph)
         self.save_state(model)
+        model = mark_observed_module(model)
         if is_standalone_module:
             assert result_node is not None
             assert isinstance(result_node.args[0], Node), \
@@ -523,13 +526,7 @@ def save_state(self, observed):
         observed._qconfig_map = self.qconfig_map
 
     def restore_state(self, observed):
-        err_msg = 'please make sure the model is produced by prepare'
-        assert hasattr(observed, '_activation_post_process_map'), 'did not found ' + \
-            '_activation_post_process attribute ' + err_msg
-        assert hasattr(observed, '_patterns'), 'did not found ' + \
-            '_patterns attribute ' + err_msg
-        assert hasattr(observed, '_qconfig_map'), 'did not found ' + \
-            '_qconfig_map attribute ' + err_msg
+        assert is_observed_module(observed), 'incoming model must be produced by prepare_fx'
         self.activation_post_process_map = observed._activation_post_process_map
         self.patterns = observed._patterns
         self.qconfig_map = observed._qconfig_map
diff --git a/torch/quantization/fx/standalone_module.py b/torch/quantization/fx/standalone_module.py
deleted file mode 100644
index 55aa8e21f98f..000000000000
--- a/torch/quantization/fx/standalone_module.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import torch
-import copy
-from torch.fx import GraphModule
-
-class ObservedStandaloneGraphModule(GraphModule):
-    _PRESERVED_ATTR_NAMES = [
-        '_activation_post_process_map',
-        '_patterns',
-        '_qconfig_map',
-        '_standalone_module_observed_input_idxs',
-        '_output_is_observed']
-
-    def __init__(self, root, graph):
-        preserved_attrs = dict()
-        for attr in self._PRESERVED_ATTR_NAMES:
-            preserved_attrs[attr] = getattr(root, attr)
-        super().__init__(root, graph)
-        for attr in preserved_attrs:
-            setattr(self, attr, preserved_attrs[attr])
-
-    def __deepcopy__(self, memo):
-        fake_mod = torch.nn.Module()
-        fake_mod.__dict__ = copy.deepcopy(self.__dict__)
-        return ObservedStandaloneGraphModule(fake_mod, self.graph)
-
-def mark_observed_standalone_module(module):
-    return ObservedStandaloneGraphModule(module, module.graph)
-
-def is_observed_standalone_module(module):
-    return isinstance(module, ObservedStandaloneGraphModule)

From 115bbf9945dfef34d036c07bd329d800ede0c51b Mon Sep 17 00:00:00 2001
From: Dmytro Dzhulgakov <dzhulgakov@fb.com>
Date: Tue, 27 Oct 2020 16:01:28 -0700
Subject: [PATCH 052/169] [caffe2] Disable running full grad check in tests by
 default

Summary:
We've been seeing a lot of Hypothesis timeouts and from profiling a few of the failing tests one of the contributing factors is really slow grad checker. In short, it launches the whole op for each of the input elements so the overall complexity is O(numel^2) at least.

This applies a very unscientific hack to just run grad check on the first and last few elements. It's not ideal, but it's better than flaky tests. One can still explicitly opt in with the env var.

Reviewed By: malfet

Differential Revision: D23336220

fbshipit-source-id: f04d8d43c6aa1590c2f3e72fc7ccc6aa674e49d2
---
 caffe2/python/gradient_checker.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/caffe2/python/gradient_checker.py b/caffe2/python/gradient_checker.py
index afb8d5071492..5f116bd6107c 100644
--- a/caffe2/python/gradient_checker.py
+++ b/caffe2/python/gradient_checker.py
@@ -5,6 +5,7 @@
 
 
 
+import os
 import numpy as np
 
 from caffe2.python import core, workspace, net_drawer
@@ -292,8 +293,20 @@ def CheckSimple(
         if ensure_outputs_are_inferred:
             self._assertInferTensorChecks(op, grad_ops)
 
+        full_grad_check = os.getenv('CAFFE2_FULL_GRAD_CHECK') == '1'
+
         dims_to_check = inputs[input_to_check].size
         for current_dim in range(dims_to_check):
+            # Grad check is very expensive (as it involves running the op from
+            # scratch for each of the input tensor elements). Thus, let's
+            # run it by default only on a small subset of dimensions. Here we
+            # apply very scientific approach: the first and the last 3 elements
+            # of each tensor. Pass CAFFE2_FULL_GRAD_CHECK=1 env var to enable
+            # the full check
+            if not full_grad_check and current_dim >= 3 and \
+                    current_dim + 3 < dims_to_check:
+                grad_estimate.flat[current_dim] = grad.flat[current_dim]
+                continue
             # Positive gradient
             inputs[input_to_check].flat[current_dim] += self._stepsize
             pos_loss, _ = self.GetLossAndGrad(

From 53839ac9d740da3cd4ab162cfe2b4628cce4bd6c Mon Sep 17 00:00:00 2001
From: kiyosora <xueht.fnst@cn.fujitsu.com>
Date: Tue, 27 Oct 2020 16:44:56 -0700
Subject: [PATCH 053/169] Fix internal assert for torch.heaviside with cuda
 tensor and cpu scalar tensor (#46831)

Summary:
Fixed https://github.com/pytorch/pytorch/issues/46681

```
>>> x = torch.randn(10, device='cuda')
>>> y = torch.tensor(1.)
>>> torch.heaviside(x, y)
tensor([0., 1., 0., 1., 1., 0., 1., 1., 1., 0.], device='cuda:0')
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46831

Reviewed By: navahgar

Differential Revision: D24567953

Pulled By: izdeby

fbshipit-source-id: e5fcf4355b27ce0bdf434963d01863d3b24d0bea
---
 .../ATen/native/cuda/BinaryMiscOpsKernels.cu  |  2 +-
 test/test_torch.py                            | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
index fc9aa74f91f4..12b1a0dbf305 100644
--- a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
+++ b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
@@ -102,7 +102,7 @@ void nextafter_kernel_cuda(TensorIterator& iter) {
 
 void heaviside_kernel_cuda(TensorIterator& iter) {
   AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, kBFloat16, iter.dtype(), "heaviside_cuda", [&]() {
-    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
       return a == 0 ? b : static_cast<scalar_t>(a > 0);
     });
   });
diff --git a/test/test_torch.py b/test/test_torch.py
index 1f3241cf3b79..84a540ae3937 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -6329,6 +6329,25 @@ def test_heaviside(self, device, dtypes):
             with self.assertRaisesRegex(RuntimeError, 'heaviside is not yet implemented for tensors with different dtypes.'):
                 input.heaviside_(values)
 
+    @onlyCUDA
+    def test_heaviside_cross_device(self, device):
+        x = torch.tensor([-9, 5, 0, 6, -2, 2], device='cuda')
+        y = torch.tensor(0)
+        result = torch.heaviside(x, y)
+        expect = torch.tensor([0, 1, 0, 1, 0, 1], device='cuda')
+        self.assertEqual(result, expect)
+
+        result = torch.heaviside(y, x)
+        expect = torch.tensor([-9, 5, 0, 6, -2, 2], device='cuda')
+        self.assertEqual(result, expect)
+
+        x = torch.tensor([-9, 5, 0, 6, -2, 2])
+        y = torch.tensor(0, device='cuda')
+        with self.assertRaisesRegex(RuntimeError, 'Expected all tensors to be on the same device'):
+            torch.heaviside(x, y)
+
+        with self.assertRaisesRegex(RuntimeError, 'Expected all tensors to be on the same device'):
+            torch.heaviside(y, x)
 
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     @dtypes(*list(product(torch.testing.get_all_complex_dtypes(),

From 67c1dc65a342358c050f26107f885d62785c8c41 Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Tue, 27 Oct 2020 17:49:29 -0700
Subject: [PATCH 054/169] [FX] Fix handling of `inf` and `nan` literals
 (#46894)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/46894

Test Plan: Imported from OSS

Reviewed By: zdevito

Differential Revision: D24555136

Pulled By: jamesr66a

fbshipit-source-id: 22765a4d9d373711e9e6d7b1d3898080ecbcf2f5
---
 test/test_fx.py   | 19 +++++++++++++++++++
 torch/fx/graph.py |  8 ++++++--
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/test/test_fx.py b/test/test_fx.py
index b5d03a86a177..f470cf417958 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -838,6 +838,25 @@ def test_typename_print(self):
         output : torch.fx.Node = graph.output(b)
         self.assertTrue('typing.List[float]' in str(graph))
 
+    def test_inf_nan(self):
+        class FooMod(torch.nn.Module):
+            def forward(self, x):
+                return x + float('inf'), x + float('-inf'), x + float('nan')
+
+        fm = FooMod()
+        self.checkGraphModule(fm, (torch.rand(3, 4),))
+
+    def test_inf_nan_kwds(self):
+        graph : torch.fx.Graph = torch.fx.Graph()
+        x : torch.fx.Node = graph.create_node('placeholder', 'x')
+        b : torch.fx.Node = graph.create_node('call_function', operator.add, (x, float('inf')), {}, name='inf')
+        c : torch.fx.Node = graph.create_node('call_function', operator.add, (x, float('nan')), {}, name='nan')
+        graph.output((b, c))
+
+        gm = torch.fx.GraphModule(torch.nn.Module(), graph)
+        x = torch.rand(3, 4)
+        self.assertEqual(gm(x), (x + float('inf'), x + float('nan')))
+
     def test_subgraph_creation(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index 983ba9a90ed7..46215ca9357b 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -8,7 +8,7 @@
 import re
 
 def _shadows_builtin_name(name: str) -> bool:
-    return name in builtins.__dict__ or name in keyword.kwlist
+    return name in builtins.__dict__ or name in keyword.kwlist or name in {'inf', 'nan'}
 
 def _is_magic(x: str) -> bool:
     return x.startswith('__') and x.endswith('__')
@@ -380,7 +380,11 @@ def type_repr(o : Any):
                 continue
             raise NotImplementedError(f'node: {node.op} {node.target}')
 
-        import_block = '\n'.join(f'import {name}' for name in sorted(modules_used))
+        # repr() for inf and nan floating point values aren't parseable by
+        # python as literals. Explicitly import the names from the `math` module.
+        import_strs = ['from math import inf, nan']
+        import_strs.extend(f'import {name}' for name in sorted(modules_used))
+        import_block = '\n'.join(import_strs)
 
         code = ''.join(body)
         code = '\n'.join('    ' + line for line in code.split('\n')) + '\n'

From 8e37dcb1f3f5806b96aea3ddff54faf7c0a165b9 Mon Sep 17 00:00:00 2001
From: iurii zdebskyi <47012416+izdeby@users.noreply.github.com>
Date: Tue, 27 Oct 2020 17:55:28 -0700
Subject: [PATCH 055/169] Added foreach_zero_ API (#46215)

Summary:
Adding Added foreach_zero_(TensorList) API

Tested via unit tests

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46215

Reviewed By: zhangguanheng66

Differential Revision: D24262885

Pulled By: izdeby

fbshipit-source-id: 144c283dd00924083096d6d92eb9085cbd6097d3
---
 aten/src/ATen/native/ForeachOpsKernels.cpp    |  6 ++-
 aten/src/ATen/native/cuda/ForeachFunctors.cuh | 40 +++++++++++++++++++
 aten/src/ATen/native/cuda/ForeachUnaryOp.cu   | 19 +++++++++
 aten/src/ATen/native/native_functions.yaml    |  8 ++++
 tools/codegen/model.py                        |  3 +-
 torch/optim/optimizer.py                      | 11 ++++-
 6 files changed, 84 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp
index 335473f729f1..6a567835e0e8 100644
--- a/aten/src/ATen/native/ForeachOpsKernels.cpp
+++ b/aten/src/ATen/native/ForeachOpsKernels.cpp
@@ -98,7 +98,8 @@ std::vector<Tensor> foreach_tensor_##OP##_slow(TensorList tensors) {       \
                                                                            \
   return result;                                                           \
 }                                                                          \
-                                                                           \
+
+#define FOREACH_UNARY_OP_(OP)                                              \
 void foreach_tensor_##OP##_slow_(TensorList tensors) {                     \
   check_foreach_api_restrictions(tensors);                                 \
                                                                            \
@@ -161,6 +162,9 @@ FOREACH_BINARY_OP_LIST(mul);
 FOREACH_BINARY_OP_LIST(div);
 FOREACH_UNARY_OP(sqrt);
 FOREACH_UNARY_OP(exp);
+FOREACH_UNARY_OP_(sqrt);
+FOREACH_UNARY_OP_(exp);
+FOREACH_UNARY_OP_(zero);
 FOREACH_POINTWISE_OP_SCALAR(addcdiv);
 FOREACH_POINTWISE_OP_SCALAR(addcmul);
 FOREACH_POINTWISE_OP_SCALARLIST(addcdiv);
diff --git a/aten/src/ATen/native/cuda/ForeachFunctors.cuh b/aten/src/ATen/native/cuda/ForeachFunctors.cuh
index e58188f846f6..118b36cbbff3 100644
--- a/aten/src/ATen/native/cuda/ForeachFunctors.cuh
+++ b/aten/src/ATen/native/cuda/ForeachFunctors.cuh
@@ -245,6 +245,46 @@ struct BinaryOpListAlphaFunctor {
 // Unary Functors
 //
 
+template<typename T, int depth, int r_args_depth, int res_arg_index>
+struct ZeroFunctor {
+    __device__ __forceinline__ void operator() (
+        int chunk_size,
+        TensorListMetadata<1>& tl) {
+            int tensor_loc = tl.block_to_tensor[blockIdx.x];
+            int chunk_idx = tl.block_to_chunk[blockIdx.x];
+            int n = tl.sizes[tensor_loc];
+
+            T* args[depth];
+            bool all_aligned = init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+            n -= chunk_idx * chunk_size;
+            T r_args[r_args_depth][kILP];
+
+            // to make things simple, we put aligned case in a different code path
+            if(n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+                for(int i_start = threadIdx.x; i_start * kILP < n && i_start * kILP < chunk_size; i_start += blockDim.x) {
+                    // load
+                    load_store(r_args[0], args[0], 0, i_start);
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_args[0][ii] = 0;
+                    }
+                    // store
+                    load_store(args[0], r_args[0], i_start, 0);
+                }
+            }
+            else {
+                for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * kILP) {
+                    load_args<depth>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_args[0][ii] = 0;
+                    }
+                    store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+                }
+            }
+        }
+};
+
 template<typename T, int depth, int r_args_depth, int res_arg_index>
 struct UnaryOpFunctor {
     using opmath_t = typename get_opmath_t<T>::opmath_t;
diff --git a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
index 74c27ed3ca35..ad45f7aa6971 100644
--- a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
@@ -73,4 +73,23 @@ void foreach_tensor_##NAME##_cuda_(TensorList tensors) {                \
 FOREACH_UNARY_OP(exp, Exp);
 FOREACH_UNARY_OP(sqrt, Sqrt);
 
+void foreach_tensor_zero_cuda_(TensorList tensors) {
+    check_foreach_api_restrictions(tensors);
+
+    if (!can_use_fast_route(tensors)) {
+        return at::native::foreach_tensor_zero_slow_(tensors);
+    }
+
+    std::vector<std::vector<at::Tensor>> tensor_lists;
+    tensor_lists.emplace_back(tensors.vec());
+
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, tensors[0].scalar_type(), "foreach_zero_cuda_", [&]() {
+        multi_tensor_apply<1>(tensor_lists,
+                              ZeroFunctor<scalar_t,
+                                          /* depth */ 1,
+                                          /* r_args_depth */ 1, 
+                                          /* res_arg_index */ 0>());
+    });
+}
+
 }} // namespace at::native
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index b143d39e67e2..7df6fd00d61e 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -7047,6 +7047,14 @@
     CPU: foreach_tensor_sqrt_slow_
     CUDA: foreach_tensor_sqrt_cuda_
 
+- func: _foreach_zero_(Tensor(a!)[] self) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_zero_slow_
+    CUDA: foreach_tensor_zero_cuda_
+
 - func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
   use_c10_dispatcher: full
   device_guard: False
diff --git a/tools/codegen/model.py b/tools/codegen/model.py
index 56605b7130db..ef6fd162938f 100644
--- a/tools/codegen/model.py
+++ b/tools/codegen/model.py
@@ -378,7 +378,8 @@ def __post_init__(self) -> None:
                     '_foreach_addcmul_.Scalar',
                     '_foreach_addcdiv_.Scalar',
                     '_foreach_addcmul_.ScalarList',
-                    '_foreach_addcdiv_.ScalarList']:
+                    '_foreach_addcdiv_.ScalarList',
+                    '_foreach_zero_']:
                 assert len(self.returns) == 1
 
     def is_out_fn(self) -> bool:
diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py
index 7d413b959415..cb3ec4830aa4 100644
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@@ -179,6 +179,7 @@ def zero_grad(self, set_to_none: bool = False):
                 (in one case it does the step with a gradient of 0 and in the other it skips
                 the step altogether).
         """
+        per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list))
         for group in self.param_groups:
             for p in group['params']:
                 if p.grad is not None:
@@ -189,7 +190,15 @@ def zero_grad(self, set_to_none: bool = False):
                             p.grad.detach_()
                         else:
                             p.grad.requires_grad_(False)
-                        p.grad.zero_()
+
+                        if p.grad.is_sparse:
+                            p.grad.zero_()
+                        else:
+                            per_device_and_dtype_grads[p.grad.device][p.grad.dtype].append(p.grad)
+
+            for _, per_dtype_grads in per_device_and_dtype_grads.items():
+                for grads in per_dtype_grads.values():
+                    torch._foreach_zero_(grads)
 
     def step(self, closure):
         r"""Performs a single optimization step (parameter update).

From 810c68fb1d60a54c32a88b3e55db0717a5ac1775 Mon Sep 17 00:00:00 2001
From: Yang Wang <wangyang19@fb.com>
Date: Tue, 27 Oct 2020 18:50:48 -0700
Subject: [PATCH 056/169] [OpBench] fix jit tracing with quantized op/tensor by
 enabling `_compare_tensors_internal` to compare quantized tensors (#46772)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46772

When running `buck run caffe2/benchmarks/operator_benchmark/pt:qactivation_test -- --use_jit`, I encountered the following error P146518683. The error was traced down to the fact that `torch.allclose` does not work with quantized tensors (the error was triggered by this particular multiplication https://fburl.com/diffusion/8vw647o6 since native mul can not work with a float scalar and a quantized tensor.)

Minimum example to reproduce:
```(Pdb) input = torch.ones(5)
(Pdb) aa = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8)
(Pdb) bb = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8)
(Pdb) torch.allclose(aa, bb)
Comparison exception: 	promoteTypes with quantized numbers is not handled yet; figure out what the correct rules should be, offending types: QUInt8 Float
```

Here the proposed fix is to compare quantized tensors strictly within `_compare_tensors_internal`.

The other two possible fixes are:
1. convert quantized tensors to float tensors first before sending them to `torch.allclose`
2. change `torch.allclose` to handle quantized tensor.

Test Plan: buck run caffe2/benchmarks/operator_benchmark/pt:qactivation_test -- --use_jit

Reviewed By: kimishpatel

Differential Revision: D24506723

fbshipit-source-id: 6426ea2a88854b4fb89abef0edd2b49921283796
---
 torch/testing/__init__.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/torch/testing/__init__.py b/torch/testing/__init__.py
index 396d0718efbc..78a6c13a6eeb 100644
--- a/torch/testing/__init__.py
+++ b/torch/testing/__init__.py
@@ -24,6 +24,9 @@ def is_integral(dtype: torch.dtype) -> bool:
     dtypes = [x for x in get_all_dtypes() if x not in get_all_complex_dtypes()]
     return dtype in dtypes and not dtype.is_floating_point
 
+def is_quantized(dtype: torch.dtype) -> bool:
+    return dtype in (torch.quint8, torch.qint8, torch.qint32, torch.quint4x2)
+
 # Helper function that maps a flattened index back into the given shape
 # TODO: consider adding torch.unravel_index
 def _unravel_index(flat_index, shape):
@@ -70,7 +73,11 @@ def _compare_tensors_internal(a: torch.Tensor, b: torch.Tensor, *, rtol, atol, e
     debug_msg : Optional[str]
     # Integer (including bool) comparisons are identity comparisons
     # when rtol is zero and atol is less than one
-    if (is_integral(a.dtype) and rtol == 0 and atol < 1) or a.dtype is torch.bool:
+    if (
+        (is_integral(a.dtype) and rtol == 0 and atol < 1)
+        or a.dtype is torch.bool
+        or is_quantized(a.dtype)
+    ):
         if (a == b).all().item():
             return (True, None)
 

From 5a8198eb3c594aa18352930fd21f3c25bd7b7100 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Tue, 27 Oct 2020 19:57:22 -0700
Subject: [PATCH 057/169] [quant][graphmode][fx][fix] scalar as first input for
 add/mul (#46751)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46751

Currently we assume the first input for add/mul is node (Tensor), but it might not be the case

Test Plan:
python test/test_quantization.py TestQuantizeFxOps.test_quantized_add
python test/test_quantization.py TestQuantizeFxOps.test_quantized_mul
python test/test_quantization.py TestQuantizeFxOps.test_quantized_add_relu
python test/test_quantization.py TestQuantizeFxOps.test_quantized_mul_relu

Imported from OSS

Reviewed By: raghuramank100

Differential Revision: D24494456

fbshipit-source-id: ef5e23ba60eb22a57771791f4934306b25c27c01
---
 aten/src/ATen/native/quantized/cpu/qadd.cpp   | 11 ++++++
 aten/src/ATen/native/quantized/cpu/qmul.cpp   | 14 ++++++++
 aten/src/ATen/native/quantized/library.cpp    |  4 +++
 test/quantization/test_quantize_fx.py         |  5 +++
 .../quantization/fx/quantization_patterns.py  | 34 +++++++++++++------
 torch/quantization/fx/quantize.py             | 20 +++++++----
 6 files changed, 72 insertions(+), 16 deletions(-)

diff --git a/aten/src/ATen/native/quantized/cpu/qadd.cpp b/aten/src/ATen/native/quantized/cpu/qadd.cpp
index a12718502dd1..0b9bc6b8e901 100644
--- a/aten/src/ATen/native/quantized/cpu/qadd.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qadd.cpp
@@ -243,6 +243,15 @@ Tensor qadd_scalar(Tensor qa, Scalar b) {
   return _add_scalar_out<ReLUFused>(qc, qa, b);
 }
 
+template <bool ReLUFused = false>
+Tensor qadd_scalar2(Scalar b, Tensor qa) {
+  TORCH_CHECK(qa.qscheme() == kPerTensorAffine ||
+              qa.qscheme() == kPerTensorSymmetric,
+              "Only per tensor quantization is supported in Add.");
+  auto qc = at::empty_like(qa, qa.suggest_memory_format());
+  return _add_scalar_out<ReLUFused>(qc, qa, b);
+}
+
 template <bool ReLUFused = false>
 Tensor qadd_scalar_out(Tensor qa, Scalar b, Tensor out) {
   check_inputs(qa, out);
@@ -269,10 +278,12 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
   m.impl(TORCH_SELECTIVE_NAME("quantized::add"),                 TORCH_FN(qadd</*ReLUFused=*/false>));
   m.impl(TORCH_SELECTIVE_NAME("quantized::add.out"),             TORCH_FN(qadd_out</*ReLUFused=*/false>));
   m.impl(TORCH_SELECTIVE_NAME("quantized::add.Scalar"),          TORCH_FN(qadd_scalar</*ReLUFused=*/false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add.Scalar2"),          TORCH_FN(qadd_scalar2</*ReLUFused=*/false>));
   m.impl(TORCH_SELECTIVE_NAME("quantized::add.Scalar_out"),      TORCH_FN(qadd_scalar_out</*ReLUFused=*/false>));
   m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu"),            TORCH_FN(qadd</*ReLUFused=*/true>));
   m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu.out"),        TORCH_FN(qadd_out</*ReLUFused=*/true>));
   m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu.Scalar"),     TORCH_FN(qadd_scalar</*ReLUFused=*/true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu.Scalar2"),     TORCH_FN(qadd_scalar2</*ReLUFused=*/true>));
   m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu.Scalar_out"), TORCH_FN(qadd_scalar_out</*ReLUFused=*/true>));
   // deprecated functions, kept for backward compatibility
   m.impl(TORCH_SELECTIVE_NAME("quantized::add_out"),             TORCH_FN(qadd_out</*ReLUFused=*/false>));
diff --git a/aten/src/ATen/native/quantized/cpu/qmul.cpp b/aten/src/ATen/native/quantized/cpu/qmul.cpp
index deeae36dc502..dccc2d718bf1 100644
--- a/aten/src/ATen/native/quantized/cpu/qmul.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qmul.cpp
@@ -136,6 +136,18 @@ class QMulScalar final {
   }
 };
 
+template <bool ReLUFused = false>
+class QMulScalar2 final {
+ public:
+  static Tensor run(Scalar b, Tensor qa) {
+    TORCH_CHECK(qa.qscheme() == kPerTensorAffine ||
+              qa.qscheme() == kPerTensorSymmetric,
+              "Only per tensor quantization is supported in Mul.");
+    auto qc = at::empty_like(qa, qa.suggest_memory_format());
+    return _mul_scalar_out<ReLUFused>(qc, qa, b);
+  }
+};
+
 template <bool ReLUFused = false>
 class QMulScalarOut final {
  public:
@@ -176,10 +188,12 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
   m.impl(TORCH_SELECTIVE_NAME("quantized::mul"),                 TORCH_FN(QMul</*ReLUFused=*/false>::run));
   m.impl(TORCH_SELECTIVE_NAME("quantized::mul.out"),             TORCH_FN(QMulOut</*ReLUFused=*/false>::run));
   m.impl(TORCH_SELECTIVE_NAME("quantized::mul.Scalar"),          TORCH_FN(QMulScalar</*ReLUFused=*/false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul.Scalar2"),          TORCH_FN(QMulScalar2</*ReLUFused=*/false>::run));
   m.impl(TORCH_SELECTIVE_NAME("quantized::mul.Scalar_out"),      TORCH_FN(QMulScalarOut</*ReLUFused=*/false>::run));
   m.impl(TORCH_SELECTIVE_NAME("quantized::mul_relu"),            TORCH_FN(QMul</*ReLUFused=*/true>::run));
   m.impl(TORCH_SELECTIVE_NAME("quantized::mul_relu.out"),        TORCH_FN(QMulOut</*ReLUFused=*/true>::run));
   m.impl(TORCH_SELECTIVE_NAME("quantized::mul_relu.Scalar"),     TORCH_FN(QMulScalar</*ReLUFused=*/true>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_relu.Scalar2"),     TORCH_FN(QMulScalar2</*ReLUFused=*/true>::run));
   m.impl(TORCH_SELECTIVE_NAME("quantized::mul_relu.Scalar_out"), TORCH_FN(QMulScalarOut</*ReLUFused=*/true>::run));
   // deprecated functions, kept for backward compatibility
   m.impl(TORCH_SELECTIVE_NAME("quantized::mul_out"),             TORCH_FN(QMulOut</*ReLUFused=*/false>::run));
diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp
index 3b866cf2fd12..3150fd986300 100644
--- a/aten/src/ATen/native/quantized/library.cpp
+++ b/aten/src/ATen/native/quantized/library.cpp
@@ -23,9 +23,11 @@ TORCH_LIBRARY(quantized, m) {
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::add(Tensor qa, Tensor qb, float scale, int zero_point) -> Tensor qc"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::add.out(Tensor qa, Tensor qb, Tensor(a!) out) -> Tensor(a!) out"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::add.Scalar(Tensor qa, Scalar b) -> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add.Scalar2(Scalar b, Tensor qa) -> Tensor qc"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::add.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out) -> Tensor(a!) out"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_relu(Tensor qa, Tensor qb, float scale, int zero_point) -> Tensor qc"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_relu.Scalar(Tensor qa, Scalar b) -> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_relu.Scalar2(Scalar b, Tensor qa) -> Tensor qc"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_relu.out(Tensor qa, Tensor qb, Tensor(a!) out) -> Tensor(a!) out"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_relu.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out) -> Tensor(a!) out"));
   // deprecated functions, kept for backward compatibility
@@ -142,10 +144,12 @@ TORCH_LIBRARY(quantized, m) {
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul(Tensor qa, Tensor qb, float scale, int zero_point)-> Tensor qc"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul.out(Tensor qa, Tensor qb, Tensor(a!) out)-> Tensor(a!) out"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul.Scalar(Tensor qa, Scalar b)-> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul.Scalar2(Scalar b, Tensor qa)-> Tensor qc"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out)-> Tensor(a!) out"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_relu(Tensor qa, Tensor qb, float scale, int zero_point)-> Tensor qc"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_relu.out(Tensor qa, Tensor qb, Tensor(a!) out)-> Tensor(a!) out"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_relu.Scalar(Tensor qa, Scalar b)-> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_relu.Scalar2(Scalar b, Tensor qa)-> Tensor qc"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_relu.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out)-> Tensor(a!) out"));
   // deprecated functions, kept for backward compatibility
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_out(Tensor qa, Tensor qb, Tensor(a!) out)-> Tensor(a!) out"));
diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index d2b2d6c82ddf..2c6c3221fcfd 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -997,7 +997,10 @@ def __init__(self, is_inplace, is_scalar):
             def forward(self, x, y):
                 x = self.conv1(x)
                 y = 3 if self.is_scalar else self.conv2(y)
+                # x = x + y
                 x = self.op(x, y)
+                # x = y + x
+                x = self.op(y, x)
                 return x
 
         # TODO: decide whether we want to quantize or not
@@ -1040,6 +1043,8 @@ def forward(self, x, y):
                 y = 3 if self.is_scalar else self.conv2(y)
                 x = self.op(x, y)
                 x = self.relu(x)
+                x = self.op(y, x)
+                x = self.relu(x)
                 return x
 
         data = (torch.rand((1, 1, 1, 1), dtype=torch.float),
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index 2898a07a17c1..1f2763d6aeb4 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -46,7 +46,8 @@ def __init__(self, quantizer, node):
         # this is an indicator of whether all the inputs are Node or not
         # since some op might be quantized differently depending on whether
         # all inputs are tensors or not, e.g. add/mul
-        self.all_nodes = True
+        self.num_node_args = len(node.args)
+        self.all_node_args = True
 
     @abstractmethod
     def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_dict=None):
@@ -71,18 +72,24 @@ def __init__(self, quantizer, node):
             node = node.args[0]
         assert node.op == 'call_function' and node.target in [operator.add, torch.add]
         self.add_node = node
-        self.all_nodes = all([isinstance(a, Node) for a in self.add_node.args[:2]])
+        self.num_node_args = len([a for a in self.add_node.args[:2] if isinstance(a, Node)])
 
     def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_dict=None):
-        if not self.all_nodes:
+        if self.num_node_args == 1:
             # add scalar
             if self.relu_node is not None:
                 op = torch.ops.quantized.add_relu
             else:
                 op = torch.ops.quantized.add
+
+            if isinstance(self.add_node.args[0], Node):
+                quantized_index = 0
+            else:
+                quantized_index = 1
+
             return quantizer.quantized_graph.create_node(
                 'call_function', op,
-                load_arg(quantized=[0])(self.add_node.args), self.add_node.kwargs)
+                load_arg(quantized=[quantized_index])(self.add_node.args), self.add_node.kwargs)
         else:
             activation_post_process = quantizer.activation_post_process_map[node.name]
             scale, zero_point = activation_post_process.calculate_qparams()
@@ -96,6 +103,7 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
             return quantizer.quantized_graph.create_node(
                 'call_function', op, load_arg(quantized=True)(self.add_node.args), kwargs)
 
+# TODO: merge with Add
 @register_quant_pattern(operator.mul)
 @register_quant_pattern(torch.mul)
 @register_quant_pattern((torch.nn.ReLU, operator.mul))
@@ -112,17 +120,23 @@ def __init__(self, quantizer, node):
             node = node.args[0]
         assert node.op == 'call_function' and node.target in [operator.mul, torch.mul]
         self.mul_node = node
-        self.all_nodes = all([isinstance(a, Node) for a in self.mul_node.args[:2]])
+        self.num_node_args = len([a for a in self.mul_node.args[:2] if isinstance(a, Node)])
 
     def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_dict=None):
-        if not self.all_nodes:
+        if self.num_node_args == 1:
             # mul scalar
             if self.relu_node is not None:
                 op = torch.ops.quantized.mul_relu
             else:
                 op = torch.ops.quantized.mul
+
+            if isinstance(self.mul_node.args[0], Node):
+                quantized_index = 0
+            else:
+                quantized_index = 1
+
             return quantizer.quantized_graph.create_node(
-                'call_function', op, load_arg(quantized=[0])(self.mul_node.args), self.mul_node.kwargs)
+                'call_function', op, load_arg(quantized=[quantized_index])(self.mul_node.args), self.mul_node.kwargs)
         else:
             activation_post_process = quantizer.activation_post_process_map[node.name]
             scale, zero_point = activation_post_process.calculate_qparams()
@@ -138,7 +152,7 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 @register_quant_pattern(torch.cat)
 class Cat(QuantizeHandler):
     def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_dict=None):
-        if not self.all_nodes:
+        if not self.all_node_args:
             return NotImplemented
         activation_post_process = quantizer.activation_post_process_map[node.name]
         scale, zero_point = activation_post_process.calculate_qparams()
@@ -438,7 +452,7 @@ class DefaultNode(QuantizeHandler):
     ''' Common quantized op, first input and first output will be quantized
     '''
     def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_dict=None):
-        if not self.all_nodes:
+        if not self.all_node_args:
             return NotImplemented
         assert node.op in ['call_module', 'call_function'], 'Only call_module and ' + \
             'call_function are handled in DefaultNode'
@@ -580,7 +594,7 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 # of quantizable objects (e.g. modules and functionals)
 class DefaultQuantizeHandler(QuantizeHandler):
     def convert(self, quantizer, node):
-        assert self.all_nodes
+        assert self.all_node_args
         root_module = quantizer.modules['']
         return quantize_node(
             root_module,
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index 60da8142ee2d..3f7a699d97b3 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -384,7 +384,7 @@ def load_arg(a):
                 continue
 
             prefix = node.name + '_activation_post_process_'
-            root_node, _, pattern, obj, qconfig = matches.get(node.name, (None, None, None, None, None))
+            root_node, matched_nodes, pattern, obj, qconfig = matches.get(node.name, (None, None, None, None, None))
             if root_node is None:
                 env[node.name] = observed_graph.node_copy(node, load_arg)
             elif root_node is node:
@@ -458,15 +458,23 @@ def is_observed(input_arg):
                     # propagate observed property from input
                     if is_observed(node.args[0]):
                         observed_node_names_set.add(node.name)
-                elif (isinstance(obj, Add) or isinstance(obj, Mul)) and not obj.all_nodes:
-                    if node.args[0].name in observed_node_names_set:
+                elif (isinstance(obj, Add) or isinstance(obj, Mul)) and obj.num_node_args == 1:
+                    input_node = matched_nodes[-1]  # first node in the sequence
+
+                    def input_is_observed(arg):
+                        return isinstance(arg, Node) and arg.name in observed_node_names_set
+                    # This is checking if one of the argument of add/mul
+                    # is an observed node
+                    # If both of the inputs are number,
+                    # we will not consider the output to be observed
+                    if input_is_observed(input_node.args[0]) or input_is_observed(input_node.args[1]):
                         observed_node_names_set.add(node.name)
                 elif isinstance(obj, StandaloneModuleQuantizeHandler):
                     assert node.op == 'call_module'
                     output_is_observed = self.modules[node.target]._output_is_observed
                     if output_is_observed:
                         observed_node_names_set.add(node.name)
-                elif qconfig is not None and obj.all_nodes:
+                elif qconfig is not None and obj.all_node_args:
                     # observer for outputs
                     new_observer = qconfig.activation()
                     # respect device affinity when adding observers
@@ -741,8 +749,8 @@ def is_quantized(node):
                 # the node is quantized in parent module
                 quant_env[node.name] = self.quantized_graph.node_copy(node, load_non_quantized)
             else:
-                # dequantize inputs for the node that are not quantized
-                env[node.name] = self.quantized_graph.node_copy(node, load_non_quantized)
+                # copy quantized or non-quantized node
+                env[node.name] = self.quantized_graph.node_copy(node, load_x)
 
         # remove activation post process
         act_post_process_removed_graph = Graph()

From 998b9b9e686a9ad7016a62a7fa027968708b7fdc Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Tue, 27 Oct 2020 21:39:14 -0700
Subject: [PATCH 058/169] [quant][graphmode][fx] custom_module support
 static/dynamic/weight_only quant (#46786)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46786

Previously we only support static quant, this PR added support for other types of quantization.

Note qat is actually orthogonal to these quant types, this is referring to the convert step where we
convert the observed module to a quantized module.

for qat, user will provide a CustomModule -> FakeQuantizedCustomModule in prepare_custom_config_dict
and FakeQuantizedCustomModule -> static/dynamic/weight_only quantized CustomModule in convert_custom_config_dict.

Test Plan: Imported from OSS

Reviewed By: raghuramank100

Differential Revision: D24514701

fbshipit-source-id: 2918be422dd76093d67a6df560aaaf949b7f338c
---
 test/quantization/test_quantize_fx.py         | 123 +++++++++++-------
 torch/quantization/__init__.py                |   2 +-
 .../quantization/fx/quantization_patterns.py  |  16 ++-
 torch/quantization/fx/quantize.py             |  22 ++--
 torch/quantization/fx/utils.py                |  70 ++++++++++
 torch/quantization/quant_type.py              |  12 +-
 torch/quantization/quantize_fx.py             |  23 +++-
 7 files changed, 199 insertions(+), 69 deletions(-)

diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index 2c6c3221fcfd..a645d8e07262 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -9,6 +9,7 @@
 # graph mode quantization based on fx
 from torch.quantization import (
     QuantType,
+    quant_type_to_str,
     prepare_fx,
     convert_fx,
     prepare_qat_fx,
@@ -634,104 +635,126 @@ def test_custom_module_class(self):
         class CustomModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.conv = torch.nn.Conv2d(1, 1, 1)
+                self.linear = torch.nn.Linear(3, 3)
 
             def forward(self, x):
-                return self.conv(x)
+                return self.linear(x)
 
         class ObservedCustomModule(torch.nn.Module):
-            def __init__(self, conv):
+            def __init__(self, linear):
                 super().__init__()
-                self.conv = conv
+                self.linear = linear
 
             def forward(self, x):
-                return self.conv(x)
+                return self.linear(x)
 
             @classmethod
             def from_float(cls, float_module):
                 assert hasattr(float_module, 'qconfig')
-                observed = cls(float_module.conv)
+                observed = cls(float_module.linear)
                 observed.qconfig = float_module.qconfig
                 return observed
 
-        class QuantizedCustomModule(torch.nn.Module):
-            def __init__(self, conv):
+        class StaticQuantCustomModule(torch.nn.Module):
+            def __init__(self, linear):
                 super().__init__()
-                self.conv = conv
+                self.linear = linear
 
             def forward(self, x):
-                return self.conv(x)
+                return self.linear(x)
 
             @classmethod
             def from_observed(cls, observed_module):
                 assert hasattr(observed_module, 'qconfig')
                 assert hasattr(observed_module, 'activation_post_process')
-                observed_module.conv.activation_post_process = \
+                observed_module.linear.activation_post_process = \
                     observed_module.activation_post_process
-                quantized = cls(nnq.Conv2d.from_float(observed_module.conv))
+                quantized = cls(nnq.Linear.from_float(observed_module.linear))
                 return quantized
 
-        class DynamicallyQuantizedCustomModule(torch.nn.Module):
-            def __init__(self, conv):
+        class DynamicQuantCustomModule(torch.nn.Module):
+            def __init__(self, linear):
                 super().__init__()
-                self.conv = conv
+                self.linear = linear
 
             def forward(self, x):
-                return self.conv(x)
+                return self.linear(x)
 
             @classmethod
             def from_observed(cls, observed_module):
                 assert hasattr(observed_module, 'qconfig')
-                assert hasattr(observed_module, 'activation_post_process')
-                quantized = cls(nnqd.Conv2d.from_float(observed_module.conv))
+                quantized = cls(nnqd.Linear.from_float(observed_module.linear))
                 return quantized
 
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.conv = torch.nn.Conv2d(1, 1, 1)
+                self.linear = torch.nn.Linear(3, 3)
                 self.custom = CustomModule()
 
             def forward(self, x):
-                x = self.conv(x)
+                x = self.linear(x)
                 x = self.custom(x)
                 return x
 
         class RefM(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.conv1 = torch.nn.Conv2d(1, 1, 1)
-                self.conv2 = torch.nn.Conv2d(1, 1, 1)
+                self.linear1 = torch.nn.Linear(3, 3)
+                self.linear2 = torch.nn.Linear(3, 3)
 
             def forward(self, x):
-                x = self.conv1(x)
-                x = self.conv2(x)
+                x = self.linear1(x)
+                x = self.linear2(x)
                 return x
 
-        data = torch.randn(1, 1, 1, 1)
+        data = torch.randn(3, 3)
         # instantiate M and RefM and align the parameters
         original_m = M().eval()
         original_ref_m = RefM().eval()
-        original_ref_m.conv1.weight = torch.nn.Parameter(original_m.conv.weight.detach())
-        original_ref_m.conv1.bias = torch.nn.Parameter(original_m.conv.bias.detach())
-        original_ref_m.conv2.weight = torch.nn.Parameter(original_m.custom.conv.weight.detach())
-        original_ref_m.conv2.bias = torch.nn.Parameter(original_m.custom.conv.bias.detach())
+        original_ref_m.linear1.weight = torch.nn.Parameter(original_m.linear.weight.detach())
+        original_ref_m.linear1.bias = torch.nn.Parameter(original_m.linear.bias.detach())
+        original_ref_m.linear2.weight = torch.nn.Parameter(original_m.custom.linear.weight.detach())
+        original_ref_m.linear2.bias = torch.nn.Parameter(original_m.custom.linear.bias.detach())
+
+        test_configs = {
+            "static": (default_qconfig, StaticQuantCustomModule, 3),
+            "dynamic": (default_dynamic_qconfig, DynamicQuantCustomModule, 0)
+        }
 
-        # TODO: add other quant types after mixed mode support
-        for quant_type in [QuantType.STATIC]:
-            qconfig_dict = {
-                "": default_qconfig,
-            }
-            prepare_custom_config_dict = {
-                "float_to_observed_custom_module_class": {
-                    CustomModule: ObservedCustomModule
+        for quant_type in [QuantType.DYNAMIC]:
+            key = quant_type_to_str(quant_type)
+            qconfig, quantized_module_class, num_observers = test_configs[key]
+            qconfig_dict = {"": qconfig}
+            if key == "static":
+                prepare_custom_config_dict = {
+                    "float_to_observed_custom_module_class": {
+                        "static": {
+                            CustomModule: ObservedCustomModule
+                        }
+                    }
                 }
-            }
-            convert_custom_config_dict = {
-                "observed_to_quantized_custom_module_class": {
-                    ObservedCustomModule: QuantizedCustomModule
+                convert_custom_config_dict = {
+                    "observed_to_quantized_custom_module_class": {
+                        "static": {
+                            ObservedCustomModule: quantized_module_class
+                        }
+                    }
                 }
-            }
+            else:
+                prepare_custom_config_dict = {
+                    "non_traceable_module_class": [
+                        CustomModule
+                    ]
+                }
+                convert_custom_config_dict = {
+                    "observed_to_quantized_custom_module_class": {
+                        "dynamic": {
+                            CustomModule: quantized_module_class
+                        }
+                    }
+                }
+
             # check prepared model
             m = prepare_fx(
                 original_m,
@@ -741,7 +764,7 @@ def forward(self, x):
             m(data)
             # all activation observers are inserted in the top level module
             count_check = {
-                ns.call_module(torch.quantization.MinMaxObserver): 3
+                ns.call_module(torch.quantization.MinMaxObserver): num_observers
             }
             self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
 
@@ -749,12 +772,14 @@ def forward(self, x):
             m = convert_fx(
                 m,
                 convert_custom_config_dict=convert_custom_config_dict)
-            count_check = {
-                ns.call_function(torch.quantize_per_tensor) : 1,
-                ns.call_module(nnq.Conv2d) : 1,
-                ns.call_method('dequantize') : 1,
-            }
-            self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
+            if quant_type == QuantType.STATIC:
+                count_check = {
+                    ns.call_function(torch.quantize_per_tensor) : 1,
+                    ns.call_module(nnq.Linear) : 1,
+                    ns.call_method('dequantize') : 1,
+                }
+                self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
+            self.assertEqual(type(m.custom), quantized_module_class)
             res = m(data)
 
             # quantize the reference model
diff --git a/torch/quantization/__init__.py b/torch/quantization/__init__.py
index ee506b6fc6a7..d6daa79fae53 100644
--- a/torch/quantization/__init__.py
+++ b/torch/quantization/__init__.py
@@ -28,7 +28,7 @@ def default_eval_fn(model, calib_data):
     # Top level API for graph mode quantization on GraphModule(torch.fx)
     'fuse_fx', 'quantize_fx',  # TODO: add quantize_dynamic_fx
     'prepare_fx', 'prepare_dynamic_fx', 'convert_fx',
-    'QuantType',  # quantization type
+    'QuantType', 'quant_type_to_str',  # quantization type
     # custom module APIs
     'get_default_static_quant_module_mappings', 'get_static_quant_module_class',
     'get_default_dynamic_quant_module_mappings',
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index 1f2763d6aeb4..c588354a3192 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -20,6 +20,7 @@
     _parent_name,
     quantize_node,
     get_per_tensor_qparams,
+    get_swapped_custom_module_class,
     activation_is_statically_quantized,
     weight_is_quantized,
     weight_dtype,
@@ -194,6 +195,12 @@ def __init__(self, quantizer, node):
 
     def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_dict=None):
         # TODO: debug option for conv module
+        qconfig = quantizer.qconfig_map[node.name]
+        activation_statically_quantized = activation_is_statically_quantized(qconfig)
+        # only static qunatization (for both ptq and qat) is supported for conv
+        if not activation_statically_quantized:
+            return quantizer.quantized_graph.node_copy(node, load_arg(quantized=None))
+
         if self.conv_node.op == 'call_module':
             # note that relu should already be fused into conv module in the fusion step
             assert self.relu_node is None, 'conv module and relu fusion is not executed, ' \
@@ -609,13 +616,14 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
         assert convert_custom_config_dict is not None
         custom_module_class_mapping = convert_custom_config_dict.get("observed_to_quantized_custom_module_class", None)
         assert custom_module_class_mapping is not None
+        qconfig = quantizer.qconfig_map[node.name]
         observed_custom_module = quantizer.modules[node.target]
-        if node.name in quantizer.activation_post_process_map:
+        if activation_is_statically_quantized(qconfig):
+            assert node.name in quantizer.activation_post_process_map
             observed_custom_module.activation_post_process = \
                 quantizer.activation_post_process_map[node.name]
-        quantized_custom_module_class = custom_module_class_mapping.get(type(observed_custom_module), None)
-        assert quantized_custom_module_class is not None, "did not found quantized custom module for:" + \
-            str(type(observed_custom_module))
+        quantized_custom_module_class = get_swapped_custom_module_class(
+            observed_custom_module, custom_module_class_mapping, qconfig)
         quantized_custom_module = \
             quantized_custom_module_class.from_observed(observed_custom_module)
         parent_name, name = _parent_name(node.target)
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index 3f7a699d97b3..1ef8174af24a 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -42,6 +42,8 @@
 from .utils import (
     _parent_name,
     quantize_node,
+    get_custom_module_class_keys,
+    get_swapped_custom_module_class,
     activation_is_statically_quantized,
 )
 
@@ -348,9 +350,9 @@ def _prepare(self, model, qconfig_dict, inplace, prepare_custom_config_dict, is_
 
         # match the patterns that will get quantized
         standalone_module_names = prepare_custom_config_dict.get("standalone_module_name", None)
-        custom_module_class_mapping = prepare_custom_config_dict.get("float_to_observed_custom_module_class", None)
+        custom_module_classes = get_custom_module_class_keys(prepare_custom_config_dict, "float_to_observed_custom_module_class")
         matches = self._find_matches(
-            model.graph, self.modules, self.patterns, standalone_module_names, custom_module_class_mapping)
+            model.graph, self.modules, self.patterns, standalone_module_names, custom_module_classes)
 
         # find _inputs_ to matched nodes that are not quantized, these
         # have to be quantized, which requires measuring stats,
@@ -404,8 +406,9 @@ def insert_observer(node, observer, device):
 
                 if isinstance(obj, CustomModuleQuantizeHandler):
                     custom_module = self.modules[node.target]
+                    custom_module_class_mapping = prepare_custom_config_dict.get("float_to_observed_custom_module_class", {})
                     observed_custom_module_class = \
-                        custom_module_class_mapping[type(custom_module)]
+                        get_swapped_custom_module_class(custom_module, custom_module_class_mapping, qconfig)
                     observed_custom_module = \
                         observed_custom_module_class.from_float(custom_module)
                     parent_name, name = _parent_name(node.target)
@@ -584,10 +587,11 @@ def _convert(self, model, inplace=False, debug=False, convert_custom_config_dict
         model.eval().cpu()
         self.modules = dict(model.named_modules())
 
-        custom_module_class_mapping = convert_custom_config_dict.get("observed_to_quantized_custom_module_class", None)
+        custom_module_classes = get_custom_module_class_keys(
+            convert_custom_config_dict, "observed_to_quantized_custom_module_class")
         matches = self._find_matches(
             model.graph, self.modules, self.patterns,
-            custom_module_class_mapping=custom_module_class_mapping)
+            custom_module_classes=custom_module_classes)
 
         quants = self._find_quants(model.graph, matches)
 
@@ -830,7 +834,7 @@ def convert(self, model, inplace=False, debug=False, convert_custom_config_dict=
 
     def _find_matches(
             self, graph, modules, patterns,
-            standalone_module_names=None, custom_module_class_mapping=None):
+            standalone_module_names=None, custom_module_classes=None):
         """
         Matches the nodes in the input graph to quantization patterns, and
         outputs the information needed to quantize them in future steps.
@@ -851,8 +855,8 @@ def _find_matches(
           ...
         }
         """
-        if custom_module_class_mapping is None:
-            custom_module_class_mapping = {}
+        if custom_module_classes is None:
+            custom_module_classes = []
 
         match_map = {}
         all_matched = set()
@@ -882,7 +886,7 @@ def record_match(pattern, node, matched):
         # add custom module instances to the match result
         for node in graph.nodes:
             if node.op == 'call_module' and \
-               type(self.modules[node.target]) in custom_module_class_mapping:
+               type(self.modules[node.target]) in custom_module_classes:
                 custom_module_qconfig = self.qconfig_map[node.name]
                 match_map[node.name] = (
                     node, [node], None, CustomModuleQuantizeHandler(self, node), custom_module_qconfig)
diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py
index 98f94a0633a0..366970cec4c0 100644
--- a/torch/quantization/fx/utils.py
+++ b/torch/quantization/fx/utils.py
@@ -1,5 +1,6 @@
 import re
 import torch
+from ..quant_type import QuantType, quant_type_to_str
 
 # turn foo.bar -> ['foo', 'bar']
 def _parent_name(target):
@@ -139,6 +140,55 @@ def get_next_qparams_idx(module, qparams):
         inputs.append(graph.create_node('get_attr', qparam_full_path))
     return graph.create_node('call_function', quantize_op, tuple(inputs), {})
 
+def get_custom_module_class_keys(custom_config_dict, custom_config_dict_key):
+    r""" Get all the unique custom module keys in the custom config dict
+    e.g.
+    Input:
+    custom_config_dict = {
+        "float_to_observed_custom_module_class": {
+           "static": {
+               CustomModule1: ObservedCustomModule
+           },
+           "dynamic": {
+               CustomModule2: DynamicObservedCustomModule
+           },
+           "weight_only": {
+               CustomModule3: WeightOnlyObservedCustomModule
+           },
+        },
+    }
+
+    Output:
+    # extract all the keys in "static", "dynamic" and "weight_only" dict
+    [CustomModule1, CustomModule2, CustomModule3]
+    """
+    # using set to dedup
+    float_custom_module_classes = set()
+    custom_module_mapping = custom_config_dict.get(custom_config_dict_key, {})
+    for quant_mode in ["static", "dynamic", "weight_only"]:
+        quant_mode_custom_module_config = custom_module_mapping.get(quant_mode, {})
+        quant_mode_custom_module_classes = set(quant_mode_custom_module_config.keys())
+        float_custom_module_classes |= quant_mode_custom_module_classes
+    return list(float_custom_module_classes)
+
+def get_swapped_custom_module_class(custom_module, custom_module_class_mapping, qconfig):
+    """ Get the observed/quantized custom module class that we need
+    to swap `custom_module` to
+    Input:
+        custom_module: input, can be an instance of either a float or observed custom module
+        custom_module_class_mapping: the float to observed or observed to quantized custom module class mapping
+        qconfig: qconfig configured for the custom module
+
+    Output:
+        corresponding observed/quantized custom module class for input custom module instance
+    """
+    quant_type = get_quant_type(qconfig)
+    quant_type_str = quant_type_to_str(quant_type)
+    class_mapping = custom_module_class_mapping.get(quant_type_str, {})
+    assert type(custom_module) in class_mapping, "did not found corresponding observed " \
+        "module class for {} in mapping: {}".format(type(custom_module), class_mapping)
+    return class_mapping[type(custom_module)]
+
 def activation_is_statically_quantized(qconfig):
     """ Given a qconfig, decide if the activation needs to be
     statically quantized or not
@@ -158,6 +208,26 @@ def weight_is_quantized(qconfig):
     """
     return weight_dtype(qconfig) in [torch.quint8, torch.qint8]
 
+def get_quant_type(qconfig):
+    assert qconfig is not None
+    activation = qconfig.activation()
+    weight = qconfig.weight()
+    static_dtypes = [torch.quint8, torch.qint8]
+    if weight.dtype in static_dtypes:
+        if activation.dtype in static_dtypes:
+            return QuantType.STATIC
+        elif hasattr(activation, 'compute_dtype') and activation.compute_dtype in static_dtypes:
+            return QuantType.DYNAMIC
+        else:
+            return QuantType.WEIGHT_ONLY
+
+    if weight.dtype == torch.float16:
+        if activation.dtype == torch.float:
+            return QuantType.WEIGHT_ONLY
+
+    raise Exception("Unrecognized dtype combination in get_quant_type: activation({}),"
+                    "weight({})".format(activation.dtype, weight.dtype))
+
 def get_linear_prepack_op_for_dtype(dtype):
     if dtype == torch.float16:
         return torch.ops.quantized.linear_prepack_fp16
diff --git a/torch/quantization/quant_type.py b/torch/quantization/quant_type.py
index 212dec1fe28c..463d086b39b6 100644
--- a/torch/quantization/quant_type.py
+++ b/torch/quantization/quant_type.py
@@ -1,4 +1,3 @@
-
 import enum
 
 # Quantization type (dynamic quantization, static quantization).
@@ -7,3 +6,14 @@ class QuantType(enum.IntEnum):
     DYNAMIC = 0
     STATIC = 1
     QAT = 2
+    WEIGHT_ONLY = 3
+
+
+def quant_type_to_str(quant_type):
+    m = {
+        QuantType.STATIC: "static",
+        QuantType.DYNAMIC: "dynamic",
+        QuantType.QAT: "qat",
+        QuantType.WEIGHT_ONLY: "weight_only",
+    }
+    return m[quant_type]
diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py
index 001c797ad6b1..e3a116be1785 100644
--- a/torch/quantization/quantize_fx.py
+++ b/torch/quantization/quantize_fx.py
@@ -4,6 +4,7 @@
 from .fx import Fuser  # noqa: F401
 from .fx import Quantizer  # noqa: F401
 from .fx.utils import graph_pretty_str  # noqa: F401
+from .fx.utils import get_custom_module_class_keys  # noqa: F401
 
 def _check_is_graph_module(model):
     if not isinstance(model, GraphModule):
@@ -74,9 +75,9 @@ def _prepare_fx(model, qconfig_dict, inplace, prepare_custom_config_dict=None, i
         # standalone module and custom module config are applied in top level module
         standalone_module_names = prepare_custom_config_dict.get('standalone_module_name', [])
         skipped_module_names += standalone_module_names
-        custom_module_config = prepare_custom_config_dict.get('float_to_observed_custom_module_class', {})
-        custom_module_classes = list(custom_module_config.keys())
-        skipped_module_classes += custom_module_classes
+        float_custom_module_classes = get_custom_module_class_keys(
+            prepare_custom_config_dict, "float_to_observed_custom_module_class")
+        skipped_module_classes += float_custom_module_classes
     tracer = CustomTracer(skipped_module_names, skipped_module_classes)
     graph_module = GraphModule(model, tracer.trace(model))
     graph_module = _fuse_fx(graph_module, inplace, prepare_custom_config_dict)
@@ -178,8 +179,11 @@ def prepare_fx(model, qconfig_dict, inplace=False, prepare_custom_config_dict=No
         # user will manually define the corresponding observed
         # module class which has a from_float class method that converts
         # float custom module to observed custom module
+        # (only needed for static quantization)
         "float_to_observed_custom_module_class": {
-           CustomModule: ObservedCustomModule
+           "static": {
+               CustomModule: ObservedCustomModule
+           }
         },
 
         # the qualified names for the submodule that are not symbolically traceable
@@ -188,6 +192,7 @@ def prepare_fx(model, qconfig_dict, inplace=False, prepare_custom_config_dict=No
         ],
 
         # the module classes that are not symbolically traceable
+        # we'll also put dynamic/weight_only custom module here
         "non_traceable_module_class": [
            NonTraceableModule
         ],
@@ -313,7 +318,15 @@ def convert_fx(graph_module, inplace=False, debug=False, convert_custom_config_d
           # module class which has a from_observed class method that converts
           # observed custom module to quantized custom module
           "observed_to_quantized_custom_module_class": {
-             ObservedCustomModule: QuantizedCustomModule
+             "static": {
+                 ObservedCustomModule: QuantizedCustomModule
+             },
+             "dynamic": {
+                 ObservedCustomModule: QuantizedCustomModule
+             },
+             "weight_only": {
+                 ObservedCustomModule: QuantizedCustomModule
+             }
           }
         }
 

From e077a2a238b1fe62f2d4c1da34df16ec9f8b563b Mon Sep 17 00:00:00 2001
From: Yi Wang <wayi@fb.com>
Date: Tue, 27 Oct 2020 21:39:17 -0700
Subject: [PATCH 059/169] [Gradient Compression] Add CppCommHook subclass for
 supporting the C++ API of communication hook. (#46566)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46566

Only provides an interface. Some built-in implementations will be provided in a follow-up commit.

Original PR issue: C++ DDP Communication Hook https://github.com/pytorch/pytorch/issues/46348
ghstack-source-id: 115319038

Test Plan: waitforbuildbot

Reviewed By: pritamdamania87

Differential Revision: D24379460

fbshipit-source-id: 8382dc4185c7c01d0ac5b3498e1bead785bccec5
---
 torch/csrc/distributed/c10d/comm.cpp    |  2 +-
 torch/csrc/distributed/c10d/comm.h      | 33 ++++++++++++++++++++-----
 torch/csrc/distributed/c10d/reducer.cpp |  3 ++-
 3 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/torch/csrc/distributed/c10d/comm.cpp b/torch/csrc/distributed/c10d/comm.cpp
index 6d1181514aeb..f9fa2bbd7e8f 100644
--- a/torch/csrc/distributed/c10d/comm.cpp
+++ b/torch/csrc/distributed/c10d/comm.cpp
@@ -97,7 +97,7 @@ PythonCommHook::~PythonCommHook() {
 }
 
 c10::intrusive_ptr<torch::jit::Future> PythonCommHook::runHook(
-    const GradBucket& bucket) {
+    GradBucket& bucket) {
   py::gil_scoped_acquire acquire;
 
   py::object py_fut = hook_(state_, bucket);
diff --git a/torch/csrc/distributed/c10d/comm.h b/torch/csrc/distributed/c10d/comm.h
index bb5391a68aaa..0d2978e7221f 100644
--- a/torch/csrc/distributed/c10d/comm.h
+++ b/torch/csrc/distributed/c10d/comm.h
@@ -1,7 +1,5 @@
 #pragma once
 
-#include <memory>
-
 #include <ATen/ATen.h>
 #include <c10d/ProcessGroup.hpp>
 #include <torch/csrc/utils/pybind.h>
@@ -47,7 +45,7 @@ class TORCH_PYTHON_API CommHookInterface {
   // Once the tensors in the bucket are ready, kicks off the hook asynchronously
   // and returns a future that holds the communication results.
   virtual c10::intrusive_ptr<torch::jit::Future> runHook(
-      const GradBucket& bucket) = 0;
+      GradBucket& bucket) = 0;
 
   // Returns the resulting tensors once the communication hook result is ready.
   // The resulting tensors will then be copied to the grads of individual
@@ -68,16 +66,39 @@ class TORCH_PYTHON_API PythonCommHook : public CommHookInterface {
 
   ~PythonCommHook() override;
 
-  c10::intrusive_ptr<torch::jit::Future> runHook(
-      const GradBucket& bucket) override;
+  c10::intrusive_ptr<torch::jit::Future> runHook(GradBucket& bucket) override;
 
   std::vector<at::Tensor> parseHookResult(const c10::IValue& result) override;
 
  private:
   // Only needed for stateful communication.
   py::object state_;
-  // Indicates an asynchrounous communication of gradients.
   py::object hook_;
 };
 
+// This CppCommHook interface only requires implementing runHook method that
+// potentially uses a state.
+template <typename T>
+class TORCH_API CppCommHookInterface : public CommHookInterface {
+ public:
+  explicit CppCommHookInterface(T& state) : state_(state) {}
+
+  virtual ~CppCommHookInterface() {}
+
+  std::vector<at::Tensor> parseHookResult(const c10::IValue& result) override {
+    TORCH_INTERNAL_ASSERT(
+        result.isTensor() || result.isTensorList(),
+        "expected the hook result is either a Tensor or a TensorList");
+
+    if (result.isTensor()) {
+      return {result.toTensor()};
+    }
+
+    return result.toTensorVector();
+  }
+
+ protected:
+  T state_; // Not owned.
+};
+
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index ea1eef082a52..abb26dfaa10c 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -700,7 +700,8 @@ void Reducer::mark_bucket_ready(size_t bucket_index) {
     if (comm_hook_ == nullptr) {
       bucket.work = process_group_->allreduce(tensors);
     } else {
-      bucket.future_work = comm_hook_->runHook(GradBucket(tensors));
+      GradBucket grad_bucket(tensors);
+      bucket.future_work = comm_hook_->runHook(grad_bucket);
     }
   }
 }

From e299393fd51993bd25f61e31847d0b7b2121c548 Mon Sep 17 00:00:00 2001
From: Yi Wang <wayi@fb.com>
Date: Tue, 27 Oct 2020 21:39:17 -0700
Subject: [PATCH 060/169] [Gradient Compression] Provide 2 default C++ comm
 hooks (#46701)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46701

Provide 2 built-in implementations of C++ comm hook.

Original PR issue: C++ DDP Communication Hook https://github.com/pytorch/pytorch/issues/46348
ghstack-source-id: 115319061

Test Plan: waitforbuildbot

Reviewed By: pritamdamania87

Differential Revision: D24382504

fbshipit-source-id: 1c1ef56620f91ab37a1707c5589f1d0eb4455bb3
---
 tools/build_variables.bzl                     |  1 +
 torch/csrc/distributed/c10d/comm.h            |  4 ++
 .../distributed/c10d/default_comm_hooks.cpp   | 41 +++++++++++++++++++
 .../distributed/c10d/default_comm_hooks.h     | 20 +++++++++
 4 files changed, 66 insertions(+)
 create mode 100644 torch/csrc/distributed/c10d/default_comm_hooks.cpp
 create mode 100644 torch/csrc/distributed/c10d/default_comm_hooks.h

diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index 63446d3a1316..86fc691c298c 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -544,6 +544,7 @@ libtorch_python_core_sources = [
 
 libtorch_python_distributed_core_sources = [
     "torch/csrc/distributed/c10d/comm.cpp",
+    "torch/csrc/distributed/c10d/default_comm_hooks.cpp",
     "torch/csrc/distributed/c10d/init.cpp",
     "torch/csrc/distributed/c10d/reducer.cpp",
 ]
diff --git a/torch/csrc/distributed/c10d/comm.h b/torch/csrc/distributed/c10d/comm.h
index 0d2978e7221f..20e0f948808d 100644
--- a/torch/csrc/distributed/c10d/comm.h
+++ b/torch/csrc/distributed/c10d/comm.h
@@ -29,6 +29,10 @@ class GradBucket {
     return tensors_;
   }
 
+  std::vector<at::Tensor>& getTensorsRef() {
+    return tensors_;
+  }
+
  private:
   std::vector<at::Tensor> tensors_;
 };
diff --git a/torch/csrc/distributed/c10d/default_comm_hooks.cpp b/torch/csrc/distributed/c10d/default_comm_hooks.cpp
new file mode 100644
index 000000000000..0f7a24acc40d
--- /dev/null
+++ b/torch/csrc/distributed/c10d/default_comm_hooks.cpp
@@ -0,0 +1,41 @@
+#include <torch/csrc/distributed/c10d/default_comm_hooks.h>
+
+#include <c10d/ProcessGroup.hpp>
+#include <torch/csrc/distributed/c10d/comm.h>
+#include <torch/torch.h>
+
+namespace c10d {
+
+c10::intrusive_ptr<torch::jit::Future> AllReduceCommHook::runHook(
+    GradBucket& bucket) {
+  auto allreduce_work = state_->allreduce(bucket.getTensorsRef());
+
+  auto div_by_process_group_size = [allreduce_work, this]() {
+    auto tensor = allreduce_work->result()[0] / state_->getSize();
+    return c10::IValue(tensor);
+  };
+
+  auto fut = allreduce_work->getFuture();
+  return fut->then(div_by_process_group_size, fut->elementType());
+}
+
+c10::intrusive_ptr<torch::jit::Future> FP16CompressCommHook::runHook(
+    GradBucket& bucket) {
+  auto& tensors = bucket.getTensorsRef();
+  for (auto& tensor : tensors) {
+    tensor.copy_(tensor.to(torch::kFloat16));
+  }
+  auto allreduce_work = state_->allreduce(tensors);
+
+  auto decompress_and_div_by_process_group_size = [allreduce_work, this]() {
+    auto tensor = allreduce_work->result()[0];
+    tensor.copy_(tensor.to(torch::kFloat) / state_->getSize());
+    return c10::IValue(tensor);
+  };
+
+  auto fut = allreduce_work->getFuture();
+  return fut->then(
+      decompress_and_div_by_process_group_size, fut->elementType());
+}
+
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/default_comm_hooks.h b/torch/csrc/distributed/c10d/default_comm_hooks.h
new file mode 100644
index 000000000000..5e53e01ac688
--- /dev/null
+++ b/torch/csrc/distributed/c10d/default_comm_hooks.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <c10d/ProcessGroup.hpp>
+#include <torch/csrc/distributed/c10d/comm.h>
+
+namespace c10d {
+
+class AllReduceCommHook : public CppCommHookInterface<ProcessGroup*> {
+  ~AllReduceCommHook() override {}
+
+  c10::intrusive_ptr<torch::jit::Future> runHook(GradBucket& bucket) override;
+};
+
+class FP16CompressCommHook : public CppCommHookInterface<ProcessGroup*> {
+  ~FP16CompressCommHook() override {}
+
+  c10::intrusive_ptr<torch::jit::Future> runHook(GradBucket& bucket) override;
+};
+
+} // namespace c10d

From d92bf921db95a7c471040b3a017ece41527cbb7c Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Tue, 27 Oct 2020 22:32:31 -0700
Subject: [PATCH 061/169] [quant][graphmode][fx] Remove `inplace` option for
 fuse_fx (#46953)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/46953

Test Plan: Imported from OSS

Reviewed By: supriyar

Differential Revision: D24579402

fbshipit-source-id: 5e0b8abf682287ab3c7dd54c2fc2cf309295e147
---
 torch/quantization/fx/fuse.py     |  5 +----
 torch/quantization/quantize_fx.py | 11 +++++------
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/torch/quantization/fx/fuse.py b/torch/quantization/fx/fuse.py
index 989f8937dd7c..56b375a02c00 100644
--- a/torch/quantization/fx/fuse.py
+++ b/torch/quantization/fx/fuse.py
@@ -12,13 +12,10 @@
 
 from .fusion_patterns import *  # noqa: F401
 
-import copy
 class Fuser:
-    def fuse(self, model, inplace=False, fuse_custom_config_dict=None):
+    def fuse(self, model, fuse_custom_config_dict=None):
         if fuse_custom_config_dict is None:
             fuse_custom_config_dict = {}
-        if not inplace:
-            model = copy.deepcopy(model)
 
         input_root = model
         input_graph = model.graph
diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py
index e3a116be1785..0e52f52a6f33 100644
--- a/torch/quantization/quantize_fx.py
+++ b/torch/quantization/quantize_fx.py
@@ -27,7 +27,7 @@ def _swap_ff_with_fxff(model):
         del model._modules[name]
         model._modules[name] = torch.nn.quantized.FXFloatFunctional()
 
-def _fuse_fx(graph_module, inplace=False, fuse_custom_config_dict=None):
+def _fuse_fx(graph_module, fuse_custom_config_dict=None):
     r""" Internal helper function to fuse modules in preparation for quantization
 
     Args:
@@ -35,7 +35,7 @@ def _fuse_fx(graph_module, inplace=False, fuse_custom_config_dict=None):
     """
     _check_is_graph_module(graph_module)
     fuser = Fuser()
-    return fuser.fuse(graph_module, inplace, fuse_custom_config_dict)
+    return fuser.fuse(graph_module, fuse_custom_config_dict)
 
 class CustomTracer(Tracer):
     def __init__(self, skipped_module_names, skipped_module_classes):
@@ -80,7 +80,7 @@ def _prepare_fx(model, qconfig_dict, inplace, prepare_custom_config_dict=None, i
         skipped_module_classes += float_custom_module_classes
     tracer = CustomTracer(skipped_module_names, skipped_module_classes)
     graph_module = GraphModule(model, tracer.trace(model))
-    graph_module = _fuse_fx(graph_module, inplace, prepare_custom_config_dict)
+    graph_module = _fuse_fx(graph_module, prepare_custom_config_dict)
     quantizer = Quantizer()
     return quantizer.prepare(
         graph_module,
@@ -108,12 +108,11 @@ def _prepare_standalone_module_fx(model, qconfig_dict, inplace=False, prepare_cu
     return _prepare_fx(model, qconfig_dict, inplace, prepare_custom_config_dict, is_standalone_module=True)
 
 
-def fuse_fx(model, inplace=False, fuse_custom_config_dict=None):
+def fuse_fx(model, fuse_custom_config_dict=None):
     r""" Fuse modules like conv+bn, conv+bn+relu etc, model must be in eval mode.
     Fusion rules are defined in torch.quantization.fx.fusion_pattern.py
     Args:
         `model`: a torch.nn.Module model
-        `inplace`: flag for whether we fuse modules inplace or out of place
         `fuse_custom_config_dict`: Dictionary for custom configurations for fuse_fx, e.g.
          fuse_custom_config_dict = {
            "additional_fuser_method_mapping": {
@@ -131,7 +130,7 @@ def fuse_fx(model, inplace=False, fuse_custom_config_dict=None):
     torch._C._log_api_usage_once("quantization_api.quantize_fx.fuse_fx")
     assert not model.training, 'fuse_fx only works on models in eval mode'
     graph_module = torch.fx.symbolic_trace(model)
-    return _fuse_fx(graph_module, inplace, fuse_custom_config_dict)
+    return _fuse_fx(graph_module, fuse_custom_config_dict)
 
 def prepare_fx(model, qconfig_dict, inplace=False, prepare_custom_config_dict=None):
     r""" Prepare a model for post training static quantization

From ddbdbce6232f896f47b81fc13f95b536cf274d20 Mon Sep 17 00:00:00 2001
From: Bram Wasti <bwasti@fb.com>
Date: Tue, 27 Oct 2020 23:54:55 -0700
Subject: [PATCH 062/169] [jit] Prevent caching of `graph` attribute. (#46960)

Summary:
`graph` is automatically cached even when the underlying graph changes -- this PR hardcodes a fix to that.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46960

Reviewed By: mrshenli

Differential Revision: D24582185

Pulled By: bwasti

fbshipit-source-id: 16aeeba251830886c92751dd5c9bda8699d62803
---
 torch/jit/_script.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/jit/_script.py b/torch/jit/_script.py
index 19cce3a86945..d4f6f96c3da2 100644
--- a/torch/jit/_script.py
+++ b/torch/jit/_script.py
@@ -443,7 +443,7 @@ def graph(self):
             Returns a string representation of the internal graph for the
             ``forward`` method. See :ref:`interpreting-graphs` for details.
             """
-            return self.forward.graph
+            return self._c._get_method("forward").graph
 
         @property
         def inlined_graph(self):

From 46b252b83a97bba0926cead050d76fcef129cb6b Mon Sep 17 00:00:00 2001
From: Alban Desmaison <albandes@fb.com>
Date: Wed, 28 Oct 2020 06:46:47 -0700
Subject: [PATCH 063/169] Revert D24262885: [pytorch][PR] Added foreach_zero_
 API

Test Plan: revert-hammer

Differential Revision:
D24262885 (https://github.com/pytorch/pytorch/commit/8e37dcb1f3f5806b96aea3ddff54faf7c0a165b9)

Original commit changeset: 144c283dd009

fbshipit-source-id: 451b202e23bc1fcb11b20d26c11d9a1329789d22
---
 aten/src/ATen/native/ForeachOpsKernels.cpp    |  6 +--
 aten/src/ATen/native/cuda/ForeachFunctors.cuh | 40 -------------------
 aten/src/ATen/native/cuda/ForeachUnaryOp.cu   | 19 ---------
 aten/src/ATen/native/native_functions.yaml    |  8 ----
 tools/codegen/model.py                        |  3 +-
 torch/optim/optimizer.py                      | 11 +----
 6 files changed, 3 insertions(+), 84 deletions(-)

diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp
index 6a567835e0e8..335473f729f1 100644
--- a/aten/src/ATen/native/ForeachOpsKernels.cpp
+++ b/aten/src/ATen/native/ForeachOpsKernels.cpp
@@ -98,8 +98,7 @@ std::vector<Tensor> foreach_tensor_##OP##_slow(TensorList tensors) {       \
                                                                            \
   return result;                                                           \
 }                                                                          \
-
-#define FOREACH_UNARY_OP_(OP)                                              \
+                                                                           \
 void foreach_tensor_##OP##_slow_(TensorList tensors) {                     \
   check_foreach_api_restrictions(tensors);                                 \
                                                                            \
@@ -162,9 +161,6 @@ FOREACH_BINARY_OP_LIST(mul);
 FOREACH_BINARY_OP_LIST(div);
 FOREACH_UNARY_OP(sqrt);
 FOREACH_UNARY_OP(exp);
-FOREACH_UNARY_OP_(sqrt);
-FOREACH_UNARY_OP_(exp);
-FOREACH_UNARY_OP_(zero);
 FOREACH_POINTWISE_OP_SCALAR(addcdiv);
 FOREACH_POINTWISE_OP_SCALAR(addcmul);
 FOREACH_POINTWISE_OP_SCALARLIST(addcdiv);
diff --git a/aten/src/ATen/native/cuda/ForeachFunctors.cuh b/aten/src/ATen/native/cuda/ForeachFunctors.cuh
index 118b36cbbff3..e58188f846f6 100644
--- a/aten/src/ATen/native/cuda/ForeachFunctors.cuh
+++ b/aten/src/ATen/native/cuda/ForeachFunctors.cuh
@@ -245,46 +245,6 @@ struct BinaryOpListAlphaFunctor {
 // Unary Functors
 //
 
-template<typename T, int depth, int r_args_depth, int res_arg_index>
-struct ZeroFunctor {
-    __device__ __forceinline__ void operator() (
-        int chunk_size,
-        TensorListMetadata<1>& tl) {
-            int tensor_loc = tl.block_to_tensor[blockIdx.x];
-            int chunk_idx = tl.block_to_chunk[blockIdx.x];
-            int n = tl.sizes[tensor_loc];
-
-            T* args[depth];
-            bool all_aligned = init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
-            n -= chunk_idx * chunk_size;
-            T r_args[r_args_depth][kILP];
-
-            // to make things simple, we put aligned case in a different code path
-            if(n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
-                for(int i_start = threadIdx.x; i_start * kILP < n && i_start * kILP < chunk_size; i_start += blockDim.x) {
-                    // load
-                    load_store(r_args[0], args[0], 0, i_start);
-#pragma unroll
-                    for(int ii = 0; ii < kILP; ii++) {
-                        r_args[0][ii] = 0;
-                    }
-                    // store
-                    load_store(args[0], r_args[0], i_start, 0);
-                }
-            }
-            else {
-                for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * kILP) {
-                    load_args<depth>(r_args, args, i_start, chunk_size, n);
-#pragma unroll
-                    for(int ii = 0; ii < kILP; ii++) {
-                        r_args[0][ii] = 0;
-                    }
-                    store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
-                }
-            }
-        }
-};
-
 template<typename T, int depth, int r_args_depth, int res_arg_index>
 struct UnaryOpFunctor {
     using opmath_t = typename get_opmath_t<T>::opmath_t;
diff --git a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
index ad45f7aa6971..74c27ed3ca35 100644
--- a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
@@ -73,23 +73,4 @@ void foreach_tensor_##NAME##_cuda_(TensorList tensors) {                \
 FOREACH_UNARY_OP(exp, Exp);
 FOREACH_UNARY_OP(sqrt, Sqrt);
 
-void foreach_tensor_zero_cuda_(TensorList tensors) {
-    check_foreach_api_restrictions(tensors);
-
-    if (!can_use_fast_route(tensors)) {
-        return at::native::foreach_tensor_zero_slow_(tensors);
-    }
-
-    std::vector<std::vector<at::Tensor>> tensor_lists;
-    tensor_lists.emplace_back(tensors.vec());
-
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, tensors[0].scalar_type(), "foreach_zero_cuda_", [&]() {
-        multi_tensor_apply<1>(tensor_lists,
-                              ZeroFunctor<scalar_t,
-                                          /* depth */ 1,
-                                          /* r_args_depth */ 1, 
-                                          /* res_arg_index */ 0>());
-    });
-}
-
 }} // namespace at::native
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 7df6fd00d61e..b143d39e67e2 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -7047,14 +7047,6 @@
     CPU: foreach_tensor_sqrt_slow_
     CUDA: foreach_tensor_sqrt_cuda_
 
-- func: _foreach_zero_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_zero_slow_
-    CUDA: foreach_tensor_zero_cuda_
-
 - func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
   use_c10_dispatcher: full
   device_guard: False
diff --git a/tools/codegen/model.py b/tools/codegen/model.py
index ef6fd162938f..56605b7130db 100644
--- a/tools/codegen/model.py
+++ b/tools/codegen/model.py
@@ -378,8 +378,7 @@ def __post_init__(self) -> None:
                     '_foreach_addcmul_.Scalar',
                     '_foreach_addcdiv_.Scalar',
                     '_foreach_addcmul_.ScalarList',
-                    '_foreach_addcdiv_.ScalarList',
-                    '_foreach_zero_']:
+                    '_foreach_addcdiv_.ScalarList']:
                 assert len(self.returns) == 1
 
     def is_out_fn(self) -> bool:
diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py
index cb3ec4830aa4..7d413b959415 100644
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@@ -179,7 +179,6 @@ def zero_grad(self, set_to_none: bool = False):
                 (in one case it does the step with a gradient of 0 and in the other it skips
                 the step altogether).
         """
-        per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list))
         for group in self.param_groups:
             for p in group['params']:
                 if p.grad is not None:
@@ -190,15 +189,7 @@ def zero_grad(self, set_to_none: bool = False):
                             p.grad.detach_()
                         else:
                             p.grad.requires_grad_(False)
-
-                        if p.grad.is_sparse:
-                            p.grad.zero_()
-                        else:
-                            per_device_and_dtype_grads[p.grad.device][p.grad.dtype].append(p.grad)
-
-            for _, per_dtype_grads in per_device_and_dtype_grads.items():
-                for grads in per_dtype_grads.values():
-                    torch._foreach_zero_(grads)
+                        p.grad.zero_()
 
     def step(self, closure):
         r"""Performs a single optimization step (parameter update).

From cd8ed9328719c762b6f0260ca761b7764d660125 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Wed, 28 Oct 2020 07:58:21 -0700
Subject: [PATCH 064/169] [quant][graphmode][fx][api] Remove `inplace` option
 from prepare_fx (#46954)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/46954

Test Plan: Imported from OSS

Reviewed By: supriyar

Differential Revision: D24579401

fbshipit-source-id: adce623ce819fa220f7bb08d1ff3beaa69850621
---
 test/quantization/test_quantize_fx.py |  6 ++----
 torch/quantization/fx/quantize.py     |  8 +++-----
 torch/quantization/quantize_fx.py     | 21 ++++++++-------------
 3 files changed, 13 insertions(+), 22 deletions(-)

diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index a645d8e07262..f2911642cf3a 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -280,13 +280,11 @@ def forward(self, x):
 
         model = M().eval()
         qconfig_dict = {'': default_qconfig}
-        prepared = prepare_fx(
-            model, qconfig_dict, inplace=False)
+        prepared = prepare_fx(model, qconfig_dict)
         test_only_eval_fn(model, self.img_data_2d)
         non_inplace_model = convert_fx(prepared, inplace=True)
 
-        prepared = prepare_fx(
-            model, qconfig_dict, inplace=True)
+        prepared = prepare_fx(model, qconfig_dict)
         test_only_eval_fn(model, self.img_data_2d)
         inplace_model = convert_fx(prepared, inplace=True)
 
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index 1ef8174af24a..5536f5ef58b0 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -312,7 +312,7 @@ def get_qconfig(module_name):
                 self.modules[node.target].qconfig = module_qconfig
                 self.qconfig_map[node.name] = module_qconfig
 
-    def _prepare(self, model, qconfig_dict, inplace, prepare_custom_config_dict, is_standalone_module):
+    def _prepare(self, model, qconfig_dict, prepare_custom_config_dict, is_standalone_module):
         """ standalone_module means it a submodule that is not inlined in parent module,
         and will be quantized separately as one unit.
 
@@ -328,8 +328,6 @@ def _prepare(self, model, qconfig_dict, inplace, prepare_custom_config_dict, is_
         """
         if prepare_custom_config_dict is None:
             prepare_custom_config_dict = {}
-        if not inplace:
-            model = copy.deepcopy(model)
         additional_quant_patterns = prepare_custom_config_dict.get("additional_quant_pattern", {})
         self.patterns = get_default_quant_patterns().copy()
         for k, v in additional_quant_patterns.items():
@@ -542,8 +540,8 @@ def restore_state(self, observed):
         self.patterns = observed._patterns
         self.qconfig_map = observed._qconfig_map
 
-    def prepare(self, model, qconfig_dict, inplace=False, prepare_custom_config_dict=None, is_standalone_module=False):
-        return self._prepare(model, qconfig_dict, inplace, prepare_custom_config_dict, is_standalone_module)
+    def prepare(self, model, qconfig_dict, prepare_custom_config_dict=None, is_standalone_module=False):
+        return self._prepare(model, qconfig_dict, prepare_custom_config_dict, is_standalone_module)
 
     def _run_weight_observers(self, observed):
         r''' Extract the subgraph that produces the weight for dynamic quant
diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py
index 0e52f52a6f33..1794c3ac5a2d 100644
--- a/torch/quantization/quantize_fx.py
+++ b/torch/quantization/quantize_fx.py
@@ -50,10 +50,10 @@ def is_leaf_module(self, m, module_qualified_name):
             type(m) in self.skipped_module_classes
 
 
-def _prepare_fx(model, qconfig_dict, inplace, prepare_custom_config_dict=None, is_standalone_module=False):
+def _prepare_fx(model, qconfig_dict, prepare_custom_config_dict=None, is_standalone_module=False):
     r""" Internal helper function for prepare_fx
     Args:
-      `model`, `qconfig_dict`, `inplace` `prepare_custom_config_dict`: see docs for :func:`~torch.quantization.prepare_fx`
+      `model`, `qconfig_dict`, `prepare_custom_config_dict`: see docs for :func:`~torch.quantization.prepare_fx`
       `is_standalone_module`: a boolean flag indicates whether we are
       quantizing a standalone module or not, a standalone module
       is a submodule of the parent module that is not inlined in the
@@ -85,11 +85,10 @@ def _prepare_fx(model, qconfig_dict, inplace, prepare_custom_config_dict=None, i
     return quantizer.prepare(
         graph_module,
         qconfig_dict,
-        inplace=True,
         prepare_custom_config_dict=prepare_custom_config_dict,
         is_standalone_module=is_standalone_module)
 
-def _prepare_standalone_module_fx(model, qconfig_dict, inplace=False, prepare_custom_config_dict=None):
+def _prepare_standalone_module_fx(model, qconfig_dict, prepare_custom_config_dict=None):
     r""" [Internal use only] Prepare a standalone module, so that it can be used when quantizing the
     parent module.
     standalone_module means it a submodule that is not inlined in parent module,
@@ -105,7 +104,7 @@ def _prepare_standalone_module_fx(model, qconfig_dict, inplace=False, prepare_cu
 
     """
     torch._C._log_api_usage_once("quantization_api.quantize_fx._prepare_standalone_module_fx")
-    return _prepare_fx(model, qconfig_dict, inplace, prepare_custom_config_dict, is_standalone_module=True)
+    return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict, is_standalone_module=True)
 
 
 def fuse_fx(model, fuse_custom_config_dict=None):
@@ -132,7 +131,7 @@ def fuse_fx(model, fuse_custom_config_dict=None):
     graph_module = torch.fx.symbolic_trace(model)
     return _fuse_fx(graph_module, fuse_custom_config_dict)
 
-def prepare_fx(model, qconfig_dict, inplace=False, prepare_custom_config_dict=None):
+def prepare_fx(model, qconfig_dict, prepare_custom_config_dict=None):
     r""" Prepare a model for post training static quantization
 
     Args:
@@ -165,8 +164,6 @@ def prepare_fx(model, qconfig_dict, inplace=False, prepare_custom_config_dict=No
       # qconfig == None means fusion and quantization should be skipped for anything
       # matching the rule
       }
-      `inplace`: flag for carry out model transformations in-place,
-      the original module is mutated
       `prepare_custom_config_dict`: customization configuration dictionary for
       quantization tool:
       prepare_custom_config_dict = {
@@ -246,15 +243,13 @@ def calibrate(model, data_loader):
     torch._C._log_api_usage_once("quantization_api.quantize_fx.prepare_fx")
     assert not model.training, 'prepare_fx only works for models in' + \
         'eval mode'
-    return _prepare_fx(model, qconfig_dict, inplace, prepare_custom_config_dict)
+    return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict)
 
-def prepare_qat_fx(model, qconfig_dict, inplace=False, prepare_custom_config_dict=None):
+def prepare_qat_fx(model, qconfig_dict, prepare_custom_config_dict=None):
     r""" Prepare a model for quantization aware training
     Args:
       `model`: torch.nn.Module model, must be in train mode
       `qconfig_dict`: see :func:`~torch.quantization.prepare_fx`
-      `inplace`: flag for carry out model transformations in-place,
-       the original module is mutated
       `prepare_custom_config_dict`: see :func:`~torch.quantization.prepare_fx`
 
     Return:
@@ -283,7 +278,7 @@ def train_loop(model, train_data):
     torch._C._log_api_usage_once("quantization_api.quantize_fx.prepare_qat_fx")
     assert model.training, 'prepare_qat_fx only works for models in ' + \
         'train mode'
-    return _prepare_fx(model, qconfig_dict, inplace, prepare_custom_config_dict)
+    return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict)
 
 def _convert_fx(graph_module, inplace, debug, convert_custom_config_dict=None, is_standalone_module=False):
     """ `is_standalone_module`: see docs in :func:`~torch.quantization.prepare_standalone_module_fx`

From c886c7f6dd04303b0dcfb348aa2a5973f9c3efcf Mon Sep 17 00:00:00 2001
From: frgfm <fgfm03@hotmail.fr>
Date: Wed, 28 Oct 2020 08:07:07 -0700
Subject: [PATCH 065/169] fix: Fixed typing of bool in _ConvNd (#46828)

Summary:
Hello there :wave:

I do believe there is some typo in the typing of the `bool` argument of `_ConvNd`constructor.
The typing of the attribute is correct, but the constructor argument, while being the same way, is not the value that will be assigned to `self.bias`.

This PR simply corrects that.

Any feedback is welcome!

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46828

Reviewed By: izdeby

Differential Revision: D24550435

Pulled By: ezyang

fbshipit-source-id: ab10f1a5b29a912cb23fc321a51e78b04a8391e3
---
 torch/nn/modules/conv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py
index 7280eab37caa..20a1d49619b0 100644
--- a/torch/nn/modules/conv.py
+++ b/torch/nn/modules/conv.py
@@ -69,7 +69,7 @@ def __init__(self,
                  transposed: bool,
                  output_padding: _size_1_t,
                  groups: int,
-                 bias: Optional[Tensor],
+                 bias: bool,
                  padding_mode: str) -> None:
         super(_ConvNd, self).__init__()
         if in_channels % groups != 0:

From 50c9581de1238c8dec9c397faa19a1addf6e657a Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Wed, 28 Oct 2020 09:04:08 -0700
Subject: [PATCH 066/169] AT_ERROR if mmap allocation has failed (#46934)

Summary:
All other system call failures in `THMapAllocator` constructor are considered failures, but this one, for some reason was not

Fixes https://github.com/pytorch/pytorch/issues/46651

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46934

Reviewed By: walterddr, seemethere

Differential Revision: D24572657

Pulled By: malfet

fbshipit-source-id: 0a2b6ce78d5484190536bc4949fc4697d6387ab8
---
 aten/src/TH/THAllocator.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/aten/src/TH/THAllocator.cpp b/aten/src/TH/THAllocator.cpp
index 6cdc62ab6da6..20bfd490ed9f 100644
--- a/aten/src/TH/THAllocator.cpp
+++ b/aten/src/TH/THAllocator.cpp
@@ -288,6 +288,7 @@ THMapAllocator::THMapAllocator(WithFd, const char *filename, int fd, int flags,
 
     if (base_ptr_ == MAP_FAILED) {
       base_ptr_ = nullptr; /* let's be sure it is NULL */
+      AT_ERROR("unable to mmap ", size_, " bytes from file <", filename_, ">: ", strerror(errno), " (", errno, ")");
     }
 
     if (flags_ & TH_ALLOCATOR_MAPPED_KEEPFD) {

From 353e7f940f548e0a0cb3b420b4190b4624ae9b41 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Wed, 28 Oct 2020 09:22:04 -0700
Subject: [PATCH 067/169] Ensure kernel launches are checked (#46474)

Summary:
Caffe2 and Torch currently does not have a consistent mechanism for determining if a kernel has launched successfully. The result is difficult-to-detect or silent errors. This diff provides functionality to fix that. Subsequent diffs on the stack fix the identified issues.

Kernel launch errors may arise if invalid launch parameters (number of blocks, number of threads, shared memory, or stream id) are specified incorrectly for the hardware or for other reasons. Interestingly, unless these launch errors are specifically checked for CUDA will silently fail and return garbage answers which can affect downstream computation. Therefore, catching launch errors is important.

Launches are currently checked by placing
```
AT_CUDA_CHECK(cudaGetLastError());
```
somewhere below the kernel launch. This is bad for two reasons.
1. The check may be performed at a site distant to the kernel launch, making debugging difficult.
2. The separation of the launch from the check means that it is difficult for humans and static analyzers to determine whether the check has taken place.

This diff defines a macro:
```
#define TORCH_CUDA_KERNEL_LAUNCH_CHECK() AT_CUDA_CHECK(cudaGetLastError())
```
which clearly indicates the check.

This diff also introduces a new test which analyzes code to identify kernel launches and determines whether the line immediately following the launch contains `TORCH_CUDA_KERNEL_LAUNCH_CHECK();`.

A search of the Caffe2 codebase identifies 104 instances of `AT_CUDA_CHECK(cudaGetLastError());` while the foregoing test identifies 1,467 launches which are not paired with a check. Visual inspection indicates that few of these are false positives, highlighting the need for some sort of static analysis system.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46474

Test Plan:
The new test is run with:
```
buck test //caffe2/test:kernel_launch_checks -- --print-passing-details
```
And should be launched automatically with the other land tests. (TODO: Is it?)

The test is currently set up only to provide warnings but can later be adjusted to require checks.

Otherwise, I rely on the existing test frameworks to ensure that changes resulting from reorganizing existing launch checks don't cause regressions.

Reviewed By: ngimel

Differential Revision: D24309971

Pulled By: r-barnes

fbshipit-source-id: 0dc97984a408138ad06ff2bca86ad17ef2fdf0b6
---
 .github/workflows/lint.yml             |   4 +
 aten/src/ATen/cuda/Exceptions.h        |   5 ++
 test/test_kernel_launch_checks.py      |  42 +++++++++
 torch/testing/__init__.py              |   1 +
 torch/testing/check_kernel_launches.py | 118 +++++++++++++++++++++++++
 5 files changed, 170 insertions(+)
 create mode 100644 test/test_kernel_launch_checks.py
 create mode 100644 torch/testing/check_kernel_launches.py

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 18f7f0a1783f..3e6bb17375b9 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -42,6 +42,10 @@ jobs:
         run: |
           sudo apt-get install -y doxygen && pip install -r requirements.txt
           cd docs/cpp/source && ./check-doxygen.sh
+      - name: CUDA kernel launch check
+        run: |
+          set -eux
+          python torch/testing/check_kernel_launches.py |& tee ${GITHUB_WORKSPACE}/cuda_kernel_launch_checks.txt
 
   flake8-py3:
     runs-on: ubuntu-latest
diff --git a/aten/src/ATen/cuda/Exceptions.h b/aten/src/ATen/cuda/Exceptions.h
index 80e39c6bc6bc..b82e04fbe1d6 100644
--- a/aten/src/ATen/cuda/Exceptions.h
+++ b/aten/src/ATen/cuda/Exceptions.h
@@ -79,6 +79,11 @@ const char *cusparseGetErrorString(cusparseStatus_t status);
 
 #define AT_CUDA_CHECK(EXPR) C10_CUDA_CHECK(EXPR)
 
+// This should be used directly after every kernel launch to ensure
+// the launch happened correctly and provide an early, close-to-source
+// diagnostic if it didn't.
+#define TORCH_CUDA_KERNEL_LAUNCH_CHECK() AT_CUDA_CHECK(cudaGetLastError())
+
 // For CUDA Driver API
 //
 // This is here instead of in c10 because NVRTC is loaded dynamically via a stub
diff --git a/test/test_kernel_launch_checks.py b/test/test_kernel_launch_checks.py
new file mode 100644
index 000000000000..8796b9913f73
--- /dev/null
+++ b/test/test_kernel_launch_checks.py
@@ -0,0 +1,42 @@
+from torch.testing._internal.common_utils import TestCase, run_tests
+from torch.testing import check_cuda_kernel_launches, check_code_for_cuda_kernel_launches
+
+
+class AlwaysCheckCudaLaunchTest(TestCase):
+    def test_check_code(self):
+        """Verifies that the regex works for a few different situations"""
+
+        # Try some different spacings
+        self.assertEqual(2, check_code_for_cuda_kernel_launches("""
+some_function_call<TemplateArg><<<1,2,0,stream>>>(arg1,arg2,arg3);
+TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+some_function_call<TemplateArg><<<1,2,0,stream>>>(arg1,arg2,arg3);
+
+some_function_call<TemplateArg><<<1,2,0,stream>>>(arg1,arg2,arg3);
+TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+some_function_call<TemplateArg><<<1,2,0,stream>>>(arg1,arg2,arg3);
+some_other_stuff;
+some_function_call<TemplateArg><<<1,2,0,stream>>>(arg1,arg2,arg3);
+TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+some_function_call<TemplateArg><<<1,2,0,stream>>> (arg1,arg2,arg3);
+TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+some_function_call<TemplateArg><<<1,2,0,stream>>> ( arg1 , arg2 , arg3 ) ;
+
+    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        """))
+
+        # Does it work for macros?
+        self.assertEqual(0, check_code_for_cuda_kernel_launches("""
+#define SOME_MACRO(x) some_function_call<<<1,2>>> ( x ) ;  \\
+    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        """))
+
+    def test_check_cuda_launches(self):
+        check_cuda_kernel_launches()
+        # TODO: Enable this after warning messages have been dealt with.
+        self.assertTrue(True)
+        # self.assertTrue(check_cuda_kernel_launches() == 0)
+
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/torch/testing/__init__.py b/torch/testing/__init__.py
index 78a6c13a6eeb..80120b019a99 100644
--- a/torch/testing/__init__.py
+++ b/torch/testing/__init__.py
@@ -6,6 +6,7 @@
 import random
 import math
 from typing import cast, List, Optional, Tuple, Union
+from .check_kernel_launches import check_cuda_kernel_launches, check_code_for_cuda_kernel_launches
 
 FileCheck = torch._C.FileCheck
 
diff --git a/torch/testing/check_kernel_launches.py b/torch/testing/check_kernel_launches.py
new file mode 100644
index 000000000000..3385fcdf9618
--- /dev/null
+++ b/torch/testing/check_kernel_launches.py
@@ -0,0 +1,118 @@
+import os
+import re
+import sys
+
+
+# Regular expression identifies a kernel launch indicator by
+# finding something approximating the pattern ">>>(arguments);"
+# It then requires that `TORCH_CUDA_KERNEL_LAUNCH_CHECK` be
+# the next command.
+# It allows a single backslash `\` between the end of the launch
+# command and the beginning of the kernel check. This handles
+# cases where the kernel launch is in a multiline preprocessor
+# definition.
+#
+# There are various ways this can fail:
+# * If the semicolon is in a string for some reason
+# * If there's a triply-nested template
+# But this should be sufficient to detect and fix most problem
+# instances and can be refined before the test is made binding
+kernel_launch_regex = re.compile(r"""
+    >>>           # Identifies kernel launch
+    \s*           # Maybe some whitespace (includes newlines)
+    \([^;]+\);    # And then arguments in parens and semi-colon
+    (?!           # Negative lookahead: we trigger if we don't find the launch guard
+        \s*                                  # Maybe some whitespace (includes newlines)
+        \\?                                  # 0 or 1 backslashes (for launches in preprocessor macros)
+        (?:[0-9]+: )?                        # Detects and ignores a line numbering, if present
+        \s*                                  # Maybe some whitespace (includes newlines)
+        TORCH_CUDA_KERNEL_LAUNCH_CHECK\(\);  # Kernel launch guard!
+    )             # End negative lookahead
+""", flags=re.MULTILINE | re.VERBOSE)
+
+
+def check_code_for_cuda_kernel_launches(code, filename=None):
+    """Checks code for CUDA kernel launches without cuda error checks.
+
+    Args:
+        filename - Filename of file containing the code. Used only for display
+                   purposes, so you can put anything here.
+        code     - The code to check
+
+    Returns:
+        The number of unsafe kernel launches in the code
+    """
+    if filename is None:
+        filename = "##Python Function Call##"
+
+    # We break the code apart and put it back together to add
+    # helpful line numberings for identifying problem areas
+    code = enumerate(code.split("\n"))                             # Split by line breaks
+    code = [f"{lineno}: {linecode}" for lineno, linecode in code]  # Number the lines
+    code = '\n'.join(code)                                         # Put it back together
+
+    results = kernel_launch_regex.findall(code)               # Search for bad launches
+    for r in results:
+        print(f"Missing TORCH_CUDA_KERNEL_LAUNCH_CHECK in '{filename}'. Context:\n{r}", file=sys.stderr)
+    return len(results)
+
+
+def check_file(filename):
+    """Checks a file for CUDA kernel launches without cuda error checks
+
+    Args:
+        filename - File to check
+
+    Returns:
+        The number of unsafe kernel launches in the file
+    """
+    if not (filename.endswith(".cu") or filename.endswith(".cuh")):
+        return 0
+    contents = open(filename, "r").read()
+    return check_code_for_cuda_kernel_launches(contents, filename)
+
+
+def check_cuda_kernel_launches():
+    """Checks all pytorch code for CUDA kernel launches without cuda error checks
+
+    Returns:
+        The number of unsafe kernel launches in the codebase
+    """
+    torch_dir = os.path.dirname(os.path.realpath(__file__))
+    torch_dir = os.path.dirname(torch_dir)  # Go up to parent torch
+    torch_dir = os.path.dirname(torch_dir)  # Go up to parent caffe2
+
+    kernels_without_checks = 0
+    files_without_checks = []
+    for root, dirnames, filenames in os.walk(torch_dir):
+        # `$BASE/build` and `$BASE/torch/include` are generated
+        # so we don't want to flag their contents
+        if root == os.path.join(torch_dir, "build") or root == os.path.join(torch_dir, "torch/include"):
+            # Curtail search by modifying dirnames and filenames in place
+            # Yes, this is the way to do this, see `help(os.walk)`
+            dirnames[:] = []
+            continue
+
+        for x in filenames:
+            filename = os.path.join(root, x)
+            file_result = check_file(filename)
+            if file_result > 0:
+                kernels_without_checks += file_result
+                files_without_checks.append(filename)
+
+    if kernels_without_checks > 0:
+        count_str = f"Found {kernels_without_checks} instances in " \
+                    f"{len(files_without_checks)} files where kernel " \
+                    "launches didn't have checks."
+        print(count_str, file=sys.stderr)
+        print("Files without checks:", file=sys.stderr)
+        for x in files_without_checks:
+            print(f"\t{x}", file=sys.stderr)
+        print(count_str, file=sys.stderr)
+
+    return kernels_without_checks
+
+
+if __name__ == "__main__":
+    unsafe_launches = check_cuda_kernel_launches()
+    sys.exit(0)

From b75b961934f8daa03c877a380487334e873b1950 Mon Sep 17 00:00:00 2001
From: Kurt Mohler <kurtamohler@gmail.com>
Date: Wed, 28 Oct 2020 09:31:02 -0700
Subject: [PATCH 068/169] Fix `requires_grad` arg for `new_full`, `new_empty`,
 `new_zeros` (#46486)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/36455

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46486

Reviewed By: gchanan

Differential Revision: D24497034

Pulled By: ezyang

fbshipit-source-id: 769a7f00f9a8f7cb77273a1193173a837ae7e32f
---
 test/test_torch.py                     | 19 +++++++++++++++++++
 tools/autograd/gen_python_functions.py |  9 ++++++++-
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/test/test_torch.py b/test/test_torch.py
index 84a540ae3937..f1b0b23b1518 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -3468,6 +3468,25 @@ def test_pin_memory(self):
                 self.assertIs(pinned, pinned.pin_memory())
                 self.assertEqual(pinned.data_ptr(), pinned.pin_memory().data_ptr())
 
+        def test_new_methods_requires_grad(self):
+            size = (10,)
+            test_cases = [
+                # method name, args
+                ('new_full', [size, 1]),
+                ('new_empty', [size]),
+                ('new_zeros', [size]),
+            ]
+            for method_name, args in test_cases:
+                x = torch.randn(size)
+                for requires_grad in [True, False]:
+                    x_new = x.__getattribute__(method_name)(*args, requires_grad=requires_grad)
+                    self.assertEqual(x_new.requires_grad, requires_grad)
+                x = torch.randint(10, size)
+                with self.assertRaisesRegex(
+                        RuntimeError,
+                        r'Only Tensors of floating point and complex dtype can require gradients'):
+                    x_new = x.__getattribute__(method_name)(*args, requires_grad=True)
+
         @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
         def test_numpy_unresizable(self) -> None:
             x = np.zeros((2, 2))
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index c82b8b298704..2e822e68a998 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -922,8 +922,15 @@ def go(f: NativeFunction) -> str:
         lambda_args = ', '.join(lambda_arg_exprs.exprs)
 
         # scatter fields
+        # TODO: Checking `ps.method and ('requires_grad' in parser_outputs)` is a hacky
+        #       solution for enabling the 'requires_grad' argument for tensor methods
+        #       new_full, new_empty, and new_zeros. A much better but more difficult to
+        #       implement solution involves refactoring according to Ed's description here:
+        #       https://github.com/pytorch/pytorch/issues/36455#issuecomment-614767589
+        need_set_requires_grad = ps.tensor_options_args and (not has_tensor_options(f) or (
+            ps.method and ('requires_grad' in parser_outputs)))
         set_requires_grad = f'.set_requires_grad({parser_outputs["requires_grad"].expr})' \
-            if ps.tensor_options_args and not has_tensor_options(f) else ''
+            if need_set_requires_grad else ''
 
         auto_no_gil = '' if decl['with_gil'] else 'pybind11::gil_scoped_release no_gil;'
 

From c6858fd71a117d47873a2134bf1c8c1d9bb09ed3 Mon Sep 17 00:00:00 2001
From: Sheng Qin <shengqin@fb.com>
Date: Wed, 28 Oct 2020 09:38:05 -0700
Subject: [PATCH 069/169] Set up benchmarks for ClipRanges operator for Caffe2
 and PyTorch

Summary: As title, adding the benchmark tests for ClipRanges operators.

Test Plan:
benchmark test for Caffe2
```
# ----------------------------------------
# PyTorch/Caffe2 Operator Micro-benchmarks
# ----------------------------------------
# Tag : short

# Benchmarking Caffe2: clip_ranges
WARNING: Logging before InitGoogleLogging() is written to STDERR
W1026 12:30:33.938997 2658759 init.h:137] Caffe2 GlobalInit should be run before any other API calls.
# Name: clip_ranges_LENGTH6_M1_N2_MAX_LENGTH1_dtypeint32
# Input: LENGTH: 6, M: 1, N: 2, MAX_LENGTH: 1, dtype: int32
Forward Execution Time (us) : 5.805

# Benchmarking Caffe2: clip_ranges
# Name: clip_ranges_LENGTH7_M1_N2_MAX_LENGTH2_dtypeint32
# Input: LENGTH: 7, M: 1, N: 2, MAX_LENGTH: 2, dtype: int32
Forward Execution Time (us) : 5.913

# Benchmarking Caffe2: clip_ranges
# Name: clip_ranges_LENGTH8_M1_N2_MAX_LENGTH3_dtypeint32
# Input: LENGTH: 8, M: 1, N: 2, MAX_LENGTH: 3, dtype: int32
Forward Execution Time (us) : 5.941

# Benchmarking Caffe2: clip_ranges
# Name: clip_ranges_LENGTH9_M1_N2_MAX_LENGTH4_dtypeint32
# Input: LENGTH: 9, M: 1, N: 2, MAX_LENGTH: 4, dtype: int32
Forward Execution Time (us) : 5.868

# Benchmarking Caffe2: clip_ranges
# Name: clip_ranges_LENGTH10_M1_N2_MAX_LENGTH5_dtypeint32
# Input: LENGTH: 10, M: 1, N: 2, MAX_LENGTH: 5, dtype: int32
Forward Execution Time (us) : 6.408
```

benchmark test for PyTorch
```
# ----------------------------------------
# PyTorch/Caffe2 Operator Micro-benchmarks
# ----------------------------------------
# Tag : short

# Benchmarking PyTorch: clip_ranges
# Mode: JIT
# Name: clip_ranges_LENGTH6_M1_N2_MAX_LENGTH1_dtypetorch.int32_cpu
# Input: LENGTH: 6, M: 1, N: 2, MAX_LENGTH: 1, dtype: torch.int32, device: cpu
Forward Execution Time (us) : 443.012

# Benchmarking PyTorch: clip_ranges
# Mode: JIT
# Name: clip_ranges_LENGTH7_M1_N2_MAX_LENGTH2_dtypetorch.int32_cpu
# Input: LENGTH: 7, M: 1, N: 2, MAX_LENGTH: 2, dtype: torch.int32, device: cpu
Forward Execution Time (us) : 446.480

# Benchmarking PyTorch: clip_ranges
# Mode: JIT
# Name: clip_ranges_LENGTH8_M1_N2_MAX_LENGTH3_dtypetorch.int32_cpu
# Input: LENGTH: 8, M: 1, N: 2, MAX_LENGTH: 3, dtype: torch.int32, device: cpu
Forward Execution Time (us) : 444.064

# Benchmarking PyTorch: clip_ranges
# Mode: JIT
# Name: clip_ranges_LENGTH9_M1_N2_MAX_LENGTH4_dtypetorch.int32_cpu
# Input: LENGTH: 9, M: 1, N: 2, MAX_LENGTH: 4, dtype: torch.int32, device: cpu
Forward Execution Time (us) : 445.511

# Benchmarking PyTorch: clip_ranges
# Mode: JIT
# Name: clip_ranges_LENGTH10_M1_N2_MAX_LENGTH5_dtypetorch.int32_cpu
# Input: LENGTH: 10, M: 1, N: 2, MAX_LENGTH: 5, dtype: torch.int32, device: cpu
Forward Execution Time (us) : 450.468
```

Reviewed By: MarcioPorto

Differential Revision: D24500468

fbshipit-source-id: a582090a3982005af272cb10cdd257b2b2e787c4
---
 .../operator_benchmark/c2/clip_ranges_test.py | 51 ++++++++++++++++++
 .../operator_benchmark/pt/clip_ranges_test.py | 53 +++++++++++++++++++
 2 files changed, 104 insertions(+)
 create mode 100644 benchmarks/operator_benchmark/c2/clip_ranges_test.py
 create mode 100644 benchmarks/operator_benchmark/pt/clip_ranges_test.py

diff --git a/benchmarks/operator_benchmark/c2/clip_ranges_test.py b/benchmarks/operator_benchmark/c2/clip_ranges_test.py
new file mode 100644
index 000000000000..2bb32f062445
--- /dev/null
+++ b/benchmarks/operator_benchmark/c2/clip_ranges_test.py
@@ -0,0 +1,51 @@
+import benchmark_caffe2 as op_bench_c2
+import operator_benchmark as op_bench
+from benchmark_caffe2 import Caffe2BenchmarkBase  # noqa
+from caffe2.python import core, dyndep
+
+dyndep.InitOpsLibrary("@/caffe2/caffe2/fb/operators:clip_ranges_op")
+
+"""Microbenchmarks for ClipRanges operator."""
+
+# Configs for C2 ClipRanges operator
+clip_ranges_long_configs = op_bench.cross_product_configs(
+    LENGTH=range(1, 100),
+    M=[1],
+    N=[2],
+    MAX_LENGTH=range(1, 100),
+    dtype=["int32"],
+    tags=["long"]
+)
+
+
+clip_ranges_short_configs = op_bench.config_list(
+    attrs=[
+        [6, 1, 2, 1, "int32"],
+        [7, 1, 2, 2, "int32"],
+        [8, 1, 2, 3, "int32"],
+        [9, 1, 2, 4, "int32"],
+        [10, 1, 2, 5, "int32"],
+    ],
+    attr_names=["LENGTH", "M", "N", "MAX_LENGTH", "dtype"],
+    tags=["short"],
+)
+
+
+class ClipRangesBenchmark(op_bench_c2.Caffe2BenchmarkBase):
+    def init(self, LENGTH, M, N, MAX_LENGTH, dtype):
+        self.input = self.tensor([LENGTH, M, N], dtype)
+        self.max_length = MAX_LENGTH
+        self.set_module_name("clip_ranges")
+
+    def forward(self):
+        op = core.CreateOperator("ClipRanges", self.input, self.input, max_length=self.max_length)
+        return op
+
+
+op_bench_c2.generate_c2_test(
+    clip_ranges_long_configs + clip_ranges_short_configs, ClipRangesBenchmark
+)
+
+
+if __name__ == "__main__":
+    op_bench.benchmark_runner.main()
diff --git a/benchmarks/operator_benchmark/pt/clip_ranges_test.py b/benchmarks/operator_benchmark/pt/clip_ranges_test.py
new file mode 100644
index 000000000000..dccbf9250809
--- /dev/null
+++ b/benchmarks/operator_benchmark/pt/clip_ranges_test.py
@@ -0,0 +1,53 @@
+import operator_benchmark as op_bench
+import torch
+
+
+"""Microbenchmarks for ClipRanges operator."""
+
+# Configs for C2 ClipRanges operator
+clip_ranges_long_configs = op_bench.cross_product_configs(
+    LENGTH=range(1, 100),
+    M=[1],
+    N=[2],
+    MAX_LENGTH=range(1, 100),
+    device=['cpu', 'cuda'],
+    dtype=[torch.int32],
+    tags=["long"],
+)
+
+
+clip_ranges_short_configs = op_bench.config_list(
+    attrs=[
+        [6, 1, 2, 1, torch.int32],
+        [7, 1, 2, 2, torch.int32],
+        [8, 1, 2, 3, torch.int32],
+        [9, 1, 2, 4, torch.int32],
+        [10, 1, 2, 5, torch.int32],
+    ],
+    attr_names=["LENGTH", "M", "N", "MAX_LENGTH", "dtype"],
+    cross_product_configs={
+        'device': ['cpu', 'cuda'],
+    },
+    tags=["short"],
+)
+
+
+class ClipRangesBenchmark(op_bench.TorchBenchmarkBase):
+    def init(self, LENGTH, M, N, MAX_LENGTH, device, dtype):
+        self.input = torch.rand(LENGTH, M, N, device=device).type(dtype)
+        self.max_length = MAX_LENGTH
+        self.set_module_name("clip_ranges")
+
+    def forward(self):
+        output = self.input.clone()
+        output[:, :, 1].clamp_(0, self.max_length)
+        return output
+
+
+op_bench.generate_pt_test(
+    clip_ranges_long_configs + clip_ranges_short_configs, ClipRangesBenchmark
+)
+
+
+if __name__ == "__main__":
+    op_bench.benchmark_runner.main()

From c9222b7471a5452704c63beba30426e478c6b10e Mon Sep 17 00:00:00 2001
From: Sheng Qin <shengqin@fb.com>
Date: Wed, 28 Oct 2020 09:38:05 -0700
Subject: [PATCH 070/169] Implement clip_ranges operator for PyTorch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Test Plan:
unit test for correctness
```
buck test caffe2/torch/fb/sparsenn:test -- test_clip_ranges
Parsing buck files: finished in 1.6 sec
Creating action graph: finished in 18.9 sec
Building: finished in 15.0 sec (100%) 9442/9442 jobs, 1 updated
  Total time: 35.6 sec
More details at https://www.internalfb.com/intern/buck/build/66fb17de-859e-4d01-89bf-5c5de2950693
Tpx test run coordinator for Facebook. See https://fburl.com/tpx for details.
Running with tpx session id: 80f5e0c2-7db2-48a4-b148-25dd34651682
Trace available for this run at /tmp/tpx-20201026-123217.050766/trace.log
Started reporting to test run: https://our.intern.facebook.com/intern/testinfra/testrun/4503599665041422
    ✓ ListingSuccess: caffe2/torch/fb/sparsenn:test - main (14.912)
    ✓ Pass: caffe2/torch/fb/sparsenn:test - test_clip_ranges (caffe2.torch.fb.sparsenn.tests.sparsenn_operators_test.SparseNNOperatorsTest) (14.098)
Summary
  Pass: 1
  ListingSuccess: 1
Finished test run: https://our.intern.facebook.com/intern/testinfra/testrun/4503599665041422
```

new  benchmark perf test
```
# ----------------------------------------
# PyTorch/Caffe2 Operator Micro-benchmarks
# ----------------------------------------
# Tag : short

# Benchmarking PyTorch: clip_ranges
# Mode: JIT
# Name: clip_ranges_LENGTH6_M1_N2_MAX_LENGTH1_dtypetorch.int32_cpu
# Input: LENGTH: 6, M: 1, N: 2, MAX_LENGTH: 1, dtype: torch.int32, device: cpu
Forward Execution Time (us) : 155.765

# Benchmarking PyTorch: clip_ranges
# Mode: JIT
# Name: clip_ranges_LENGTH7_M1_N2_MAX_LENGTH2_dtypetorch.int32_cpu
# Input: LENGTH: 7, M: 1, N: 2, MAX_LENGTH: 2, dtype: torch.int32, device: cpu
Forward Execution Time (us) : 156.248

# Benchmarking PyTorch: clip_ranges
# Mode: JIT
# Name: clip_ranges_LENGTH8_M1_N2_MAX_LENGTH3_dtypetorch.int32_cpu
# Input: LENGTH: 8, M: 1, N: 2, MAX_LENGTH: 3, dtype: torch.int32, device: cpu
Forward Execution Time (us) : 156.634

# Benchmarking PyTorch: clip_ranges
# Mode: JIT
# Name: clip_ranges_LENGTH9_M1_N2_MAX_LENGTH4_dtypetorch.int32_cpu
# Input: LENGTH: 9, M: 1, N: 2, MAX_LENGTH: 4, dtype: torch.int32, device: cpu
Forward Execution Time (us) : 155.408

# Benchmarking PyTorch: clip_ranges
# Mode: JIT
# Name: clip_ranges_LENGTH10_M1_N2_MAX_LENGTH5_dtypetorch.int32_cpu
# Input: LENGTH: 10, M: 1, N: 2, MAX_LENGTH: 5, dtype: torch.int32, device: cpu
Forward Execution Time (us) : 165.168
```

Compare with the old implementation, there are **around 300us gain**
```
# ----------------------------------------
# PyTorch/Caffe2 Operator Micro-benchmarks
# ----------------------------------------
# Tag : short

# Benchmarking PyTorch: clip_ranges
# Mode: JIT
# Name: clip_ranges_LENGTH6_M1_N2_MAX_LENGTH1_dtypetorch.int32_cpu
# Input: LENGTH: 6, M: 1, N: 2, MAX_LENGTH: 1, dtype: torch.int32, device: cpu
Forward Execution Time (us) : 443.012

# Benchmarking PyTorch: clip_ranges
# Mode: JIT
# Name: clip_ranges_LENGTH7_M1_N2_MAX_LENGTH2_dtypetorch.int32_cpu
# Input: LENGTH: 7, M: 1, N: 2, MAX_LENGTH: 2, dtype: torch.int32, device: cpu
Forward Execution Time (us) : 446.480

# Benchmarking PyTorch: clip_ranges
# Mode: JIT
# Name: clip_ranges_LENGTH8_M1_N2_MAX_LENGTH3_dtypetorch.int32_cpu
# Input: LENGTH: 8, M: 1, N: 2, MAX_LENGTH: 3, dtype: torch.int32, device: cpu
Forward Execution Time (us) : 444.064

# Benchmarking PyTorch: clip_ranges
# Mode: JIT
# Name: clip_ranges_LENGTH9_M1_N2_MAX_LENGTH4_dtypetorch.int32_cpu
# Input: LENGTH: 9, M: 1, N: 2, MAX_LENGTH: 4, dtype: torch.int32, device: cpu
Forward Execution Time (us) : 445.511

# Benchmarking PyTorch: clip_ranges
# Mode: JIT
# Name: clip_ranges_LENGTH10_M1_N2_MAX_LENGTH5_dtypetorch.int32_cpu
# Input: LENGTH: 10, M: 1, N: 2, MAX_LENGTH: 5, dtype: torch.int32, device: cpu
Forward Execution Time (us) : 450.468
```

Reviewed By: MarcioPorto

Differential Revision: D24546110

fbshipit-source-id: e6c9b38e911f177f97961ede5bf375107f240363
---
 benchmarks/operator_benchmark/pt/clip_ranges_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/operator_benchmark/pt/clip_ranges_test.py b/benchmarks/operator_benchmark/pt/clip_ranges_test.py
index dccbf9250809..d2c0d575647b 100644
--- a/benchmarks/operator_benchmark/pt/clip_ranges_test.py
+++ b/benchmarks/operator_benchmark/pt/clip_ranges_test.py
@@ -3,6 +3,7 @@
 
 
 """Microbenchmarks for ClipRanges operator."""
+torch.ops.load_library("//caffe2/torch/fb/sparsenn:sparsenn_operators")
 
 # Configs for C2 ClipRanges operator
 clip_ranges_long_configs = op_bench.cross_product_configs(
@@ -39,8 +40,7 @@ def init(self, LENGTH, M, N, MAX_LENGTH, device, dtype):
         self.set_module_name("clip_ranges")
 
     def forward(self):
-        output = self.input.clone()
-        output[:, :, 1].clamp_(0, self.max_length)
+        output = torch.ops.fb.clip_ranges(self.input, self.max_length)
         return output
 
 

From c3fc17b48edfdb0c09d9da24cb3bf01fe1c5f1d2 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Wed, 28 Oct 2020 09:53:55 -0700
Subject: [PATCH 071/169] Fix bit math (#46837)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46837

Formerly `static_cast<StreamId>(bits)` and `static_cast<DeviceIndex>(bits)` were and-ed against `ull` types resulting in an integer promotion which later raised a warning in downcasting passes to  `Stream` and `Device`.

Moving the `&` operation inside the cast results in two `uint64_t` being operated on and then cast to the correct type, eliminating the warning.

Test Plan: Standard pre-commit test rig.

Reviewed By: malfet

Differential Revision: D24481292

fbshipit-source-id: a8bcbde631054c26ca8c98fbed275254dd359dd0
---
 c10/core/Stream.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/c10/core/Stream.h b/c10/core/Stream.h
index 6962be72bf72..5baac5325af7 100644
--- a/c10/core/Stream.h
+++ b/c10/core/Stream.h
@@ -124,11 +124,11 @@ class Stream final {
   }
 
   static Stream unpack(uint64_t bits) {
-    auto stream_id = static_cast<StreamId>(bits) & 0xFFFFFFFFull;
+    const auto stream_id = static_cast<StreamId>(bits & 0xFFFFFFFFull);
     bits >>= 32;
-    auto device_index = static_cast<DeviceIndex>(bits) & 0xFFFFull;
+    const auto device_index = static_cast<DeviceIndex>(bits & 0xFFFFull);
     bits >>= 16;
-    auto device_type = static_cast<DeviceType>(bits);
+    const auto device_type = static_cast<DeviceType>(bits);
     TORCH_CHECK(isValidDeviceType(device_type));
     // Unfortunately, we can't check if the StreamId is valid here; it
     // will be checked upon first use.

From cbf90dafe1cdb4f3f87b7a0804b601dc41bceb5b Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Wed, 28 Oct 2020 10:04:09 -0700
Subject: [PATCH 072/169] Fix CPUCaching allocator guard bug (#46922)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46922

Earlier bug wrongly captures the previous value to be saved.

Test Plan: cpu_caching_allocator_test

Reviewed By: dreiss

Differential Revision: D24566514

fbshipit-source-id: 734a4c1f810bbec16fe007f31fffa360898955ac
---
 c10/mobile/CPUCachingAllocator.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/c10/mobile/CPUCachingAllocator.cpp b/c10/mobile/CPUCachingAllocator.cpp
index b2f193299089..bde4067d45dc 100644
--- a/c10/mobile/CPUCachingAllocator.cpp
+++ b/c10/mobile/CPUCachingAllocator.cpp
@@ -95,8 +95,8 @@ CPUCachingAllocator* GetThreadLocalCachingAllocator() {
 
 WithCPUCachingAllocatorGuard::WithCPUCachingAllocatorGuard(
     CPUCachingAllocator* allocator) {
-  caching_allocator_ptr = allocator;
   prev_caching_allocator_ptr_ = GetThreadLocalCachingAllocator();
+  caching_allocator_ptr = allocator;
 }
 
 WithCPUCachingAllocatorGuard::~WithCPUCachingAllocatorGuard() {

From 069232a57412eedeeefbd64bab3e31a6b91c8927 Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Wed, 28 Oct 2020 10:20:04 -0700
Subject: [PATCH 073/169] [FX] Fix corner case in name sanitization (#46958)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/46958

Test Plan: Imported from OSS

Reviewed By: dzhulgakov

Differential Revision: D24580474

Pulled By: jamesr66a

fbshipit-source-id: 2f8d252998c72e1e79d6a5f7766c2d51a271cc83
---
 test/test_fx.py   |  5 +++++
 torch/fx/graph.py | 11 ++++++-----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/test/test_fx.py b/test/test_fx.py
index f470cf417958..4945ea857ede 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -814,6 +814,11 @@ def forward(self, x, w):
         x, w = torch.rand(3, 4), torch.rand(4, 4)
         self.assertTrue(any(n.target == torch.relu for n in traced.graph.nodes))
 
+    def test_sequential(self):
+        m = torch.nn.Sequential(torch.nn.Conv2d(1, 1, 1))
+        gm = torch.fx.symbolic_trace(m)
+        gm_copy = copy.deepcopy(gm)
+
     def test_ctx_mgr(self):
         @contextlib.contextmanager
         def do_nothing():
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index 46215ca9357b..a9a7e5dc80c0 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -271,11 +271,12 @@ def node_copy(self, node: Node, arg_transform: Callable[[Node], Argument] = lamb
             sanitized_name = node.name
             if '_' in node.name:
                 base, maybe_idx = node.name.rsplit('_', 1)
-                try:
-                    int(maybe_idx)
-                    sanitized_name = base
-                except ValueError:
-                    pass
+                if base != '':
+                    try:
+                        int(maybe_idx)
+                        sanitized_name = base
+                    except ValueError:
+                        pass
             name = self._name(sanitized_name)
         return self.create_node(node.op, node.target, args, kwargs, name, node.type)
 

From 61ee0242c08fb16bf368ce22d840acd586d79ced Mon Sep 17 00:00:00 2001
From: Alban Desmaison <albandes@fb.com>
Date: Wed, 28 Oct 2020 10:29:34 -0700
Subject: [PATCH 074/169] Fix backcompat in master following revert (#46984)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/46984

Reviewed By: mrshenli

Differential Revision: D24592404

Pulled By: albanD

fbshipit-source-id: d317d934b650f1ac0f91e51ef5cbc14e886aa3fe
---
 test/backward_compatibility/check_backward_compatibility.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py
index 5beee3d1c683..9f1f6cc99fe5 100644
--- a/test/backward_compatibility/check_backward_compatibility.py
+++ b/test/backward_compatibility/check_backward_compatibility.py
@@ -128,6 +128,7 @@
     ("aten::_foreach_addcdiv_", datetime.date(2020, 10, 15)),
     ("aten::_foreach_addcdiv", datetime.date(2020, 10, 15)),
     ("aten::_foreach_addcmul", datetime.date(2020, 10, 15)),
+    ("aten::_foreach_zero_", datetime.date(2020, 11, 5)),
     ("aten::conj", datetime.date(2020, 11, 10)),
     ("aten::add_relu", datetime.date(2020, 10, 28)),
     ("aten::add_relu_", datetime.date(2020, 10, 28)),

From 98b3da8b13a7795bc9f0a180b08e671c7b00745c Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Wed, 28 Oct 2020 10:49:40 -0700
Subject: [PATCH 075/169] Revert D24452660: [pytorch][PR] Add CUDA 11.1 CI

Test Plan: revert-hammer

Differential Revision:
D24452660 (https://github.com/pytorch/pytorch/commit/1479ed91be56a10e0014c8b1646fbcbf8d741dd9)

Original commit changeset: 3480a2533214

fbshipit-source-id: 1e720c5d6fe1a377f6decd3ecc4f412c53fb293c
---
 .circleci/cimodel/data/pytorch_build_data.py |  2 +-
 .circleci/config.yml                         | 24 ++++++++++----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/.circleci/cimodel/data/pytorch_build_data.py b/.circleci/cimodel/data/pytorch_build_data.py
index 12a40a17bed3..5156e69b77af 100644
--- a/.circleci/cimodel/data/pytorch_build_data.py
+++ b/.circleci/cimodel/data/pytorch_build_data.py
@@ -57,7 +57,7 @@
                     ]),
                 ]),
             ]),
-            ("11.1", [
+            ("11.0", [
                 ("3.8", [
                     X(True),
                     ("libtorch", [
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 38ec1ff25a3a..2ad408cf6b37 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -6806,37 +6806,37 @@ workflows:
           build_environment: "pytorch-libtorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7-build"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
       - pytorch_linux_build:
-          name: pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_build
+          name: pytorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_build
           requires:
-            - "docker-pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7"
+            - "docker-pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7"
           filters:
             branches:
               only:
                 - master
                 - /ci-all\/.*/
                 - /release\/.*/
-          build_environment: "pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7"
+          build_environment: "pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7-build"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7"
       - pytorch_linux_test:
-          name: pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_test
+          name: pytorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_test
           requires:
-            - pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_build
+            - pytorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_build
           filters:
             branches:
               only:
                 - master
                 - /ci-all\/.*/
                 - /release\/.*/
-          build_environment: "pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7"
+          build_environment: "pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - pytorch_linux_build:
-          name: pytorch_libtorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_build
+          name: pytorch_libtorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_build
           requires:
-            - "docker-pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7"
-          build_environment: "pytorch-libtorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7"
+            - "docker-pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7"
+          build_environment: "pytorch-libtorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7-build"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7"
       - pytorch_linux_build:
           name: pytorch_linux_bionic_py3_6_clang9_build
           requires:

From 179d2b288c4398d4bd1d9187b36ae978bc308cc9 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Wed, 28 Oct 2020 11:19:59 -0700
Subject: [PATCH 076/169] Fix interval midpoint calculation in vulkan (#46839)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46839

Interval midpoint calculations can overflow (integers). This fixes such an instance.

Test Plan: Standard test rig.

Reviewed By: drdarshan

Differential Revision: D24392545

fbshipit-source-id: 84c81802165bb8084e2d54c9f3755f39143a5b00
---
 aten/src/ATen/native/vulkan/api/vk_mem_alloc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/vulkan/api/vk_mem_alloc.h b/aten/src/ATen/native/vulkan/api/vk_mem_alloc.h
index fdeadf9cdbfa..b468a1c05c6d 100644
--- a/aten/src/ATen/native/vulkan/api/vk_mem_alloc.h
+++ b/aten/src/ATen/native/vulkan/api/vk_mem_alloc.h
@@ -4594,7 +4594,7 @@ static IterT VmaBinaryFindFirstNotLess(IterT beg, IterT end, const KeyT &key, co
     size_t down = 0, up = (end - beg);
     while(down < up)
     {
-        const size_t mid = (down + up) / 2;
+        const size_t mid = down + (up - down) / 2; //Overflow-safe midpoint calculation
         if(cmp(*(beg+mid), key))
         {
             down = mid + 1;

From 23bce17baa7f6cfcf299aa692a610bfc60f1d640 Mon Sep 17 00:00:00 2001
From: David Reiss <dreiss@fb.com>
Date: Wed, 28 Oct 2020 11:22:04 -0700
Subject: [PATCH 077/169] Add inputsSize to Python IR, like outputsSize
 (#46779)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/46779

Test Plan: Used it in some notebooks.

Reviewed By: suo

Differential Revision: D24574005

Pulled By: dreiss

fbshipit-source-id: 78ba7a2bdb859fef5633212b73c7a3eb2cfbc380
---
 torch/csrc/jit/python/python_ir.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch/csrc/jit/python/python_ir.cpp b/torch/csrc/jit/python/python_ir.cpp
index fc968237e4ba..20d0e6272a19 100644
--- a/torch/csrc/jit/python/python_ir.cpp
+++ b/torch/csrc/jit/python/python_ir.cpp
@@ -493,6 +493,7 @@ void initPythonIRBindings(PyObject* module_) {
           })
       .def("sourceRange", [](Node& n) { return n.sourceRange().str(); })
       .def("hasMultipleOutputs", [](Node& n) { return n.outputs().size() > 1; })
+      .def("inputsSize", [](Node& n) { return n.inputs().size(); })
       .def("outputsSize", [](Node& n) { return n.outputs().size(); })
       .NS(kind)
       .def("inputsAt", [](Node& n, size_t i) { return n.inputs().at(i); })

From bf08814b73a948c38b4d715e43ce9318f3579385 Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Wed, 28 Oct 2020 11:49:17 -0700
Subject: [PATCH 078/169] [FX] Kill functional transforms name (#47004)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47004

Test Plan: Imported from OSS

Reviewed By: zdevito

Differential Revision: D24597581

Pulled By: jamesr66a

fbshipit-source-id: 9213d58f4a53ea55e97e6ca0572fdcf5e271bdc3
---
 torch/fx/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/fx/__init__.py b/torch/fx/__init__.py
index 792a905432a5..c7fbd6fbf0ea 100644
--- a/torch/fx/__init__.py
+++ b/torch/fx/__init__.py
@@ -2,7 +2,7 @@
 r'''
 **This feature is experimental and its stability is not currently guaranteed. Proceed at your own risk**
 
-FX (Functional Transformations) is a toolkit for capturing and transforming functional PyTorch programs. It
+FX is a toolkit for capturing and transforming functional PyTorch programs. It
 consists of GraphModule and a corresponding intermediate representation (IR). When GraphModule is constructed
 with an `nn.Module` instance as its argument, GraphModule will trace through the computation of that Module's
 `forward` method symbolically and record those operations in the FX intermediate representation.

From ec600bc391320b94f4d000e547ae7ac98ded8b52 Mon Sep 17 00:00:00 2001
From: Ashkan Aliabadi <ashkanaliabadi@fb.com>
Date: Wed, 28 Oct 2020 12:06:05 -0700
Subject: [PATCH 079/169] Add Vulkan tensor copy. (#46481)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/46481

Test Plan: Imported from OSS

Reviewed By: IvanKobzarev

Differential Revision: D24379143

Pulled By: AshkanAliabadi

fbshipit-source-id: cf492c5c632bf193c8aff0169d17bbf962e019e1
---
 aten/src/ATen/native/Copy.cpp            |   5 +
 aten/src/ATen/native/vulkan/ops/Copy.cpp | 149 +++++++++++++++++++++++
 aten/src/ATen/native/vulkan/ops/Copy.h   |  19 +++
 aten/src/ATen/test/vulkan_api_test.cpp   |  32 ++++-
 4 files changed, 203 insertions(+), 2 deletions(-)
 create mode 100644 aten/src/ATen/native/vulkan/ops/Copy.cpp
 create mode 100644 aten/src/ATen/native/vulkan/ops/Copy.h

diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp
index 0430de87eb77..360069998f19 100644
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@@ -5,6 +5,7 @@
 #include <ATen/NativeFunctions.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/quantized/Copy.h>
+#include <ATen/native/vulkan/ops/Copy.h>
 #include <ATen/quantized/Quantizer.h>
 #include <ATen/vulkan/Context.h>
 #include <ATen/metal/Context.h>
@@ -131,7 +132,11 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
   }
 
   if (self.device().type() == at::kVulkan || src.device().type() == at::kVulkan) {
+  #ifdef USE_VULKAN_API
+    return vulkan::ops::copy_(self, src);
+  #else
     return at::vulkan::vulkan_copy_(self, src);
+  #endif
   }
 
   if (self.device().type() == at::kMetal || src.device().type() == at::kMetal) {
diff --git a/aten/src/ATen/native/vulkan/ops/Copy.cpp b/aten/src/ATen/native/vulkan/ops/Copy.cpp
new file mode 100644
index 000000000000..2f74d1be00ab
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/ops/Copy.cpp
@@ -0,0 +1,149 @@
+#include <ATen/native/vulkan/ops/Common.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace ops {
+
+Tensor& copy_(Tensor& self, const Tensor& src) {
+  // X -> Vulkan
+  if (at::kVulkan == self.device().type()) {
+    vTensor& v_self = convert(self);
+
+    // CPU -> Vulkan
+    if (at::kCPU == src.device().type()) {
+      // Requesting write-only host access to the tensor never triggers a sync
+      // as the contents will be overwritten regardless.  Having said that,
+      // appropriate barriers are inserted automatically if WAR or WAW hazards
+      // are detected.  Examples of such scenario for instance are if any of
+      // these async operations are on going in the background on 'self':
+      //  - On discrete systems:
+      //      * buffer-to-staging transfers
+      //      * staging-to-buffer transfers
+      // -  On UMA buffer is an alias for staging and accessible both on host
+      //    and device.  Consequently:
+      //      * buffer-to-image NHWC -> NC4HW packing
+      //      * image-to-buffer NC4HW -> NHWC unpacking
+
+      using Future = vTensor::Future<void, vTensor::Access::Write>;
+      Future v_self_future = v_self.host<void, vTensor::Access::Write>();
+
+      // This wait() will be a no-op if no hazards are detected, including the
+      // obvious, yet important, special case of 'self' being an empty tensor.
+
+      Future::Payload v_self_payload = v_self_future.wait();
+
+      memcpy(
+          v_self_payload.get(),
+          src.contiguous().data_ptr<float>(),
+          std::min(src.nbytes(), self.nbytes()));
+    }
+    // Vulkan -> Vulkan
+    else if (at::kVulkan == src.device().type()) {
+      api::Command::Buffer command_buffer = api::context()->command().pool.allocate();
+      command_buffer.begin();
+
+      command_buffer.copy(
+          // - Read-only access is implied on const tensors.  Memory barriers
+          //   are automatically inserted if a RAW hazard is detected.
+          // - Recording any potential pending sync operations into the same
+          //   command buffer prevents an expensive queue submission.
+          convert(src).buffer(command_buffer),
+          // - Write-only access never triggers a sync as the contents will be
+          //   overwritten regardless.  Having said that, appropriate barriers
+          //   are inserted automatically if WAR or WAW hazards are detected.
+          // - Recording pending sync operations into the same command buffer
+          //   prevents an expensive queue submission.
+          v_self.buffer(command_buffer, vTensor::Access::Write));
+
+      command_buffer.end();
+      command_buffer.submit(api::context()->gpu().queue);
+    }
+    else {
+      TORCH_INTERNAL_ASSERT(false, "Unsupported!");
+    }
+  }
+  // Vulkan -> X
+  else if (at::kVulkan == src.device().type()) {
+    const vTensor& v_src = convert(src);
+
+    {
+      // Similar notes as above applies, with the additional consideration of
+      // potential syncs on read accesses.  Namely,
+      // - on discrete systems, if the (staging, buffer, image) trio, or
+      // - on UMA, if the (buffer, image) duo
+      // have gone out of sync as a result of one processor writing to one
+      // resource which is then either accessed as an another resource type on
+      // the same or another processor.  Same considerations regarding hazard
+      // avoidance as above applies.
+
+      using Future = vTensor::Future<const void, vTensor::Access::Read>;
+      const Future v_src_future = v_src.host<const void>();
+
+      // Vulkan -> CPU
+      if (at::kCPU == self.device().type()) {
+        // This wait() is a no-op if data is not out of sync.  More often than
+        // not though, waits here are expected as the GPU catches up with
+        // compute submitted from CPU.
+
+        const Future::Payload v_src_payload = v_src_future.wait();
+
+        memcpy(
+            self.data_ptr<float>(),
+            v_src_payload.get(),
+            std::min(src.nbytes(), self.nbytes()));
+      }
+      else {
+        TORCH_INTERNAL_ASSERT(false, "Unsupported!");
+      }
+    }
+
+    //
+    // WARNING
+    //
+
+    // This is not great.  We almost never want to flush the GPU pipeline as
+    // that has far reaching consequences, especially if PyTorch is not the only
+    // process accessing the GPU.  If we have done our job properly, above
+    // synchronization mechanisms should be enough to ensure correctness at a more
+    // modest cost, as there is no need to flush the entirety of jobs in flight
+    // if one is only interested on waiting on computation affecting one single
+    // tensor to finish.
+    //
+    // Having said that, we still do need to release all pool resources at one
+    // point per inference run or we will run out of memory otherwise. There is
+    // no perfect answer to this problem that checks all boxes, which leaves us
+    // with one of several design decisions:
+    //
+    // 1) Use graph mode to gain an understanding of the computation graph,
+    //    itself allowing us to place pool purges intelligently.  Best option
+    //    for performance and memory consumption.  Not without its downsides if
+    //    flexibility is a top priority.
+    // 2) If on eager mode, and hence are seeing operations one at a time, expose
+    //    this release of resources to the user as a Python / C++ function.  This
+    //    makes for suboptimal user experience but is efficient in terms of
+    //    performance.
+    // 3) If on eager mode, and interested in keeping this bookkeeping transparent
+    //    to the user, release all resources somewhere ... like here.  This is
+    //    not ideal since it requires a pipeline flush to make sure these objects
+    //    are not already in use by a workload in flight.  Cannot do much better
+    //    within the constraints of this approach.  Good for user experience,
+    //    suboptimal for performance.
+    // 4) If on eager mode, and interested in keeping this bookkeeping transparent
+    //    to the user, and performance does not matter, make CPU and GPU run in
+    //    lockstep.  Obviously this is just bad.  Mentioned for the sake of
+    //    completeness.
+
+    api::context()->flush();
+  }
+  else {
+    TORCH_INTERNAL_ASSERT(false, "Unsupported!");
+  }
+
+  return self;
+}
+
+} // namespace ops
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/vulkan/ops/Copy.h b/aten/src/ATen/native/vulkan/ops/Copy.h
new file mode 100644
index 000000000000..e69af06357c5
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/ops/Copy.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#ifdef USE_VULKAN_API
+
+#include <ATen/native/vulkan/ops/Common.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace ops {
+
+Tensor& copy_(Tensor& self, const Tensor& src);
+
+} // namespace ops
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index c7ac15544a99..88b9cda507af 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -1,11 +1,39 @@
-#include <gtest/gtest.h>
+#ifdef USE_VULKAN_API
 
+#include <gtest/gtest.h>
 #include <ATen/ATen.h>
 
-#ifdef USE_VULKAN_API
+// TODO: These functions should move to a common place.
+
+namespace {
+
+bool checkRtol(const at::Tensor& diff, const std::vector<at::Tensor>& inputs) {
+  float maxValue = 0.0f;
+
+  for (const auto& tensor : inputs) {
+    maxValue = fmax(tensor.abs().max().item<float>(), maxValue);
+  }
+
+  return diff.abs().max().item<float>() < (2e-6 * maxValue);
+}
+
+bool almostEqual(const at::Tensor& a, const at::Tensor& b) {
+  return checkRtol(a - b, {a, b});
+}
+
+bool exactlyEqual(const at::Tensor& a, const at::Tensor& b) {
+  return (a - b).abs().max().item<float>() == 0.0f;
+}
+
+} // namespace
 
 namespace {
 
+TEST(VulkanAPITest, copy) {
+  const auto cpu = at::rand({13, 17, 37, 19}, at::device(at::kCPU).dtype(at::kFloat));
+  ASSERT_TRUE(exactlyEqual(cpu, cpu.vulkan().cpu()));
+}
+
 TEST(VulkanAPITest, empty) {
   ASSERT_NO_THROW(at::empty({1, 17, 41, 53}, at::device(at::kVulkan).dtype(at::kFloat)));
 }

From 14d87ec5a345891217dc77a88a5ec29272909383 Mon Sep 17 00:00:00 2001
From: Ashkan Aliabadi <ashkanaliabadi@fb.com>
Date: Wed, 28 Oct 2020 12:06:05 -0700
Subject: [PATCH 080/169] Add Vulkan op Add. (#44017)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44017

Test Plan: Imported from OSS

Reviewed By: IvanKobzarev

Differential Revision: D23820826

Pulled By: AshkanAliabadi

fbshipit-source-id: 47db435894696f4eb4277370d4d317d2df9e3b98
---
 aten/src/ATen/native/vulkan/VulkanOps.cpp     |   5 +-
 aten/src/ATen/native/vulkan/glsl/add.glsl     |  31 ++-
 aten/src/ATen/native/vulkan/glsl/add_.glsl    |  27 ++
 .../ATen/native/vulkan/glsl/add_scalar.glsl   |  26 +-
 .../ATen/native/vulkan/glsl/add_scalar_.glsl  |  26 ++
 aten/src/ATen/native/vulkan/ops/Add.cpp       | 257 ++++++++++++++++++
 aten/src/ATen/test/CMakeLists.txt             |   6 +-
 aten/src/ATen/test/vulkan_api_test.cpp        |  50 ++++
 8 files changed, 399 insertions(+), 29 deletions(-)
 create mode 100644 aten/src/ATen/native/vulkan/glsl/add_.glsl
 create mode 100644 aten/src/ATen/native/vulkan/glsl/add_scalar_.glsl
 create mode 100644 aten/src/ATen/native/vulkan/ops/Add.cpp

diff --git a/aten/src/ATen/native/vulkan/VulkanOps.cpp b/aten/src/ATen/native/vulkan/VulkanOps.cpp
index 302525582c9d..8b040ecbec10 100644
--- a/aten/src/ATen/native/vulkan/VulkanOps.cpp
+++ b/aten/src/ATen/native/vulkan/VulkanOps.cpp
@@ -561,13 +561,12 @@ void add(VulkanTensor& output, const VulkanTensor& input, const float s) {
 
   auto device = context().device();
   struct ConstBlock {
-    int32_t inputSize[4];
+    int32_t inputSize[3];
     float s;
   };
   ConstBlock cb{{safe_downcast<int32_t>(W),
                  safe_downcast<int32_t>(H),
-                 safe_downcast<int32_t>(C_4),
-                 0},
+                 safe_downcast<int32_t>(C_4)},
                 s};
   VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb));
 
diff --git a/aten/src/ATen/native/vulkan/glsl/add.glsl b/aten/src/ATen/native/vulkan/glsl/add.glsl
index 9b7e992e78c5..27e69152ac1b 100644
--- a/aten/src/ATen/native/vulkan/glsl/add.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/add.glsl
@@ -1,27 +1,28 @@
 #version 450 core
 #define PRECISION $precision
+
 layout(std430) buffer;
 layout(std430) uniform;
 
-layout(set = 0, rgba16f, binding = 0) writeonly PRECISION uniform image3D uOutput;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput0;
-layout(set = 0, binding = 2) uniform PRECISION sampler3D uInput1;
-layout(set = 0, binding = 3) uniform constBlock {
-  int W;
-  int H;
-  int C;
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 1)          uniform PRECISION                    sampler3D uInput0;
+layout(set = 0, binding = 2)          uniform PRECISION                    sampler3D uInput1;
+layout(set = 0, binding = 3)          uniform           restrict           Block {
+  ivec3 WHC;
   float alpha;
-}
-uConstBlock;
+} uBlock;
 
 layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
 
 void main() {
-  ivec3 pos = ivec3(gl_GlobalInvocationID);
-  ivec3 WHC = ivec3(uConstBlock.W, uConstBlock.H, uConstBlock.C);
-  if (all(lessThan(pos, WHC))) {
-    vec4 v = texelFetch(uInput0, pos, 0) +
-        uConstBlock.alpha * texelFetch(uInput1, pos, 0);
-    imageStore(uOutput, pos, v);
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (all(lessThan(pos, uBlock.WHC))) {
+    imageStore(
+        uOutput,
+        pos,
+        texelFetch(uInput0, pos, 0) + uBlock.alpha * texelFetch(uInput1, pos, 0));
   }
 }
diff --git a/aten/src/ATen/native/vulkan/glsl/add_.glsl b/aten/src/ATen/native/vulkan/glsl/add_.glsl
new file mode 100644
index 000000000000..c872a8193ca3
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/add_.glsl
@@ -0,0 +1,27 @@
+#version 450 core
+#define PRECISION $precision
+
+layout(std430) buffer;
+layout(std430) uniform;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION          image3D   uOutput;
+layout(set = 0, binding = 1)          uniform PRECISION          sampler3D uInput0;
+layout(set = 0, binding = 2)          uniform           restrict Block {
+  ivec3 WHC;
+  float alpha;
+} uBlock;
+
+layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (all(lessThan(pos, uBlock.WHC))) {
+    imageStore(
+        uOutput,
+        pos,
+        imageLoad(uOutput, pos) + uBlock.alpha * texelFetch(uInput0, pos, 0));
+  }
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/add_scalar.glsl b/aten/src/ATen/native/vulkan/glsl/add_scalar.glsl
index 559cdd7441c3..10a95330a48c 100644
--- a/aten/src/ATen/native/vulkan/glsl/add_scalar.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/add_scalar.glsl
@@ -1,21 +1,27 @@
 #version 450 core
+#define PRECISION $precision
+
 layout(std430) buffer;
 layout(std430) uniform;
 
-layout(set = 0, rgba16f, binding = 0) writeonly highp uniform image3D uOutput;
-layout(set = 0, binding = 1) uniform highp sampler3D uInput;
-layout(set = 0, binding = 2) uniform constBlock {
-  ivec4 sizes;
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 1)          uniform PRECISION                    sampler3D uInput;
+layout(set = 0, binding = 2)          uniform           restrict           Block {
+  ivec3 WHC;
   float other;
-}
-uConstBlock;
+} uBlock;
 
 layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
 
 void main() {
-  ivec3 pos = ivec3(gl_GlobalInvocationID);
-  if (all(lessThan(pos, uConstBlock.sizes.xyz))) {
-    vec4 v = texelFetch(uInput, pos, 0) + uConstBlock.other;
-    imageStore(uOutput, pos, v);
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (all(lessThan(pos, uBlock.WHC))) {
+    imageStore(
+        uOutput,
+        pos,
+        texelFetch(uInput, pos, 0) + uBlock.other);
   }
 }
diff --git a/aten/src/ATen/native/vulkan/glsl/add_scalar_.glsl b/aten/src/ATen/native/vulkan/glsl/add_scalar_.glsl
new file mode 100644
index 000000000000..8e736e2a6a71
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/add_scalar_.glsl
@@ -0,0 +1,26 @@
+#version 450 core
+#define PRECISION $precision
+
+layout(std430) buffer;
+layout(std430) uniform;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict image3D uOutput;
+layout(set = 0, binding = 1)          uniform           restrict Block {
+  ivec3 WHC;
+  float other;
+} uBlock;
+
+layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (all(lessThan(pos, uBlock.WHC))) {
+    imageStore(
+        uOutput,
+        pos,
+        imageLoad(uOutput, pos) + uBlock.other);
+  }
+}
diff --git a/aten/src/ATen/native/vulkan/ops/Add.cpp b/aten/src/ATen/native/vulkan/ops/Add.cpp
new file mode 100644
index 000000000000..1c1fa216d3f3
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/ops/Add.cpp
@@ -0,0 +1,257 @@
+#include <ATen/native/vulkan/ops/Common.h>
+#include <torch/library.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace ops {
+namespace {
+
+Tensor add_scalar(
+    const Tensor& self_arg,
+    const Scalar other,
+    const Scalar alpha) {
+  api::Context* const context = api::context();
+
+  const Tensor self = self_arg.is_vulkan() ? self_arg : self_arg.vulkan();
+  const vTensor& v_self = convert(self);
+
+  vTensor v_output{
+    context,
+    self.sizes(),
+    self.options(),
+  };
+
+  api::Command::Buffer command_buffer = context->command().pool.allocate();
+  command_buffer.begin();
+  {
+    if (v_output.has_image() && v_self.has_image()) {
+      const struct {
+        uint32_t width, height, channels;
+        float other;
+      } block {
+        v_output.extents().width,
+        v_output.extents().height,
+        v_output.extents().depth,
+        other.to<float>() * alpha.to<float>(),
+      };
+
+      context->dispatch(
+          command_buffer,
+          {
+            VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+          },
+          VK_KERNEL(add_scalar),
+          v_output.extents(),
+          // Write-only access bypasses synchronization but inserts appropriate
+          // barriers if necessary.
+          v_output.image(command_buffer, vTensor::Access::Write),
+          // Read-only access is implied on const tensors and triggers an async
+          // synchronization if necessary.
+          v_self.image(command_buffer),
+          // Object lifetime is managed by the resource pool.
+          // It is OK not to keep track of the handle.
+          context->resource().pool.uniform(block).object);
+    }
+    else {
+      TORCH_CHECK(false, "Not implemented!");
+    }
+  }
+  command_buffer.end();
+  command_buffer.submit(context->gpu().queue);
+
+  return convert(v_output);
+}
+
+Tensor& add_scalar_(
+    Tensor& self_arg,
+    const Scalar other,
+    const Scalar alpha) {
+  api::Context* const context = api::context();
+
+  TORCH_CHECK(
+      self_arg.is_vulkan(),
+      "Vulkan: In-place add is only supported on Vulkan tensors.");
+
+  vTensor& v_self = convert(self_arg);
+
+  api::Command::Buffer command_buffer = context->command().pool.allocate();
+  command_buffer.begin();
+  {
+    if (v_self.has_image()) {
+      const struct {
+        uint32_t width, height, channels;
+        float other;
+      } block {
+        v_self.extents().width,
+        v_self.extents().height,
+        v_self.extents().depth,
+        other.to<float>() * alpha.to<float>(),
+      };
+
+      context->dispatch(
+          command_buffer,
+          {
+            VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+          },
+          VK_KERNEL(add_scalar_),
+          v_self.extents(),
+          // Read-Write access triggers an async synchronization if necessory
+          // and inserts appropriate barriers if hazards are detected.
+          v_self.image(command_buffer, vTensor::Access::Read | vTensor::Access::Write),
+          // Object lifetime is managed by the resource pool.
+          // It is OK not to keep track of the handle.
+          context->resource().pool.uniform(block).object);
+    }
+    else {
+      TORCH_CHECK(false, "Not implemented!");
+    }
+  }
+  command_buffer.end();
+  command_buffer.submit(context->gpu().queue);
+
+  return self_arg;
+}
+
+Tensor add_tensor(
+    const Tensor& self_arg,
+    const Tensor& other_arg,
+    const Scalar alpha) {
+  api::Context* const context = api::context();
+
+  const Tensor self = self_arg.is_vulkan() ? self_arg : self_arg.vulkan();
+  const vTensor& v_self = convert(self);
+
+  const Tensor other = other_arg.is_vulkan() ? other_arg : other_arg.vulkan();
+  const vTensor& v_other = convert(other);
+
+  vTensor v_output{
+    context,
+    self.sizes(),
+    self.options(),
+  };
+
+  api::Command::Buffer command_buffer = context->command().pool.allocate();
+  command_buffer.begin();
+  {
+    if (v_self.has_image() && v_other.has_image()) {
+      const struct {
+        uint32_t width, height, channels;
+        float alpha;
+      } block {
+        v_output.extents().width,
+        v_output.extents().height,
+        v_output.extents().depth,
+        alpha.to<float>(),
+      };
+
+      context->dispatch(
+          command_buffer,
+          {
+            VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+          },
+          VK_KERNEL(add),
+          v_output.extents(),
+          // Write-only access bypasses synchronization but inserts appropriate
+          // barriers if necessary.
+          v_output.image(command_buffer, vTensor::Access::Write),
+          // Read-only access is implied on const tensors and triggers an async
+          // synchronization if necessary.
+          v_self.image(command_buffer),
+          // Read-only access is implied on const tensors and triggers an async
+          // synchronization if necessary.
+          v_other.image(command_buffer),
+          // Object lifetime is managed by the resource pool.
+          // It is OK not to keep track of the handle.
+          context->resource().pool.uniform(block).object);
+    }
+    else {
+      TORCH_CHECK(false, "Not implemented!");
+    }
+  }
+  command_buffer.end();
+  command_buffer.submit(context->gpu().queue);
+
+  return convert(v_output);
+}
+
+Tensor& add_tensor_(
+    Tensor& self_arg,
+    const Tensor& other_arg,
+    const Scalar alpha) {
+  api::Context* const context = api::context();
+
+  TORCH_CHECK(
+      self_arg.is_vulkan(),
+      "Vulkan: In-place add is only supported on Vulkan tensors.");
+
+  vTensor& v_self = convert(self_arg);
+
+  const Tensor other = other_arg.is_vulkan() ? other_arg : other_arg.vulkan();
+  const vTensor& v_other = convert(other);
+
+  api::Command::Buffer command_buffer = context->command().pool.allocate();
+  command_buffer.begin();
+  {
+    if (v_self.has_image() && v_other.has_image()) {
+      const struct {
+        uint32_t width, height, channels;
+        float alpha;
+      } block {
+        v_self.extents().width,
+        v_self.extents().height,
+        v_self.extents().depth,
+        alpha.to<float>(),
+      };
+
+      context->dispatch(
+          command_buffer,
+          {
+            VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+          },
+          VK_KERNEL(add_),
+          v_self.extents(),
+          // Read-Write access triggers an async synchronization if necessory
+          // and inserts appropriate barriers if hazards are detected.
+          v_self.image(command_buffer, vTensor::Access::Read | vTensor::Access::Write),
+          // Read-only access is implied on const tensors and triggers an async
+          // synchronization if necessary.
+          v_other.image(command_buffer),
+          // Object lifetime is managed by the resource pool.
+          // It is OK not to keep track of the handle.
+          context->resource().pool.uniform(block).object);
+    }
+    else {
+      TORCH_CHECK(false, "Not implemented!");
+    }
+  }
+  command_buffer.end();
+  command_buffer.submit(context->gpu().queue);
+
+  return self_arg;
+}
+
+#ifdef USE_VULKAN_API
+
+TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
+  m.impl("add.Scalar", TORCH_FN(add_scalar));
+  m.impl("add_.Scalar", TORCH_FN(add_scalar_));
+  m.impl("add.Tensor", TORCH_FN(add_tensor));
+  m.impl("add_.Tensor", TORCH_FN(add_tensor_));
+}
+
+#endif /* USE_VULKAN_API */
+
+} // namespace
+} // namespace ops
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
index 4268db33fa16..385c87f9e7e7 100644
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@@ -76,9 +76,13 @@ list(APPEND ATen_HIP_TEST_SRCS
 #  ${CMAKE_CURRENT_SOURCE_DIR}/hip/hip_stream_test.cpp
 
 list(APPEND ATen_VULKAN_TEST_SRCS
-  ${CMAKE_CURRENT_SOURCE_DIR}/vulkan_api_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/vulkan_test.cpp)
 
+if(USE_VULKAN_API)
+  list(APPEND ATen_VULKAN_TEST_SRCS
+    ${CMAKE_CURRENT_SOURCE_DIR}/vulkan_api_test.cpp)
+endif()
+
 list(APPEND ATen_MOBILE_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/cpu_profiling_allocator_test.cpp
diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index 88b9cda507af..123028fe714c 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -29,6 +29,56 @@ bool exactlyEqual(const at::Tensor& a, const at::Tensor& b) {
 
 namespace {
 
+TEST(VulkanAPITest, add) {
+  const auto a_cpu = at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto a_vulkan = a_cpu.vulkan();
+
+  const auto b_cpu = at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto b_vulkan = b_cpu.vulkan();
+
+  const auto c_cpu = at::add(a_cpu, b_cpu, 2.1f);
+  const auto c_vulkan = at::add(a_vulkan, b_vulkan, 2.1f);
+
+  ASSERT_TRUE(almostEqual(c_cpu, c_vulkan.cpu()));
+}
+
+TEST(VulkanAPITest, add_) {
+  auto a_cpu = at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat));
+  auto a_vulkan = a_cpu.vulkan();
+
+  const auto b_cpu = at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto b_vulkan = b_cpu.vulkan();
+
+  a_cpu.add_(b_cpu, 2.1f);
+  a_vulkan.add_(b_vulkan, 2.1f);
+
+  ASSERT_TRUE(almostEqual(a_cpu, a_vulkan.cpu()));
+}
+
+TEST(VulkanAPITest, add_scalar) {
+  const auto a_cpu = at::rand({1, 1, 1, 1}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto a_vulkan = a_cpu.vulkan();
+
+  const float b_scalar = 3.1415f;
+
+  const auto c_cpu = at::add(a_cpu, b_scalar, 2.1f);
+  const auto c_vulkan = at::add(a_vulkan, b_scalar, 2.1f);
+
+  ASSERT_TRUE(almostEqual(c_cpu, c_vulkan.cpu()));
+}
+
+TEST(VulkanAPITest, add_scalar_) {
+  auto a_cpu = at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat));
+  auto a_vulkan = a_cpu.vulkan();
+
+  const float b_scalar = 3.1415f;
+
+  a_cpu.add_(b_scalar, 2.1f);
+  a_vulkan.add_(b_scalar, 2.1f);
+
+  ASSERT_TRUE(almostEqual(a_cpu, a_vulkan.cpu()));
+}
+
 TEST(VulkanAPITest, copy) {
   const auto cpu = at::rand({13, 17, 37, 19}, at::device(at::kCPU).dtype(at::kFloat));
   ASSERT_TRUE(exactlyEqual(cpu, cpu.vulkan().cpu()));

From 058f43fc514f0505ffd78decbd3191edaf676449 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Wed, 28 Oct 2020 12:46:45 -0700
Subject: [PATCH 081/169] Fix torch.version.debug generation (#47006)

Summary:
argparser type bool returns True for any argument passed as input

Use `distutils.util.strtobool` which returns 0 for input values like "0", "no", "n", "f", "false" and 1 for "1", "yes", "y", "t", "true"

Fixes https://github.com/pytorch/pytorch/issues/46973 and https://github.com/pytorch/pytorch/issues/47003

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47006

Reviewed By: samestep

Differential Revision: D24598193

Pulled By: malfet

fbshipit-source-id: e8f6688d6883011f301b49a0f03c452c611f7001
---
 tools/generate_torch_version.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/generate_torch_version.py b/tools/generate_torch_version.py
index b5ea62ff29bb..8129f38eb0ef 100644
--- a/tools/generate_torch_version.py
+++ b/tools/generate_torch_version.py
@@ -2,6 +2,7 @@
 import os
 import subprocess
 from pathlib import Path
+from distutils.util import strtobool
 
 def get_sha():
     try:
@@ -27,7 +28,7 @@ def get_torch_version(sha=None):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Generate torch/version.py from build and environment metadata.")
-    parser.add_argument("--is_debug", type=bool, help="Whether this build is debug mode or not.")
+    parser.add_argument("--is_debug", type=strtobool, help="Whether this build is debug mode or not.")
     parser.add_argument("--cuda_version", type=str)
     parser.add_argument("--hip_version", type=str)
 
@@ -47,7 +48,7 @@ def get_torch_version(sha=None):
         # NB: This is not 100% accurate, because you could have built the
         # library code with DEBUG, but csrc without DEBUG (in which case
         # this would claim to be a release build when it's not.)
-        f.write("debug = {}\n".format(repr(args.is_debug)))
+        f.write("debug = {}\n".format(repr(bool(args.is_debug))))
         f.write("cuda = {}\n".format(repr(args.cuda_version)))
         f.write("git_version = {}\n".format(repr(sha)))
         f.write("hip = {}\n".format(repr(args.hip_version)))

From cd26d027b3357cd913412ee13060cdbb28fc178a Mon Sep 17 00:00:00 2001
From: Nikita Vedeneev <nik@quansight.com>
Date: Wed, 28 Oct 2020 14:54:00 -0700
Subject: [PATCH 082/169] [doc] Fix info on the shape of pivots in `torch.lu` +
 more info on what and how they encode permutations. (#46844)

Summary:
As per title.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46844

Reviewed By: VitalyFedyunin

Differential Revision: D24595538

Pulled By: ezyang

fbshipit-source-id: 1bb9c0310170124c3b6e33bd26ce38c22b36e926
---
 torch/functional.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/torch/functional.py b/torch/functional.py
index e31ec40d63d7..cb9be8117fa8 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -1449,7 +1449,12 @@ def _lu_impl(A, pivot=True, get_infos=False, out=None):
 
             - **factorization** (*Tensor*): the factorization of size :math:`(*, m, n)`
 
-            - **pivots** (*IntTensor*): the pivots of size :math:`(*, m)`
+            - **pivots** (*IntTensor*): the pivots of size :math:`(*, \text{min}(m, n))`.
+              ``pivots`` stores all the intermediate transpositions of rows.
+              The final permutation ``perm`` could be reconstructed by
+              applying ``swap(perm[i], perm[pivots[i] - 1])`` for ``i = 0, ..., pivots.size(-1) - 1``,
+              where ``perm`` is initially the identity permutation of :math:`m` elements
+              (essentially this is what :func:`torch.lu_unpack` is doing).
 
             - **infos** (*IntTensor*, *optional*): if :attr:`get_infos` is ``True``, this is a tensor of
               size :math:`(*)` where non-zero values indicate whether factorization for the matrix or

From fc2bd991cca680fb9579913765f2d7cf82130c75 Mon Sep 17 00:00:00 2001
From: Supriya Rao <supriyar@fb.com>
Date: Wed, 28 Oct 2020 15:46:23 -0700
Subject: [PATCH 083/169] [quant] Fix flaky test
 test_histogram_observer_against_reference (#46957)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46957

Possibly due to use of large tensor in hypothesis. Reducing the size to see if it helps

Test Plan:
python test/test_quantization.py TestRecordHistogramObserver.test_histogram_observer_against_reference

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D24580137

fbshipit-source-id: f44ab059796fba97cccb12353c13803bf49214a1
---
 test/quantization/test_workflow_module.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/quantization/test_workflow_module.py b/test/quantization/test_workflow_module.py
index a16b01d36d46..cd722d59d2a2 100644
--- a/test/quantization/test_workflow_module.py
+++ b/test/quantization/test_workflow_module.py
@@ -736,7 +736,7 @@ def test_histogram_observer_same_inputs(self):
         self.assertEqual(myobs.max_val, 8.0)
         self.assertEqual(myobs.histogram, [2., 3., 3.])
 
-    @given(N=st.sampled_from([10, 1000, 10**6]),
+    @given(N=st.sampled_from([10, 1000]),
            bins=st.sampled_from([256, 512, 1024, 2048]),
            dtype=st.sampled_from([torch.qint8, torch.quint8]),
            qscheme=st.sampled_from([torch.per_tensor_affine, torch.per_tensor_symmetric]),

From dc8176356e059e5b5a40252dcd44fd7b9bb96a80 Mon Sep 17 00:00:00 2001
From: Michael Suo <suo@fb.com>
Date: Wed, 28 Oct 2020 16:23:58 -0700
Subject: [PATCH 084/169] Various cleanups to ir_emitter and friends (#46686)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46686

I was trying to page this code back in after a while and some things
stuck out as unnecessarily confusing.

1. Improve documentation of closures and fork stuff to be more accurate
to how we use them today.
2. Change `prim::LocalVariableScope` to `prim::ListComprehension`. It is
only ever used for a list comprehensions, and in general the nodes
emitted by `ir_emitter` should correspond to concrete operations or
language features rather than semantic constraints.
3. Change the somewhat mysterious "inputs" and "attributes" argument
names throughout the codebase to be the more obvious "args" and "kwargs"
that they generally represent (I think "inputs" and "attributes" come
from the AST naming).

Test Plan: Imported from OSS

Reviewed By: navahgar, jamesr66a

Differential Revision: D24464197

Pulled By: suo

fbshipit-source-id: 1f4b1475b58b5690a0b204e705caceff969533b4
---
 aten/src/ATen/core/interned_strings.h         |  4 +-
 test/test_jit.py                              |  8 +-
 torch/csrc/jit/frontend/convert_to_ssa.cpp    | 23 +++---
 torch/csrc/jit/frontend/exit_transforms.cpp   |  4 +-
 torch/csrc/jit/frontend/ir_emitter.cpp        | 49 ++++++++----
 torch/csrc/jit/frontend/schema_matching.cpp   |  6 +-
 torch/csrc/jit/frontend/schema_matching.h     |  8 +-
 torch/csrc/jit/frontend/sugared_value.cpp     | 54 ++++++-------
 torch/csrc/jit/frontend/sugared_value.h       | 75 +++++++++----------
 torch/csrc/jit/ir/alias_analysis.cpp          |  2 +-
 .../csrc/jit/passes/constant_propagation.cpp  |  2 +-
 .../jit/passes/inline_forked_closures.cpp     |  2 +-
 torch/csrc/jit/passes/lift_closures.cpp       |  4 +-
 .../csrc/jit/python/python_sugared_value.cpp  | 68 ++++++++---------
 torch/csrc/jit/python/python_sugared_value.h  | 32 ++++----
 torch/csrc/jit/runtime/operator.cpp           |  2 +-
 torch/csrc/jit/runtime/symbolic_script.cpp    |  4 +-
 torch/csrc/jit/serialization/python_print.cpp |  2 +-
 18 files changed, 183 insertions(+), 166 deletions(-)

diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h
index dd099be59dff..5f881b5edccc 100644
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@@ -56,7 +56,7 @@ namespace c10 {
   _(prim, ReturnStmt)                \
   _(prim, BreakStmt)                 \
   _(prim, ContinueStmt)              \
-  _(prim, LocalVariableScope)        \
+  _(prim, ListComprehensionScope)    \
   _(prim, Store)                     \
   _(prim, AutogradZero)              \
   _(prim, AutogradAnyNonZero)        \
@@ -129,7 +129,7 @@ namespace c10 {
   _(prim, fork)                      \
   _(prim, forkClosure)               \
   _(prim, RaiseException)            \
-  _(prim, Function)                  \
+  _(prim, Closure)                   \
   _(prim, CreateObject)              \
   _(prim, SetAttr)                   \
   _(prim, GetAttr)                   \
diff --git a/test/test_jit.py b/test/test_jit.py
index dbe3fff20245..9994754dcc42 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -13297,7 +13297,7 @@ def backward(grad_output):
         ''')
         cu = torch.jit.CompilationUnit(code)
         g = cu.tanh.graph
-        FileCheck().check_count("prim::Function_0", 2).check("None = prim::Constant") \
+        FileCheck().check_count("prim::Closure_0", 2).check("None = prim::Constant") \
                    .check_next("return").run(g)
 
         code = dedent('''
@@ -13314,7 +13314,7 @@ def backward(grad_output):
         ''')
         cu = torch.jit.CompilationUnit(code)
         g = cu.tanh.graph
-        FileCheck().check_count("prim::Function_0", 2).check("int = prim::If") \
+        FileCheck().check_count("prim::Closure_0", 2).check("int = prim::If") \
                    .run(g)
 
         code = dedent('''
@@ -13328,9 +13328,9 @@ def backward(grad_output):
         ''')
         cu = torch.jit.CompilationUnit(code)
         fc = FileCheck()
-        fc.check("prim::Function").check("(Tensor, None) = prim::TupleConstruct")
+        fc.check("prim::Closure").check("(Tensor, None) = prim::TupleConstruct")
         # Loop then two if's added in exit transform
-        fc.check("prim::Function").check("prim::Loop").check_count("prim::If", 2)
+        fc.check("prim::Closure").check("prim::Loop").check_count("prim::If", 2)
         fc.run(cu.loop_in_closure.graph)
 
         code = dedent('''
diff --git a/torch/csrc/jit/frontend/convert_to_ssa.cpp b/torch/csrc/jit/frontend/convert_to_ssa.cpp
index 10109aa55824..1dd61c260bd6 100644
--- a/torch/csrc/jit/frontend/convert_to_ssa.cpp
+++ b/torch/csrc/jit/frontend/convert_to_ssa.cpp
@@ -5,19 +5,20 @@
 #include <torch/csrc/jit/frontend/mini_environment.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/ir_views.h>
-#include <torch/csrc/jit/passes/inline_forked_closures.h>
 
 namespace torch {
 namespace jit {
 
 // At the beginning of the pass the Graph has already undergone type checking,
 // and writes or reads to a variable are emitted as Loads and Stores in the
-// graph. a = 1 print(a) is represented as:
-//
-// %a.1 : int = prim::Constant[value=1]()
-// prim::Store[name="a"](%a.1)
-// %a : int = prim::Load[name="a"]()
-// prim::Print(%a)
+// graph.
+//     a = 1
+//     print(a)
+// is represented as:
+//     %a.1 : int = prim::Constant[value=1]()
+//     prim::Store[name="a"](%a.1)
+//     %a : int = prim::Load[name="a"]()
+//     prim::Print(%a)
 //
 // First, this pass recursively adds the Loads & Stores to control flow nodes
 // Then the graph is converted to SSA form.
@@ -149,7 +150,7 @@ struct ControlFlowLoadStores {
         case prim::Loop: {
           addLoopLoadStores(n);
         } break;
-        case prim::Function: {
+        case prim::Closure: {
           for (auto b : n->blocks()) {
             addControlFlowLoadStores(b);
           }
@@ -157,7 +158,7 @@ struct ControlFlowLoadStores {
         case prim::Store: {
           environment_stack->setVar(n->s(attr::name), n->input()->type());
         } break;
-        case prim::LocalVariableScope: {
+        case prim::ListComprehensionScope: {
           addControlFlowLoadStores(n->blocks().at(0));
         } break;
       }
@@ -204,7 +205,7 @@ struct EraseLoadStores {
           n->output()->replaceAllUsesWith(var);
           n->destroy();
         } break;
-        case prim::LocalVariableScope: {
+        case prim::ListComprehensionScope: {
           // writes within a local variable scope do not leak into
           // the rest of the graph
           auto body = n->blocks().at(0);
@@ -279,7 +280,7 @@ struct LoopContinuations {
           assignExitContinuations(n->blocks().at(0));
           assignExitContinuations(n->blocks().at(1));
         } break;
-        case prim::Function: {
+        case prim::Closure: {
           LoopContinuations closure_block;
           closure_block.run(n->blocks().at(0));
         } break;
diff --git a/torch/csrc/jit/frontend/exit_transforms.cpp b/torch/csrc/jit/frontend/exit_transforms.cpp
index 3126d78c3bd2..e14cb6428890 100644
--- a/torch/csrc/jit/frontend/exit_transforms.cpp
+++ b/torch/csrc/jit/frontend/exit_transforms.cpp
@@ -119,7 +119,7 @@ struct ExitTransformer {
 
   static bool isGraphOrClosureBlock(Block* block) {
     return block->owningNode() == nullptr ||
-        owningNodeKind(block) == prim::Function;
+        owningNodeKind(block) == prim::Closure;
   }
 
   static void removeOutputs(Block* b) {
@@ -425,7 +425,7 @@ struct ExitTransformer {
         case prim::With: {
           exit_pair = transformWith(node);
         } break;
-        case prim::Function: {
+        case prim::Closure: {
           // exits of closure declaration stay local to the closure
           transformExits(node->blocks().at(0));
         } break;
diff --git a/torch/csrc/jit/frontend/ir_emitter.cpp b/torch/csrc/jit/frontend/ir_emitter.cpp
index 5294e02e739f..201afec1ce4e 100644
--- a/torch/csrc/jit/frontend/ir_emitter.cpp
+++ b/torch/csrc/jit/frontend/ir_emitter.cpp
@@ -1,4 +1,5 @@
 #include <torch/csrc/jit/frontend/ir_emitter.h>
+
 #include <c10/util/Exception.h>
 #include <c10/util/StringUtil.h>
 #include <torch/csrc/jit/api/function_impl.h>
@@ -859,9 +860,12 @@ struct to_ir {
     return emitStatements(statements.begin(), statements.end());
   }
 
-  // XXX - right now closures are used _only_ for defining gradients internally
+  // XXX: Right now closures are not generically implemented and are only used
+  // as an intermediate form for special tasks, like defining gradients or
+  // forked functions.
+  //
   // There are several unfinished aspects that make them unusable generally
-  // 1. We do not have a type, ivalue, operator to represent prim::Function, so
+  // 1. We do not have a type, ivalue, operator to represent prim::Closure, so
   // closure_node has type None
   // 2. There is no export logic for it yet, so it cannot be
   // exported/python_printed
@@ -870,9 +874,19 @@ struct to_ir {
   //    the changes to those variables will just get forgotten.
   // 4. There is no parsing support in frontend.py, this is intentional since it
   //    prevents people from accidentally using this feature.
+  //
+  // This function leaves in the graph something like:
+  //
+  //   %2 : None = prim::Closure()
+  //     block0():
+  //       %1 : Tensor = prim::DoSomething(%0)
+  //       -> (%1)
+  //
+  // A separate pass is required to erase this closure and replace it with
+  // something actually executable (see liftClosure and inlineForkedClosure).
   std::shared_ptr<ClosureValue> emitClosure(
       const std::function<void(Block*)>& emit_body) {
-    Node* closure_node = graph->insertNode(graph->create(prim::Function, 1));
+    Node* closure_node = graph->insertNode(graph->create(prim::Closure, 1));
     // it is not a real thing yet, so just say the type is None
     closure_node->output()->setType(NoneType::get());
     Block* block = closure_node->addBlock();
@@ -1262,7 +1276,7 @@ struct to_ir {
     // comprehension introduces it's own scope. no variable assigned
     // leaks into the rest of the graph
     Node* n =
-        graph->insertNode(create(prim::LocalVariableScope, lc.range(), 0));
+        graph->insertNode(create(prim::ListComprehensionScope, lc.range(), 0));
     auto* comprehension_block = n->addBlock();
     pushFrame(comprehension_block);
     WithInsertPoint guard(comprehension_block);
@@ -2094,8 +2108,8 @@ struct to_ir {
           stmt.range(),
           *method.graph(),
           getAugOp(stmt, lhs->type()),
-          /*inputs=*/{lhs, rhs},
-          /*attributes=*/{},
+          /*args=*/{lhs, rhs},
+          /*kwargs=*/{},
           /*self=*/c10::nullopt);
     }
   }
@@ -2665,9 +2679,9 @@ struct to_ir {
     if (auto special_form = dynamic_cast<SpecialFormValue*>(sv.get())) {
       return emitApplySpecialForm(special_form->form(), apply, type_hint);
     }
-    auto inputs = getNamedValues(apply.inputs(), true);
-    auto attributes = emitAttributes(apply.attributes());
-    return sv->call(loc, method, inputs, attributes, n_binders);
+    auto args = getNamedValues(apply.inputs(), true);
+    auto kwargs = emitAttributes(apply.attributes());
+    return sv->call(loc, method, args, kwargs, n_binders);
   }
 
   // this function handles expressions that look like apply statements
@@ -2688,9 +2702,9 @@ struct to_ir {
         }
         auto forked = emitSugaredExpr(Expr(trees[0]), 1);
         TreeList sliced_trees(trees.begin() + 1, trees.end());
-        auto inputs = getNamedValues(sliced_trees, true);
-        auto attributes = emitAttributes(apply.attributes());
-        return emitForkExpr(apply.range(), forked, inputs, attributes);
+        auto args = getNamedValues(sliced_trees, true);
+        auto kwargs = emitAttributes(apply.attributes());
+        return emitForkExpr(apply.range(), forked, args, kwargs);
       }
       case prim::annotate: {
         checkApplyNumInputs(apply, 2);
@@ -2965,11 +2979,15 @@ struct to_ir {
     return graph->insertConstant(maybe_out_stack->at(0), tree->range());
   }
 
+  /**
+   * Emit a fork expression, of the form:
+   *   torch.jit.fork(forked, *args, **kwargs)
+   */
   std::shared_ptr<SugaredValue> emitForkExpr(
       SourceRange loc,
       const std::shared_ptr<SugaredValue>& forked,
-      at::ArrayRef<NamedValue> inputs,
-      at::ArrayRef<NamedValue> attributes) {
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs) {
     auto g = method.graph();
     Node* fork_node;
     TypePtr out_type;
@@ -2989,8 +3007,7 @@ struct to_ir {
         fork_node->addInput(closure_output);
       } else {
         auto emit_closure_body = [&](Block* closure_block) {
-          auto fn_sugared_output =
-              forked->call(loc, method, inputs, attributes, 1);
+          auto fn_sugared_output = forked->call(loc, method, args, kwargs, 1);
           auto fn_simple_output = fn_sugared_output->asValue(loc, method);
           closure_block->registerOutput(fn_simple_output);
           out_type = fn_simple_output->type();
diff --git a/torch/csrc/jit/frontend/schema_matching.cpp b/torch/csrc/jit/frontend/schema_matching.cpp
index fb2e0f20f380..9fd3973f9b3d 100644
--- a/torch/csrc/jit/frontend/schema_matching.cpp
+++ b/torch/csrc/jit/frontend/schema_matching.cpp
@@ -584,8 +584,8 @@ Value* emitBuiltinCall(
     const SourceRange& loc,
     Graph& graph,
     Symbol name,
-    at::ArrayRef<NamedValue> inputs,
-    at::ArrayRef<NamedValue> attributes,
+    at::ArrayRef<NamedValue> args,
+    at::ArrayRef<NamedValue> kwargs,
     const c10::optional<NamedValue>& self) {
   const auto& variants = getAllOperatorsFor(name);
   const auto& builtin_functions = getAllBuiltinFunctionsFor(name);
@@ -620,7 +620,7 @@ Value* emitBuiltinCall(
     throw error;
   }
 
-  auto matched = matchSchemas(schemas, loc, graph, inputs, attributes, self);
+  auto matched = matchSchemas(schemas, loc, graph, args, kwargs, self);
 
   if (matched.first < variants.size()) {
     return emitBuiltinNode(matched.second, loc, graph, name);
diff --git a/torch/csrc/jit/frontend/schema_matching.h b/torch/csrc/jit/frontend/schema_matching.h
index 88fe23a9682d..83e34bb33ae5 100644
--- a/torch/csrc/jit/frontend/schema_matching.h
+++ b/torch/csrc/jit/frontend/schema_matching.h
@@ -23,7 +23,7 @@ TORCH_API MatchedSchema matchSchema(
     const SourceRange& loc,
     Graph& graph,
     at::ArrayRef<NamedValue> args,
-    at::ArrayRef<NamedValue> kwarg,
+    at::ArrayRef<NamedValue> kwargs,
     const c10::optional<NamedValue>& self = c10::nullopt);
 
 TORCH_API std::pair<size_t, MatchedSchema> matchSchemas(
@@ -31,7 +31,7 @@ TORCH_API std::pair<size_t, MatchedSchema> matchSchemas(
     const SourceRange& loc,
     Graph& graph,
     at::ArrayRef<NamedValue> args,
-    at::ArrayRef<NamedValue> kwarg,
+    at::ArrayRef<NamedValue> kwargs,
     const c10::optional<NamedValue>& self = c10::nullopt,
     bool render_errors = false);
 
@@ -43,8 +43,8 @@ TORCH_API Value* emitBuiltinCall(
     const SourceRange& loc,
     Graph& graph,
     Symbol name,
-    at::ArrayRef<NamedValue> inputs,
-    at::ArrayRef<NamedValue> attributes,
+    at::ArrayRef<NamedValue> args,
+    at::ArrayRef<NamedValue> kwargs,
     const c10::optional<NamedValue>& self = c10::nullopt);
 
 TORCH_API c10::optional<size_t> findInputWithName(
diff --git a/torch/csrc/jit/frontend/sugared_value.cpp b/torch/csrc/jit/frontend/sugared_value.cpp
index 69e86716f72e..57faeeb42265 100644
--- a/torch/csrc/jit/frontend/sugared_value.cpp
+++ b/torch/csrc/jit/frontend/sugared_value.cpp
@@ -1,4 +1,5 @@
 #include <torch/csrc/jit/frontend/sugared_value.h>
+
 #include <torch/csrc/jit/frontend/schema_matching.h>
 #include <torch/csrc/jit/frontend/tree_views.h>
 #include <torch/csrc/jit/ir/ir.h>
@@ -17,14 +18,14 @@ struct NoneValue : SugaredValue {
 std::shared_ptr<SugaredValue> PrintValue::call(
     const SourceRange& loc,
     Function& m,
-    at::ArrayRef<NamedValue> inputs,
-    at::ArrayRef<NamedValue> attributes,
+    at::ArrayRef<NamedValue> args,
+    at::ArrayRef<NamedValue> kwargs,
     size_t n_binders) {
   auto& g = *m.graph();
-  if (!attributes.empty())
+  if (!kwargs.empty())
     throw ErrorReport(loc) << "print doesn't accept any keyword arguments";
 
-  std::vector<Value*> lowered_inputs = toValues(*m.graph(), inputs);
+  std::vector<Value*> lowered_inputs = toValues(*m.graph(), args);
   g.insertNode(g.create(prim::Print, lowered_inputs, 0)->setSourceRange(loc));
   return std::make_shared<NoneValue>();
 }
@@ -46,11 +47,11 @@ builtin_cast_method_to_scalar_type() {
 std::shared_ptr<SugaredValue> BuiltinFunction::call(
     const SourceRange& loc,
     Function& m,
-    at::ArrayRef<NamedValue> inputs,
-    at::ArrayRef<NamedValue> attributes,
+    at::ArrayRef<NamedValue> args,
+    at::ArrayRef<NamedValue> kwargs,
     size_t n_binders) {
   return std::make_shared<SimpleValue>(
-      emitBuiltinCall(loc, *m.graph(), symbol, inputs, attributes, self));
+      emitBuiltinCall(loc, *m.graph(), symbol, args, kwargs, self));
 }
 
 // older versions of gcc/clang have a bug where enums can't be used as keys
@@ -322,14 +323,14 @@ void SimpleValue::setAttr(
 std::shared_ptr<SugaredValue> SimpleValue::call(
     const SourceRange& loc,
     Function& m,
-    at::ArrayRef<NamedValue> inputs,
-    at::ArrayRef<NamedValue> attributes,
+    at::ArrayRef<NamedValue> args,
+    at::ArrayRef<NamedValue> kwargs,
     size_t n_binders) {
   // allow our 'fake' closures to be called, used for fork serialization
   // at the moment, but can be expanded later
   Node* self = getValue()->node();
   if (self->kind() == prim::TupleConstruct && self->inputs().size() == 2 &&
-      self->inputs().at(0)->node()->kind() == prim::Function) {
+      self->inputs().at(0)->node()->kind() == prim::Closure) {
     std::shared_ptr<Graph> graph =
         self->inputs().at(0)->node()->g(attr::Subgraph);
     Value* context = self->inputs().at(1);
@@ -348,16 +349,15 @@ std::shared_ptr<SugaredValue> SimpleValue::call(
     auto ret = StrongFunctionPtr(std::move(cu), fn);
 
     std::vector<NamedValue> ctx_inputs = {close_context};
-    ctx_inputs.insert(ctx_inputs.end(), inputs.begin(), inputs.end());
-    return FunctionValue(ret).call(loc, m, ctx_inputs, attributes, n_binders);
+    ctx_inputs.insert(ctx_inputs.end(), args.begin(), args.end());
+    return FunctionValue(ret).call(loc, m, ctx_inputs, kwargs, n_binders);
   }
 
   if (auto class_type = getValue()->type()->cast<ClassType>()) {
-    return attr(loc, m, "__call__")
-        ->call(loc, m, inputs, attributes, n_binders);
+    return attr(loc, m, "__call__")->call(loc, m, args, kwargs, n_binders);
   }
 
-  return SugaredValue::call(loc, m, inputs, attributes, n_binders);
+  return SugaredValue::call(loc, m, args, kwargs, n_binders);
 }
 
 Value* SimpleValue::len(const SourceRange& loc, Function& m) {
@@ -569,27 +569,27 @@ void IterableTree::addChild(
 std::shared_ptr<SugaredValue> MagicMethod::call(
     const SourceRange& loc,
     Function& m,
-    at::ArrayRef<NamedValue> inputs,
-    at::ArrayRef<NamedValue> attributes,
+    at::ArrayRef<NamedValue> args,
+    at::ArrayRef<NamedValue> kwargs,
     size_t n_binders) {
-  if (inputs.size() > 0) {
-    Value* self = inputs[0].value(*m.graph());
+  if (args.size() > 0) {
+    Value* self = args[0].value(*m.graph());
     if (auto class_ptr = self->type()->cast<ClassType>()) {
       return SimpleValue(self)
           .attr(loc, m, desugared_name_)
-          ->call(loc, m, inputs.slice(1), attributes, n_binders);
+          ->call(loc, m, args.slice(1), kwargs, n_binders);
     }
   }
   TORCH_INTERNAL_ASSERT(base_value_);
-  return base_value_->call(loc, m, inputs, attributes, n_binders);
+  return base_value_->call(loc, m, args, kwargs, n_binders);
 }
 
 std::shared_ptr<SugaredValue> ClassValue::call(
     const SourceRange& loc,
     Function& m,
     // note: names for args will be 'argument 0', 'argument 1', etc..
-    at::ArrayRef<NamedValue> inputs,
-    at::ArrayRef<NamedValue> attributes,
+    at::ArrayRef<NamedValue> args,
+    at::ArrayRef<NamedValue> kwargs,
     size_t n_binders) {
   AT_ASSERT(n_binders <= 1);
 
@@ -602,7 +602,7 @@ std::shared_ptr<SugaredValue> ClassValue::call(
   }
 
   // Call the init function
-  MethodValue(self, "__init__").call(loc, m, inputs, attributes, n_binders);
+  MethodValue(self, "__init__").call(loc, m, args, kwargs, n_binders);
 
   return std::make_shared<SimpleValue>(self);
 }
@@ -621,15 +621,15 @@ std::shared_ptr<SugaredValue> ClassValue::attr(
 std::shared_ptr<SugaredValue> NamedTupleConstructor::call(
     const SourceRange& loc,
     Function& m,
-    at::ArrayRef<NamedValue> inputs,
-    at::ArrayRef<NamedValue> attributes,
+    at::ArrayRef<NamedValue> args,
+    at::ArrayRef<NamedValue> kwargs,
     size_t n_binders) {
   auto& g = *m.graph();
 
   auto schema = type_->schema();
   TORCH_INTERNAL_ASSERT(schema);
   auto qualname = type_->name();
-  auto matched_schema = matchSchema(*schema, loc, g, inputs, attributes);
+  auto matched_schema = matchSchema(*schema, loc, g, args, kwargs);
 
   auto self =
       g.insertNode(
diff --git a/torch/csrc/jit/frontend/sugared_value.h b/torch/csrc/jit/frontend/sugared_value.h
index 3523523f5c23..f85e75509898 100644
--- a/torch/csrc/jit/frontend/sugared_value.h
+++ b/torch/csrc/jit/frontend/sugared_value.h
@@ -84,8 +84,8 @@ struct TORCH_API SugaredValue
       const SourceRange& loc,
       Function& m,
       // note: names for args will be 'argument 0', 'argument 1', etc..
-      at::ArrayRef<NamedValue> inputs_,
-      at::ArrayRef<NamedValue> attributes,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
       size_t n_binders) {
     // n_binders is always set to the number of variables an expression is
     // syntactically bound to:
@@ -181,8 +181,8 @@ struct TORCH_API SimpleValue : public SugaredValue {
       const SourceRange& loc,
       Function& m,
       // note: names for args will be 'argument 0', 'argument 1', etc..
-      at::ArrayRef<NamedValue> inputs_,
-      at::ArrayRef<NamedValue> attributes,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
       size_t n_binders) override;
 
   std::shared_ptr<SugaredValue> iter(const SourceRange& loc, Function& m)
@@ -215,8 +215,8 @@ struct TORCH_API BuiltinFunction : public SugaredValue {
   std::shared_ptr<SugaredValue> call(
       const SourceRange& loc,
       Function& m,
-      at::ArrayRef<NamedValue> attributes,
-      at::ArrayRef<NamedValue> inputs,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
       size_t n_binders) override;
 
   // try to create this builtin but if it doesn't exist or the self argument
@@ -332,8 +332,8 @@ struct TORCH_API ClassValue : public SugaredValue {
   std::shared_ptr<SugaredValue> call(
       const SourceRange& loc,
       Function& m,
-      at::ArrayRef<NamedValue> inputs,
-      at::ArrayRef<NamedValue> attributes,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
       size_t n_binders) override;
 
   std::shared_ptr<SugaredValue> attr(
@@ -354,8 +354,8 @@ struct TORCH_API NamedTupleConstructor : public SugaredValue {
   std::shared_ptr<SugaredValue> call(
       const SourceRange& loc,
       Function& m,
-      at::ArrayRef<NamedValue> inputs,
-      at::ArrayRef<NamedValue> attributes,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
       size_t n_binders) override;
 
   std::string kind() const override {
@@ -384,8 +384,8 @@ struct FunctionValue : public SugaredValue {
   std::shared_ptr<SugaredValue> call(
       const SourceRange& loc,
       Function& f,
-      at::ArrayRef<NamedValue> inputs,
-      at::ArrayRef<NamedValue> attributes,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
       size_t n_binders) override {
     std::vector<const FunctionSchema*> schemas;
     for (Function* callee : callees_) {
@@ -398,7 +398,7 @@ struct FunctionValue : public SugaredValue {
       }
       schemas.push_back(&callee->getSchema());
     }
-    auto match = matchSchemas(schemas, loc, *f.graph(), inputs, attributes);
+    auto match = matchSchemas(schemas, loc, *f.graph(), args, kwargs);
     Value* output =
         f.graph()->insertFunctionCall(callees_[match.first], match.second);
     output->node()->setSourceRange(loc);
@@ -417,7 +417,7 @@ struct FunctionValue : public SugaredValue {
 
 struct TORCH_API ClosureValue : public SugaredValue {
   ClosureValue(Value* value) : value_(value) {
-    TORCH_INTERNAL_ASSERT(value_->node()->kind() == prim::Function);
+    TORCH_INTERNAL_ASSERT(value_->node()->kind() == prim::Closure);
   }
   std::string kind() const override {
     return "closure";
@@ -442,11 +442,11 @@ struct MethodValue : public SugaredValue {
   std::shared_ptr<SugaredValue> call(
       const SourceRange& loc,
       Function& f,
-      at::ArrayRef<NamedValue> inputs,
-      at::ArrayRef<NamedValue> attributes,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
       size_t n_binders) override {
-    std::vector<NamedValue> inputsWithSelf = {self_};
-    inputsWithSelf.insert(inputsWithSelf.end(), inputs.begin(), inputs.end());
+    std::vector<NamedValue> argsWithSelf = {self_};
+    argsWithSelf.insert(argsWithSelf.end(), args.begin(), args.end());
     std::vector<const FunctionSchema*> schemas;
     for (const std::string& method_name : method_names_) {
       if (auto class_type = self_->type()->cast<ClassType>()) {
@@ -466,8 +466,7 @@ struct MethodValue : public SugaredValue {
             false, "method constructed that is not a class or interface");
       }
     }
-    auto match =
-        matchSchemas(schemas, loc, *f.graph(), inputsWithSelf, attributes);
+    auto match = matchSchemas(schemas, loc, *f.graph(), argsWithSelf, kwargs);
     Value* output =
         f.graph()->insertMethodCall(method_names_[match.first], match.second);
     output->node()->setSourceRange(loc);
@@ -486,8 +485,8 @@ struct TORCH_API PrintValue : public SugaredValue {
   std::shared_ptr<SugaredValue> call(
       const SourceRange& loc,
       Function& m,
-      at::ArrayRef<NamedValue> inputs,
-      at::ArrayRef<NamedValue> attributes,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
       size_t n_binders) override;
 };
 
@@ -500,16 +499,16 @@ struct TORCH_API CastValue : public BuiltinFunction {
   std::shared_ptr<SugaredValue> call(
       const SourceRange& loc,
       Function& m,
-      at::ArrayRef<NamedValue> inputs,
-      at::ArrayRef<NamedValue> attributes,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
       size_t n_binders) override {
-    if (inputs.size() == 1 && attributes.size() == 0) {
-      auto v = inputs[0].value(*m.graph());
+    if (args.size() == 1 && kwargs.size() == 0) {
+      auto v = args[0].value(*m.graph());
       if (v->type()->isSubtypeOf(type_)) {
         return std::make_shared<SimpleValue>(v);
       }
     }
-    return BuiltinFunction::call(loc, m, inputs, attributes, n_binders);
+    return BuiltinFunction::call(loc, m, args, kwargs, n_binders);
   }
 
  private:
@@ -527,17 +526,17 @@ struct TORCH_API TensorCastValue : public SugaredValue {
   std::shared_ptr<SugaredValue> call(
       const SourceRange& loc,
       Function& m,
-      at::ArrayRef<NamedValue> inputs,
-      at::ArrayRef<NamedValue> attributes,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
       size_t n_binders) override {
-    TORCH_INTERNAL_ASSERT(inputs.size() == 0 && attributes.size() == 0);
+    TORCH_INTERNAL_ASSERT(args.size() == 0 && kwargs.size() == 0);
     Value* dtype_const = m.graph()->insertConstant(dtype_, loc);
-    std::vector<NamedValue> kwargs{self_,
-                                   NamedValue(loc, "dtype", dtype_const)};
+    std::vector<NamedValue> kwargs_{self_,
+                                    NamedValue(loc, "dtype", dtype_const)};
     Value* casted_val = m.graph()->insert(
         /*opname=*/Symbol::fromQualString("aten::to"),
-        /*args=*/inputs,
-        /*kwargs=*/kwargs,
+        /*args=*/args,
+        /*kwargs=*/kwargs_,
         /*range=*/loc);
     return std::make_shared<SimpleValue>(casted_val);
   }
@@ -560,8 +559,8 @@ struct TORCH_API MagicMethod : public SugaredValue {
   std::shared_ptr<SugaredValue> call(
       const SourceRange& loc,
       Function& m,
-      at::ArrayRef<NamedValue> inputs,
-      at::ArrayRef<NamedValue> attributes,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
       size_t n_binders) override;
 
  private:
@@ -735,11 +734,11 @@ struct TORCH_API ExceptionValue : public SugaredValue {
   std::shared_ptr<SugaredValue> call(
       const SourceRange& loc,
       Function& m,
-      at::ArrayRef<NamedValue> inputs,
+      at::ArrayRef<NamedValue> args,
       at::ArrayRef<NamedValue> /*attributes*/,
       size_t /*n_binders*/) override {
     auto exception_message = insertConstant(*m.graph(), message_ + ": ", loc);
-    for (auto& input : inputs) {
+    for (auto& input : args) {
       auto input_str = input.value(*m.graph());
       if (!input_str->type()->isSubtypeOf(StringType::get())) {
         input_str =
diff --git a/torch/csrc/jit/ir/alias_analysis.cpp b/torch/csrc/jit/ir/alias_analysis.cpp
index bb5872f35f4f..b055d29164a5 100644
--- a/torch/csrc/jit/ir/alias_analysis.cpp
+++ b/torch/csrc/jit/ir/alias_analysis.cpp
@@ -494,7 +494,7 @@ void AliasDb::analyzeImpl(Node* node) {
     case prim::MMBatchSide:
     case prim::BroadcastSizes:
     case prim::ChunkSizes:
-    case prim::Function:
+    case prim::Closure:
     case prim::CreateObject:
     case prim::tolist:
       return analyzeCreator(node);
diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp
index c3285f2e2426..2981daa0006a 100644
--- a/torch/csrc/jit/passes/constant_propagation.cpp
+++ b/torch/csrc/jit/passes/constant_propagation.cpp
@@ -89,7 +89,7 @@ namespace {
 std::unordered_set<Symbol> skip_list = {
     prim::If,
     prim::Loop,
-    prim::Function,
+    prim::Closure,
     prim::Constant,
     prim::AutogradZero,
     prim::Uninitialized,
diff --git a/torch/csrc/jit/passes/inline_forked_closures.cpp b/torch/csrc/jit/passes/inline_forked_closures.cpp
index ea5a977e4091..e97d71e32249 100644
--- a/torch/csrc/jit/passes/inline_forked_closures.cpp
+++ b/torch/csrc/jit/passes/inline_forked_closures.cpp
@@ -19,7 +19,7 @@ void inlineForkedClosure(Node* fork_closure) {
   Node* function_context_node = fork_closure->input()->node();
 
   if (function_context_node->inputs().size() != 2 ||
-      function_context_node->inputs().at(0)->node()->kind() != prim::Function ||
+      function_context_node->inputs().at(0)->node()->kind() != prim::Closure ||
       function_context_node->inputs().at(1)->node()->kind() !=
           prim::TupleConstruct) {
     throw ErrorReport(fork_closure->sourceRange()) << "Cannot fork this value";
diff --git a/torch/csrc/jit/passes/lift_closures.cpp b/torch/csrc/jit/passes/lift_closures.cpp
index 82e6f2216681..4f5941ce8afb 100644
--- a/torch/csrc/jit/passes/lift_closures.cpp
+++ b/torch/csrc/jit/passes/lift_closures.cpp
@@ -5,7 +5,7 @@
 namespace torch {
 namespace jit {
 
-// Closures are initially emitted as prim::Function nodes with a single block.
+// Closures are initially emitted as prim::Closure nodes with a single block.
 // Here, we convert the block to a subgraph, adding all closed over variables
 // as a context tuple input to the closure node.
 // At this point the closure has already undergone conversion to SSA,
@@ -58,7 +58,7 @@ void liftClosures(Block* block) {
     Node* n = *it;
     it++;
     switch (n->kind()) {
-      case prim::Function: {
+      case prim::Closure: {
         liftClosure(n);
       } break;
       default: {
diff --git a/torch/csrc/jit/python/python_sugared_value.cpp b/torch/csrc/jit/python/python_sugared_value.cpp
index 15d151b761ef..071347561665 100644
--- a/torch/csrc/jit/python/python_sugared_value.cpp
+++ b/torch/csrc/jit/python/python_sugared_value.cpp
@@ -1,4 +1,5 @@
 #include <torch/csrc/jit/python/python_sugared_value.h>
+
 #include <pybind11/pytypes.h>
 #include <torch/csrc/Dtype.h>
 #include <torch/csrc/Layout.h>
@@ -114,21 +115,20 @@ FunctionSchema PythonValue::getSchema(
 std::shared_ptr<SugaredValue> PythonValue::call(
     const SourceRange& loc,
     Function& m,
-    at::ArrayRef<NamedValue> inputs_,
-    at::ArrayRef<NamedValue> attributes,
+    at::ArrayRef<NamedValue> args,
+    at::ArrayRef<NamedValue> kwargs,
     size_t n_binders) {
-  std::vector<NamedValue> inputsWithSelf;
+  std::vector<NamedValue> argsWithSelf;
   if (moduleSelf_) {
-    inputsWithSelf.emplace_back(NamedValue("self", moduleSelf_));
+    argsWithSelf.emplace_back(NamedValue("self", moduleSelf_));
   }
-  inputsWithSelf.insert(inputsWithSelf.end(), inputs_.begin(), inputs_.end());
-  inputs_ = inputsWithSelf;
+  argsWithSelf.insert(argsWithSelf.end(), args.begin(), args.end());
 
-  auto schema = getSchema(inputs_.size(), n_binders, loc);
-  auto inputs = toValues(*m.graph(), inputs_);
+  auto schema = getSchema(argsWithSelf.size(), n_binders, loc);
+  auto inputs = toValues(*m.graph(), argsWithSelf);
 
   MatchedSchema matched_schema =
-      matchSchema(schema, loc, *m.graph(), inputs_, attributes);
+      matchSchema(schema, loc, *m.graph(), argsWithSelf, kwargs);
 
   // If if a function is marked as dropped,
   // we throw an exception if it is invoked.
@@ -652,8 +652,8 @@ void ModuleValue::setAttr(
 std::shared_ptr<SugaredValue> BooleanDispatchValue::call(
     const SourceRange& loc,
     Function& caller,
-    at::ArrayRef<NamedValue> inputs,
-    at::ArrayRef<NamedValue> attributes,
+    at::ArrayRef<NamedValue> args,
+    at::ArrayRef<NamedValue> kwargs,
     size_t n_binders) {
   c10::optional<bool> result;
   Graph& graph = *(caller.graph());
@@ -662,14 +662,14 @@ std::shared_ptr<SugaredValue> BooleanDispatchValue::call(
   auto arg_name = py::str(dispatched_fn_["arg_name"]);
 
   ErrorReport error(loc);
-  if (index < inputs.size()) {
+  if (index < args.size()) {
     // Dispatch flag is in arg list
-    result = constant_as<bool>(inputs.at(index).value(graph));
+    result = constant_as<bool>(args.at(index).value(graph));
     error << "Argument for boolean dispatch at position " << index
           << " was not constant";
-  } else if (auto i = findInputWithName(arg_name, attributes)) {
+  } else if (auto i = findInputWithName(arg_name, kwargs)) {
     // Dispatch flag is in kwargs
-    result = constant_as<bool>(attributes[*i].value(graph));
+    result = constant_as<bool>(kwargs[*i].value(graph));
     error << "Keyword argument '" << arg_name
           << "' for boolean dispatch at position was not constant";
   } else {
@@ -688,28 +688,28 @@ std::shared_ptr<SugaredValue> BooleanDispatchValue::call(
   } else {
     value = toSugaredValue(dispatched_fn_["if_false"], caller, loc);
   }
-  return value->call(loc, caller, inputs, attributes, n_binders);
+  return value->call(loc, caller, args, kwargs, n_binders);
 }
 
 std::shared_ptr<SugaredValue> PythonExceptionValue::call(
     const SourceRange& loc,
     Function& caller,
-    at::ArrayRef<NamedValue> inputs,
-    at::ArrayRef<NamedValue> attributes,
+    at::ArrayRef<NamedValue> args,
+    at::ArrayRef<NamedValue> kwargs,
     size_t /*n_binders*/) {
   Value* error_message = nullptr;
-  if (inputs.size() == 0) {
+  if (args.size() == 0) {
     error_message = insertConstant(*caller.graph(), "", loc);
-  } else if (inputs.size() == 1) {
-    error_message = inputs.at(0).value(*caller.graph());
+  } else if (args.size() == 1) {
+    error_message = args.at(0).value(*caller.graph());
   } else {
     std::vector<Value*> message_values;
-    message_values.reserve(inputs.size() + attributes.size());
+    message_values.reserve(args.size() + kwargs.size());
 
-    for (auto inp : inputs) {
+    for (const auto& inp : args) {
       message_values.push_back(inp.value(*caller.graph()));
     }
-    for (auto kwarg_inp : attributes) {
+    for (const auto& kwarg_inp : kwargs) {
       message_values.push_back(kwarg_inp.value(*caller.graph()));
     }
     error_message =
@@ -802,10 +802,10 @@ std::shared_ptr<SugaredValue> createSimpleEnumValue(
 std::shared_ptr<SugaredValue> PythonSliceClass::call(
     const SourceRange& loc,
     Function& caller,
-    at::ArrayRef<NamedValue> inputs,
-    at::ArrayRef<NamedValue> attributes,
+    at::ArrayRef<NamedValue> args,
+    at::ArrayRef<NamedValue> kwargs,
     size_t /*n_binders*/) {
-  if (!attributes.empty()) {
+  if (!kwargs.empty()) {
     throw ErrorReport(loc) << "Slice does not accept any keyword arguments";
   }
 
@@ -824,23 +824,23 @@ std::shared_ptr<SugaredValue> PythonSliceClass::call(
   Value* start;
   Value* stop;
   Value* step;
-  size_t n = inputs.size();
+  size_t n = args.size();
   // Slice's constructor signature is Slice(start=None, stop, step=None)
   if (n == 1) {
     // Case where only `stop` is specified.
     start = ValOr(nullptr, default_start);
-    stop = ValOr(inputs[0].value(graph), default_stop);
+    stop = ValOr(args[0].value(graph), default_stop);
     step = ValOr(nullptr, default_step);
   } else if (n == 2) {
     // Case where `start` and `stop` are specified.
-    start = ValOr(inputs[0].value(graph), default_start);
-    stop = ValOr(inputs[1].value(graph), default_stop);
+    start = ValOr(args[0].value(graph), default_start);
+    stop = ValOr(args[1].value(graph), default_stop);
     step = ValOr(nullptr, default_step);
   } else if (n == 3) {
     // Case where `start`, `stop` and `step` are all specified.
-    start = ValOr(inputs[0].value(graph), default_start);
-    stop = ValOr(inputs[1].value(graph), default_stop);
-    step = ValOr(inputs[2].value(graph), default_step);
+    start = ValOr(args[0].value(graph), default_start);
+    stop = ValOr(args[1].value(graph), default_stop);
+    step = ValOr(args[2].value(graph), default_step);
   } else {
     throw ErrorReport(loc) << "slice accepts exactly 1, 2 or 3 arguments, got: "
                            << n;
diff --git a/torch/csrc/jit/python/python_sugared_value.h b/torch/csrc/jit/python/python_sugared_value.h
index ecb3c6da4ff4..a399fa543dfb 100644
--- a/torch/csrc/jit/python/python_sugared_value.h
+++ b/torch/csrc/jit/python/python_sugared_value.h
@@ -47,8 +47,8 @@ struct VISIBILITY_HIDDEN PythonValue : public SugaredValue {
   std::shared_ptr<SugaredValue> call(
       const SourceRange& loc,
       Function& m,
-      at::ArrayRef<NamedValue> inputs_,
-      at::ArrayRef<NamedValue> attributes,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
       size_t n_binders) override;
 
   std::string kind() const override;
@@ -99,8 +99,8 @@ struct VISIBILITY_HIDDEN ConstantParameterList : public SugaredValue {
   std::shared_ptr<SugaredValue> call(
       const SourceRange& loc,
       Function& caller,
-      at::ArrayRef<NamedValue> inputs,
-      at::ArrayRef<NamedValue> attributes,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
       size_t n_binders) override {
     return toSimple(the_list_);
   }
@@ -120,10 +120,10 @@ struct VISIBILITY_HIDDEN ModuleDictMethod : public SugaredValue {
   std::shared_ptr<SugaredValue> call(
       const SourceRange& loc,
       Function& f,
-      at::ArrayRef<NamedValue> inputs,
-      at::ArrayRef<NamedValue> attributes,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
       size_t n_binders) override {
-    if (inputs.size() || attributes.size()) {
+    if (args.size() || kwargs.size()) {
       throw ErrorReport(loc)
           << name_ << " method does not accept any arguments";
     }
@@ -175,11 +175,11 @@ struct VISIBILITY_HIDDEN ModuleValue : public SugaredValue {
   std::shared_ptr<SugaredValue> call(
       const SourceRange& loc,
       Function& caller,
-      at::ArrayRef<NamedValue> inputs,
-      at::ArrayRef<NamedValue> attributes,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
       size_t n_binders) override {
     return attr(loc, caller, "forward")
-        ->call(loc, caller, inputs, attributes, n_binders);
+        ->call(loc, caller, args, kwargs, n_binders);
   }
 
   std::shared_ptr<SugaredDict> getSugaredDict(
@@ -268,8 +268,8 @@ struct VISIBILITY_HIDDEN BooleanDispatchValue : public SugaredValue {
   std::shared_ptr<SugaredValue> call(
       const SourceRange& loc,
       Function& caller,
-      at::ArrayRef<NamedValue> inputs,
-      at::ArrayRef<NamedValue> attributes,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
       size_t n_binders) override;
 
  private:
@@ -308,8 +308,8 @@ struct VISIBILITY_HIDDEN PythonExceptionValue : public ExceptionValue {
   std::shared_ptr<SugaredValue> call(
       const SourceRange& loc,
       Function& caller,
-      at::ArrayRef<NamedValue> inputs,
-      at::ArrayRef<NamedValue> attributes,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
       size_t n_binders) override;
 };
 
@@ -324,8 +324,8 @@ struct VISIBILITY_HIDDEN PythonSliceClass : public SugaredValue {
   std::shared_ptr<SugaredValue> call(
       const SourceRange& loc,
       Function& caller,
-      at::ArrayRef<NamedValue> inputs,
-      at::ArrayRef<NamedValue> attributes,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
       size_t n_binders) override;
 };
 
diff --git a/torch/csrc/jit/runtime/operator.cpp b/torch/csrc/jit/runtime/operator.cpp
index e36208dfb19f..dc1ff95cf735 100644
--- a/torch/csrc/jit/runtime/operator.cpp
+++ b/torch/csrc/jit/runtime/operator.cpp
@@ -287,7 +287,7 @@ bool aliasAnalysisHasSpecialCaseFor(Symbol symbol) {
       prim::MMBatchSide,
       prim::BroadcastSizes,
       prim::ChunkSizes,
-      prim::Function,
+      prim::Closure,
       prim::TupleUnpack,
       prim::TupleIndex,
       prim::TupleSlice,
diff --git a/torch/csrc/jit/runtime/symbolic_script.cpp b/torch/csrc/jit/runtime/symbolic_script.cpp
index 578586e9e9ff..6887be516e7b 100644
--- a/torch/csrc/jit/runtime/symbolic_script.cpp
+++ b/torch/csrc/jit/runtime/symbolic_script.cpp
@@ -1369,8 +1369,8 @@ std::pair<std::shared_ptr<Graph>, Value*> extractClosure(Value* closure) {
   Value* context = closure->node()->inputs().at(1);
 
   TORCH_CHECK(
-      fn->node()->kind() == prim::Function,
-      "closure tuple must contain a prim::Function");
+      fn->node()->kind() == prim::Closure,
+      "closure tuple must contain a prim::Closure");
   return std::make_pair(fn->node()->g(attr::Subgraph), context);
 }
 
diff --git a/torch/csrc/jit/serialization/python_print.cpp b/torch/csrc/jit/serialization/python_print.cpp
index e04339dacc22..0f63770aa792 100644
--- a/torch/csrc/jit/serialization/python_print.cpp
+++ b/torch/csrc/jit/serialization/python_print.cpp
@@ -801,7 +801,7 @@ struct PythonPrintImpl {
         }
         level--;
       } break;
-      case prim::Function: {
+      case prim::Closure: {
         if (enforce_importable_) {
           throw ErrorReport(node->sourceRange())
               << "closures are not exportable";

From c7183c9878f4f4d5eec2d10b092b71dce9947dca Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Wed, 28 Oct 2020 18:11:07 -0700
Subject: [PATCH 085/169] Fix object-based collectives API to use
 torch.cuda.current_device instead of (#46897)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46897

These APIs implicitly assumed that gpu for rank == rank index, but
that is not necessarily true. For example, the first GPU could be used for a
different purpose and rank 0 could use GPU 1, rank 1 uses GPU 2, etc. Thus, we
mandate that the user specify the device to use via `torch.cuda.set_device()`
before making calls to this API. This expectation should be okay since we
clearly document it, and we expect the user to set this for
DistributedDataParallel as well.

Also adds/tidies up some documentation.
ghstack-source-id: 115359633

Test Plan: Modified unittests

Reviewed By: divchenko

Differential Revision: D24556177

fbshipit-source-id: 7e826007241eba0fde3019180066ed56faf3c0ca
---
 torch/distributed/distributed_c10d.py         | 74 +++++++++++++------
 .../_internal/distributed/distributed_test.py | 17 +++++
 2 files changed, 68 insertions(+), 23 deletions(-)

diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 32353c5dc023..3090721c20ff 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -1357,7 +1357,8 @@ def all_gather_object(object_list, obj, group=group.WORLD):
         object_list (list[Any]): Output list. It should be correctly sized as the
             size of the group for this collective and will contain the output.
         object (Any): Pickable Python object to be broadcast from current process.
-        group (ProcessGroup, optional): The process group to work on
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
 
     Returns:
         None. If the calling rank is part of this group, the output of the
@@ -1369,6 +1370,13 @@ def all_gather_object(object_list, obj, group=group.WORLD):
         collective since it does not provide an ``async_op`` handle and thus
         will be a blocking call.
 
+    .. note:: For NCCL-based processed groups, internal tensor representations
+        of objects must be moved to the GPU device before communication takes
+        place. In this case, the device used is given by
+        ``torch.cuda.current_device()`` and it is the user's responsiblity to
+        ensure that this is set so that each rank has an individual GPU, via
+        ``torch.cuda.set_device()``.
+
     .. warning::
         :func:`all_gather_object` uses ``pickle`` module implicitly, which is
         known to be insecure. It is possible to construct malicious pickle data
@@ -1380,16 +1388,19 @@ def all_gather_object(object_list, obj, group=group.WORLD):
 
     input_tensor, local_size = _object_to_tensor(obj)
     group_backend = get_backend(group)
-    my_rank = get_rank()
     is_nccl_backend = group_backend == Backend.NCCL
+    current_device = torch.device("cpu")
     if is_nccl_backend:
-        input_tensor, local_size = input_tensor.to(my_rank), local_size.to(my_rank)
+        # See note about using torch.cuda.current_device() here in docstring.
+        # We cannot simply use my_rank since rank == device is not necessarily
+        # true.
+        current_device = torch.cuda.current_device()
+        input_tensor = input_tensor.to(current_device)
+        local_size = local_size.to(current_device)
     # Gather all local sizes. This is so that we can find the max size, and index
     # until the correct size when deserializing the tensors.
     group_size = get_world_size(group=group)
-    object_sizes_tensor = torch.zeros(group_size, dtype=int).to(
-        my_rank if is_nccl_backend else "cpu"
-    )
+    object_sizes_tensor = torch.zeros(group_size, dtype=int, device=current_device)
     object_size_list = [
         object_sizes_tensor[i].unsqueeze(dim=0) for i in range(group_size)
     ]
@@ -1399,8 +1410,8 @@ def all_gather_object(object_list, obj, group=group.WORLD):
     # Resize tensor to max size across all ranks.
     input_tensor.resize_(max_object_size)
     coalesced_output_tensor = torch.empty(
-        max_object_size * group_size, dtype=torch.uint8
-    ).to(my_rank if is_nccl_backend else "cpu")
+        max_object_size * group_size, dtype=torch.uint8, device=current_device
+    )
     # Output tensors are nonoverlapping views of coalesced_output_tensor
     output_tensors = [
         coalesced_output_tensor[max_object_size * i : max_object_size * (i + 1)]
@@ -1427,7 +1438,8 @@ def gather_object(obj, object_gather_list=None, dst=0, group=group.WORLD):
             collective and will contain the output. Must be ``None`` on non-dst
             ranks. (default is ``None``)
         dst (int, optional): Destination rank. (default is 0)
-        group: (ProcessGroup, optional): The process group to work on.
+        group: (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
 
     Returns:
         None. On the ``dst`` rank, ``object_gather_list`` will contain the
@@ -1453,20 +1465,22 @@ def gather_object(obj, object_gather_list=None, dst=0, group=group.WORLD):
     _validate_output_list_for_rank(my_rank, dst, object_gather_list)
     input_tensor, local_size = _object_to_tensor(obj)
     group_backend = get_backend(group)
+    current_device = torch.device("cpu")
     is_nccl_backend = group_backend == Backend.NCCL
     if is_nccl_backend:
-        input_tensor, local_size = input_tensor.to(my_rank), local_size.to(my_rank)
+        current_device = torch.cuda.current_device()
+        input_tensor = input_tensor.to(current_device)
+        local_size = local_size.to(current_device)
     # Gather all local sizes. This is so that we can find the max size, and index
     # until the correct size when deserializing the tensors.
     group_size = get_world_size(group=group)
-    object_sizes_tensor = torch.zeros(group_size, dtype=int).to(
-        my_rank if is_nccl_backend else "cpu"
-    )
+    object_sizes_tensor = torch.zeros(group_size, dtype=int, device=current_device)
     object_size_list = [
         object_sizes_tensor[i].unsqueeze(dim=0) for i in range(group_size)
     ]
-    # Allgather tensor sizes. An all-gather is needed here despite this being a gather,
-    # since each rank needs to broadcast a tensor of the same (maximal) size.
+    # Allgather tensor sizes. An all-gather is needed here despite this being a
+    # gather, since each rank needs to broadcast a tensor of the same (maximal)
+    # size.
     all_gather(object_size_list, local_size, group=group)
     max_object_size = max(object_size_list)
     # Resize tensor to max size across all ranks.
@@ -1474,8 +1488,8 @@ def gather_object(obj, object_gather_list=None, dst=0, group=group.WORLD):
     # Avoid populating output tensors if the result won't be gathered on this rank.
     if my_rank == dst:
         coalesced_output_tensor = torch.empty(
-            max_object_size * group_size, dtype=torch.uint8
-        ).to(my_rank if is_nccl_backend else "cpu")
+            max_object_size * group_size, dtype=torch.uint8, device=current_device
+        )
         # Output tensors are nonoverlapping views of coalesced_output_tensor
         output_tensors = [
             coalesced_output_tensor[max_object_size * i : max_object_size * (i + 1)]
@@ -1508,15 +1522,23 @@ def broadcast_object_list(object_list, src, group=group.WORLD):
             Each object must be picklable. Only objects on the ``src`` rank will
             be broadcast, but each rank must provide lists of equal sizes.
         src (int): Source rank from which to broadcast ``object_list``.
-        group: (ProcessGroup, optional): The process group to work on.
+        group: (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
 
     Returns:
         ``None``. If rank is part of the group, ``object_list`` will contain the
         broadcasted objects from ``src`` rank.
 
-    .. note:: Note that this API differs slightly from the broadcast collective
-        since it does not provide an ``async_op`` handle and thus will be a
-        blocking call.
+    .. note:: For NCCL-based processed groups, internal tensor representations
+        of objects must be moved to the GPU device before communication takes
+        place. In this case, the device used is given by
+        ``torch.cuda.current_device()`` and it is the user's responsiblity to
+        ensure that this is set so that each rank has an individual GPU, via
+        ``torch.cuda.set_device()``.
+
+    .. note:: Note that this API differs slightly from the :func:`all_gather`
+        collective since it does not provide an ``async_op`` handle and thus
+        will be a blocking call.
 
     .. warning::
         :func:`broadcast_object_list` uses ``pickle`` module implicitly, which
@@ -1537,8 +1559,14 @@ def broadcast_object_list(object_list, src, group=group.WORLD):
 
     group_backend = get_backend(group)
     is_nccl_backend = group_backend == Backend.NCCL
+    current_device = torch.device("cpu")
     if is_nccl_backend:
-        object_sizes_tensor = object_sizes_tensor.to(my_rank)
+        # See note about using torch.cuda.current_device() here in docstring.
+        # We cannot simply use my_rank since rank == device is not necessarily
+        # true.
+        current_device = torch.cuda.current_device()
+        object_sizes_tensor = object_sizes_tensor.to(current_device)
+        object_sizes_tensor = object_sizes_tensor.to(current_device)
 
     # Broadcast object sizes
     broadcast(object_sizes_tensor, src=src, group=group)
@@ -1550,7 +1578,7 @@ def broadcast_object_list(object_list, src, group=group.WORLD):
         object_tensor = torch.ByteTensor(torch.sum(object_sizes_tensor).item())
 
     if is_nccl_backend:
-        object_tensor = object_tensor.to(my_rank)
+        object_tensor = object_tensor.to(current_device)
     broadcast(object_tensor, src=src, group=group)
     # Deserialize objects using their stored sizes.
     offset = 0
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index f304e37389b5..bf58b89fbb55 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -3140,6 +3140,13 @@ def validate_global_samples(local_num_samples):
         @require_backend({"nccl", "gloo"})
         @require_n_gpus_for_nccl_backend(int(os.environ["WORLD_SIZE"]), os.environ["BACKEND"])
         def test_allgather_object(self):
+            # Only set device for NCCL backend since it must use GPUs.
+            backend = os.environ["BACKEND"]
+            if backend == "nccl":
+                # Case where rank != GPU device.
+                next_rank = (self.rank + 1) % int(self.world_size)
+                torch.cuda.set_device(next_rank)
+
             gather_objects = collectives_object_test_list
             output_gathered = [None for _ in range(dist.get_world_size())]
             dist.all_gather_object(
@@ -3194,7 +3201,10 @@ class Bar:
         def test_nccl_gather_object_err(self):
             output_gathered = [None for _ in range(dist.get_world_size())]
             gather_on_rank = 0
+            # Case where rank != GPU device.
             my_rank = dist.get_rank()
+            next_rank = (my_rank + 1) % dist.get_world_size()
+            torch.cuda.set_device(next_rank)
             with self.assertRaisesRegex(
                 RuntimeError, "ProcessGroupNCCL does not support gather"
             ):
@@ -3665,6 +3675,13 @@ def test_ddp_uneven_inputs_replicated_error(self):
         @require_backend({"nccl", "gloo"})
         @require_n_gpus_for_nccl_backend(int(os.environ["WORLD_SIZE"]), os.environ["BACKEND"])
         def test_broadcast_object_list(self):
+            # Only set device for NCCL backend since it must use GPUs.
+            backend = os.environ["BACKEND"]
+            if backend == "nccl":
+                # Case where rank != GPU device.
+                next_rank = (self.rank + 1) % int(self.world_size)
+                torch.cuda.set_device(next_rank)
+
             src_rank = 0
             objects = collectives_object_test_list if self.rank == src_rank else [None for _ in collectives_object_test_list]
 

From 9fefb406283842b4c62228e13a9eb825804fdbe7 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Wed, 28 Oct 2020 18:15:31 -0700
Subject: [PATCH 086/169] Fix signed-to-unsigned conversion warning (#46834)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46834

`next` is set to `-1`, presumably to avoid an "undefined variable" warning. However, Setting `next=-1` gives a signed-to-unsigned warning. In practice, the `-1` wraps around to `size_t::max`. So we set this to `size_t::max` from the get go to avoid all warnings.

Test Plan: Standard pre-commit test rig.

Reviewed By: xw285cornell

Differential Revision: D24481068

fbshipit-source-id: 58b8a1b027a129fc4994c8593838a82b3991be22
---
 aten/src/ATen/core/op_registration/op_whitelist.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/core/op_registration/op_whitelist.h b/aten/src/ATen/core/op_registration/op_whitelist.h
index c8437e924a3c..26d5533244d7 100644
--- a/aten/src/ATen/core/op_registration/op_whitelist.h
+++ b/aten/src/ATen/core/op_registration/op_whitelist.h
@@ -36,7 +36,9 @@ namespace impl {
 // returns true iff whitelist contains item
 // op_whitelist_contains("a;bc;d", "bc") == true
 constexpr bool op_whitelist_contains(string_view whitelist, string_view item) {
-    size_t next = -1;
+    //Choose a really big value for next so that if something goes wrong
+    //this code will blow up in a hopefully detectable way.
+    size_t next = std::numeric_limits<size_t>::max();
     for (size_t cur = 0; cur <= whitelist.size(); cur = next) {
       next = whitelist.find(';', cur);
       if (next != string_view::npos) {

From ad260ae7fd8e295015cef3a911431fb55752a63b Mon Sep 17 00:00:00 2001
From: Pritam Damania <pritam.damania@fb.com>
Date: Wed, 28 Oct 2020 19:25:43 -0700
Subject: [PATCH 087/169] Disable test_joing_running_workers for TSAN. (#46966)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46966

These tests had false positives in TSAN for modifying thread local
variables:

```
WARNING: ThreadSanitizer: data race (pid=5364)
  Write of size 8 at 0x7b2c0004ff70 by thread T2:
    #0 free <null> (libtools_build_sanitizers_tsan-py.so+0xde6ad)
    #1 __GI__dl_deallocate_tls

  Previous write of size 1 at 0x7b2c0004ff71 by thread T3:
    #0 at::GradMode::set_enabled(bool) caffe2/aten/src/ATen/core/grad_mode.cpp:20 (libcaffe2_ATen-core.so+0x40e013)
    #1 torch::autograd::set_grad_enabled(_object*, _object*) caffe2/torch/csrc/autograd/init.cpp:143 (libcaffe2__C_impl_cuda.so+0x115ef0e)
    #2 _PyMethodDef_RawFastCallKeywords

  Thread T3 (tid=5385, finished) created by main thread at:
    #0 pthread_create <null> (libtools_build_sanitizers_tsan-py.so+0xc5a86)
    #1 PyThread_start_new_thread
```
ghstack-source-id: 115330433

Test Plan: waitforbuildbot

Reviewed By: mrshenli

Differential Revision: D24584411

fbshipit-source-id: e35f704dfcb7b161a13a4902beaf8b1e41ccd596
---
 test/distributed/_pipeline/sync/test_worker.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/distributed/_pipeline/sync/test_worker.py b/test/distributed/_pipeline/sync/test_worker.py
index 0247a71ba4a8..fb306b52fad9 100644
--- a/test/distributed/_pipeline/sync/test_worker.py
+++ b/test/distributed/_pipeline/sync/test_worker.py
@@ -13,6 +13,7 @@
 from torch.distributed._pipeline.sync.microbatch import Batch
 from torch.distributed._pipeline.sync.stream import CPUStream
 from torch.distributed._pipeline.sync.worker import Task, spawn_workers
+from torch.testing._internal.common_utils import TEST_WITH_TSAN
 
 
 class fake_device:
@@ -24,6 +25,7 @@ class fake_device:
     index = None
 
 
+@pytest.mark.skipif(TEST_WITH_TSAN, reason="False positive in TSAN")
 def test_join_running_workers():
     count = 0
 
@@ -47,6 +49,7 @@ def call_in_worker(i, f):
     assert count == 10
 
 
+@pytest.mark.skipif(TEST_WITH_TSAN, reason="False positive in TSAN")
 def test_join_running_workers_with_exception():
     class ExpectedException(Exception):
         pass

From c2a3951352dec671e25526e291c4b44502608d73 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Wed, 28 Oct 2020 21:04:34 -0700
Subject: [PATCH 088/169] [quant][graphmode][fx] Remove inplace option for
 convert_fx (#46955)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46955

Initially we were thinking of adding a `invalidate_quantized_float_parameters` option to free the memory
of quantized floating parameters, but it turns out we will do module swap just like in eager mode for the modules
that are quantized, so the old floating point module will not be referenced after quantization. therefore this feature
is only needed for functionals, since most people are using quantization with modules we may not need this.

we'll revisit after we find there is a need for this.

Test Plan: Imported from OSS

Reviewed By: supriyar

Differential Revision: D24579400

fbshipit-source-id: fbb0e567405dc0604a2089fc001573affdade986
---
 test/quantization/test_quantize_fx.py         | 27 ----------------
 torch/quantization/fx/quantize.py             |  9 ++----
 torch/quantization/quantize_fx.py             | 14 ++++-----
 .../testing/_internal/common_quantization.py  | 31 +++++++++----------
 4 files changed, 23 insertions(+), 58 deletions(-)

diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index f2911642cf3a..0f9b4b74f869 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -46,9 +46,6 @@
 
 from torch.testing._internal.common_quantization import NodeSpec as ns
 
-from torch.testing._internal.common_quantization import (
-    test_only_eval_fn,
-)
 from torch.testing import FileCheck
 
 import copy
@@ -268,30 +265,6 @@ def forward(self, x):
         model_device = next(iter(model_devices))
         self.assertEqual(model_device, device)
 
-    @skipIfNoFBGEMM
-    def test_inplace_option(self):
-        class M(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.conv = torch.nn.Conv2d(3, 3, 3)
-
-            def forward(self, x):
-                return self.conv(x)
-
-        model = M().eval()
-        qconfig_dict = {'': default_qconfig}
-        prepared = prepare_fx(model, qconfig_dict)
-        test_only_eval_fn(model, self.img_data_2d)
-        non_inplace_model = convert_fx(prepared, inplace=True)
-
-        prepared = prepare_fx(model, qconfig_dict)
-        test_only_eval_fn(model, self.img_data_2d)
-        inplace_model = convert_fx(prepared, inplace=True)
-
-        non_inplace_res = non_inplace_model(self.img_data_2d[0][0])
-        inplace_res = inplace_model(self.img_data_2d[0][0])
-        self.assertEqual(non_inplace_res, inplace_res)
-
     @skipIfNoFBGEMM
     def test_dict_output(self):
         """ Make sure quantization runs for models with dictionary output
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index 5536f5ef58b0..732f2efdedfe 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -49,7 +49,6 @@
 
 from collections import OrderedDict
 import warnings
-import copy
 import re
 
 from typing import Optional
@@ -562,7 +561,7 @@ def _run_weight_observers(self, observed):
                             weight_observer_module()
         return
 
-    def _convert(self, model, inplace=False, debug=False, convert_custom_config_dict=None, is_standalone_module=False):
+    def _convert(self, model, debug=False, convert_custom_config_dict=None, is_standalone_module=False):
         """ standalone_module means it a submodule that is not inlined in parent module,
         and will be quantized separately as one unit.
         For standalone module: the inputs will be quantized by parent module,
@@ -575,8 +574,6 @@ def _convert(self, model, inplace=False, debug=False, convert_custom_config_dict
         if convert_custom_config_dict is None:
             convert_custom_config_dict = {}
         self.restore_state(model)
-        if not inplace:
-            model = copy.deepcopy(model)
         # always run weight observers in the top level forward method
         # for dynamic quant ops or weight only quant ops
         self._run_weight_observers(model)
@@ -824,8 +821,8 @@ def load_arg(a):
         quantized = GraphModule(quantized_root, folded_graph)
         return quantized
 
-    def convert(self, model, inplace=False, debug=False, convert_custom_config_dict=None, is_standalone_module=False):
-        quantized = self._convert(model, inplace, debug, convert_custom_config_dict, is_standalone_module)
+    def convert(self, model, debug=False, convert_custom_config_dict=None, is_standalone_module=False):
+        quantized = self._convert(model, debug, convert_custom_config_dict, is_standalone_module)
         if not debug:
             quantized = self._fold_weight(quantized)
         return quantized
diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py
index 1794c3ac5a2d..5543c99298f5 100644
--- a/torch/quantization/quantize_fx.py
+++ b/torch/quantization/quantize_fx.py
@@ -280,19 +280,17 @@ def train_loop(model, train_data):
         'train mode'
     return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict)
 
-def _convert_fx(graph_module, inplace, debug, convert_custom_config_dict=None, is_standalone_module=False):
+def _convert_fx(graph_module, debug, convert_custom_config_dict=None, is_standalone_module=False):
     """ `is_standalone_module`: see docs in :func:`~torch.quantization.prepare_standalone_module_fx`
     """
     _check_is_graph_module(graph_module)
     quantizer = Quantizer()
-    return quantizer.convert(graph_module, inplace, debug, convert_custom_config_dict, is_standalone_module)
+    return quantizer.convert(graph_module, debug, convert_custom_config_dict, is_standalone_module)
 
-def convert_fx(graph_module, inplace=False, debug=False, convert_custom_config_dict=None):
+def convert_fx(graph_module, debug=False, convert_custom_config_dict=None):
     r""" Convert a calibrated or trained model to a quantized model
     Args:
         `graph_module`: A prepared and calibrated/trained model (GraphModule)
-        `inplace`: flag for carry out model transformations in-place,
-        the original module is mutated
         `debug`: flag for producing a debug friendly model (preserve weight attribute)
         `convert_custom_config_dict`: dictionary for custom configurations for convert function:
         convert_custom_config_dict = {
@@ -334,9 +332,9 @@ def convert_fx(graph_module, inplace=False, debug=False, convert_custom_config_d
     ```
     """
     torch._C._log_api_usage_once("quantization_api.quantize_fx.convert_fx")
-    return _convert_fx(graph_module, inplace, debug, convert_custom_config_dict)
+    return _convert_fx(graph_module, debug, convert_custom_config_dict)
 
-def _convert_standalone_module_fx(graph_module, inplace=False, debug=False, convert_custom_config_dict=None):
+def _convert_standalone_module_fx(graph_module, debug=False, convert_custom_config_dict=None):
     r""" [Internal use only] Convert a model produced by :func:`~torch.quantization.prepare_standalone_module_fx`
     and convert it to a quantized model
 
@@ -348,4 +346,4 @@ def _convert_standalone_module_fx(graph_module, inplace=False, debug=False, conv
       and produces quantized output (if needed).
     """
     torch._C._log_api_usage_once("quantization_api.quantize_fx._convert_standalone_module_fx")
-    return _convert_fx(graph_module, inplace, debug, convert_custom_config_dict, is_standalone_module=True)
+    return _convert_fx(graph_module, debug, convert_custom_config_dict, is_standalone_module=True)
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index 440e59cf9174..24db8f634e86 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -18,8 +18,6 @@
     get_default_qconfig_propagation_list,
     get_default_qat_module_mappings,
 )
-# symbolic trace
-from torch.fx import symbolic_trace
 
 # graph mode quantization based on fx
 from torch.quantization import (
@@ -628,33 +626,33 @@ def checkGraphModeFxOp(self, model, inputs, quant_type,
         # TODO: make img_data a single example instead of a list
         if type(inputs) == list:
             inputs = inputs[0]
-        if custom_qconfig is None:
-            if quant_type == QuantType.QAT:
-                qconfig = get_default_qat_qconfig(torch.backends.quantized.engine)
-            elif quant_type == QuantType.STATIC:
-                qconfig = get_default_qconfig(torch.backends.quantized.engine)
-            else:
-                qconfig = default_dynamic_qconfig
-        else:
-            qconfig = custom_qconfig
 
         if quant_type == QuantType.QAT:
+            qconfig = get_default_qat_qconfig(torch.backends.quantized.engine)
             model.train()
+        elif quant_type == QuantType.STATIC:
+            qconfig = get_default_qconfig(torch.backends.quantized.engine)
+            model.eval()
         else:
+            qconfig = default_dynamic_qconfig
             model.eval()
 
-        original = symbolic_trace(model)
+        # overwrite qconfig with custom_qconfig
+        if custom_qconfig is not None:
+            qconfig = custom_qconfig
+
         if quant_type == QuantType.QAT:
             prepare = prepare_qat_fx
         else:
             prepare = prepare_fx
 
         qconfig_dict = {'': qconfig}
-        prepared = prepare(original, qconfig_dict)
+        prepared = prepare(model, qconfig_dict)
         if not quant_type == QuantType.DYNAMIC:
             prepared(*inputs)
+        prepared_copy = copy.deepcopy(prepared)
         qgraph = convert_fx(prepared)
-        qgraph_debug = convert_fx(prepared, debug=True)
+        qgraph_debug = convert_fx(prepared_copy, debug=True)
         result = qgraph(*inputs)
         result_debug = qgraph_debug(*inputs)
 
@@ -662,10 +660,9 @@ def checkGraphModeFxOp(self, model, inputs, quant_type,
         if print_debug_info:
             print()
             print('quant type:', quant_type)
-            print('origianl graph module:', type(model))
-            self.printGraphModule(original)
+            print('original model:', model)
             print()
-            print('quantized graph module:', type(qgraph_to_check))
+            print('quantized model:', qgraph_to_check)
             self.printGraphModule(qgraph_to_check)
             print()
         self.checkGraphModuleNodes(

From a86b3438eb1885d503ce091186f6146b0e098c5c Mon Sep 17 00:00:00 2001
From: Wang Xu <scottxu0730@gmail.com>
Date: Wed, 28 Oct 2020 21:07:45 -0700
Subject: [PATCH 089/169] add support for different memory sizes on
 size_based_partition (#46919)

Summary:
WIP: add support for different memory sizes on size_based_partition, so the size_based_partition could support different logical devices with different memory sizes. Compared to the original size_based_partition, the new one also supports partition to logical device mapping. Multiple partitions can be mapped into one device if the memory size is allowed. A test unit test_different_size_partition is also added.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46919

Reviewed By: gcatron, VitalyFedyunin

Differential Revision: D24603511

Pulled By: scottxu0730

fbshipit-source-id: 1ba37338ae054ad846b425fbb7e631d3b6c500b6
---
 test/test_fx_experimental.py         |  31 ++--
 torch/fx/experimental/Partitioner.py | 232 +++++++++++++++++----------
 2 files changed, 165 insertions(+), 98 deletions(-)

diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py
index 34065987a4b2..434c678c53dc 100644
--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@@ -29,6 +29,7 @@ def forward(self, a, b):
         module_with_submodules = ret.module_with_submodules
         dag = ret.dag
         self.assertEqual(traced(a, b), module_with_submodules(a, b))
+        assert dag.nodes[0].logical_device_ids == [0]
 
     def test_size_based_partition(self):
         class TestModule(torch.nn.Module):
@@ -62,40 +63,42 @@ def forward(self, a, b):
         module_with_submodules = ret.module_with_submodules
         dag = ret.dag
         self.assertEqual(traced(a, b), module_with_submodules(a, b))
-        assert len(module_with_submodules.graph.nodes) == 7
+        for i, node in enumerate(dag.nodes):
+            assert node.logical_device_ids == [i]
 
     def test_partition_combining(self):
         class TestModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.linear_0 = torch.nn.Linear(4, 4)
+                self.linear = torch.nn.Linear(4, 4)
 
-            def forward(self, a, b):
+            def forward(self, a):
+                b = torch.rand(4)
                 add_1 = a + b
-                c = self.linear_0(a)
-                add_2 = c + add_1
-                return add_2
+                linear_1 = self.linear(add_1)
+                add_2 = torch.rand(4) + a
+                add_3 = add_2 + linear_1
+                return add_3
 
         m = TestModule()
         traced = symbolic_trace(m)
         a = torch.rand(4)
-        b = torch.rand(4)
         GraphManipulation.get_size_of_all_nodes(
             traced,
-            [a, b]
+            [a]
         )
         partitioner = Partitioner()
         devices = [
-            Device('dev_0', 125, 0),
-            Device('dev_1', 125, 1),
-            Device('dev_2', 125, 2)
+            Device('dev_0', 120, 0),
+            Device('dev_1', 144, 1)
         ]
-        partitioner_config = PartitionerConfig(devices)
+        partitioner_config = PartitionerConfig(devices, is_sparse_nn=False)
         ret = partitioner.partition_graph(traced, m, partitioner_config)
         module_with_submodules = ret.module_with_submodules
         dag = ret.dag
-        self.assertEqual(traced(a, b), module_with_submodules(a, b))
-        assert len(module_with_submodules.graph.nodes) == 5
+        self.assertEqual(traced(a), module_with_submodules(a))
+        for i, node in enumerate(dag.nodes):
+            assert node.logical_device_ids == [i]
 
     def test_sparse_nn_partition(self):
         class MyRecommendationModule(torch.nn.Module):
diff --git a/torch/fx/experimental/Partitioner.py b/torch/fx/experimental/Partitioner.py
index fadbac42cdd0..1a4887b666fb 100644
--- a/torch/fx/experimental/Partitioner.py
+++ b/torch/fx/experimental/Partitioner.py
@@ -106,6 +106,13 @@ def get_extra_size_of(node: Node, nodes: Set[Node]) -> int:
         raise RuntimeError('node has no size_bytes attr')
     return total_size_of_input_nodes
 
+def calculate_mem_bytes_needed(p1, p2):
+    nodes = p1.nodes.union(p2.nodes)
+    mem_bytes_needed = 0
+    for node in nodes:
+        mem_bytes_needed += get_extra_size_of(node, nodes)
+    return mem_bytes_needed
+
 class Partitioner:
     """A graph module may not fit into one device.
     Partitioner class helps cut one graph into subgraphs (partitions),
@@ -147,14 +154,17 @@ def partition_graph(
             if node.op == 'output':
                 break
             total_size_of_graph += node.size_bytes.total_size
-        if total_size_of_graph <= available_mem_bytes:
+        device_with_max_mem = max(self.devices, key=lambda d: d.available_mem_bytes)
+        if total_size_of_graph <= device_with_max_mem.available_mem_bytes:
             self.find_single_partition(total_size_of_graph)
-        elif total_size_of_graph > len(self.devices) * available_mem_bytes:
+        elif total_size_of_graph > sum([d.available_mem_bytes for d in self.devices]):
             raise RuntimeError('Devices have no enough memory for the module')
         else:
-            if not all(device.available_mem_bytes == available_mem_bytes for device in self.devices):
-                raise RuntimeError('All devices must have same memory size!')
             if partitioner_config.is_sparse_nn:
+                if not all(device.available_mem_bytes == available_mem_bytes for device in self.devices):
+                    raise RuntimeError('All devices must have same memory size!')
+                # sparse_nn_partition only support same memory size
+                # TODO: add different size support for sparse_nn_partition
                 self.sparse_nn_partition(available_mem_bytes)
             else:
                 self.size_based_partition(available_mem_bytes)
@@ -174,57 +184,142 @@ def find_single_partition(self, total_size_of_graph) -> None:
             self.node_to_partitions[node] = partition_0.partition_id
             partition_0.nodes.add(node)
         partition_0.used_mem_bytes = total_size_of_graph
-        partition_0.logical_device_ids = [self.devices[0].logical_id]
+        partition_0.logical_device_ids = [0]
         return
 
     def size_based_partition(self, available_mem_bytes: int) -> None:
-        """This method partitions the graph based on memory size.
-           We assume all devices have the same memory size.
+        """This method is to partition the graph based on memory size.
+           It uses greedy approach. The result may not be the best.
            The basic idea is:
-           First, create a new partition.
-           Then traverse the graph through self.graph_module.graph.nodes
-           The traversal only focuses on op nodes
-           (call_function, call_module, call_method).
-           The placeholder nodes (placeholder) and constant nodes (get_attr) are skipped.
-           A placeholder (placeholder) or a constant (get_attr)
-           is added into a partition when it is a input node for a op node.
-           From one op node to another, check if a op node and its input nodes
-           can fit into the current partition.
-           If the current partition is full, create a new one
-           and continue traversing op nodes.
-           Then through self.combine_partition_based_on_size(),
-           partitions will be combined to keep
-           as less partitions as possible.
+           Step 1:
+           Find a device which has enough memory to fit the first node, create a empty partition
+           with the size of that device.
+           Then keep adding the following nodes into the partition until the partition is full.
+           Step 2:
+           Repeat Step 1 until no device left
+           Step 3:
+           If some nodes are left, create a partition for each left node (single node partition).
+           Try to combine those single node partitions with the non single node partitions
+           from Step 1 and Step 2.
+           If two partitions cannot be combined, but could fit into the same logical device,
+           Two partitions use the same logical device.
         """
-        # Create the first partition
+        def find_device_based_on_size(node) -> Device:
+            """Given a node, this function is to find a logical device
+               that could fit the node.
+            """
+            mem_size_needed = get_extra_size_of(node, set())
+            device = Device('', -1, -1)
+            for d in self.devices:
+                if d not in occupied_devices and d.available_mem_bytes >= mem_size_needed:
+                    device = d
+                    break
+            if device.available_mem_bytes < 0:
+                raise RuntimeError(str(node) + 'is too large to fit any device')
+            occupied_devices.append(device)
+            return device
+
+        def create_single_node_partition(node):
+            """Create a partition for a single node
+            """
+            partition = self.create_partition()
+            total_size_needed = get_extra_size_of(node, set())
+            partition.nodes.add(node)
+            partition.used_mem_bytes = total_size_needed
+            single_node_partitions.append(partition)
+
+        # Track all single node partitions in Step 3
+        single_node_partitions: List[Partition] = []
+        # Track all non single node partitions in Step 1 and Step 2
+        non_single_node_partitions: List[Partition] = []
+        # Track partition and its left mem size
+        partition_to_left_mem_bytes: Dict[Partition, int] = {}
+        # Track all the devices that have been used
+        occupied_devices: List[Device] = []
         partition = self.create_partition()
-        # Track the used mem for the current partition
         for node in self.graph_module.graph.nodes:
             if node.op in {'call_module', 'call_method', 'call_function'}:
-                total_size_of_input_nodes = get_extra_size_of(node, partition.nodes)
-                # The current node with its inputs cannot fit into the current partition
-                if total_size_of_input_nodes + partition.used_mem_bytes > available_mem_bytes:
-                    partition = self.create_partition()
+                # Check if there are devices left
+                if len(self.partitions) <= len(self.devices):
                     total_size_of_input_nodes = get_extra_size_of(node, partition.nodes)
-                    # The current node may be too large to fit into a whole new partition
-                    if total_size_of_input_nodes + partition.used_mem_bytes > available_mem_bytes:
-                        raise RuntimeError(node.target + 'is too large to fit into a device')
-                # Add the current node into the current partition
-                partition.nodes.add(node)
-                partition.used_mem_bytes += total_size_of_input_nodes
-        # Find parent partitions and child partitions for each partition.
+                    # Check if the current partition is the very first partition
+                    if partition.used_mem_bytes == 0:
+                        # Find a device to fit the first node, return available mem size
+                        device = find_device_based_on_size(node)
+                        occupied_devices.append(device)
+                        # Update partition and its left mem size
+                        partition_to_left_mem_bytes[partition] = device.available_mem_bytes
+                        # Update available mem for the current partitio
+                        partition.logical_device_ids.append(device.logical_id)
+                    else:
+                        # The current partition is not the first partition
+                        # Check if the current node can fit into this partition
+                        if partition_to_left_mem_bytes[partition] < total_size_of_input_nodes:
+                            # Check if no device is left
+                            if len(self.partitions) == len(self.devices):
+                                # No device left, all the partitions before are non single node partitions
+                                non_single_node_partitions = self.partitions[:]
+                                # Create the first single node partition for the current node
+                                create_single_node_partition(node)
+                                continue
+                            # Some devices are still left
+                            device = find_device_based_on_size(node)
+                            partition = self.create_partition()
+                            total_size_of_input_nodes = get_extra_size_of(node, partition.nodes)
+                            partition_to_left_mem_bytes[partition] = device.available_mem_bytes
+                            partition.logical_device_ids.append(device.logical_id)
+                    partition.nodes.add(node)
+                    partition_to_left_mem_bytes[partition] -= total_size_of_input_nodes
+                    partition.used_mem_bytes += total_size_of_input_nodes
+                # No device left, create single node partitions
+                else:
+                    create_single_node_partition(node)
         self.set_parents_and_children()
-        # Combine small partitions
-        self.combine_partitions_based_on_size(self.partitions[:], available_mem_bytes)
-        # Reassign partition ids and update self.node_to_partitions.
+        # Check if having single node partitions
+        # If not, partition is done
+        if len(single_node_partitions) != 0:
+            # Going through all single node partitions,
+            # see if it can be combined with non single node partitions
+            # or at least fit into a logical device as a standaline partition
+            while single_node_partitions:
+                self.get_bfs_level_partition()
+                # Pick a single node partition
+                p1 = single_node_partitions.pop(0)
+                # Set up a flag
+                find_device = False
+                # Going through all non single partitions
+                # and find a device to fit p1
+                for p2 in non_single_node_partitions:
+                    # Calculate how many bytes are needed if combining p1 and p2
+                    mem_size_needed = calculate_mem_bytes_needed(p1, p2)
+                    # Get the available size of p2
+                    available_mem_bytes = p2.used_mem_bytes + partition_to_left_mem_bytes[p2]
+                    if mem_size_needed <= available_mem_bytes:
+                        # Two partitions can be fit on the same device,
+                        # check if combining them to be one partition
+                        if abs(p1.bfs_level - p2.bfs_level) <= 1:
+                            # Combining p1 and p2 into p0
+                            p0 = self.combine_two_partitions(p1, p2)
+                            self.set_parents_and_children()
+                            p0.logical_device_ids = p2.logical_device_ids
+                            # Remove p2 from non_single_node_partitions
+                            non_single_node_partitions.remove(p2)
+                            # Add p0 to non_single_partitions
+                            non_single_node_partitions.append(p0)
+                            # Update partition_to_left_mem_bytes
+                            partition_to_left_mem_bytes[p0] = available_mem_bytes - mem_size_needed
+                            del partition_to_left_mem_bytes[p2]
+                        else:
+                            # Cannot combine two partitions,
+                            # but two partitions can fit into p2's device
+                            p1.logical_device_ids = p2.logical_device_ids
+                            # Update partition_to_left_mem_bytes for p2
+                            partition_to_left_mem_bytes[p2] = available_mem_bytes - mem_size_needed
+                        find_device = True
+                        break
+                if not find_device:
+                    raise RuntimeError('Lack of Devices')
         self.reorganize_partitions()
-        # Check if devices are enough for all partitions
-        if len(self.partitions) > len(self.devices):
-            msg = 'Need ' + str(len(self.partitions)) + ' devices, but only ' \
-                + str(len(self.devices)) + ' provided'
-            raise RuntimeError(msg)
-        for i, partition in enumerate(self.partitions):
-            partition.logical_device_ids = [self.devices[i].logical_id]
         return
 
     def do_partition(self) -> GraphModule:
@@ -310,13 +405,6 @@ def find_partition_to_combine_based_on_size(
     ) -> Tuple[bool, List[Partition]]:
         """step 1 in self.combine_partition_based_on_size()"""
 
-        def calculate_mem_bytes_needed(p1, p2):
-            nodes = p1.nodes.union(p2.nodes)
-            mem_bytes_needed = 0
-            for node in nodes:
-                mem_bytes_needed += get_extra_size_of(node, nodes)
-            return mem_bytes_needed
-
         find_combination = False
         smallest_partition = sorted_partitions.pop(0)
         for p in sorted_partitions[::-1]:
@@ -336,42 +424,20 @@ def combine_two_partitions(
         self,
         partition_0: Partition,
         partition_1: Partition,
-    ) -> None:
+    ) -> Partition:
         """Given two partitions, combine them into a new one
         and remove the previous two partitions from self.partitions
         """
         partition = self.create_partition()
         partition.nodes = partition_0.nodes.union(partition_1.nodes)
-        partition.parents = partition_0.parents.union(partition_1.parents)
-        partition.children = partition_0.children.union(partition_1.children)
-        partition.recalculate_mem_size()
-        partition.bfs_level = max(partition_0.bfs_level, partition_1.bfs_level)
-        if partition_0 in partition.children:
-            partition.children.remove(partition_0)
-        if partition_0 in partition.parents:
-            partition.parents.remove(partition_0)
-        if partition_1 in partition.children:
-            partition.children.remove(partition_1)
-        if partition_1 in partition.parents:
-            partition.parents.remove(partition_1)
-        # Replace partition_0 and partition_1 with the new partition in children and parents
-        for p in partition.parents:
-            if partition_0 in p.children:
-                p.children.remove(partition_0)
-                p.children.add(partition)
-            if partition_1 in p.children:
-                p.children.remove(partition_1)
-                p.children.add(partition)
-        for p in partition.children:
-            if partition_0 in p.parents:
-                p.parents.remove(partition_0)
-                p.parents.add(partition)
-            if partition_1 in p.parents:
-                p.parents.remove(partition_1)
-                p.parents.add(partition_1)
         self.partitions.remove(partition_0)
         self.partitions.remove(partition_1)
-        return
+        # reset parents and children for all partitions
+        for partition in self.partitions:
+            partition.parents = set()
+            partition.children = set()
+        self.set_parents_and_children()
+        return partition
 
     def set_parents_and_children(self) -> None:
         # Go through all nodes in a partition.
@@ -388,10 +454,8 @@ def set_parents_and_children(self) -> None:
                     # that partition is not the child of the current partition
                     for p in self.partitions:
                         if p != partition and n in p.nodes and node not in p.nodes:
-                            if p not in partition.children:
-                                partition.children.add(p)
-                                if partition not in p.parents:
-                                    p.parents.add(partition)
+                            partition.children.add(p)
+                            p.parents.add(partition)
         return
 
     def reorganize_partitions(self) -> None:

From 79474a192873df89883005e50df8a41de18d8594 Mon Sep 17 00:00:00 2001
From: Jiakai Liu <liujiakai@fb.com>
Date: Wed, 28 Oct 2020 21:21:12 -0700
Subject: [PATCH 090/169] [pytorch] simplify tensor options logic in pybinding
 codegen (#46976)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46976

Technically, it's not semantic preserving, e.g.: emition of
'requires_grad' is no longer gated by 'has_tensor_return' - there is no
guarantee that is_like_or_new_function should all have tensor return.
But the output is identical so there might be some invariant - could
also add assertion to fail loudly when it's broken.

Test Plan: Imported from OSS

Reviewed By: ezyang

Differential Revision: D24589211

Pulled By: ljk53

fbshipit-source-id: 47c7e43b080e4e67a526fde1a8a53aae99df4432
---
 tools/codegen/api/python.py | 25 ++++++-------------------
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/tools/codegen/api/python.py b/tools/codegen/api/python.py
index f04a648aade8..dae0a2c000ff 100644
--- a/tools/codegen/api/python.py
+++ b/tools/codegen/api/python.py
@@ -515,42 +515,31 @@ def signature(f: NativeFunction, *, method: bool = False) -> PythonSignature:
     has_tensor_return = any(r.type.is_tensor_like() for r in f.func.returns)
 
     name: str = cpp.name(f.func)
-    has_options_arg = has_tensor_options(f)
-
-    is_like_function = name.endswith('_like') or f.category_override == 'like'
-    is_new_function = name.startswith('new_') or f.category_override == 'new'
-    is_factory_function = has_tensor_return and not has_tensor_input_arg \
-        or f.category_override == 'factory'
-    is_like_or_new_function_with_options = \
-        (is_like_function or is_new_function) and has_options_arg
+    is_factory_function = f.category_override == 'factory' or (has_tensor_return and not has_tensor_input_arg)
+    is_like_or_new_function = f.category_override in ('new', 'like') or name.startswith('new_') or name.endswith('_like')
 
     tensor_options_args: List[PythonArgument] = []
-    if is_factory_function or has_options_arg:
+    if is_factory_function or is_like_or_new_function:
         tensor_options_args.append(PythonArgument(
             name='dtype',
             cpp_type_str='const ScalarType &',
             type=BaseType(BaseTy.ScalarType),
             default=_dtype_default_type_hack(name),
-            default_init='self.scalar_type()'
-            if is_like_or_new_function_with_options else None,
+            default_init='self.scalar_type()' if is_like_or_new_function else None,
         ))
-
-    if is_factory_function or is_like_or_new_function_with_options:
         tensor_options_args.append(PythonArgument(
             name='layout',
             cpp_type_str='c10::optional<Layout>',
             type=BaseType(BaseTy.Layout),
             default='torch.strided',
-            default_init='layout_from_backend(self.options().backend())'
-            if is_like_or_new_function_with_options else None,
+            default_init='layout_from_backend(self.options().backend())' if is_like_or_new_function else None,
         ))
         tensor_options_args.append(PythonArgument(
             name='device',
             cpp_type_str='const Device &',
             type=BaseType(BaseTy.Device),
             default='None',
-            default_init='self.device()'
-            if is_like_or_new_function_with_options else None,
+            default_init='self.device()' if is_like_or_new_function else None,
         ))
         tensor_options_args.append(PythonArgument(
             name='pin_memory',
@@ -559,8 +548,6 @@ def signature(f: NativeFunction, *, method: bool = False) -> PythonSignature:
             default='False',
             default_init=None,
         ))
-
-    if has_tensor_return and (is_factory_function or is_like_function or is_new_function):
         tensor_options_args.append(PythonArgument(
             name='requires_grad',
             cpp_type_str='bool',

From 9d23fd5c00c7524dcb27a0e285322ea887b0766f Mon Sep 17 00:00:00 2001
From: Jiakai Liu <liujiakai@fb.com>
Date: Wed, 28 Oct 2020 21:21:12 -0700
Subject: [PATCH 091/169] [pytorch] get rid of cpp_type_str from pybind codegen
 (#46977)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46977

Clean up a few TODOs in the new python binding codegen.
Get rid of the _simple_type() hack and the uses of cpp_type_str.
Now python argument type strings and PythonArgParser unpacking methods
are directly generated from the original Type model.

Test Plan: Imported from OSS

Reviewed By: ezyang

Differential Revision: D24589209

Pulled By: ljk53

fbshipit-source-id: b2a6c3911d58eae49c031d319c8ea6f804e2cfde
---
 tools/autograd/gen_python_functions.py |   1 -
 tools/codegen/api/python.py            | 304 +++++++++++--------------
 2 files changed, 136 insertions(+), 169 deletions(-)

diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index 2e822e68a998..f3f2eb3033f2 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -871,7 +871,6 @@ def go(f: NativeFunction) -> PythonSignature:
         src_args: Dict[str, PythonArgument] = {a.name: PythonArgument(
             name=a.name,
             type=a.type,
-            cpp_type_str=a.cpp_type_str,
             default=None,
             default_init=None,
         ) for a in itertools.chain(python_sig.input_args, python_sig.input_kwargs)}
diff --git a/tools/codegen/api/python.py b/tools/codegen/api/python.py
index dae0a2c000ff..bb02407004ab 100644
--- a/tools/codegen/api/python.py
+++ b/tools/codegen/api/python.py
@@ -1,5 +1,6 @@
 from tools.codegen.api.types import *
 import tools.codegen.api.cpp as cpp
+import tools.codegen.local as local
 from tools.codegen.gen import pythonify_default
 from tools.codegen.model import *
 
@@ -175,11 +176,6 @@
 class PythonArgument:
     name: str
     type: Type
-
-    # Consistent with 'type' for most cases, except for some TensorOptions fields
-    # which are hardcoded (see 'signature()' method).
-    cpp_type_str: str
-
     default: Optional[str]
 
     # Used to generate the default init expr for some PythonArgParser outputs, e.g.:
@@ -193,29 +189,15 @@ class PythonArgument:
     # Compute argument formal for python argument parsing.
     # Needs to be consistent with torch/csrc/utils/python_arg_parser.h.
     def argument_str(self, *, method: bool = False) -> str:
-        name = self.name
-        typename = _simple_type(self.cpp_type_str)
-
-        # [old codegen] TODO: remove this and make optional types in simple_type
-        # to be consistent across tensor and other types after make Tensor? be
-        # optional instead of undefined
-        if self.type.is_nullable() and '?' not in typename:
-            typename = f'{typename}?'
+        type_str = argument_type_str(self.type)
 
         # s/self/input/ outside method bindings
         # [old codegen] TODO: remove this? doesn't rename in codegen, it's just
         # for the parse string
-        if name == 'self' and typename == 'Tensor' and not method:
+        name = self.name
+        if name == 'self' and type_str == 'Tensor' and not method:
             name = 'input'
 
-        # add list size annotation
-        size = self.size
-        if size is not None:
-            if typename.endswith('?'):
-                typename = f'{typename[:-1]}[{size}]?'
-            else:
-                typename = f'{typename}[{size}]'
-
         # add default
         if self.default is not None:
             default = {
@@ -223,15 +205,9 @@ def argument_str(self, *, method: bool = False) -> str:
                 'c10::nullopt': 'None',
                 '{}': 'None',
             }.get(self.default, self.default)
-            return f'{typename} {name}={default}'
+            return f'{type_str} {name}={default}'
         else:
-            return f'{typename} {name}'
-
-    @property
-    def size(self) -> Optional[int]:
-        l = self.type.is_list_like()
-        return l.size \
-            if l is not None and l.size is not None and str(l.elem) != 'bool' else None
+            return f'{type_str} {name}'
 
 @dataclass(frozen=True)
 class PythonOutArgument(PythonArgument):
@@ -252,7 +228,6 @@ def from_outputs(outputs: Tuple[PythonArgument, ...]) -> Optional['PythonOutArgu
             return PythonOutArgument(
                 name=outputs[0].name,
                 type=outputs[0].type,
-                cpp_type_str=outputs[0].cpp_type_str,
                 default='None',
                 default_init=None,
                 outputs=outputs,
@@ -263,7 +238,6 @@ def from_outputs(outputs: Tuple[PythonArgument, ...]) -> Optional['PythonOutArgu
             return PythonOutArgument(
                 name='out',
                 type=ListType(BaseType(BaseTy.Tensor), size),
-                cpp_type_str='TensorList',
                 default='None',
                 default_init=None,
                 outputs=outputs,
@@ -368,7 +342,6 @@ def signature_str(self, *, skip_outputs: bool = False) -> str:
 class DispatchLambdaArgument:
     name: str
     type_str: str
-    cpp_type_str: str
     is_out_arg: bool
 
 # To pass PyObjects arguments to C++ function (via the lambda wrapper),
@@ -424,28 +397,6 @@ class DispatchLambdaArgumentExprs:
 #
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
-# The original simple_type is derived from the 'type' field in Declaration.yaml,
-# which is generated from the C++ argument type, following some seemingly
-# artificial rules:
-#
-# Concrete C++ types are preferred in most cases, e.g.:
-#   'IntArrayRef' instead of 'int[]'
-#   'int64_t' instead of 'int'
-#
-# Constant/Reference annotation and optional field are handled specially, e.g.:
-#   'ScalarType?' instead of 'c10::optional<ScalarType>'
-#   'Tensor' instead of 'const Tensor &' / 'Tensor &'
-#
-# TODO: This needs to be consistent with python_arg_parser - can we simplify it?
-def _simple_type(cpp_type_str: str) -> str:
-    simple_type = cpp_type_str.replace(' &', '').replace('const ', '')
-    opt_match = re.match(r'c10::optional<(.+)>', simple_type)
-    if opt_match:
-        typename = opt_match.group(1)
-        # HACK: 'Layout?' needs to be hardcoded to 'Layout'!
-        simple_type = f'{typename}?' if typename != 'Layout' else 'Layout'
-    return simple_type
-
 def _cpp_signature(f: NativeFunction, *, method: bool = False) -> cpp.CppSignature:
     return CppSignatureGroup.from_schema(f.func, method=method).signature
 
@@ -459,6 +410,49 @@ def has_tensor_options(f: NativeFunction) -> bool:
 #
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
+def argument_type_str(t: Type) -> str:
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Tensor:
+            return 'Tensor'
+        elif t.name == BaseTy.int:
+            return 'int64_t'
+        elif t.name == BaseTy.float:
+            return 'double'
+        elif t.name == BaseTy.str:
+            return 'std::string'
+        elif t.name in [BaseTy.bool, BaseTy.QScheme, BaseTy.Scalar,
+                        BaseTy.ScalarType, BaseTy.Generator, BaseTy.Storage,
+                        BaseTy.Layout, BaseTy.Device, BaseTy.MemoryFormat,
+                        BaseTy.Dimname, BaseTy.Stream, BaseTy.ConstQuantizerPtr]:
+            # These python schema type names line up with their function schema names
+            return t.name.name
+
+    elif isinstance(t, OptionalType):
+        elem = argument_type_str(t.elem)
+        if elem == 'Layout':
+            # TODO: fix this special case in PythonArgParser?
+            return 'Layout'
+        else:
+            return f'{elem}?'
+
+    elif isinstance(t, ListType):
+        if str(t.elem) == 'bool':
+            assert t.size is not None
+            return f'std::array<bool,{t.size}>'
+        elif str(t.elem) == 'int':
+            return f'IntArrayRef[{t.size}]' if t.size is not None else 'IntArrayRef'
+        elif str(t.elem) == 'Tensor':
+            return f'TensorList[{t.size}]' if t.size is not None else 'TensorList'
+        elif str(t.elem) == 'Tensor?':
+            # TODO: clone the old codegen behavior but does it make sense?
+            return 'TensorList?'
+        elif str(t.elem) == 'Dimname':
+            return f'DimnameList[{t.size}]' if t.size is not None else 'DimnameList'
+        elem = argument_type_str(t.elem)
+        return f'ArrayRef<{elem}>'
+
+    raise RuntimeError(f'unrecognized type {repr(t)}')
+
 def argument(cpp_arg: CppArgument) -> PythonArgument:
     a = cpp_arg.argument
     if not isinstance(a, Argument):
@@ -468,7 +462,6 @@ def argument(cpp_arg: CppArgument) -> PythonArgument:
     return PythonArgument(
         name=a.name,
         type=a.type,
-        cpp_type_str=cpp_arg.type,
         # TODO: directly translate a.default to python default
         default=str(pythonify_default(cpp.default_expr(a.default, a.type)))
         if a.default is not None else None,
@@ -522,35 +515,30 @@ def signature(f: NativeFunction, *, method: bool = False) -> PythonSignature:
     if is_factory_function or is_like_or_new_function:
         tensor_options_args.append(PythonArgument(
             name='dtype',
-            cpp_type_str='const ScalarType &',
             type=BaseType(BaseTy.ScalarType),
             default=_dtype_default_type_hack(name),
             default_init='self.scalar_type()' if is_like_or_new_function else None,
         ))
         tensor_options_args.append(PythonArgument(
             name='layout',
-            cpp_type_str='c10::optional<Layout>',
-            type=BaseType(BaseTy.Layout),
+            type=OptionalType(BaseType(BaseTy.Layout)),
             default='torch.strided',
             default_init='layout_from_backend(self.options().backend())' if is_like_or_new_function else None,
         ))
         tensor_options_args.append(PythonArgument(
             name='device',
-            cpp_type_str='const Device &',
             type=BaseType(BaseTy.Device),
             default='None',
             default_init='self.device()' if is_like_or_new_function else None,
         ))
         tensor_options_args.append(PythonArgument(
             name='pin_memory',
-            cpp_type_str='bool',
             type=BaseType(BaseTy.bool),
             default='False',
             default_init=None,
         ))
         tensor_options_args.append(PythonArgument(
             name='requires_grad',
-            cpp_type_str='bool',
             type=BaseType(BaseTy.bool),
             default='False',
             default_init=None,
@@ -647,7 +635,6 @@ def dispatch_lambda_arg(cpp_arg: CppArgument) -> DispatchLambdaArgument:
         return DispatchLambdaArgument(
             name=cpp_arg.name,
             type_str=type_str,
-            cpp_type_str=cpp_arg.type,
             is_out_arg=is_out_arg,
         )
 
@@ -737,107 +724,91 @@ def cpp_dispatch_exprs(f: NativeFunction, method: bool, *,
 #
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
-# TODO: should emit these unpack methods directly from Type to avoid
-# indirect translation via cpp_type_str.
-UNPACK_METHODS = {
-    'const Tensor &': 'tensor',
-    'Tensor &': 'tensor',
-    'Stream': 'stream',
-    'c10::optional<Tensor>': 'optionalTensor',
-    'const c10::optional<Tensor>&': 'optionalTensor',
-    'c10::optional<Generator>': 'generator',
-    'Storage': 'storage',
-    'Storage &': 'storage',
-    'const ScalarType &': 'scalartype',
-    'const Device &': 'device',
-    'c10::optional<DimnameList>': 'toDimnameListOptional',
-    'c10::optional<ScalarType>': 'scalartypeOptional',
-    'c10::optional<Layout>': 'layoutOptional',
-    'c10::optional<MemoryFormat>': 'memoryformatOptional',
-    'c10::optional<Scalar>': 'scalarOptional',
-    'c10::optional<IntArrayRef>': 'intlistOptional',
-    'c10::optional<int64_t>': 'toInt64Optional',
-    'c10::optional<bool>': 'toBoolOptional',
-    'c10::optional<double>': 'toDoubleOptional',
-    'c10::optional<ArrayRef<double>>': 'doublelistOptional',
-    'ArrayRef<double>': 'doublelist',
-    'IntArrayRef': 'intlist',
-    'Scalar': 'scalar',
-    'ScalarType': 'scalartype',
-    'Dimname': 'dimname',
-    'DimnameList': 'dimnamelist',
-    'TensorList': 'tensorlist',
-    'int64_t': 'toInt64',
-    'bool': 'toBool',
-    'double': 'toDouble',
-    'std::string': 'string',
-    'c10::optional<std::string>': 'stringOptional',
-}
-
-UNPACK_WITH_SIZE_METHODS = {
-    'TensorList': 'tensorlist_n<{}>',
-    'DimnameList': 'dimnamelist',
-    'IntArrayRef': 'intlist',
-    'c10::optional<IntArrayRef>': 'intlistOptional',
-}
-
-UNPACK_WITH_DEFAULT_METHODS = {
-    'const ScalarType &': 'scalartypeWithDefault',
-    'const Device &': 'deviceWithDefault',
-    'c10::optional<Layout>': 'layoutWithDefault',
-}
+# We explicitly enumerate the PythonArgParser unpacking methods for all
+# supported types. This might be more verbose than necessary, partially
+# because of the irregularity of unpacking method naming, partially
+# because we want to mimic the old codegen behavior - to reject
+# unexpected and/or unsupported cases which the old codegen rejects.
+# For certain cases it is intentionally more restrictive than necessary,
+# e.g.: it doesn't accepts doublelist with definite size.
+def arg_parser_unpack_method(t: Type, has_default: bool) -> str:
+    if has_default and str(t) not in ('ScalarType', 'Device', 'Layout?'):
+        raise RuntimeError(f'type \'{t}\' does not supported unpacking with default')
+
+    if isinstance(t, BaseType):
+        if t.name in [BaseTy.Tensor, BaseTy.Stream, BaseTy.Storage,
+                      BaseTy.Scalar, BaseTy.Dimname]:
+            # These unpack methods line up with their schema names
+            return t.name.name.lower()
+        elif t.name == BaseTy.ScalarType:
+            return 'scalartypeWithDefault' if has_default else 'scalartype'
+        elif t.name == BaseTy.Device:
+            return 'deviceWithDefault' if has_default else 'device'
+        elif t.name == BaseTy.int:
+            return 'toInt64'
+        elif t.name == BaseTy.bool:
+            return 'toBool'
+        elif t.name == BaseTy.float:
+            return 'toDouble'
+        elif t.name == BaseTy.str:
+            return 'string'
+
+    elif isinstance(t, OptionalType):
+        if str(t.elem) == 'Tensor':
+            if local.use_c10_dispatcher().dispatcher_uses_new_style():
+                return 'optionalTensor'
+            else:
+                return 'tensor'
+
+        elif isinstance(t.elem, BaseType):
+            if t.elem.name in [BaseTy.ScalarType, BaseTy.Scalar,
+                               BaseTy.int, BaseTy.bool,
+                               BaseTy.float, BaseTy.str]:
+                # Regular cases: append 'Optional' to elem's unpacking method
+                return arg_parser_unpack_method(t.elem, False) + 'Optional'
+            elif t.elem.name == BaseTy.MemoryFormat:
+                return 'memoryformatOptional'
+            elif t.elem.name == BaseTy.Generator:
+                return 'generator'
+            elif t.elem.name == BaseTy.Layout:
+                return 'layoutWithDefault' if has_default else 'layoutOptional'
+
+        elif isinstance(t.elem, ListType):
+            if str(t.elem.elem) == 'int':
+                # accept definite size
+                return 'intlistOptional'
+            elif str(t.elem) == 'float[]':
+                return 'doublelistOptional'
+            elif str(t.elem) == 'Dimname[]':
+                return 'toDimnameListOptional'
+
+    elif isinstance(t, ListType):
+        if str(t.elem) == 'Tensor' or str(t.elem) == 'Tensor?':
+            # accept and use definite size
+            if t.size is not None:
+                return f'tensorlist_n<{t.size}>'
+            else:
+                return 'tensorlist'
+        elif str(t.elem) == 'Dimname':
+            # accept definite size
+            return 'dimnamelist'
+        elif str(t.elem) == 'int':
+            # accept definite size
+            return 'intlist'
+        elif str(t) == 'float[]':
+            return 'doublelist'
+
+    raise RuntimeError(f'type \'{t}\' is not supported by PythonArgParser')
 
 # Return RHS expression for python argument using PythonArgParser output.
 # e.g. for arg name 'foo', arg type 'bool', arg_index = 2, returns '_r.toBool(2)'
 def arg_parser_output_expr(
-    arg_index: int, a: PythonArgument, la: Optional[DispatchLambdaArgument]
+    arg_index: int, a: PythonArgument
 ) -> PythonArgParserOutputExpr:
-    # The same python signature (and python schema string) is usually
-    # associated with two aten C++ functions: the base version and the
-    # out-place variant. Usually the two functions have the same set of
-    # arguments - of course, except for the output arguments. But in some
-    # cases they might have slightly different C++ argument types -
-    # affected by the 'use_c10_dispatcher' state.
-    #
-    # More specially, 'Tensor?' type can be translated into
-    # either 'const c10::optional<Tensor>&' or 'const Tensor &'.
-    # Unfortunately, this difference can affect how we should access arg
-    # parser output. The former expects '_r.optionalTensor(i)' while the
-    # latter expects '_r.tensor(i)'.
-    #
-    # Because of this subtle difference, we cannot solely use the shared
-    # python signature to determine the RHS expr for both C++ variants.
-    # We could create and use each C++ variant's own python signature,
-    # but we have to fix the argument index difference between the two
-    # python signatures like the old codegen does - and it feels wrong as
-    # technically there is only one shared python signature!
-    #
-    # So here we pass in the lambda wrapper's argument and use it to
-    # decide what PythonArgParser unpack method to use.
-    #
-    # TODO: this seems too complicated - maybe we can simplify after full
-    # c10 dispatch migration?
-    typename = la.cpp_type_str \
-        if a.name != 'out' and la is not None else a.cpp_type_str
-
-    if a.default_init is not None:
-        # Note: only introduced in tensor_options_args
-        if typename not in UNPACK_WITH_DEFAULT_METHODS:
-            raise RuntimeError(
-                f'type \'{typename}\' is not supported in default_init')
-        unpack_with_default = UNPACK_WITH_DEFAULT_METHODS[typename]
-        expr = f'_r.{unpack_with_default}({arg_index}, {a.default_init})'
-    elif a.size is not None:
-        if typename not in UNPACK_WITH_SIZE_METHODS:
-            raise RuntimeError(
-                f'type \'{typename}\' with definite size ({a.size}) is not supported')
-        unpack_with_size = UNPACK_WITH_SIZE_METHODS[typename].format(a.size)
-        expr = f'_r.{unpack_with_size}({arg_index})'
-    else:
-        unpack = UNPACK_METHODS.get(typename)
-        if unpack is None:
-            raise RuntimeError(f'type \'{typename}\' is not supported')
-        expr = f'_r.{unpack}({arg_index})'
+    has_default = a.default_init is not None
+    unpack_method = arg_parser_unpack_method(a.type, has_default)
+    default = f', {a.default_init}' if has_default else ''
+    expr = f'_r.{unpack_method}({arg_index}{default})'
 
     return PythonArgParserOutputExpr(
         name=a.name,
@@ -850,17 +821,14 @@ def arg_parser_output_expr(
 def arg_parser_output_exprs(
     ps: PythonSignature, f: NativeFunction, *, method: bool
 ) -> Dict[str, PythonArgParserOutputExpr]:
-    lambda_args = dispatch_lambda_args(ps, f, method=method)
-    lambda_args_map = dict(map(lambda a: (a.name, a), lambda_args))
-
     return {e.name: e for i, a in enumerate(ps.arguments())
-            for e in (arg_parser_output_expr(i, a, lambda_args_map.get(a.name)), )}
+            for e in (arg_parser_output_expr(i, a), )}
 
-# argument name to 'simple_type' for scattered tensor options fields
+# argument name to type for scattered tensor options fields
 TENSOR_OPTIONS_FIELDS = {
     'dtype': 'ScalarType',
     'device': 'Device',
-    'layout': 'Layout',
+    'layout': 'Layout?',
     'pin_memory': 'bool',
     'requires_grad': 'bool',
 }
@@ -896,7 +864,7 @@ def dispatch_lambda_exprs(
             ])
             for i, out_arg in enumerate(a.outputs):
                 lambda_args_exprs[out_arg.name] = f'out[{i}]'
-        elif a.cpp_type_str == 'c10::optional<DimnameList>':
+        elif str(a.type) == 'Dimname[]?':
             # [old codegen]
             # TODO: make this part of something more general, or get rid of it.
             # optional<ArrayRef<T>> are special. The PythonArgParser returns an
@@ -924,9 +892,9 @@ def dispatch_lambda_exprs(
             if a.name not in TENSOR_OPTIONS_FIELDS:
                 raise RuntimeError(
                     f'{f.func}: unrecognized tensor options field \'{a.name}\' in python binding arguments')
-            if _simple_type(a.cpp_type_str) != TENSOR_OPTIONS_FIELDS.get(a.name):
+            if str(a.type) != TENSOR_OPTIONS_FIELDS.get(a.name):
                 raise RuntimeError(
-                    f'{f.func}: unrecognized type \'{_simple_type(a.cpp_type_str)}\' for tensor options field \'{a.name}\'')
+                    f'{f.func}: unrecognized type \'{str(a.type)}\' for tensor options field \'{a.name}\'')
         if not all(map(lambda a: a in tensor_options_args_names, TENSOR_OPTIONS_FIELDS.keys())):
             raise RuntimeError(
                 f'{f.func}: incomplete tensor options args: {tensor_options_args_names}')

From 42a51148c102c04c628ddffc017de6c1daa43510 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Wed, 28 Oct 2020 21:26:49 -0700
Subject: [PATCH 092/169] Use f-strings in torch.utils.cpp_extension (#47025)

Summary:
Plus two minor fixes to `torch/csrc/Module.cpp`:
 - Use iterator of type `Py_ssize_t` for array indexing in `THPModule_initNames`
 - Fix clang-tidy warning of unneeded defaultGenerator copy by capturing it as `const auto&`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47025

Reviewed By: samestep

Differential Revision: D24605907

Pulled By: malfet

fbshipit-source-id: c276567d320758fa8b6f4bd64ff46d2ea5d40eff
---
 torch/csrc/Module.cpp        |  6 +--
 torch/utils/cpp_extension.py | 94 +++++++++++++++++-------------------
 2 files changed, 47 insertions(+), 53 deletions(-)

diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index d09b3428e4f1..7e4f8bd3c080 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -85,9 +85,9 @@ static PyObject * THPModule_initNames(PyObject *self, PyObject *arg)
   THPObjectPtr types(PySequence_Fast(arg, "expected a sequence"));
   if (!types) return nullptr;
 
-  int num_classes = PySequence_Fast_GET_SIZE(types.get());
+  auto num_classes = PySequence_Fast_GET_SIZE(types.get());
   names.reserve(names.size() + num_classes);
-  for (size_t i = 0; i < num_classes; i++) {
+  for (Py_ssize_t i = 0; i < num_classes; i++) {
     PyObject* obj = PySequence_Fast_GET_ITEM(types.get(), i);
     THPUtils_assert(PyType_Check(obj), "expected a PyTypeObject");
     PyTypeObject* type = (PyTypeObject*)obj;
@@ -864,7 +864,7 @@ Call this whenever a new thread is created in order to propagate values from
   ASSERT_TRUE(set_module_attr("_GLIBCXX_USE_CXX11_ABI", Py_False));
 #endif
 
-  auto defaultGenerator = at::detail::getDefaultCPUGenerator();
+  const auto& defaultGenerator = at::detail::getDefaultCPUGenerator();
   THPDefaultCPUGenerator = (THPGenerator*)THPGenerator_initDefaultGenerator(defaultGenerator);
   // This reference is meant to be given away, so no need to incref here.
   ASSERT_TRUE(set_module_attr("default_generator", (PyObject*)THPDefaultCPUGenerator, /* incref= */ false));
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 4948e6e33099..737a6371b9f7 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -50,7 +50,7 @@ def _find_cuda_home() -> Optional[str]:
             if not os.path.exists(cuda_home):
                 cuda_home = None
     if cuda_home and not torch.cuda.is_available():
-        print("No CUDA runtime is found, using CUDA_HOME='{}'".format(cuda_home))
+        print(f"No CUDA runtime is found, using CUDA_HOME='{cuda_home}'")
     return cuda_home
 
 def _find_rocm_home() -> Optional[str]:
@@ -72,7 +72,7 @@ def _find_rocm_home() -> Optional[str]:
             if not os.path.exists(rocm_home):
                 rocm_home = None
     if rocm_home and torch.version.hip is None:
-        print("No ROCm runtime is found, using ROCM_HOME='{}'".format(rocm_home))
+        print(f"No ROCm runtime is found, using ROCM_HOME='{rocm_home}'")
     return rocm_home
 
 
@@ -275,13 +275,13 @@ def check_compiler_abi_compatibility(compiler) -> bool:
             version = (0, 0, 0) if match is None else match.groups()
     except Exception:
         _, error, _ = sys.exc_info()
-        warnings.warn('Error checking compiler version for {}: {}'.format(compiler, error))
+        warnings.warn(f'Error checking compiler version for {compiler}: {error}')
         return False
 
     if tuple(map(int, version)) >= minimum_required_version:
         return True
 
-    compiler = '{} {}'.format(compiler, ".".join(version))
+    compiler = f'{compiler} {".".join(version)}'
     warnings.warn(ABI_INCOMPATIBILITY_WARNING.format(compiler))
 
     return False
@@ -715,7 +715,7 @@ def _define_torch_extension_name(self, extension):
         # as the library name
         names = extension.name.split('.')
         name = names[-1]
-        define = '-DTORCH_EXTENSION_NAME={}'.format(name)
+        define = f'-DTORCH_EXTENSION_NAME={name}'
         self._add_compile_flag(extension, define)
 
     def _add_gnu_cpp_abi_flag(self, extension):
@@ -1102,9 +1102,7 @@ def load_inline(name,
             # Make the function docstring the same as the function name.
             functions = dict((f, f) for f in functions)
         elif not isinstance(functions, dict):
-            raise ValueError(
-                "Expected 'functions' to be a list or dict, but was {}".format(
-                    type(functions)))
+            raise ValueError(f"Expected 'functions' to be a list or dict, but was {type(functions)}")
         for function_name, docstring in functions.items():
             if with_pytorch_error_handling:
                 module_def.append(
@@ -1170,9 +1168,9 @@ def _jit_compile(name,
     )
     if version > 0:
         if version != old_version and verbose:
-            print('The input conditions for extension module {} have changed. '.format(name) +
-                  'Bumping to version {0} and re-building as {1}_v{0}...'.format(version, name))
-        name = '{}_v{}'.format(name, version)
+            print(f'The input conditions for extension module {name} have changed. ' +
+                  f'Bumping to version {version} and re-building as {name}_v{version}...')
+        name = f'{name}_v{version}'
 
     if version != old_version:
         baton = FileBaton(os.path.join(build_directory, 'lock'))
@@ -1205,7 +1203,7 @@ def _jit_compile(name,
             baton.wait()
     elif verbose:
         print('No modifications detected for re-loaded extension '
-              'module {}, skipping build step...'.format(name))
+              f'module {name}, skipping build step...')
 
     if verbose:
         print(f'Loading extension module {name}...')
@@ -1292,11 +1290,11 @@ def _write_ninja_file_and_build_library(
         with_cuda=with_cuda)
 
     if verbose:
-        print('Building extension module {}...'.format(name))
+        print(f'Building extension module {name}...')
     _run_ninja_build(
         build_directory,
         verbose,
-        error_prefix="Error building extension '{}'".format(name))
+        error_prefix=f"Error building extension '{name}'")
 
 
 def is_ninja_available():
@@ -1342,10 +1340,10 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose):
             extra_ldflags.append('-INCLUDE:?warp_size@cuda@at@@YAHXZ')
         extra_ldflags.append('torch.lib')
         extra_ldflags.append('torch_python.lib')
-        extra_ldflags.append('/LIBPATH:{}'.format(python_lib_path))
-        extra_ldflags.append('/LIBPATH:{}'.format(lib_path))
+        extra_ldflags.append(f'/LIBPATH:{python_lib_path}')
+        extra_ldflags.append(f'/LIBPATH:{lib_path}')
     else:
-        extra_ldflags.append('-L{}'.format(lib_path))
+        extra_ldflags.append(f'-L{lib_path}')
         extra_ldflags.append('-lc10')
         if with_cuda:
             extra_ldflags.append('-lc10_hip' if IS_HIP_EXTENSION else '-lc10_cuda')
@@ -1359,19 +1357,18 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose):
         if verbose:
             print('Detected CUDA files, patching ldflags')
         if IS_WINDOWS:
-            extra_ldflags.append('/LIBPATH:{}'.format(
-                _join_cuda_home('lib/x64')))
+            extra_ldflags.append(f'/LIBPATH:{_join_cuda_home("lib/x64")}')
             extra_ldflags.append('cudart.lib')
             if CUDNN_HOME is not None:
                 extra_ldflags.append(os.path.join(CUDNN_HOME, 'lib/x64'))
         elif not IS_HIP_EXTENSION:
-            extra_ldflags.append('-L{}'.format(_join_cuda_home('lib64')))
+            extra_ldflags.append(f'-L{_join_cuda_home("lib64")}')
             extra_ldflags.append('-lcudart')
             if CUDNN_HOME is not None:
-                extra_ldflags.append('-L{}'.format(os.path.join(CUDNN_HOME, 'lib64')))
+                extra_ldflags.append(f'-L{os.path.join(CUDNN_HOME, "lib64")}')
         elif IS_HIP_EXTENSION:
             assert ROCM_VERSION is not None
-            extra_ldflags.append('-L{}'.format(_join_rocm_home('lib')))
+            extra_ldflags.append(f'-L{_join_rocm_home("lib")}')
             extra_ldflags.append('-lamdhip64' if ROCM_VERSION >= (3, 5) else '-lhip_hcc')
     return extra_ldflags
 
@@ -1421,7 +1418,7 @@ def _get_cuda_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
     # If not given, determine what's needed for the GPU that can be found
     if not _arch_list:
         capability = torch.cuda.get_device_capability()
-        arch_list = ['{}.{}'.format(capability[0], capability[1])]
+        arch_list = [f'{capability[0]}.{capability[1]}']
     else:
         # Deal with lists that are ' ' separated (only deal with ';' after)
         _arch_list = _arch_list.replace(' ', ';')
@@ -1434,12 +1431,12 @@ def _get_cuda_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
     flags = []
     for arch in arch_list:
         if arch not in valid_arch_strings:
-            raise ValueError("Unknown CUDA arch ({}) or GPU not supported".format(arch))
+            raise ValueError(f"Unknown CUDA arch ({arch}) or GPU not supported")
         else:
             num = arch[0] + arch[2]
-            flags.append('-gencode=arch=compute_{},code=sm_{}'.format(num, num))
+            flags.append(f'-gencode=arch=compute_{num},code=sm_{num}')
             if arch.endswith('+PTX'):
-                flags.append('-gencode=arch=compute_{},code=compute_{}'.format(num, num))
+                flags.append(f'-gencode=arch=compute_{num},code=compute_{num}')
 
     return list(set(flags))
 
@@ -1466,8 +1463,7 @@ def _get_build_directory(name: str, verbose: bool) -> str:
         root_extensions_directory = get_default_build_root()
 
     if verbose:
-        print('Using {} as PyTorch extensions root...'.format(
-            root_extensions_directory))
+        print(f'Using {root_extensions_directory} as PyTorch extensions root...')
 
     build_directory = os.path.join(root_extensions_directory, name)
     if not os.path.exists(build_directory):
@@ -1483,7 +1479,7 @@ def _get_num_workers(verbose: bool) -> Optional[int]:
     max_jobs = os.environ.get('MAX_JOBS')
     if max_jobs is not None and max_jobs.isdigit():
         if verbose:
-            print('Using envvar MAX_JOBS ({}) as the number of workers...'.format(max_jobs))
+            print(f'Using envvar MAX_JOBS ({max_jobs}) as the number of workers...')
         return int(max_jobs)
     if verbose:
         print('Allowing ninja to set a default number of workers... '
@@ -1550,7 +1546,7 @@ def _run_ninja_build(build_directory: str, verbose: bool, error_prefix: str) ->
         # `error` is a CalledProcessError (which has an `ouput`) attribute, but
         # mypy thinks it's Optional[BaseException] and doesn't narrow
         if hasattr(error, 'output') and error.output:  # type: ignore
-            message += ": {}".format(error.output.decode())  # type: ignore
+            message += f": {error.output.decode()}"  # type: ignore
         raise RuntimeError(message) from e
 
 
@@ -1592,10 +1588,10 @@ def _write_ninja_file_to_build_library(path,
         user_includes += system_includes
         system_includes.clear()
 
-    common_cflags = ['-DTORCH_EXTENSION_NAME={}'.format(name)]
+    common_cflags = [f'-DTORCH_EXTENSION_NAME={name}']
     common_cflags.append('-DTORCH_API_INCLUDE_EXTENSION_H')
-    common_cflags += ['-I{}'.format(include) for include in user_includes]
-    common_cflags += ['-isystem {}'.format(include) for include in system_includes]
+    common_cflags += [f'-I{include}' for include in user_includes]
+    common_cflags += [f'-isystem {include}' for include in system_includes]
 
     common_cflags += ['-D_GLIBCXX_USE_CXX11_ABI=' + str(int(torch._C._GLIBCXX_USE_CXX11_ABI))]
 
@@ -1639,9 +1635,9 @@ def object_file_path(source_file: str) -> str:
         if _is_cuda_file(source_file) and with_cuda:
             # Use a different object filename in case a C++ and CUDA file have
             # the same filename but different extension (.cpp vs. .cu).
-            target = '{}.cuda.o'.format(file_name)
+            target = f'{file_name}.cuda.o'
         else:
-            target = '{}.o'.format(file_name)
+            target = f'{file_name}.o'
         return target
 
     objects = [object_file_path(src) for src in sources]
@@ -1657,7 +1653,7 @@ def object_file_path(source_file: str) -> str:
         ldflags = _nt_quote_args(ldflags)
 
     ext = 'pyd' if IS_WINDOWS else 'so'
-    library_target = '{}.{}'.format(name, ext)
+    library_target = f'{name}.{ext}'
 
     _write_ninja_file(
         path=path,
@@ -1719,20 +1715,20 @@ def sanitize_flags(flags):
 
     # Version 1.3 is required for the `deps` directive.
     config = ['ninja_required_version = 1.3']
-    config.append('cxx = {}'.format(compiler))
+    config.append(f'cxx = {compiler}')
     if with_cuda:
         if IS_HIP_EXTENSION:
             nvcc = _join_rocm_home('bin', 'hipcc')
         else:
             nvcc = _join_cuda_home('bin', 'nvcc')
-        config.append('nvcc = {}'.format(nvcc))
+        config.append(f'nvcc = {nvcc}')
 
-    flags = ['cflags = {}'.format(' '.join(cflags))]
-    flags.append('post_cflags = {}'.format(' '.join(post_cflags)))
+    flags = [f'cflags = {" ".join(cflags)}']
+    flags.append(f'post_cflags = {" ".join(post_cflags)}')
     if with_cuda:
-        flags.append('cuda_cflags = {}'.format(' '.join(cuda_cflags)))
-        flags.append('cuda_post_cflags = {}'.format(' '.join(cuda_post_cflags)))
-    flags.append('ldflags = {}'.format(' '.join(ldflags)))
+        flags.append(f'cuda_cflags = {" ".join(cuda_cflags)}')
+        flags.append(f'cuda_post_cflags = {" ".join(cuda_post_cflags)}')
+    flags.append(f'ldflags = {" ".join(ldflags)}')
 
     # Turn into absolute paths so we can emit them into the ninja build
     # file wherever it is.
@@ -1765,7 +1761,7 @@ def sanitize_flags(flags):
             object_file = object_file.replace(':', '$:')
         source_file = source_file.replace(" ", "$ ")
         object_file = object_file.replace(" ", "$ ")
-        build.append('build {}: {} {}'.format(object_file, rule, source_file))
+        build.append(f'build {object_file}: {rule} {source_file}')
 
     if library_target is not None:
         link_rule = ['rule link']
@@ -1776,15 +1772,13 @@ def sanitize_flags(flags):
                 cl_path = os.path.dirname(cl_paths[0]).replace(':', '$:')
             else:
                 raise RuntimeError("MSVC is required to load C++ extensions")
-            link_rule.append(
-                '  command = "{}/link.exe" $in /nologo $ldflags /out:$out'.format(
-                    cl_path))
+            link_rule.append(f'  command = "{cl_path}/link.exe" $in /nologo $ldflags /out:$out')
         else:
             link_rule.append('  command = $cxx $in $ldflags -o $out')
 
-        link = ['build {}: link {}'.format(library_target, ' '.join(objects))]
+        link = [f'build {library_target}: link {" ".join(objects)}']
 
-        default = ['default {}'.format(library_target)]
+        default = [f'default {library_target}']
     else:
         link_rule, link, default = [], [], []
 
@@ -1796,7 +1790,7 @@ def sanitize_flags(flags):
     with open(path, 'w') as build_file:
         for block in blocks:
             lines = '\n'.join(block)
-            build_file.write('{}\n\n'.format(lines))
+            build_file.write(f'{lines}\n\n')
 
 
 def _join_cuda_home(*paths) -> str:

From 5c8aad1141f91a76544573d1aa79e5610c658526 Mon Sep 17 00:00:00 2001
From: Kshiteej K <kshitijkalambarkar@gmail.com>
Date: Wed, 28 Oct 2020 22:01:13 -0700
Subject: [PATCH 093/169] [numpy] `torch.cos`, `torch.tan` : promote integer
 inputs to float (#46706)

Summary:
References https://github.com/pytorch/pytorch/issues/42515

cc: mruberry

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46706

Reviewed By: izdeby

Differential Revision: D24537262

Pulled By: mruberry

fbshipit-source-id: e57377a625814a3f34a765ce6bfd63a33c02a5d9
---
 aten/src/ATen/native/UnaryOps.cpp                     | 8 ++++----
 aten/src/ATen/native/cuda/UnaryGeometricKernels.cu    | 4 ++--
 test/test_type_promotion.py                           | 4 ++--
 test/test_unary_ufuncs.py                             | 2 +-
 torch/testing/_internal/common_methods_invocations.py | 9 ++++++++-
 5 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index 518032c81b04..1aebfda85da0 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -339,8 +339,8 @@ Tensor& sin_out(Tensor& result, const Tensor& self) { return unary_op_impl_float
 Tensor sin(const Tensor& self) { return unary_op_impl_float(self, sin_stub); }
 Tensor& sin_(Tensor& self) { return unary_op_impl_(self, at::sin_out); }
 
-Tensor& cos_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, cos_stub); }
-Tensor cos(const Tensor& self) { return unary_op_impl(self, at::cos_out); }
+Tensor& cos_out(Tensor& result, const Tensor& self) { return unary_op_impl_float_out(result, self, cos_stub); }
+Tensor cos(const Tensor& self) { return unary_op_impl_float(self, cos_stub); }
 Tensor& cos_(Tensor& self) { return unary_op_impl_(self, at::cos_out); }
 
 Tensor& sinh_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, sinh_stub); }
@@ -452,8 +452,8 @@ Tensor& tanh_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(
 Tensor tanh(const Tensor& self) { return unary_op_impl(self, at::tanh_out); }
 Tensor& tanh_(Tensor& self) { return unary_op_impl_(self, at::tanh_out); }
 
-Tensor& tan_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, tan_stub);  }
-Tensor tan(const Tensor& self) { return unary_op_impl(self, at::tan_out);  }
+Tensor& tan_out(Tensor& result, const Tensor& self) { return unary_op_impl_float_out(result, self, tan_stub);  }
+Tensor tan(const Tensor& self) { return unary_op_impl_float(self, tan_stub);  }
 Tensor& tan_(Tensor& self) { return unary_op_impl_(self, at::tan_out);  }
 
 Tensor& trunc_out(Tensor& result, const Tensor& self) {
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricKernels.cu b/aten/src/ATen/native/cuda/UnaryGeometricKernels.cu
index 465e54db51d6..2f7e92f3fc2e 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricKernels.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricKernels.cu
@@ -43,7 +43,7 @@ void sin_kernel_cuda(TensorIterator& iter) {
 }
 
 void cos_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "cos_cuda", [&]() {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "cos_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return ::cos(a);
     });
@@ -99,7 +99,7 @@ void atanh_kernel_cuda(TensorIterator& iter) {
 }
 
 void tan_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, iter.dtype(), "tan_cuda", [&]() {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, iter.common_dtype(), "tan_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return ::tan(a);
     });
diff --git a/test/test_type_promotion.py b/test/test_type_promotion.py
index c6efa8f0d90d..c023553de402 100644
--- a/test/test_type_promotion.py
+++ b/test/test_type_promotion.py
@@ -919,8 +919,8 @@ def test_unary_op_out_casting(self, device, dtypes):
         t = torch.tensor((1), dtype=dtypes[0], device=device)
         out = torch.empty(0, dtype=dtypes[1], device=device)
 
-        ops = (torch.neg, torch.floor, torch.ceil, torch.cos, torch.erf)
-        float_only_ops = {torch.floor, torch.ceil, torch.cos, torch.erf}
+        ops = (torch.neg, torch.floor, torch.ceil, torch.erf)
+        float_only_ops = {torch.floor, torch.ceil, torch.erf}
         real_only_ops = {torch.floor, torch.ceil, torch.erf}
         for op in ops:
             if dtypes[0] is not dtypes[1]:
diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py
index 6d4dc91ff5bd..9f3353376913 100644
--- a/test/test_unary_ufuncs.py
+++ b/test/test_unary_ufuncs.py
@@ -288,7 +288,7 @@ def test_reference_numerics(self, device, dtype, op):
                     # NOTE: For these dtypes, PyTorch computes in the default scalar type (float)
                     # while NumPy computes in float16
                     self.assertEqualHelper(actual, expected, msg, dtype=dtype,
-                                           exact_dtype=exact_dtype, rtol=1e-4, atol=1e-3)
+                                           exact_dtype=exact_dtype, rtol=1e-3, atol=1e-2)
                     continue
 
             self.assertEqualHelper(actual, expected, msg, dtype=dtype, exact_dtype=exact_dtype)
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 48b24f1ae499..9b67d31e7d16 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -274,8 +274,11 @@ def sample_inputs(self, device, dtype, requires_grad=False):
                    test_inplace_grad=False),
     UnaryUfuncInfo('cos',
                    ref=np.cos,
-                   dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
+                   dtypesIfCPU=all_types_and_complex_and(torch.bool, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
                    handles_large_floats=False,
+                   promotes_integers_to_float=True,
                    decorators=(precisionOverride({torch.bfloat16: 1e-2}),),
                    skips=(
                        SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',
@@ -380,6 +383,10 @@ def sample_inputs(self, device, dtype, requires_grad=False):
                    )),
     UnaryUfuncInfo('tan',
                    ref=np.tan,
+                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
+                   dtypesIfCPU=all_types_and_complex_and(torch.bool, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half),
+                   promotes_integers_to_float=True,
                    skips=(
                        SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',
                                 device_type='cuda', dtypes=[torch.cfloat, torch.cdouble]),

From 604e1b301a1cafd326dd332fd7d72f6e9bd71054 Mon Sep 17 00:00:00 2001
From: Xiong Wei <xiongw.fnst@cn.fujitsu.com>
Date: Wed, 28 Oct 2020 22:27:04 -0700
Subject: [PATCH 094/169] Fix negative column numbers for the torch.eye
 (#46841)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/46757

Error out the negative column numbers and add the corresponding tests in the `test/test_tensor_creation_ops.py`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46841

Reviewed By: VitalyFedyunin

Differential Revision: D24593839

Pulled By: ngimel

fbshipit-source-id: b8988207911453de7811cf3ceb43747192cd689d
---
 aten/src/ATen/native/TensorFactories.cpp     | 11 +++++------
 aten/src/ATen/native/cuda/TensorFactories.cu |  8 +++-----
 test/test_tensor_creation_ops.py             | 17 +++++++++++++++++
 3 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index 82d7363a1b32..a6683bcaaf2f 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -381,7 +381,8 @@ Tensor new_empty(
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ eye ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Tensor eye(int64_t n, const TensorOptions& options) {
-  return native::eye(n, -1, options);
+  // the default value of `m` equals to `n`
+  return native::eye(n, n, options);
 }
 
 Tensor eye(int64_t n, int64_t m, const TensorOptions& options) {
@@ -390,15 +391,13 @@ Tensor eye(int64_t n, int64_t m, const TensorOptions& options) {
 }
 
 Tensor& eye_out_cpu(Tensor& result, int64_t n) {
-  return native::eye_out_cpu(result, n, -1);
+  // the default value of `m` equals to `n`
+  return native::eye_out_cpu(result, n, n);
 }
 
 Tensor& eye_out_cpu(Tensor& result, int64_t n, int64_t m) {
   TORCH_CHECK(n >= 0, "n must be greater or equal to 0, got ", n);
-
-  if(m < 0) {
-    m = n;
-  }
+  TORCH_CHECK(m >= 0, "m must be greater or equal to 0, got ", m);
 
   result.resize_({n, m});
   result.zero_();
diff --git a/aten/src/ATen/native/cuda/TensorFactories.cu b/aten/src/ATen/native/cuda/TensorFactories.cu
index fb0eb4ca8b09..13f0b53516de 100644
--- a/aten/src/ATen/native/cuda/TensorFactories.cu
+++ b/aten/src/ATen/native/cuda/TensorFactories.cu
@@ -22,15 +22,13 @@ namespace at {
 namespace native {
 
 Tensor& eye_out_cuda(Tensor& result, int64_t n) {
-  return at::native::eye_out_cuda(result, n, /*m=*/-1);
+  // the default value of `m` equals to `n`
+  return at::native::eye_out_cuda(result, n, n);
 }
 
 Tensor& eye_out_cuda(Tensor& result, int64_t n, int64_t m) {
   TORCH_CHECK(n >= 0, "n must be greater or equal to 0, got ", n);
-
-  if(m < 0) {
-    m = n;
-  }
+  TORCH_CHECK(m >= 0, "m must be greater or equal to 0, got ", m);
 
   result.resize_({n, m});
   result.zero_();
diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
index b0777c7fa12a..7e25b80851c0 100644
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@@ -702,6 +702,23 @@ def test_eye(self, device):
         for dtype in torch.testing.get_all_dtypes():
             if dtype == torch.bfloat16:
                 continue
+            # Test the RuntimeError is raised when either m or n is a negative number
+            for n, m in ((-1, 1), (1, -1), (-1, -1)):
+                with self.assertRaisesRegex(RuntimeError, 'must be greater or equal to'):
+                    torch.eye(n, m, device=device, dtype=dtype)
+
+            # Test when the `m` parameter is not provided
+            for n in (3, 5, 7):
+                res1 = torch.eye(n, device=device, dtype=dtype)
+                naive_eye = torch.zeros(n, n, dtype=dtype, device=device)
+                naive_eye.diagonal(dim1=-2, dim2=-1).fill_(1)
+                self.assertEqual(naive_eye, res1)
+
+                # Check eye_out outputs
+                res2 = torch.empty(0, device=device, dtype=dtype)
+                torch.eye(n, out=res2)
+                self.assertEqual(res1, res2)
+
             for n, m in product([3, 5, 7], repeat=2):
                 # Construct identity using diagonal and fill
                 res1 = torch.eye(n, m, device=device, dtype=dtype)

From 9c1a41b7241011267feb5ddd1fb786b5209aae13 Mon Sep 17 00:00:00 2001
From: Dhruv Matani <dhruvbird@fb.com>
Date: Wed, 28 Oct 2020 22:36:13 -0700
Subject: [PATCH 095/169] [RFC] Add OperatorHandle overload to the
 RecordFunction::before() method (#46401)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46401

Broader context about selective/custom build available at https://fb.quip.com/2oEzAR5MKqbD and https://fb.workplace.com/groups/pytorch.mobile.team/permalink/735794523641956/

Basically, we want to be able to trace full operator names (with overload name). The current observer infra picks up the operator name from the schema, which doesn't seem to include the overload name. To ensure consistency with the existing uses and to accomodate the new use-case, this diff adds a new overload to accept an `OperatorHandle` object, and the code in `before()` eagerly resolves it to an `OperatorName` object (which can be cached in a member variable) as well as a string (view) operator-name which has the same semantics as before.

Why do we pass in an `OperatorHandle` but then resolve it to an `OperatorName`? This might come across as a strange design choice (and it is), but it is grounded in practicality.

It is not reasonable to cache an `OperatorHandle` object but caching an `OperatorName` object is reasonable since it holds all the data itself.

An initial version of this change was trying to test this change in the `xplat` repo, which didn't work. Thanks to ilia-cher for pointing out that the dispatcher observing mechanism is disabled under a compile time flag (macro) for xplat.
ghstack-source-id: 114360747

Test Plan:
`buck test fbcode/caffe2/fb/test:record_function_test` succeeds. Also replicated this test in OSS in the file `test_misc.cpp` where the rest of the `RecordFunction` subsystem is being tested.

Ran benchmark as reqiested by ilia-cher

{P146511280}

Reviewed By: ilia-cher

Differential Revision: D24315241

fbshipit-source-id: 239f3081e6aa2e26c3021a7dd61f328b723b03d9
---
 aten/src/ATen/core/dispatch/Dispatcher.h |  8 +++----
 aten/src/ATen/record_function.cpp        | 17 ++++++++++++++
 aten/src/ATen/record_function.h          | 15 ++++++++++++-
 test/cpp/jit/test_misc.cpp               | 28 ++++++++++++++++++++++++
 4 files changed, 63 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h
index 3ae57341fcf8..a5f9354d7ca2 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@@ -388,9 +388,9 @@ inline Return Dispatcher::callWithDispatchKey(const TypedOperatorHandle<Return(A
       }
       if (guard.needs_inputs) {
         torch::jit::Stack stack = impl::BoxedKernelWrapper<Return(Args...)>::boxArgs(args...);
-        guard.before(op.schema().name(), stack, seq_num);
+        guard.before(op, stack, seq_num);
       } else {
-        guard.before(op.schema().name(), seq_num);
+        guard.before(op, seq_num);
       }
     }
   }
@@ -438,9 +438,9 @@ inline void Dispatcher::callBoxed(const OperatorHandle& op, Stack* stack) const
         seq_num = at::sequence_number::peek();
       }
       if (guard.needs_inputs) {
-        guard.before(op.schema().name(), *stack, seq_num);
+        guard.before(op, *stack, seq_num);
       } else {
-        guard.before(op.schema().name(), seq_num);
+        guard.before(op, seq_num);
       }
     }
   }
diff --git a/aten/src/ATen/record_function.cpp b/aten/src/ATen/record_function.cpp
index 26e9fd9f21fa..41f31968688d 100644
--- a/aten/src/ATen/record_function.cpp
+++ b/aten/src/ATen/record_function.cpp
@@ -1,4 +1,5 @@
 #include <ATen/record_function.h>
+#include <ATen/core/dispatch/Dispatcher.h>
 #include <algorithm>
 #include <cstdlib>
 #include <random>
@@ -376,6 +377,7 @@ void RecordFunction::before(const char* name, int64_t sequence_nr) {
   name_ = StringView(name);
   sequence_nr_ = sequence_nr;
   thread_id_ = currentThreadId();
+  operator_name_.reset();
 
   manager().runStartCallbacks(*this);
 }
@@ -387,6 +389,21 @@ void RecordFunction::before(std::string name, int64_t sequence_nr) {
   name_ = StringView(std::move(name));
   sequence_nr_ = sequence_nr;
   thread_id_ = currentThreadId();
+  operator_name_.reset();
+
+  manager().runStartCallbacks(*this);
+}
+
+void RecordFunction::before(
+    c10::OperatorHandle const& op,
+    int64_t sequence_nr) {
+  if (!active) {
+    return;
+  }
+  sequence_nr_ = sequence_nr;
+  thread_id_ = currentThreadId();
+  operator_name_ = op.operator_name();
+  name_ = StringView(op.schema().name());
 
   manager().runStartCallbacks(*this);
 }
diff --git a/aten/src/ATen/record_function.h b/aten/src/ATen/record_function.h
index cf839ad4a188..db2ee221a09a 100644
--- a/aten/src/ATen/record_function.h
+++ b/aten/src/ATen/record_function.h
@@ -1,12 +1,18 @@
 #pragma once
 
 #include <ATen/core/ivalue.h>
-#include <c10/util/SmallVector.h>
+#include <ATen/core/operator_name.h>
 #include <c10/macros/Export.h>
+#include <c10/util/Optional.h>
+#include <c10/util/SmallVector.h>
 #include <memory>
 
 #include <functional>
 
+namespace c10 {
+class CAFFE2_API OperatorHandle;
+}
+
 namespace at {
 
 // Kind of record function scope;
@@ -147,6 +153,7 @@ struct TORCH_API RecordFunction {
   // start callbacks
   void before(const char* name, int64_t sequence_nr = -1);
   void before(std::string name, int64_t sequence_nr = -1);
+  void before(c10::OperatorHandle const& op, int64_t sequence_nr = -1);
 
   // Sets node ID for distributed profiling
   static void setDefaultNodeId(int64_t defaultNodeId);
@@ -178,6 +185,10 @@ struct TORCH_API RecordFunction {
     return handle_;
   }
 
+  inline c10::optional<OperatorName> operator_name() const {
+    return operator_name_;
+  }
+
   inline void setHandle(RecordFunctionHandle handle) {
     handle_ = handle;
   }
@@ -213,6 +224,8 @@ struct TORCH_API RecordFunction {
   int64_t sequence_nr_ = -1;
   std::vector<c10::IValue> inputs_;
 
+  c10::optional<c10::OperatorName> operator_name_;
+
   // Kind of scope this RecordFunction is observing
   const RecordScope scope_;
 
diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp
index ca4fb2e7620d..54265530eb12 100644
--- a/test/cpp/jit/test_misc.cpp
+++ b/test/cpp/jit/test_misc.cpp
@@ -65,6 +65,7 @@
 #include <functional>
 #include <iostream>
 #include <memory>
+#include <set>
 #include <stdexcept>
 #include <string>
 #include <tuple>
@@ -1118,6 +1119,33 @@ TEST(RecordFunctionTest, Basic) {
   clearCallbacks();
 }
 
+TEST(RecordFunctionTest, OperatorNameOverload) {
+  std::set<std::string> operator_names;
+
+  at::addGlobalCallback(at::RecordFunctionCallback(
+                            [&operator_names](const at::RecordFunction& fn) {
+                              c10::optional<c10::OperatorName> op_name =
+                                  fn.operator_name();
+                              if (op_name.has_value()) {
+                                operator_names.insert(c10::toString(*op_name));
+                              } else {
+                                operator_names.insert("No Operator Name");
+                              }
+                            })
+                            .scopes({at::RecordScope::FUNCTION}));
+  auto t = torch::randn({1, 2, 3}, at::kCPU);
+  t.set_requires_grad(false);
+  auto t2 = t.pow(2);
+
+  at::clearCallbacks();
+  EXPECT_TRUE(operator_names.count("No Operator Name") == 0)
+      << "Expected that all traced operators had an associated OperatorName object";
+  EXPECT_TRUE(operator_names.count("aten::randn") == 1)
+      << "Expected aten::randn to have been called and recorded, but it was not";
+  EXPECT_TRUE(operator_names.count("aten::pow.Tensor_Scalar") == 1)
+      << "Expected aten::pow.Tensor_Scalar to have been called and recorded, but it was not";
+}
+
 class TestThreadLocalDebugInfo : public c10::DebugInfoBase {
  public:
   int getModelId() const {

From b553c06abbe7133c059e05844033067625382973 Mon Sep 17 00:00:00 2001
From: Martin Yuan <myuan@fb.com>
Date: Wed, 28 Oct 2020 22:36:23 -0700
Subject: [PATCH 096/169] Throw an exception in the constructor of torchscript
 serialization to avoid double-exception (#44266)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44266

If PyTorchStreamWriter is writing to a file in a non-existing path, it throws an exception. In unwinding the destructor calls writeEndOfFile() and throws again. To avoid this double-exception, a check and throw is added in the constructor. In such case the destructor will not be called and the exception can go through the unwinding.

Test Plan: python test/test_jit.py TestSaveLoad.test_save_nonexit_file

Reviewed By: dreiss

Differential Revision: D23560770

Pulled By: iseeyuan

fbshipit-source-id: 51b24403500bdab3578c7fd5e017780467a5d06a
---
 caffe2/serialize/inline_container.cc | 1 +
 test/jit/test_save_load.py           | 9 +++++++++
 2 files changed, 10 insertions(+)

diff --git a/caffe2/serialize/inline_container.cc b/caffe2/serialize/inline_container.cc
index 63f5a34aa23b..7928d5e3de86 100644
--- a/caffe2/serialize/inline_container.cc
+++ b/caffe2/serialize/inline_container.cc
@@ -306,6 +306,7 @@ void PyTorchStreamWriter::setup(const string& file_name) {
         file_name,
         std::ofstream::out | std::ofstream::trunc | std::ofstream::binary);
     valid("opening archive ", file_name.c_str());
+    TORCH_CHECK(file_stream_, "File ", file_name, " cannot be opened.");
     writer_func_ = [this](const void* buf, size_t nbytes) -> size_t {
       file_stream_.write(static_cast<const char*>(buf), nbytes);
       return !file_stream_ ? 0 : nbytes;
diff --git a/test/jit/test_save_load.py b/test/jit/test_save_load.py
index 178db8357e8f..23751c4fd92b 100644
--- a/test/jit/test_save_load.py
+++ b/test/jit/test_save_load.py
@@ -938,3 +938,12 @@ def forward(self, a):
 
         x = torch.tensor([1., 2., 3., 4.])
         self.assertTrue(torch.equal(m(x), m2(x)))
+
+    def test_save_nonexit_file(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                return 2 * x
+
+        script_module = torch.jit.script(Foo())
+        with self.assertRaises(RuntimeError):
+            script_module.save("NonExist/path/test.pt")

From cab32d9cdf3df1d838d41739fc519f589f834f9d Mon Sep 17 00:00:00 2001
From: Yi Wang <wayi@fb.com>
Date: Thu, 29 Oct 2020 00:12:55 -0700
Subject: [PATCH 097/169] [RPC Framework] Support remote device format
 "<workername>/<device>" (#46773)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46773

Changed the constructor of RemoteModule to accept a `remote_device` arg in the following format:
"<workername>/<device>" (e.g., "trainer0/cpu", "ps0/cuda:0")

This arg merges the original `on` and `device` arg.

Original PR issue: RemoteDevice Format #46554
ghstack-source-id: 115448051

Test Plan: buck test mode/dev-nosan caffe2/test/distributed/rpc:process_group_agent -- RemoteModule

Reviewed By: pritamdamania87

Differential Revision: D24482562

fbshipit-source-id: 5acfc73772576a4b674df27625bf560b8f8e67c1
---
 torch/distributed/nn/api/remote_module.py     | 41 ++++++-----
 torch/distributed/rpc/utils.py                | 37 ++++++++++
 .../ddp_under_dist_autograd_test.py           | 65 ++++++++---------
 .../distributed/nn/api/remote_module_test.py  | 69 ++++++++++++++-----
 4 files changed, 147 insertions(+), 65 deletions(-)
 create mode 100644 torch/distributed/rpc/utils.py

diff --git a/torch/distributed/nn/api/remote_module.py b/torch/distributed/nn/api/remote_module.py
index 225cb4842bd1..e0aebf87ae84 100644
--- a/torch/distributed/nn/api/remote_module.py
+++ b/torch/distributed/nn/api/remote_module.py
@@ -17,6 +17,7 @@
 import torch.distributed.rpc as rpc
 from torch import Tensor, device, dtype, nn
 from torch.distributed.nn.jit import instantiator
+from torch.distributed.rpc.utils import _parse_remote_device
 from torch.nn.parameter import Parameter
 from torch.utils.hooks import RemovableHandle
 
@@ -64,8 +65,7 @@ def _raise_not_supported(name):
 class _RemoteModule(nn.Module):
     def __init__(
         self,
-        on: str,
-        device: torch.device,
+        remote_device: str,
         module_cls: nn.Module,
         args: Tuple = None,
         kwargs: Dict[str, Any] = None,
@@ -100,8 +100,10 @@ def __init__(
         ``def forward_async(input: Tensor) -> Future[Tensor]:``.
 
         Arguments:
-            on (str or WorkerInfo): id or name of the destination worker.
-            device (torch.device): Device on the destination worker where we‘d like to place this module.
+            remote_device (str): Device on the destination worker where we‘d like to place this module.
+                The format should be "<workername>/<device>", where the device field can be parsed as torch.device type.
+                E.g., "trainer0/cpu", "trainer0", "ps0/cuda:0".
+                In addition, the device field can be optional and the default value is "cpu".
             module_cls (nn.Module): For example,
                 >>> class MyModule(nn.Module):
                 >>>     def forward(input):
@@ -132,7 +134,7 @@ def __init__(
             >>>
             >>> rpc.init_rpc("worker0", rank=0, world_size=2)
             >>> remote_linear_module = RemoteModule(
-            >>>     "worker1", "cpu", nn.Linear, args=(20, 30),
+            >>>     "worker1/cpu", nn.Linear, args=(20, 30),
             >>> )
             >>> input = torch.randn(128, 20)
             >>> ret_fut = remote_linear_module.forward_async(input)
@@ -155,18 +157,22 @@ def __init__(
         args = args if args is not None else ()
         kwargs = kwargs if kwargs is not None else {}
 
-        self.on = on
+        self.on, self.device = _parse_remote_device(remote_device)
 
         if _module_interface_cls is not None:
             # Users reply on this field to know if this generated RemoteModule is TorchScript-able.
             self.is_scriptable = True
 
             # Instantiate template on remote side.
-            fut = rpc.rpc_async(on, _instantiate_template, (_module_interface_cls,))
+            fut = rpc.rpc_async(
+                self.on, _instantiate_template, (_module_interface_cls,)
+            )
 
             # Instantiate template on local side.
-            generated_module = instantiator.instantiate_scriptable_remote_module_template(
-                _module_interface_cls
+            generated_module = (
+                instantiator.instantiate_scriptable_remote_module_template(
+                    _module_interface_cls
+                )
             )
             generated_methods = generated_module._generated_methods
 
@@ -178,9 +184,9 @@ def __init__(
 
         # Create the module on the remote side.
         self.module_rref = rpc.rpc_sync(
-            on,
+            self.on,
             _create_module,
-            (module_cls, args, kwargs, device, _module_interface_cls),
+            (module_cls, args, kwargs, self.device, _module_interface_cls),
         )
 
         # Install generated methods.
@@ -329,8 +335,10 @@ class RemoteModule(_RemoteModule):
         ``def forward_async(input: Tensor) -> Future[Tensor]:``.
 
     Arguments:
-        to (str or WorkerInfo): id or name of the destination worker.
-        device (torch.device): Device on the destination worker where we‘d like to place this module.
+        remote_device (str): Device on the destination worker where we‘d like to place this module.
+            The format should be "<workername>/<device>", where the device field can be parsed as torch.device type.
+            E.g., "trainer0/cpu", "trainer0", "ps0/cuda:0".
+            In addition, the device field can be optional and the default value is "cpu".
         module_cls (nn.Module): For example,
             >>> class MyModule(nn.Module):
             >>>     def forward(input):
@@ -357,7 +365,7 @@ class RemoteModule(_RemoteModule):
         >>>
         >>> rpc.init_rpc("worker0", rank=0, world_size=2)
         >>> remote_linear_module = RemoteModule(
-        >>>     "worker1", nn.Linear, args=(20, 30),
+        >>>     "worker1/cpu", nn.Linear, args=(20, 30),
         >>> )
         >>> input = torch.randn(128, 20)
         >>> ret_fut = remote_linear_module.forward_async(input)
@@ -374,10 +382,9 @@ class RemoteModule(_RemoteModule):
 
     def __init__(
         self,
-        on: str,
-        device: torch.device,
+        remote_device: str,
         module_cls: nn.Module,
         args: Tuple = None,
         kwargs: Dict[str, Any] = None,
     ):
-        super().__init__(on, device, module_cls, args, kwargs)
+        super().__init__(remote_device, module_cls, args, kwargs)
diff --git a/torch/distributed/rpc/utils.py b/torch/distributed/rpc/utils.py
new file mode 100644
index 000000000000..15924c4a72f0
--- /dev/null
+++ b/torch/distributed/rpc/utils.py
@@ -0,0 +1,37 @@
+def _parse_remote_device(remote_device: str):
+    r"""
+    Parses the remote device.
+
+    Arguments:
+        remote_device (str): Device on the destination worker where we‘d like to place this module.
+            The format should be "<workername>/<device>", where the device field can be parsed as torch.device type.
+            E.g., "trainer0/cpu", "trainer0", "ps0/cuda:0".
+            In addition, the device field can be optional and the default value is "cpu".
+
+    Returns:
+        A workername and a device.
+    """
+    fields = remote_device.split("/")
+    if len(fields) == 2:
+        [on, device] = fields
+    elif len(fields) == 1:
+        on = fields[0]
+        device = "cpu"
+    else:
+        raise RuntimeError(
+            "Could not parse remote_device: {}. The valid format is '<workername>/<device>'".format(
+                remote_device
+            )
+        )
+
+    # Since the workername in the input remote device won't be validated until the created remote module is executed,
+    # only do some very basic sanity check on workername at the module creation time.
+    # As currently there is no regex to describe the format of workername, just check whether the workername is empty.
+    if not on:
+        raise RuntimeError(
+            "The workername in remote_device '{}' cannot be empty. The valid format is '<workername>/<device>'".format(
+                remote_device
+            )
+        )
+
+    return on, device
diff --git a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
index 305b0fcb82bf..84768496b5ff 100644
--- a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
@@ -20,7 +20,7 @@
     skip_if_lt_x_gpu,
     skip_if_rocm,
 )
-from torch.testing._internal.dist_utils import dist_init, INIT_METHOD_TEMPLATE
+from torch.testing._internal.dist_utils import INIT_METHOD_TEMPLATE, dist_init
 from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
     RpcAgentTestFixture,
 )
@@ -619,35 +619,38 @@ def test_ddp_dist_autograd_local_vs_remote(self):
             rank=self.rank,
         )
 
-        remote_layer1 = RemoteModule(
-            "worker0", device="cpu", module_cls=nn.Linear, args=(10, 5, False)
-        )
-        layer1 = nn.Linear(10, 5, False)
-        # Start with the same parameters for remote and local
-        layer1.weight = remote_layer1.module_rref.to_here().weight
-
-        # Run local case.
-        layer2 = nn.Linear(5, 1)
-        inputs = torch.rand((10, 10))
-        ddp_model = DistributedDataParallel(layer2)
-        loss = ddp_model(layer1(inputs)).sum()
-        loss.backward()
-
-        # Run remote case.
-        with dist_autograd.context() as context_id:
-            loss = ddp_model(remote_layer1(inputs)).sum()
-            dist_autograd.backward(context_id, [loss])
-            grads_dict = dist_autograd.get_gradients(context_id)
-            dist.barrier()
-            self.assertEqual(layer2.weight.grad, grads_dict[layer2.weight])
-            self.assertEqual(
-                layer1.weight.grad,
-                rpc.rpc_sync(
-                    "worker0",
-                    DdpComparisonTest.get_remote_grads,
-                    args=(remote_layer1.module_rref, context_id),
-                ),
+        # Use two different remote device input string, w/ and w/o the default
+        # device string "cpu", respectively.
+        for remote_device in ["worker0/cpu", "worker0"]:
+            remote_layer1 = RemoteModule(
+                remote_device=remote_device, module_cls=nn.Linear, args=(10, 5, False)
             )
+            layer1 = nn.Linear(10, 5, False)
+            # Start with the same parameters for remote and local
+            layer1.weight = remote_layer1.module_rref.to_here().weight
+
+            # Run local case.
+            layer2 = nn.Linear(5, 1)
+            inputs = torch.rand((10, 10))
+            ddp_model = DistributedDataParallel(layer2)
+            loss = ddp_model(layer1(inputs)).sum()
+            loss.backward()
+
+            # Run remote case.
+            with dist_autograd.context() as context_id:
+                loss = ddp_model(remote_layer1(inputs)).sum()
+                dist_autograd.backward(context_id, [loss])
+                grads_dict = dist_autograd.get_gradients(context_id)
+                dist.barrier()
+                self.assertEqual(layer2.weight.grad, grads_dict[layer2.weight])
+                self.assertEqual(
+                    layer1.weight.grad,
+                    rpc.rpc_sync(
+                        "worker0",
+                        DdpComparisonTest.get_remote_grads,
+                        args=(remote_layer1.module_rref, context_id),
+                    ),
+                )
 
     @skip_if_lt_x_gpu(NUM_TRAINERS)
     @requires_nccl()
@@ -667,7 +670,7 @@ def test_ddp_dist_autograd_local_vs_remote_gpu(self):
         )
 
         remote_layer1 = RemoteModule(
-            "worker0", device="cpu", module_cls=nn.Linear, args=(10, 7, False)
+            remote_device="worker0/cpu", module_cls=nn.Linear, args=(10, 7, False)
         )
         layer1 = nn.Linear(10, 7, False)
         # Start with the same parameters for remote and local
@@ -677,7 +680,7 @@ def test_ddp_dist_autograd_local_vs_remote_gpu(self):
         ddp_layer2 = DistributedDataParallel(layer2, device_ids=[self.rank])
 
         remote_layer3 = RemoteModule(
-            "worker0", device="cpu", module_cls=nn.Linear, args=(5, 3, False)
+            remote_device="worker0/cpu", module_cls=nn.Linear, args=(5, 3, False)
         )
         layer3 = nn.Linear(5, 3, False)
         # Start with the same parameters for remote and local
diff --git a/torch/testing/_internal/distributed/nn/api/remote_module_test.py b/torch/testing/_internal/distributed/nn/api/remote_module_test.py
index da81b3b16e53..d6b3d816fe68 100644
--- a/torch/testing/_internal/distributed/nn/api/remote_module_test.py
+++ b/torch/testing/_internal/distributed/nn/api/remote_module_test.py
@@ -78,7 +78,7 @@ def world_size(self):  # Override setting in RpcAgentTestFixture
         return 2
 
     @staticmethod
-    def _create_remote_module_iter(dst_worker_name, device="cpu", modes=None):
+    def _create_remote_module_iter(remote_device, modes=None):
         if modes is None:
             modes = ModuleCreationMode.__members__.values()
 
@@ -86,15 +86,12 @@ def _create_remote_module_iter(dst_worker_name, device="cpu", modes=None):
         kwargs = dict(first_kwarg=2)
 
         if ModuleCreationMode.MODULE_CTOR in modes:
-            remote_module = RemoteModule(
-                dst_worker_name, device, MyModule, args, kwargs
-            )
+            remote_module = RemoteModule(remote_device, MyModule, args, kwargs)
             yield remote_module
 
         if ModuleCreationMode.MODULE_CTOR_WITH_INTERFACE in modes:
             remote_module = _RemoteModule(
-                dst_worker_name,
-                device,
+                remote_device,
                 create_scripted_module,
                 args,
                 kwargs,
@@ -108,6 +105,7 @@ def test_bad_module(self):
         if self.rank != 0:
             return
         dst_worker_name = dist_utils.worker_name((self.rank + 1) % self.world_size)
+        remote_device = "{}/cpu".format(dst_worker_name)
         args = (1,)
         kwargs = dict(first_kwarg=2)
 
@@ -115,13 +113,13 @@ def test_bad_module(self):
             ValueError,
             r"Expect `module_cls\(\*args, \*\*kwargs\)` returns an instance of <class nn.Module>,",
         ):
-            RemoteModule(dst_worker_name, "cpu", BadModule, args, kwargs)
+            RemoteModule(remote_device, BadModule, args, kwargs)
 
         with self.assertRaisesRegex(
             ValueError,
             r"Expect `module_cls\(\*args, \*\*kwargs\)` returns an instance of <class nn.Module>,",
         ):
-            RemoteModule(dst_worker_name, "cpu", BadModule, args, kwargs)
+            RemoteModule(remote_device, BadModule, args, kwargs)
 
     @dist_utils.dist_init
     def test_forward_async(self):
@@ -227,7 +225,7 @@ def test_valid_device(self):
         dst_worker_name = dist_utils.worker_name((self.rank + 1) % self.world_size)
 
         for remote_module in self._create_remote_module_iter(
-            dst_worker_name, device="cuda:0", modes=[ModuleCreationMode.MODULE_CTOR]
+            "{}/cuda:0".format(dst_worker_name), modes=[ModuleCreationMode.MODULE_CTOR]
         ):
             device = rpc.rpc_sync(
                 dst_worker_name, remote_device, (remote_module.module_rref,)
@@ -249,8 +247,7 @@ def test_invalid_devices(self):
         ):
             list(
                 self._create_remote_module_iter(
-                    dst_worker_name,
-                    device="foo",
+                    "{}/foo".format(dst_worker_name),
                     modes=[ModuleCreationMode.MODULE_CTOR],
                 )
             )
@@ -260,8 +257,7 @@ def test_invalid_devices(self):
         ):
             list(
                 self._create_remote_module_iter(
-                    dst_worker_name,
-                    device="cuda:100",
+                    "{}/cuda:100".format(dst_worker_name),
                     modes=[ModuleCreationMode.MODULE_CTOR],
                 )
             )
@@ -269,9 +265,8 @@ def test_invalid_devices(self):
         with self.assertRaisesRegex(RuntimeError, r"Invalid device string: 'cpu2'"):
             list(
                 self._create_remote_module_iter(
-                    dst_worker_name,
+                    "{}/cpu2".format(dst_worker_name),
                     modes=[ModuleCreationMode.MODULE_CTOR],
-                    device="cpu2",
                 )
             )
 
@@ -280,8 +275,48 @@ def test_invalid_devices(self):
         ):
             list(
                 self._create_remote_module_iter(
-                    dst_worker_name,
-                    device="cpu:2",
+                    "{}/cpu:2".format(dst_worker_name),
+                    modes=[ModuleCreationMode.MODULE_CTOR],
+                )
+            )
+
+        with self.assertRaisesRegex(RuntimeError, r"Device string must not be empty"):
+            list(
+                self._create_remote_module_iter(
+                    "{}/".format(dst_worker_name),
+                    modes=[ModuleCreationMode.MODULE_CTOR],
+                )
+            )
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Could not parse remote_device: worker1/cuda:0/cuda:1. The valid format is '<workername>/<device>'",
+        ):
+            list(
+                self._create_remote_module_iter(
+                    "{}/cuda:0/cuda:1".format(dst_worker_name),
+                    modes=[ModuleCreationMode.MODULE_CTOR],
+                )
+            )
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"The workername in remote_device '/' cannot be empty. The valid format is '<workername>/<device>'",
+        ):
+            list(
+                self._create_remote_module_iter(
+                    "/",
+                    modes=[ModuleCreationMode.MODULE_CTOR],
+                )
+            )
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"The workername in remote_device '/cuda:0' cannot be empty. The valid format is '<workername>/<device>'",
+        ):
+            list(
+                self._create_remote_module_iter(
+                    "/cuda:0",
                     modes=[ModuleCreationMode.MODULE_CTOR],
                 )
             )

From d0df29ac22789aae9078c767b8296d0f6d98a64b Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Thu, 29 Oct 2020 00:33:55 -0700
Subject: [PATCH 098/169] [FX] Put inf and nan in globals instead of with an
 import string (#47035)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/47035

Chillee thought the `from math import inf, nan` string at the top of `.code` was annoying so here's an alternative way to do it by putting those values in `globals` before we `exec`

Test Plan: Imported from OSS

Reviewed By: dzhulgakov

Differential Revision: D24611278

Pulled By: jamesr66a

fbshipit-source-id: c25ef89e649bdd3e79fe91aea945a30fa7106961
---
 torch/fx/graph.py        | 3 +--
 torch/fx/graph_module.py | 3 ++-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index a9a7e5dc80c0..d76dd2e6ba22 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -383,8 +383,7 @@ def type_repr(o : Any):
 
         # repr() for inf and nan floating point values aren't parseable by
         # python as literals. Explicitly import the names from the `math` module.
-        import_strs = ['from math import inf, nan']
-        import_strs.extend(f'import {name}' for name in sorted(modules_used))
+        import_strs = [f'import {name}' for name in sorted(modules_used)]
         import_block = '\n'.join(import_strs)
 
         code = ''.join(body)
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index 6f72a29be184..24dde1ea13d4 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -4,6 +4,7 @@
 from typing import Type, Dict, List, Any, Union
 from .graph import Graph
 import copy
+import math
 
 # normal exec loses the source code, however we can patch
 # the linecache module to still recover it.
@@ -28,7 +29,7 @@ def patched_getline(*args, **kwargs):
 linecache.getlines = patched_getline
 
 def _forward_from_src(src : str):
-    gbls: Dict[str, Any] = {}
+    gbls: Dict[str, Any] = {'inf': math.inf, 'nan': math.nan}
     exec_with_source(src, gbls)
     return gbls['forward']
 

From 18d273dc0edb830e6de79d67330b346d6eb78e2a Mon Sep 17 00:00:00 2001
From: Kunal Bhalla <kunalb@fb.com>
Date: Thu, 29 Oct 2020 04:10:38 -0700
Subject: [PATCH 099/169] [RFC][LocalSession] Fix workspace type

Summary: I was reading/looking into how LocalSession works and realized that the workspace type being passed around was the bound function on TaskGroup instead of the actual type. This meant that all workspaces for localsession would always be global, because they'd never match the private workspace type.

Test Plan: <not sure, could use some suggestions>

Reviewed By: cryptopic

Differential Revision: D24458428

fbshipit-source-id: 0f87874babe9c1ddff25b5363b443f9ca37e03c1
---
 caffe2/python/session.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/caffe2/python/session.py b/caffe2/python/session.py
index de3b09931a30..fb2b57c4f5ee 100644
--- a/caffe2/python/session.py
+++ b/caffe2/python/session.py
@@ -192,7 +192,7 @@ def _compile_task_group(cls, task_group, setup_net_list=None):
             task = task_group.to_task()
         plan = core.Plan('task_group_plan')
         plan.AddStep(task.get_step())
-        return (plan, task.output_list(), task.workspace_type)
+        return (plan, task.output_list(), task.workspace_type())
 
     def _run_compiled(self, compiled):
         plan, output_list, workspace_type = compiled

From 4a581ba6c2cdcf294aa3aea8c343f4ed66805bd5 Mon Sep 17 00:00:00 2001
From: Brandon Lin <branlin@fb.com>
Date: Thu, 29 Oct 2020 07:01:18 -0700
Subject: [PATCH 100/169] Implement LengthsToOffsets operator in Caffe2
 (#46590)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46590

This operator is very similar to LengthsToRanges but doesn't pack the offsets next to the original lengths.

Reviewed By: yf225

Differential Revision: D24419746

fbshipit-source-id: aa8b014588bb22eced324853c545f8684086c4e4
---
 caffe2/operators/utility_ops.cc  | 53 +++++++++++++++++++++++---------
 caffe2/operators/utility_ops.h   | 39 +++++++++++++++++++++++
 caffe2/python/hypothesis_test.py | 32 +++++++++++++++++++
 3 files changed, 110 insertions(+), 14 deletions(-)

diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc
index b691c24e984a..9abcf5ab0b86 100644
--- a/caffe2/operators/utility_ops.cc
+++ b/caffe2/operators/utility_ops.cc
@@ -59,6 +59,7 @@ REGISTER_CPU_OPERATOR(GatherRanges, GatherRangesOp<CPUContext>);
 REGISTER_CPU_OPERATOR(LengthsGather, LengthsGatherOp<CPUContext>);
 REGISTER_CPU_OPERATOR(LengthsToSegmentIds, LengthsToSegmentIdsOp<CPUContext>);
 REGISTER_CPU_OPERATOR(LengthsToRanges, LengthsToRangesOp<CPUContext>);
+REGISTER_CPU_OPERATOR(LengthsToOffsets, LengthsToOffsetsOp<CPUContext>);
 REGISTER_CPU_OPERATOR(SegmentIdsToLengths, SegmentIdsToLengthsOp<CPUContext>);
 REGISTER_CPU_OPERATOR(SegmentIdsToRanges, SegmentIdsToRangesOp<CPUContext>);
 REGISTER_CPU_OPERATOR(LengthsToWeights, LengthsToWeightsOp<CPUContext>);
@@ -522,20 +523,20 @@ Another output LENGTHS represents each example length within OUTPUT
         "LENGTHS",
         "1-D tensor of size N with lengths over gathered data"
         " for each row in a batch. sum(LENGTHS) == OUTPUT.size()")
-    .TensorInferenceFunction(OpSchema::NeedsAllInputShapes([](
-        const OperatorDef& /* unused */, const vector<TensorShape>& in) {
-      std::vector<TensorShape> out(2);
-
-      int total = 1;
-      for (auto d : in[0].dims()) {
-        total *= d;
-      }
-      out[0].add_dims(total);
-      out[0].set_data_type(in[0].data_type());
-      out[1].add_dims(in[1].dims(0));
-      out[1].set_data_type(in[1].data_type());
-      return out;
-    }));
+    .TensorInferenceFunction(OpSchema::NeedsAllInputShapes(
+        [](const OperatorDef& /* unused */, const vector<TensorShape>& in) {
+          std::vector<TensorShape> out(2);
+
+          int total = 1;
+          for (auto d : in[0].dims()) {
+            total *= d;
+          }
+          out[0].add_dims(total);
+          out[0].set_data_type(in[0].data_type());
+          out[1].add_dims(in[1].dims(0));
+          out[1].set_data_type(in[1].data_type());
+          return out;
+        }));
 
 OPERATOR_SCHEMA(LengthsGather)
     .NumInputs(3)
@@ -636,6 +637,30 @@ For example, `[1, 3, 0, 2]` transforms into `[[0, 1], [1, 3], [4, 0], [4, 2]]`.
         "ranges",
         "2D tensor of shape len(lengths) X 2 and the same type as `lengths`");
 
+OPERATOR_SCHEMA(LengthsToOffsets)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Given a vector of segment lengths, returns a vector of offsets from these lengths,
+which will have the same size as the input vector. Output is going to have
+the same type as input. For long tensors explicit casting from int32 to int64
+might be necessary prior to this op.
+
+For example, `[1, 3, 0, 2]` transforms into `[0, 1, 4, 4]`.
+)DOC")
+    .Input(0, "lengths", "1D tensor of int32 or int64 segment lengths.")
+    .Output(0, "offsets", "1D tensor of the same shape and type as `lengths`")
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      const ArgumentHelper args(def);
+      bool include_last_offset =
+          args.GetSingleArgument<bool>("include_last_offset", false);
+      vector<int> out_shape(in[0].dims().begin(), in[0].dims().end());
+      out_shape[0] += include_last_offset ? 1 : 0;
+      return vector<TensorShape>{
+          CreateTensorShape(out_shape, in[0].data_type())};
+    });
+
 OPERATOR_SCHEMA(SegmentIdsToLengths)
     .NumInputs(1, 2)
     .NumOutputs(1)
diff --git a/caffe2/operators/utility_ops.h b/caffe2/operators/utility_ops.h
index a82b5666fb7b..bdc9c0bfbfd9 100644
--- a/caffe2/operators/utility_ops.h
+++ b/caffe2/operators/utility_ops.h
@@ -918,6 +918,45 @@ class LengthsToRangesOp : public Operator<Context> {
   }
 };
 
+template <class Context>
+class LengthsToOffsetsOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  template <class... Args>
+  explicit LengthsToOffsetsOp(Args&&... args)
+      : Operator<Context>(std::forward<Args>(args)...),
+        include_last_offset_(this->template GetSingleArgument<bool>(
+            "include_last_offset",
+            false)) {}
+
+  bool RunOnDevice() override {
+    auto& input = Input(0);
+    auto* output = Output(0);
+    auto* input_data = input.template data<int32_t>();
+
+    CAFFE_ENFORCE(input.sizes().size() == 1, "Input must be a vector.");
+    auto size = input.numel();
+
+    output->Resize(size + (include_last_offset_ ? 1 : 0));
+    auto* output_data = output->template mutable_data<int32_t>();
+
+    int32_t offset = 0;
+    for (int i = 0; i < size; ++i) {
+      auto len = input_data[i];
+      output_data[i] = offset;
+      offset += len;
+    }
+    if (include_last_offset_) {
+      output_data[size] = offset;
+    }
+    return true;
+  }
+
+ private:
+  bool include_last_offset_;
+};
+
 template <class Context>
 class SegmentIdsToLengthsOp : public Operator<Context> {
  public:
diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py
index 045677f8422a..9298134f651c 100644
--- a/caffe2/python/hypothesis_test.py
+++ b/caffe2/python/hypothesis_test.py
@@ -994,6 +994,38 @@ def op_ref(x):
             inputs=[np.array(lengths, dtype=np.int32)],
             reference=op_ref)
 
+    @given(
+        lengths=st.lists(
+            st.integers(min_value=0, max_value=10), min_size=0, max_size=10
+        ),
+        include_last_offset=st.booleans(),
+        **hu.gcs_cpu_only
+    )
+    @settings(deadline=None)
+    def test_lengths_to_offsets(self, lengths, include_last_offset, gc, dc):
+        op = core.CreateOperator(
+            "LengthsToOffsets",
+            ["lengths"],
+            ["ranges"],
+            include_last_offset=include_last_offset,
+        )
+
+        def op_ref(x):
+            if not x.size:
+                arr = [x.reshape(0)]
+            else:
+                arr = [np.concatenate(([0], np.cumsum(x)[:-1]))]
+            if include_last_offset:
+                arr[0] = np.concatenate((arr[0], np.array([np.sum(x)])))
+            return tuple(arr)
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[np.array(lengths, dtype=np.int32)],
+            reference=op_ref,
+        )
+
     @given(prediction=hu.arrays(dims=[10, 3],
                                 elements=hu.floats(allow_nan=False,
                                                    allow_infinity=False,

From dfdc1dbee4809a7433fab5cb6880eb6396e9e59d Mon Sep 17 00:00:00 2001
From: ashish <ashish.farmer@amd.com>
Date: Thu, 29 Oct 2020 08:03:46 -0700
Subject: [PATCH 101/169] Disable softmax tests on ROCm (#46793)

Summary:
This PR disables the test_softmax and test_softmax_results in test_nn.py that were enabled in https://github.com/pytorch/pytorch/issues/46363. The softmax tests are causing failure on gfx906 machines. Disabling those until we root cause and fix them on 906.

cc: jeffdaily ezyang

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46793

Reviewed By: izdeby

Differential Revision: D24539211

Pulled By: ezyang

fbshipit-source-id: 633cb9dc497ad6359af85b85a711c4549d772b2a
---
 test/test_nn.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/test_nn.py b/test/test_nn.py
index 25960d817ab8..b86e340c14a5 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -10834,6 +10834,8 @@ def test_embedding_padding_idx(self, device, dtype):
                 embedding.zero_grad()
                 self.assertEqual(after, pre)
 
+    # Test fails on Vg20
+    @skipCUDAIfRocm
     @dtypesIfCUDA(torch.half, torch.float)
     @dtypes(torch.float)
     def test_softmax_results(self, device, dtype):
@@ -11433,6 +11435,8 @@ def test_embedding_max_norm_device(self, device, dtype):
         self.assertEqual(output[1], output[2])
         self.assertTrue(output.data.norm(p=2, dim=1).le(1).all())
 
+    # Test fails on Vg20
+    @skipCUDAIfRocm
     @onlyCUDA
     @dtypes(torch.half, torch.float)
     def test_softmax(self, device, dtype):

From 262bd6437a9340809a51e89217075be1848a9862 Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Thu, 29 Oct 2020 08:29:04 -0700
Subject: [PATCH 102/169] Show old kernel location when there are mismatches
 (#46850)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46850

So far, in the error messages when kernel signatures mismatched, we showed the location where the second kernel came from,
but we didn't show the location of the first kernel. This PR now shows the location of both.
ghstack-source-id: 115468616

Test Plan: waitforsandcastle

Reviewed By: ezyang

Differential Revision: D24540368

fbshipit-source-id: 3b4474062879d17f9bb7870ad3814343edc1b755
---
 aten/src/ATen/BatchingRegistrations.cpp       |  2 +-
 aten/src/ATen/core/dispatch/OperatorEntry.cpp | 19 +++++++++++++------
 aten/src/ATen/core/dispatch/OperatorEntry.h   | 19 +++++++++++++------
 .../op_registration/op_registration_test.cpp  | 16 +++++++++-------
 4 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/aten/src/ATen/BatchingRegistrations.cpp b/aten/src/ATen/BatchingRegistrations.cpp
index 029a5be521f7..0b180b5059d1 100644
--- a/aten/src/ATen/BatchingRegistrations.cpp
+++ b/aten/src/ATen/BatchingRegistrations.cpp
@@ -222,7 +222,7 @@ Tensor permute_batching_rule(const Tensor& self, IntArrayRef dims) {
   VmapDimVector all_dims_physical;
   all_dims_physical.reserve(self_physical.tensor().dim());
   for (int64_t bdim = 0; bdim < self_physical.numBatchDims(); bdim++) {
-    all_dims_physical.push_back(bdim); 
+    all_dims_physical.push_back(bdim);
   }
   all_dims_physical.insert(
       all_dims_physical.end(),
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
index 97651c9865a1..97ac200d79ef 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@@ -83,13 +83,15 @@ std::list<AnnotatedKernel>::iterator OperatorEntry::registerKernel(
   // that would also invalidate the old TypedOperatorHandles.
   if (cpp_signature.has_value()) {
     if (cpp_signature_.has_value()) {
-      TORCH_INTERNAL_ASSERT(*cpp_signature == *cpp_signature_,
-        "Tried to register a kernel (", debug, ") for operator ", name_," for dispatch key ", toString(dispatch_key),
-        ", but the C++ function signature ", cpp_signature->name(), " mismatched with a previous kernel that had the signature ",
-        cpp_signature_->name()
+      TORCH_INTERNAL_ASSERT(*cpp_signature == cpp_signature_->signature,
+        "Tried to register a kernel (", debug, ") for operator ", name_," (",
+        (this->schema_.has_value() ? this->schema_->debug : "no debug info"),
+        ") for dispatch key ", toString(dispatch_key), ", but the C++ function signature ",
+        cpp_signature->name(), " mismatched with a previous kernel (", cpp_signature_->debug,
+        ") that had the signature ", cpp_signature_->signature.name()
       );
     } else {
-      cpp_signature_ = *cpp_signature;
+      cpp_signature_ = CppSignatureWithDebug { *cpp_signature, debug };
     }
   }
 
@@ -103,7 +105,12 @@ std::list<AnnotatedKernel>::iterator OperatorEntry::registerKernel(
   auto& k = dispatch_key.has_value() ? kernels_[*dispatch_key] : kernels_[DispatchKey::Math];
 
   if (k.size() > 0) {
-    TORCH_WARN("Registering a kernel (", debug, ") for operator ", name_, " for dispatch key ", toString(dispatch_key), " that overwrote a previously registered kernel with the same dispatch key for the same operator.");
+    TORCH_WARN("Registering a kernel (", debug, ") for operator ", name_, " (",
+              (this->schema_.has_value() ? this->schema_->debug : "no debug info"),
+              ") for dispatch key ", toString(dispatch_key),
+              " that overwrote a previously registered kernel (",
+              (cpp_signature_.has_value() ? cpp_signature_->debug : "no debug info"),
+              ") with the same dispatch key for the same operator.");
   }
 
   if (manuallyBoxedKernel_.has_value()) {
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.h b/aten/src/ATen/core/dispatch/OperatorEntry.h
index ed4d5f40b97f..26506cb0f76f 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.h
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.h
@@ -157,13 +157,15 @@ class CAFFE2_API OperatorEntry final {
   // Asserts that the given FuncType is correct for calling this operator in an unboxed way.
   template<class FuncType>
   void assertSignatureIsCorrect() {
-    TORCH_INTERNAL_ASSERT(!cpp_signature_.has_value() || (CppSignature::make<FuncType>() == *cpp_signature_),
+    TORCH_INTERNAL_ASSERT(!cpp_signature_.has_value() || (CppSignature::make<FuncType>() == cpp_signature_->signature),
         "Tried to access operator ", name_, " with a wrong signature. Accessed with ",
         CppSignature::make<FuncType>().name(),
         " but the operator was registered with ",
-        cpp_signature_->name(),
-        " (",
+        cpp_signature_->signature.name(),
+        " (schema: ",
         (schema_.has_value() ? schema_->debug : "unknown debug info"),
+        ", kernel: ",
+        cpp_signature_->debug,
         ") This likely happened in a call to OperatorHandle::typed<Return (Args...)>(). Please make sure that the function signature matches the signature in the operator registration call."
     );
   }
@@ -230,12 +232,17 @@ class CAFFE2_API OperatorEntry final {
   AnnotatedKernel missingKernel_;
   static const AnnotatedKernel ambiguousAutogradOtherKernel_;
 
-  // signature_hash_ is set to the hash of the function signature if any of
+  // cpp_signature_ stores function signature if any of
   // the kernels was created in a way that allowed us to know the function
   // signature (i.e. by supplying an unboxed C++ kernel function).
-  // If this is set, it will be used in unboxed function calls
+  // If this is set, it will be used to check that future kernel
+  // registrations match and it will be used in unboxed function calls
   // to verify their arguments against the known function signature.
-  c10::optional<CppSignature> cpp_signature_;
+  struct CppSignatureWithDebug {
+    CppSignature signature;
+    std::string debug;
+  };
+  c10::optional<CppSignatureWithDebug> cpp_signature_;
 
   // Whether this operator needs to be observed with RecordFunction
   const bool is_observed_;
diff --git a/aten/src/ATen/core/op_registration/op_registration_test.cpp b/aten/src/ATen/core/op_registration/op_registration_test.cpp
index 93b0ffc1b88e..3fd8740d1ab1 100644
--- a/aten/src/ATen/core/op_registration/op_registration_test.cpp
+++ b/aten/src/ATen/core/op_registration/op_registration_test.cpp
@@ -310,7 +310,8 @@ TEST(OperatorRegistrationTest, givenMultipleKernelsWithSameDispatchKey_whenRegis
   std::string output = testing::internal::GetCapturedStderr();
   EXPECT_THAT(output, testing::HasSubstr("_test::dummy"));
   EXPECT_THAT(output, testing::HasSubstr("CPU"));
-  EXPECT_THAT(output, testing::HasSubstr("overwrote a previously registered kernel with the same dispatch key for the same operator"));
+  EXPECT_THAT(output, testing::HasSubstr("overwrote a previously registered kernel "));
+  EXPECT_THAT(output, testing::HasSubstr(" with the same dispatch key for the same operator"));
 }
 
 TEST(OperatorRegistrationTest, givenMultipleKernelsWithSameDispatchKey_whenRegisteringInSameOpCall_thenFails) {
@@ -348,7 +349,8 @@ TEST(OperatorRegistrationTest, givenMultipleCatchallKernels_whenRegistering_then
   std::string output = testing::internal::GetCapturedStderr();
   EXPECT_THAT(output, testing::HasSubstr("_test::dummy"));
   EXPECT_THAT(output, testing::HasSubstr("catch all"));
-  EXPECT_THAT(output, testing::HasSubstr("overwrote a previously registered kernel with the same dispatch key for the same operator"));
+  EXPECT_THAT(output, testing::HasSubstr("overwrote a previously registered kernel "));
+  EXPECT_THAT(output, testing::HasSubstr(" with the same dispatch key for the same operator"));
 }
 
 TEST(OperatorRegistrationTest, givenMultipleCatchallKernels_whenRegisteringInSameOpCall_thenFails) {
@@ -701,7 +703,7 @@ TEST(OperatorRegistrationTest, whenRegisteringMismatchingKernelsInSameOpCall_the
     auto registrar1 = c10::RegisterOperators().op("_test::dummy", c10::RegisterOperators::options()
       .kernel<DummyKernelWithIntParam>(c10::DispatchKey::CPU)
       .kernel<MockKernel>(c10::DispatchKey::CUDA, &called_kernel));
-  }, "mismatched with a previous kernel that had the signature");
+  }, "mismatched with a previous kernel");
 }
 
 void backend_fallback_kernel(const c10::OperatorHandle& op, c10::Stack* stack) {
@@ -944,7 +946,7 @@ TEST(OperatorRegistrationTest, givenLambdaKernel_whenRegisteringWithMismatchingC
   expectThrows<c10::Error>([] {
     auto registrar = c10::RegisterOperators().op("_test::dummy", c10::RegisterOperators::options()
       .kernel(DispatchKey::CPU, [] (const int64_t&) {}));
-  }, "mismatched with a previous kernel that had the signature");
+  }, "mismatched with a previous kernel");
 }
 
 TEST(OperatorRegistrationTest, givenLambdaKernel_whenRegisteringCatchAllAndBackendWithMismatchingCppSignatures_thenFails) {
@@ -953,7 +955,7 @@ TEST(OperatorRegistrationTest, givenLambdaKernel_whenRegisteringCatchAllAndBacke
   expectThrows<c10::Error>([] {
     auto registrar = c10::RegisterOperators().op("_test::dummy", c10::RegisterOperators::options()
       .kernel(DispatchKey::CPU, [] (const int64_t&) {}));
-  }, "mismatched with a previous kernel that had the signature");
+  }, "mismatched with a previous kernel");
 }
 
 TEST(OperatorRegistrationTest, givenLambdaKernel_whenRegisteringBackendAndCatchAllWithMismatchingCppSignatures_thenFails) {
@@ -962,7 +964,7 @@ TEST(OperatorRegistrationTest, givenLambdaKernel_whenRegisteringBackendAndCatchA
   expectThrows<c10::Error>([] {
     auto registrar = c10::RegisterOperators().op("_test::dummy", c10::RegisterOperators::options()
       .catchAllKernel([] (const int64_t&) {}));
-  }, "mismatched with a previous kernel that had the signature");
+  }, "mismatched with a previous kernel");
 }
 
 TEST(OperatorRegistrationTest, givenLambdaKernel_whenAccessingWithMismatchingCppSignatures_thenFails) {
@@ -989,7 +991,7 @@ TEST(OperatorRegistrationTest, givenTorchLibrary_whenRegisteringWithMismatchingC
   m.impl("dummy", DispatchKey::CPU, [] (int64_t) {});
   expectThrows<c10::Error>([&] {
     m.impl("dummy", DispatchKey::CUDA, [] (const int64_t&) {});
-  }, "mismatched with a previous kernel that had the signature");
+  }, "mismatched with a previous kernel");
 }
 
 TEST(OperatorRegistrationTest, givenTorchLibrary_whenAccessingWithMismatchingCppSignatures_thenFails) {

From ecdbea77bca044ee97eb5f17d435d701e044b79f Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Thu, 29 Oct 2020 09:10:47 -0700
Subject: [PATCH 103/169] Fix DDP documentation (#46861)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46861

Noticed that in the DDP documentation:
https://pytorch.org/docs/master/generated/torch.nn.parallel.DistributedDataParallel.html?highlight=distributeddataparallel
there were some examples with `torch.nn.DistributedDataParallel`, fix this to
read `torch.nn.parallel.DistributedDataParallel`.
ghstack-source-id: 115453703

Test Plan: ci

Reviewed By: pritamdamania87, SciPioneer

Differential Revision: D24534486

fbshipit-source-id: 64b92dc8a55136c23313f7926251fe825a2cb7d5
---
 torch/nn/parallel/distributed.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index 0fceb2137a3b..7d64ed8a4174 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -329,7 +329,7 @@ class DistributedDataParallel(Module):
     Example::
 
         >>> torch.distributed.init_process_group(backend='nccl', world_size=4, init_method='...')
-        >>> net = torch.nn.DistributedDataParallel(model, pg)
+        >>> net = torch.nn.parallel.DistributedDataParallel(model, pg)
     """
     def __init__(self, module, device_ids=None,
                  output_device=None, dim=0, broadcast_buffers=True,
@@ -626,7 +626,7 @@ def no_sync(self):
 
         Example::
 
-            >>> ddp = torch.nn.DistributedDataParallel(model, pg)
+            >>> ddp = torch.nn.parallel.DistributedDataParallel(model, pg)
             >>> with ddp.no_sync():
             >>>   for input in inputs:
             >>>     ddp(input).backward()  # no synchronization, accumulate grads

From 13b4127c95134fb2145e251441bebf63e30876e4 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Thu, 29 Oct 2020 10:20:14 -0700
Subject: [PATCH 104/169] Fix implicit conversion (#46833)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46833

Implicit integer conversions are causing compiler warnings. Since in this case the logs make it pretty clear that the `unsigned` types won't overflow despite 64-bit inputs, we fix the issue by making the downconversion explicit.

Test Plan: Standard test rig.

Reviewed By: malfet

Differential Revision: D24481377

fbshipit-source-id: 4422538286d8ed2beb65065544016fd430394ff8
---
 c10/util/llvmMathExtras.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/c10/util/llvmMathExtras.h b/c10/util/llvmMathExtras.h
index 8def126c29aa..2c4fbf8a501b 100644
--- a/c10/util/llvmMathExtras.h
+++ b/c10/util/llvmMathExtras.h
@@ -14,9 +14,10 @@
  #define LLVM_SUPPORT_MATHEXTRAS_H
 
  #include <algorithm>
- #include <cmath>
  #include <cassert>
  #include <climits>
+ #include <cmath>
+ #include <cstdint>
  #include <cstring>
  #include <limits>
  #include <type_traits>
@@ -547,26 +548,26 @@
  /// (32 bit edition.)
  /// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
  inline unsigned Log2_32(uint32_t Value) {
-   return 31 - countLeadingZeros(Value);
+   return static_cast<unsigned>(31 - countLeadingZeros(Value));
  }
 
  /// Return the floor log base 2 of the specified value, -1 if the value is zero.
  /// (64 bit edition.)
  inline unsigned Log2_64(uint64_t Value) {
-   return 63 - countLeadingZeros(Value);
+   return static_cast<unsigned>(63 - countLeadingZeros(Value));
  }
 
  /// Return the ceil log base 2 of the specified value, 32 if the value is zero.
  /// (32 bit edition).
  /// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
  inline unsigned Log2_32_Ceil(uint32_t Value) {
-   return 32 - countLeadingZeros(Value - 1);
+   return static_cast<unsigned>(32 - countLeadingZeros(Value - 1));
  }
 
  /// Return the ceil log base 2 of the specified value, 64 if the value is zero.
  /// (64 bit edition.)
  inline unsigned Log2_64_Ceil(uint64_t Value) {
-   return 64 - countLeadingZeros(Value - 1);
+   return static_cast<unsigned>(64 - countLeadingZeros(Value - 1));
  }
 
  /// Return the greatest common divisor of the values using Euclid's algorithm.
@@ -589,6 +590,7 @@
 
  /// This function takes a 32-bit integer and returns the bit equivalent float.
  inline float BitsToFloat(uint32_t Bits) {
+   //TODO: Use bit_cast once C++20 becomes available.
    float F;
    static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
    memcpy(&F, &Bits, sizeof(Bits));

From f629fbe2352571d3ec6daf9fba86fe6d095b73ee Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <ivan.yashchuk@aalto.fi>
Date: Thu, 29 Oct 2020 10:22:03 -0700
Subject: [PATCH 105/169] Added torch.linalg.tensorsolve (#46142)

Summary:
This PR adds `torch.linalg.tensorsolve` function that matches `numpy.linalg.tensorsolve`.

Ref https://github.com/pytorch/pytorch/issues/42666.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46142

Reviewed By: izdeby

Differential Revision: D24539400

Pulled By: mruberry

fbshipit-source-id: 6e38364fe0bc511e739036deb274d9307df119b2
---
 aten/src/ATen/native/BatchLinearAlgebra.cpp   |   3 +-
 aten/src/ATen/native/LinearAlgebra.cpp        |  49 +++++++
 .../ATen/native/cuda/BatchLinearAlgebra.cu    |   9 +-
 aten/src/ATen/native/native_functions.yaml    |  12 ++
 docs/source/linalg.rst                        |   1 +
 test/test_linalg.py                           | 124 +++++++++++++++++-
 torch/csrc/api/include/torch/linalg.h         |  26 ++++
 torch/linalg/__init__.py                      |  45 +++++++
 torch/overrides.py                            |   1 +
 9 files changed, 264 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
index e7e5659babbb..16f706ca0ed5 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -331,6 +331,7 @@ static void apply_solve(Tensor& b, Tensor& A, std::vector<int64_t>& infos) {
   auto batch_size = batchCount(A);
   auto n = A.size(-2);
   auto nrhs = b.size(-1);
+  auto lda = std::max(int64_t{1}, n);
 
   auto ipiv = at::empty({n}, b.options().dtype(kInt));
   auto ipiv_data = ipiv.data_ptr<int>();
@@ -339,7 +340,7 @@ static void apply_solve(Tensor& b, Tensor& A, std::vector<int64_t>& infos) {
   for (int64_t i = 0; i < batch_size; i++) {
     scalar_t* A_working_ptr = &A_data[i * A_mat_stride];
     scalar_t* b_working_ptr = &b_data[i * b_mat_stride];
-    lapackSolve<scalar_t>(n, nrhs, A_working_ptr, n, ipiv_data, b_working_ptr, n, &info);
+    lapackSolve<scalar_t>(n, nrhs, A_working_ptr, lda, ipiv_data, b_working_ptr, lda, &info);
     infos[i] = info;
     if (info != 0) {
       return;
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 0dd727fb3197..853d913e9336 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -1602,6 +1602,55 @@ Tensor& linalg_norm_out(Tensor& result, const Tensor& self, std::string ord, opt
   return linalg_norm_out_impl(result, self, c10::nullopt, ord, opt_dim, keepdim, opt_dtype);
 }
 
+Tensor linalg_tensorsolve(const Tensor& self, const Tensor& other, optional<IntArrayRef> dims) {
+  /*
+  The idea is to reduce the problem to 2D matrix solve.
+  Step 1. (optional) `self` is permuted with `dims` such that dimensions from `dims` are moved to the right.
+  For example, if we have 4D input with the shape (1, 2, 3, 4) and dims=(0, 2),
+  then the result of permutation would have the shape (2, 4, 1, 3).
+  Step 2. reshape `self` to 2D matrix.
+  Step 3. solve the matrix equation self.to_2D() @ result = other.to_1D()
+  Step 4. reshape the result.
+  */
+  int64_t ndim = self.dim();
+  Tensor self_ = self;
+
+  // move dimensions of `self_` from `dims` to the end
+  if (dims.has_value()) {
+    DimVector dest_axes(dims.value().size());
+    std::iota(dest_axes.begin(), dest_axes.end(), ndim - dest_axes.size());
+    self_ = at::movedim(self_, dims.value(), dest_axes);
+  }
+
+  // result_shape is self_.sizes[-(an-other.dim):]
+  std::vector<int64_t> result_shape = self_.sizes().slice(other.dim(), ndim - other.dim()).vec();
+
+  int64_t result_product = std::accumulate(result_shape.begin(), result_shape.end(), int64_t{1}, std::multiplies<int64_t>());
+  int64_t other_product = std::accumulate(other.sizes().begin(), other.sizes().end(), int64_t{1}, std::multiplies<int64_t>());
+
+  // Check whether the self tensor can be reshaped to the 2D square matrix
+  TORCH_CHECK(result_product == other_product,
+    "Expected self to satisfy the requirement prod(self.shape[other.ndim:]) == prod(self.shape[:other.ndim]), but got ",
+    result_product, " != ", other_product);
+
+  self_ = self_.reshape({result_product, result_product});
+
+  // 0th output of at::solve is the solution
+  // normally `other` would be flattened by at::solve expects 2D input
+  Tensor result = std::get<0>(at::solve(other.reshape({other.numel(), 1}), self_));
+  return result.reshape(result_shape);
+}
+
+Tensor& linalg_tensorsolve_out(Tensor& result, const Tensor& self, const Tensor& other, optional<IntArrayRef> dims) {
+  TORCH_CHECK(result.scalar_type() == self.scalar_type(),
+    "result dtype ", result.scalar_type(), " does not match self dtype ", self.scalar_type());
+
+  Tensor result_tmp = at::linalg_tensorsolve(self, other, dims);
+  at::native::resize_output(result, result_tmp.sizes());
+  result.copy_(result_tmp);
+  return result;
+}
+
 static inline Tensor _chain_matmul_general(TensorList matrices, std::vector<std::vector<int64_t>>& order, int64_t i, int64_t j) {
   if (i == j)
     return matrices[i];
diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
index 88bad0a919b2..318185e43e8a 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
@@ -840,12 +840,13 @@ AT_ERROR("solve: MAGMA library not found in "
   auto b_data = b.data_ptr<scalar_t>();
   magma_int_t n = magma_int_cast(A.size(-2), "A.size(-2)");
   magma_int_t nrhs = magma_int_cast(b.size(-1), "b.size(-1)");
+  magma_int_t lda = std::max(magma_int_t{1}, n);
 
   if (b.dim() == 2) {
     auto ipiv = at::empty({n}, at::kInt);
     magma_int_t info = 0;
-    magmaSolve<scalar_t>(n, nrhs, A_data, n, ipiv.data_ptr<magma_int_t>(),
-                        b_data, n, &info);
+    magmaSolve<scalar_t>(n, nrhs, A_data, lda, ipiv.data_ptr<magma_int_t>(),
+                        b_data, lda, &info);
     infos[0] = info;
   } else {
     auto A_mat_stride = matrixStride(A);
@@ -885,7 +886,7 @@ AT_ERROR("solve: MAGMA library not found in "
       magma_int_t* info_array_cur = &info_array[mini_idx];
 
       magmaSolveBatched<scalar_t>(
-          n, nrhs, A_array_cur, n, ipiv_array_cur, b_array_cur, n,
+          n, nrhs, A_array_cur, lda, ipiv_array_cur, b_array_cur, lda,
           info_array_cur, batch_limit, magma_queue);
     }
 
@@ -893,7 +894,7 @@ AT_ERROR("solve: MAGMA library not found in "
     // which concisely is equal to batch_size % batch_limit
     if (batch_size % batch_limit != 0) {
       magmaSolveBatched<scalar_t>(
-          n, nrhs, &A_array[mini_idx], n, &ipiv_array[mini_idx], &b_array[mini_idx], n,
+          n, nrhs, &A_array[mini_idx], lda, &ipiv_array[mini_idx], &b_array[mini_idx], lda,
           &info_array[mini_idx], batch_size % batch_limit, magma_queue);
     }
 
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index b143d39e67e2..cf323bcbd365 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -8875,6 +8875,18 @@
   python_module: linalg
   variants: function
 
+- func: linalg_tensorsolve(Tensor self, Tensor other, int[]? dims=None) -> Tensor
+  python_module: linalg
+  variants: function
+  dispatch:
+    Math: linalg_tensorsolve
+
+- func: linalg_tensorsolve.out(Tensor self, Tensor other, int[]? dims=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+  dispatch:
+    Math: linalg_tensorsolve_out
+
 ## Functions that are only for testing
 # It is undocumented and should not be used outside of tests.
 - func: _test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=1) -> Tensor
diff --git a/docs/source/linalg.rst b/docs/source/linalg.rst
index 834b6a60ac93..14d3ca1767e9 100644
--- a/docs/source/linalg.rst
+++ b/docs/source/linalg.rst
@@ -14,3 +14,4 @@ Functions
 
 .. autofunction:: det
 .. autofunction:: norm
+.. autofunction:: tensorsolve
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 127c674e5b05..d7de3841ab65 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -1,13 +1,15 @@
 import torch
 import unittest
 import itertools
+import warnings
 from math import inf, nan, isnan
 from random import randrange
 
 from torch.testing._internal.common_utils import \
     (TestCase, run_tests, TEST_NUMPY, IS_MACOS, IS_WINDOWS, TEST_WITH_ASAN, make_tensor)
 from torch.testing._internal.common_device_type import \
-    (instantiate_device_type_tests, dtypes, skipCUDAIfNoMagma, skipCPUIfNoLapack, precisionOverride)
+    (instantiate_device_type_tests, dtypes, dtypesIfCUDA,
+     onlyCUDA, skipCUDAIfNoMagma, skipCPUIfNoLapack, precisionOverride)
 from torch.testing._internal.jit_metaprogramming_utils import gen_script_fn_and_args
 from torch.autograd import gradcheck
 
@@ -914,6 +916,126 @@ def test_nuclear_norm_exceptions_old(self, device):
         self.assertRaisesRegex(RuntimeError, "duplicate or invalid", torch.norm, x, "nuc", (0, 0))
         self.assertRaisesRegex(IndexError, "Dimension out of range", torch.norm, x, "nuc", (0, 2))
 
+    @skipCUDAIfNoMagma
+    @skipCPUIfNoLapack
+    @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble)
+    @dtypesIfCUDA(torch.float, torch.double)
+    @precisionOverride({torch.float: 1e-4, torch.cfloat: 1e-4})
+    def test_tensorsolve(self, device, dtype):
+        def run_test(a_shape, dims):
+            a = torch.randn(a_shape, dtype=dtype, device=device)
+            b = torch.randn(a_shape[:2], dtype=dtype, device=device)
+            result = torch.linalg.tensorsolve(a, b, dims=dims)
+            expected = np.linalg.tensorsolve(a.cpu().numpy(), b.cpu().numpy(), axes=dims)
+            self.assertEqual(result, expected)
+
+            # check the out= variant
+            out = torch.empty_like(result)
+            ans = torch.linalg.tensorsolve(a, b, dims=dims, out=out)
+            self.assertEqual(ans, out)
+            self.assertEqual(ans, result)
+
+        a_shapes = [(2, 3, 6), (3, 4, 4, 3)]
+        dims = [None, (0, 2)]
+        for a_shape, d in itertools.product(a_shapes, dims):
+            run_test(a_shape, d)
+
+    @skipCUDAIfNoMagma
+    @skipCPUIfNoLapack
+    @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble)
+    @dtypesIfCUDA(torch.float, torch.double)
+    def test_tensorsolve_empty(self, device, dtype):
+        # Check for empty inputs. NumPy does not work for these cases.
+        a = torch.empty(0, 0, 1, 2, 3, 0, dtype=dtype, device=device)
+        b = torch.empty(a.shape[:2], dtype=dtype, device=device)
+        x = torch.linalg.tensorsolve(a, b)
+        self.assertEqual(torch.tensordot(a, x, dims=len(x.shape)), b)
+
+    # TODO: once "solve_cuda" supports complex dtypes, they shall be added to above tests
+    @unittest.expectedFailure
+    @onlyCUDA
+    @skipCUDAIfNoMagma
+    @dtypes(torch.cfloat, torch.cdouble)
+    def test_tensorsolve_xfailed(self, device, dtype):
+        a_shape = (2, 3, 6)
+        a = torch.randn(a_shape, dtype=dtype, device=device)
+        b = torch.randn(a_shape[:2], dtype=dtype, device=device)
+        result = torch.linalg.tensorsolve(a, b)
+        expected = np.linalg.tensorsolve(a.cpu().numpy(), b.cpu().numpy())
+        self.assertEqual(result, expected)
+
+    @skipCUDAIfNoMagma
+    @skipCPUIfNoLapack
+    @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble)
+    @dtypesIfCUDA(torch.float, torch.double)
+    @precisionOverride({torch.float: 1e-4, torch.cfloat: 1e-4})
+    def test_tensorsolve_non_contiguous(self, device, dtype):
+        def run_test_permuted(a_shape, dims):
+            # check for permuted / transposed inputs
+            a = torch.randn(a_shape, dtype=dtype, device=device)
+            a = a.movedim((0, 2), (-2, -1))
+            self.assertFalse(a.is_contiguous())
+            b = torch.randn(a.shape[:2], dtype=dtype, device=device)
+            b = b.t()
+            self.assertFalse(b.is_contiguous())
+            result = torch.linalg.tensorsolve(a, b, dims=dims)
+            expected = np.linalg.tensorsolve(a.cpu().numpy(), b.cpu().numpy(), axes=dims)
+            self.assertEqual(result, expected)
+
+        def run_test_skipped_elements(a_shape, dims):
+            # check for inputs with skipped elements
+            a = torch.randn(a_shape, dtype=dtype, device=device)
+            a = a[::2]
+            self.assertFalse(a.is_contiguous())
+            b = torch.randn(a_shape[:2], dtype=dtype, device=device)
+            b = b[::2]
+            self.assertFalse(b.is_contiguous())
+            result = torch.linalg.tensorsolve(a, b, dims=dims)
+            expected = np.linalg.tensorsolve(a.cpu().numpy(), b.cpu().numpy(), axes=dims)
+            self.assertEqual(result, expected)
+
+            # check non-contiguous out
+            out = torch.empty(2 * result.shape[0], *result.shape[1:], dtype=dtype, device=device)[::2]
+            self.assertFalse(out.is_contiguous())
+            ans = torch.linalg.tensorsolve(a, b, dims=dims, out=out)
+            self.assertEqual(ans, out)
+            self.assertEqual(ans, result)
+
+        a_shapes = [(2, 3, 6), (3, 4, 4, 3)]
+        dims = [None, (0, 2)]
+        for a_shape, d in itertools.product(a_shapes, dims):
+            run_test_permuted(a_shape, d)
+
+        a_shapes = [(4, 3, 6), (6, 4, 4, 3)]
+        dims = [None, (0, 2)]
+        for a_shape, d in itertools.product(a_shapes, dims):
+            run_test_skipped_elements(a_shape, d)
+
+    @skipCUDAIfNoMagma
+    @skipCPUIfNoLapack
+    @dtypes(torch.float32)
+    def test_tensorsolve_errors_and_warnings(self, device, dtype):
+        # tensorsolve expects the input that can be reshaped to a square matrix
+        a = torch.eye(2 * 3 * 4).reshape((2 * 3, 4, 2, 3, 4))
+        b = torch.randn(8, 4)
+        self.assertTrue(np.prod(a.shape[2:]) != np.prod(b.shape))
+        with self.assertRaisesRegex(RuntimeError, r'Expected self to satisfy the requirement'):
+            torch.linalg.tensorsolve(a, b)
+
+        # if non-empty out tensor with wrong shape is passed a warning is given
+        out = torch.empty_like(a)
+        b = torch.randn(6, 4)
+        with warnings.catch_warnings(record=True) as w:
+            # Trigger warning
+            torch.linalg.tensorsolve(a, b, out=out)
+            # Check warning occurs
+            self.assertEqual(len(w), 1)
+            self.assertTrue("An output with one or more elements was resized" in str(w[-1].message))
+
+        # dtypes should match
+        out = torch.empty_like(a).to(torch.int)
+        with self.assertRaisesRegex(RuntimeError, "result dtype Int does not match self dtype"):
+            torch.linalg.tensorsolve(a, b, out=out)
 
 instantiate_device_type_tests(TestLinalg, globals())
 
diff --git a/torch/csrc/api/include/torch/linalg.h b/torch/csrc/api/include/torch/linalg.h
index 5ce90dcc972e..c0bae62510e6 100644
--- a/torch/csrc/api/include/torch/linalg.h
+++ b/torch/csrc/api/include/torch/linalg.h
@@ -28,6 +28,14 @@ inline Tensor& norm_out(Tensor& result, const Tensor& self, std::string ord, opt
   return torch::linalg_norm_out(result, self, ord, opt_dim, keepdim, opt_dtype);
 }
 
+inline Tensor tensorsolve(const Tensor& self, const Tensor& other, optional<IntArrayRef> dims) {
+  return torch::linalg_tensorsolve(self, other, dims);
+}
+
+inline Tensor& tensorsolve_out(Tensor& result, const Tensor& self, const Tensor& other, optional<IntArrayRef> dims) {
+  return torch::linalg_tensorsolve_out(result, self, other, dims);
+}
+
 } // namespace detail
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
@@ -53,4 +61,22 @@ inline Tensor& linalg_norm_out(Tensor& result, const Tensor& self, std::string o
   return detail::norm_out(result, self, ord, opt_dim, keepdim, opt_dtype);
 }
 
+/// Computes a tensor `x` such that `tensordot(input, x, dims=x.dim()) = other`.
+///
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.tensorsolve
+///
+/// Example:
+/// ```
+/// auto a = torch::eye(2*3*4).reshape({2*3, 4, 2, 3, 4});
+/// auto b = torch::randn(2*3, 4);
+/// auto x = torch::linalg::tensorsolve(a, b);
+/// ```
+inline Tensor tensorsolve(const Tensor& input, const Tensor& other, optional<IntArrayRef> dims) {
+  return detail::tensorsolve(input, other, dims);
+}
+
+inline Tensor& tensorsolve_out(Tensor& result, const Tensor& input, const Tensor& other, optional<IntArrayRef> dims) {
+  return detail::tensorsolve_out(result, input, other, dims);
+}
+
 }} // torch::linalg
diff --git a/torch/linalg/__init__.py b/torch/linalg/__init__.py
index 074bb47faaac..ad0badf5eed9 100644
--- a/torch/linalg/__init__.py
+++ b/torch/linalg/__init__.py
@@ -139,3 +139,48 @@
     >>> LA.norm(m[0, :, :]), LA.norm(m[1, :, :])
     (tensor(3.7417), tensor(11.2250))
 """)
+
+tensorsolve = _add_docstr(_linalg.linalg_tensorsolve, r"""
+linalg.tensorsolve(input, other, dims=None, *, out=None) -> Tensor
+
+Computes a tensor ``x`` such that ``tensordot(input, x, dims=x.ndim) = other``.
+The resulting tensor ``x`` has the same shape as ``input[other.ndim:]``.
+
+Supports real-valued and, only on the CPU, complex-valued inputs.
+
+.. note:: If :attr:`input` does not satisfy the requirement
+          ``prod(input.shape[other.ndim:]) == prod(input.shape[:other.ndim])``
+          after (optionally) moving the dimensions using :attr:`dims`, then a RuntimeError will be thrown.
+
+Args:
+    input (Tensor): "left-hand-side" tensor, it must satisfy the requirement
+                    ``prod(input.shape[other.ndim:]) == prod(input.shape[:other.ndim])``.
+    other (Tensor): "right-hand-side" tensor of shape ``input.shape[other.ndim]``.
+    dims (Tuple[int]): dimensions of :attr:`input` to be moved before the computation.
+                       Equivalent to calling ``input = movedim(input, dims, range(len(dims) - input.ndim, 0))``.
+                       If None (default), no dimensions are moved.
+
+Keyword args:
+    out (Tensor, optional): The output tensor. Ignored if ``None``. Default: ``None``
+
+Examples::
+
+    >>> a = torch.eye(2 * 3 * 4).reshape((2 * 3, 4, 2, 3, 4))
+    >>> b = torch.randn(2 * 3, 4)
+    >>> x = torch.linalg.tensorsolve(a, b)
+    >>> x.shape
+    torch.Size([2, 3, 4])
+    >>> torch.allclose(torch.tensordot(a, x, dims=x.ndim), b)
+    True
+
+    >>> a = torch.randn(6, 4, 4, 3, 2)
+    >>> b = torch.randn(4, 3, 2)
+    >>> x = torch.linalg.tensorsolve(a, b, dims=(0, 2))
+    >>> x.shape
+    torch.Size([6, 4])
+    >>> a = a.permute(1, 3, 4, 0, 2)
+    >>> a.shape[b.ndim:]
+    torch.Size([6, 4])
+    >>> torch.allclose(torch.tensordot(a, x, dims=x.ndim), b, atol=1e-6)
+    True
+""")
diff --git a/torch/overrides.py b/torch/overrides.py
index 8224944d9ff6..7f448b334d17 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -738,6 +738,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.tan: lambda input, out=None: -1,
         torch.tanh: lambda input, out=None: -1,
         torch.tensordot: lambda a, b, dims=2: -1,
+        torch.linalg.tensorsolve: lambda a, b, dims=None: -1,
         torch.tensor_split: lambda input, indices_or_sections, dim=0: -1,
         torch.threshold: lambda input, threshold, value, inplace=False: -1,
         torch.topk: lambda input, k, dim=-1, descending=False, out=None: -1,

From 2249a293b73e1c5c9408c249d4e256f37b90f4ef Mon Sep 17 00:00:00 2001
From: Sameer Deshmukh <sameer.deshmukh93@gmail.com>
Date: Thu, 29 Oct 2020 10:32:08 -0700
Subject: [PATCH 106/169] Fix segfault with torch.orgqr. (#46700)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/41768

The fault was that a NULL `tau` would get passed to LAPACK function. This PR fixes that by checking whether the `tau` contains 0 elements at the beginning of the function.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46700

Reviewed By: albanD

Differential Revision: D24616427

Pulled By: mruberry

fbshipit-source-id: 92e8f1489b113c0ceeca6e54dea8b810a51a63c3
---
 aten/src/TH/generic/THLapack.cpp       | 1 -
 aten/src/TH/generic/THTensorLapack.cpp | 3 ++-
 test/test_torch.py                     | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/aten/src/TH/generic/THLapack.cpp b/aten/src/TH/generic/THLapack.cpp
index 6a776f4d0a17..c0fb51f53e45 100644
--- a/aten/src/TH/generic/THLapack.cpp
+++ b/aten/src/TH/generic/THLapack.cpp
@@ -2,7 +2,6 @@
 #define TH_GENERIC_FILE "TH/generic/THLapack.cpp"
 #else
 
-
 TH_EXTERNC void dgels_(char *trans, int *m, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, double *work, int *lwork, int *info);
 TH_EXTERNC void sgels_(char *trans, int *m, int *n, int *nrhs, float *a, int *lda, float *b, int *ldb, float *work, int *lwork, int *info);
 TH_EXTERNC void dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda, double *wr, double *wi, double* vl, int *ldvl, double *vr, int *ldvr, double *work, int *lwork, int *info);
diff --git a/aten/src/TH/generic/THTensorLapack.cpp b/aten/src/TH/generic/THTensorLapack.cpp
index 2494e21791e4..76d7d7bc48d8 100644
--- a/aten/src/TH/generic/THTensorLapack.cpp
+++ b/aten/src/TH/generic/THTensorLapack.cpp
@@ -410,7 +410,8 @@ void THTensor_(orgqr)(THTensor *ra_, THTensor *a, THTensor *tau)
 {
   if (a == NULL) a = ra_;
   THArgCheck(THTensor_nDimension(a) == 2, 1, "'input' should be 2 dimensional");
-  THArgCheck(!a->is_empty(), 1, "'input' should not be empty");
+  THArgCheck(!a->is_empty(), 2, "'input' should not be empty");
+  THArgCheck(!tau->is_empty(), 3, "'tau' should not be empty");
 
   THTensor *ra__ = NULL;
   ra__ = THTensor_(cloneColumnMajor)(ra_, a);
diff --git a/test/test_torch.py b/test/test_torch.py
index f1b0b23b1518..2711edbf3384 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -15539,7 +15539,8 @@ def test_orgqr_errors(self, device):
             ((10,), (2,), r"'input' should be 2 dimensional"),
             ((10, 6), (20,), r"input.size\(1\) must be greater than or equal to input2.size\(0\)"),
             ((6, 10), (5,), r"input.size\(0\) must be greater than or equal to input.size\(1\)"),
-            ((0, 0), (0,), r"'input' should not be empty")
+            ((0, 0), (0,), r"'input' should not be empty"),
+            ((2, 2), (2, 0,), r"'tau' should not be empty")
         ]
         for a_size, tau_size, error_regex in test_cases:
             a = torch.rand(*a_size, device=device)

From 2b6a720eb14c69893c748ba2ffd8d0567fb14add Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Thu, 29 Oct 2020 10:51:33 -0700
Subject: [PATCH 107/169] Update pybind to 2.6.0 (#46415)

Summary:
Preserve PYBIND11 (https://github.com/pytorch/pytorch/commit/63ce3fbde85672e112411ad731839e6db3c15c08) configuration options in `torch._C._PYBIND11 (https://github.com/pytorch/pytorch/commit/63ce3fbde85672e112411ad731839e6db3c15c08)_COMPILER_TYPE` and use them when building extensions

Also, use f-strings in `torch.utils.cpp_extension`

"Fixes" https://github.com/pytorch/pytorch/issues/46367

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46415

Reviewed By: VitalyFedyunin

Differential Revision: D24605949

Pulled By: malfet

fbshipit-source-id: 87340f2ed5308266a46ef8f0317316227dab9d4d
---
 .flake8                      |  2 +-
 third_party/pybind11         |  2 +-
 torch/csrc/Module.cpp        | 23 +++++++++++++++++++++++
 torch/utils/cpp_extension.py | 23 +++++++++++++++++++++++
 4 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/.flake8 b/.flake8
index 7ecc6df31754..8be8496e4224 100644
--- a/.flake8
+++ b/.flake8
@@ -12,5 +12,5 @@ ignore =
     B007,B008,
     # these ignores are from flake8-comprehensions; please fix!
     C400,C401,C402,C403,C404,C405,C407,C411,C413,C414,C415
-per-file-ignores = __init__.py: F401
+per-file-ignores = __init__.py: F401 torch/utils/cpp_extension.py: B950
 exclude = docs/src,venv,third_party,caffe2,scripts,docs/caffe2,torch/lib/include,torch/lib/tmp_install,build,torch/include,*.pyi,.git,build,build_test_custom_build,build_code_analyzer
diff --git a/third_party/pybind11 b/third_party/pybind11
index 25abf7efba0b..59a2ac2745d8 160000
--- a/third_party/pybind11
+++ b/third_party/pybind11
@@ -1 +1 @@
-Subproject commit 25abf7efba0b2990f5a6dfb0a31bc65c0f2f4d17
+Subproject commit 59a2ac2745d8a57ac94c6accced73620d59fb844
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 7e4f8bd3c080..a5df6329030d 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -864,6 +864,29 @@ Call this whenever a new thread is created in order to propagate values from
   ASSERT_TRUE(set_module_attr("_GLIBCXX_USE_CXX11_ABI", Py_False));
 #endif
 
+// See note [Pybind11 ABI constants]
+#define SET_STR_DEFINE(name) \
+  ASSERT_TRUE(set_module_attr("_" # name, THPUtils_packString(name)))
+
+#ifdef PYBIND11_COMPILER_TYPE
+  SET_STR_DEFINE(PYBIND11_COMPILER_TYPE);
+#else
+  ASSERT_TRUE(set_module_attr("_" C10_STRINGIZE(PYBIND11_COMPILER_TYPE), Py_None));
+#endif
+
+#ifdef PYBIND11_STDLIB
+  SET_STR_DEFINE(PYBIND11_STDLIB);
+#else
+  ASSERT_TRUE(set_module_attr("_" C10_STRINGIZE(PYBIND11_STDLIB), Py_None));
+#endif
+
+#ifdef PYBIND11_BUILD_ABI
+  SET_STR_DEFINE(PYBIND11_BUILD_ABI);
+#else
+  ASSERT_TRUE(set_module_attr("_" C10_STRINGIZE(PYBIND11_BUILD_ABI), Py_None));
+#endif
+#undef SET_STR_DEFINE
+
   const auto& defaultGenerator = at::detail::getDefaultCPUGenerator();
   THPDefaultCPUGenerator = (THPGenerator*)THPGenerator_initDefaultGenerator(defaultGenerator);
   // This reference is meant to be given away, so no need to incref here.
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 737a6371b9f7..afd654c6a85b 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -364,6 +364,11 @@ def build_extensions(self) -> None:
                         extension.extra_compile_args[ext] = []
 
             self._add_compile_flag(extension, '-DTORCH_API_INCLUDE_EXTENSION_H')
+            # See note [Pybind11 ABI constants]
+            for name in ["COMPILER_TYPE", "STDLIB", "BUILD_ABI"]:
+                val = getattr(torch._C, f"_PYBIND11_{name}")
+                if val is not None and not IS_WINDOWS:
+                    self._add_compile_flag(extension, f'-DPYBIND11_{name}="{val}"')
             self._define_torch_extension_name(extension)
             self._add_gnu_cpp_abi_flag(extension)
 
@@ -1590,6 +1595,24 @@ def _write_ninja_file_to_build_library(path,
 
     common_cflags = [f'-DTORCH_EXTENSION_NAME={name}']
     common_cflags.append('-DTORCH_API_INCLUDE_EXTENSION_H')
+
+    # Note [Pybind11 ABI constants]
+    #
+    # Pybind11 before 2.4 used to build an ABI strings using the following pattern:
+    # f"__pybind11_internals_v{PYBIND11_INTERNALS_VERSION}{PYBIND11_INTERNALS_KIND}{PYBIND11_BUILD_TYPE}__"
+    # Since 2.4 compier type, stdlib and build abi parameters are also encoded like this:
+    # f"__pybind11_internals_v{PYBIND11_INTERNALS_VERSION}{PYBIND11_INTERNALS_KIND}{PYBIND11_COMPILER_TYPE}{PYBIND11_STDLIB}{PYBIND11_BUILD_ABI}{PYBIND11_BUILD_TYPE}__"
+    #
+    # This was done in order to further narrow down the chances of compiler ABI incompatibility
+    # that can cause a hard to debug segfaults.
+    # For PyTorch extensions we want to relax those restrictions and pass compiler, stdlib and abi properties
+    # captured during PyTorch native library compilation in torch/csrc/Module.cpp
+
+    for pname in ["COMPILER_TYPE", "STDLIB", "BUILD_ABI"]:
+        pval = getattr(torch._C, f"_PYBIND11_{pname}")
+        if pval is not None and not IS_WINDOWS:
+            common_cflags.append(f'-DPYBIND11_{pname}=\\"{pval}\\"')
+
     common_cflags += [f'-I{include}' for include in user_includes]
     common_cflags += [f'-isystem {include}' for include in system_includes]
 

From 38265acfbece4c9c3fce8a8ff031532115343c8a Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@fb.com>
Date: Thu, 29 Oct 2020 11:12:41 -0700
Subject: [PATCH 108/169] Add Mul op for Vulkan (#47021)

Summary:
Updates mul_scalar shader to support the new Vulkan API, and adds a new op for it using the new API.

Also adds an in-place version for the op.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47021

Test Plan:
Unit test included. To build & run:
```
BUILD_CUSTOM_PROTOBUF=OFF \
  BUILD_TEST=ON \
  USE_EIGEN_FOR_BLAS=OFF \
  USE_FBGEMM=OFF \
  USE_MKLDNN=OFF \
  USE_NNPACK=OFF \
  USE_NUMPY=OFF \
  USE_OBSERVERS=OFF \
  USE_PYTORCH_QNNPACK=OFF \
  USE_QNNPACK=OFF \
  USE_VULKAN=ON \
  USE_VULKAN_API=ON \
  USE_VULKAN_SHADERC_RUNTIME=ON \
  USE_VULKAN_WRAPPER=OFF \
  MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python3 setup.py develop --cmake && ./build/bin/vulkan_api_test
```

Reviewed By: AshkanAliabadi

Differential Revision: D24624729

Pulled By: SS-JIA

fbshipit-source-id: 97e76e4060307a9a24311ac51dca8812e4471249
---
 aten/src/ATen/native/vulkan/VulkanOps.cpp     |   5 +-
 .../ATen/native/vulkan/glsl/mul_scalar.glsl   |  26 ++--
 .../ATen/native/vulkan/glsl/mul_scalar_.glsl  |  26 ++++
 aten/src/ATen/native/vulkan/ops/Mul.cpp       | 130 ++++++++++++++++++
 aten/src/ATen/test/vulkan_api_test.cpp        |  24 ++++
 5 files changed, 198 insertions(+), 13 deletions(-)
 create mode 100644 aten/src/ATen/native/vulkan/glsl/mul_scalar_.glsl
 create mode 100644 aten/src/ATen/native/vulkan/ops/Mul.cpp

diff --git a/aten/src/ATen/native/vulkan/VulkanOps.cpp b/aten/src/ATen/native/vulkan/VulkanOps.cpp
index 8b040ecbec10..2ad3a695d65b 100644
--- a/aten/src/ATen/native/vulkan/VulkanOps.cpp
+++ b/aten/src/ATen/native/vulkan/VulkanOps.cpp
@@ -613,13 +613,12 @@ void mul(VulkanTensor& output, const VulkanTensor& input, const float s) {
 
   auto device = context().device();
   struct ConstBlock {
-    int32_t inputSize[4];
+    int32_t inputSize[3];
     float s;
   };
   ConstBlock cb{{safe_downcast<int32_t>(W),
                  safe_downcast<int32_t>(H),
-                 safe_downcast<int32_t>(C_4),
-                 0},
+                 safe_downcast<int32_t>(C_4)},
                 s};
   VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb));
 
diff --git a/aten/src/ATen/native/vulkan/glsl/mul_scalar.glsl b/aten/src/ATen/native/vulkan/glsl/mul_scalar.glsl
index d34a99d2c6e8..8d7fc2b93a7f 100644
--- a/aten/src/ATen/native/vulkan/glsl/mul_scalar.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/mul_scalar.glsl
@@ -1,21 +1,27 @@
 #version 450 core
+#define PRECISION $precision
+
 layout(std430) buffer;
 layout(std430) uniform;
 
-layout(set = 0, rgba16f, binding = 0) writeonly highp uniform image3D uOutput;
-layout(set = 0, binding = 1) uniform highp sampler3D uInput;
-layout(set = 0, binding = 2) uniform constBlock {
-  ivec4 sizes;
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 1)          uniform PRECISION                    sampler3D uInput;
+layout(set = 0, binding = 2)          uniform           restrict           Block {
+  ivec3 WHC;
   float other;
-}
-uConstBlock;
+} uBlock;
 
 layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
 
 void main() {
-  ivec3 pos = ivec3(gl_GlobalInvocationID);
-  if (all(lessThan(pos, uConstBlock.sizes.xyz))) {
-    vec4 v = uConstBlock.other * texelFetch(uInput, pos, 0);
-    imageStore(uOutput, pos, v);
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (all(lessThan(pos, uBlock.WHC))) {
+    imageStore(
+        uOutput,
+        pos,
+        texelFetch(uInput, pos, 0) * uBlock.other);
   }
 }
diff --git a/aten/src/ATen/native/vulkan/glsl/mul_scalar_.glsl b/aten/src/ATen/native/vulkan/glsl/mul_scalar_.glsl
new file mode 100644
index 000000000000..9d1626a2ba83
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/mul_scalar_.glsl
@@ -0,0 +1,26 @@
+#version 450 core
+#define PRECISION $precision
+
+layout(std430) buffer;
+layout(std430) uniform;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict image3D uOutput;
+layout(set = 0, binding = 1)          uniform           restrict Block {
+  ivec3 WHC;
+  float other;
+} uBlock;
+
+layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (all(lessThan(pos, uBlock.WHC))) {
+    imageStore(
+        uOutput,
+        pos,
+        imageLoad(uOutput, pos) * uBlock.other);
+  }
+}
diff --git a/aten/src/ATen/native/vulkan/ops/Mul.cpp b/aten/src/ATen/native/vulkan/ops/Mul.cpp
new file mode 100644
index 000000000000..d7bda8f73016
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/ops/Mul.cpp
@@ -0,0 +1,130 @@
+#include <ATen/native/vulkan/ops/Common.h>
+#include <torch/library.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace ops {
+namespace {
+
+Tensor mul_scalar(
+    const Tensor& self_arg,
+    const Scalar other) {
+  api::Context* const context = api::context();
+
+  const Tensor self = self_arg.is_vulkan() ? self_arg : self_arg.vulkan();
+  const vTensor& v_self = convert(self);
+
+  vTensor v_output{
+    context,
+    self.sizes(),
+    self.options(),
+  };
+
+  api::Command::Buffer command_buffer = context->command().pool.allocate();
+  command_buffer.begin();
+  {
+    if (v_output.has_image() && v_self.has_image()) {
+      const struct {
+        uint32_t width, height, channels;
+        float other;
+      } block {
+        v_output.extents().width,
+        v_output.extents().height,
+        v_output.extents().depth,
+        other.to<float>(),
+      };
+
+      context->dispatch(
+          command_buffer,
+          {
+            VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+          },
+          VK_KERNEL(mul_scalar),
+          v_output.extents(),
+          // Write-only access bypasses synchronization but inserts appropriate
+          // barriers if necessary.
+          v_output.image(command_buffer, vTensor::Access::Write),
+          // Read-only access is implied on const tensors and triggers an async
+          // synchronization if necessary.
+          v_self.image(command_buffer),
+          // Object lifetime is managed by the resource pool.
+          // It is OK not to keep track of the handle.
+          context->resource().pool.uniform(block).object);
+    }
+    else {
+      TORCH_CHECK(false, "Not implemented!");
+    }
+  }
+  command_buffer.end();
+  command_buffer.submit(context->gpu().queue);
+
+  return convert(v_output);
+}
+
+Tensor& mul_scalar_(
+    Tensor& self_arg,
+    const Scalar other) {
+  api::Context* const context = api::context();
+
+  TORCH_CHECK(
+      self_arg.is_vulkan(),
+      "Vulkan: In-place add is only supported on Vulkan tensors.");
+
+  vTensor& v_self = convert(self_arg);
+
+  api::Command::Buffer command_buffer = context->command().pool.allocate();
+  command_buffer.begin();
+  {
+    if (v_self.has_image()) {
+      const struct {
+        uint32_t width, height, channels;
+        float other;
+      } block {
+        v_self.extents().width,
+        v_self.extents().height,
+        v_self.extents().depth,
+        other.to<float>(),
+      };
+
+      context->dispatch(
+          command_buffer,
+          {
+            VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+          },
+          VK_KERNEL(mul_scalar_),
+          v_self.extents(),
+          // Read-Write access triggers an async synchronization if necessory
+          // and inserts appropriate barriers if hazards are detected.
+          v_self.image(command_buffer, vTensor::Access::Read | vTensor::Access::Write),
+          // Object lifetime is managed by the resource pool.
+          // It is OK not to keep track of the handle.
+          context->resource().pool.uniform(block).object);
+    }
+    else {
+      TORCH_CHECK(false, "Not implemented!");
+    }
+  }
+  command_buffer.end();
+  command_buffer.submit(context->gpu().queue);
+
+  return self_arg;
+}
+
+#ifdef USE_VULKAN_API
+
+TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
+  m.impl("mul.Scalar", TORCH_FN(mul_scalar));
+  m.impl("mul_.Scalar", TORCH_FN(mul_scalar_));
+}
+
+#endif /* USE_VULKAN_API */
+
+} // namespace
+} // namespace ops
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index 123028fe714c..05febc539966 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -79,6 +79,30 @@ TEST(VulkanAPITest, add_scalar_) {
   ASSERT_TRUE(almostEqual(a_cpu, a_vulkan.cpu()));
 }
 
+TEST(VulkanAPITest, mul_scalar) {
+  const auto a_cpu = at::rand({17, 213, 213, 7}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto a_vulkan = a_cpu.vulkan();
+
+  const float b_scalar = 3.1415f;
+
+  const auto c_cpu = at::mul(a_cpu, b_scalar);
+  const auto c_vulkan = at::mul(a_vulkan, b_scalar);
+
+  ASSERT_TRUE(almostEqual(c_cpu, c_vulkan.cpu()));
+}
+
+TEST(VulkanAPITest, mul_scalar_) {
+  auto a_cpu = at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat));
+  auto a_vulkan = a_cpu.vulkan();
+
+  const float b_scalar = 3.1415f;
+
+  a_cpu.mul_(b_scalar);
+  a_vulkan.mul_(b_scalar);
+
+  ASSERT_TRUE(almostEqual(a_cpu, a_vulkan.cpu()));
+}
+
 TEST(VulkanAPITest, copy) {
   const auto cpu = at::rand({13, 17, 37, 19}, at::device(at::kCPU).dtype(at::kFloat));
   ASSERT_TRUE(exactlyEqual(cpu, cpu.vulkan().cpu()));

From dd95bf65b6a6c97455efdcfce0aca46e493db916 Mon Sep 17 00:00:00 2001
From: Blaise Sanouillet <blez@fb.com>
Date: Thu, 29 Oct 2020 11:16:13 -0700
Subject: [PATCH 109/169] [caffe2/FC DNNLOWP] Shrink Y_int32_ vector capacity
 when appropriate

Summary:
The FullyConnectedDNNLowPOp::Y_int32_ vectors consume between 1GB and 2GB on one of FB's larger applications. By adding tracing I noticed that the number of elements in each instance oscillates wildy over time. As the buffer backing a vector can only be extended in a resize operation, this means there is wasted memory space. So as a simple optimization, I added code to right-size the buffer backing the vector when the number of elements is less than half the vector capacity at that point; this doesn't affect the existing elements.

There is of course a memory/cpu tradeoff here - with the change we are doing more mallocs and frees. I added tracing to measure how many times we grow or shrink per second: it's about 100 per second on average, which is not a great deal.

Test Plan:
Memory growth impact: over 24 hours and after the startup period, the memory consumed by this code grows from 0.85GB to 1.20GB vs 0.95GB to 1.75GB in the baseline. [ source: https://fburl.com/scuba/heap_profiles/wm47kpfe ]
https://pxl.cl/1pHlJ

Reviewed By: jspark1105

Differential Revision: D24592098

fbshipit-source-id: 7892b35f24e42403653a74a1a9d06cbc7ee866b9
---
 caffe2/quantization/server/fully_connected_dnnlowp_op.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/caffe2/quantization/server/fully_connected_dnnlowp_op.cc b/caffe2/quantization/server/fully_connected_dnnlowp_op.cc
index c7e6804c1dcf..4a5a6e6b7ad0 100644
--- a/caffe2/quantization/server/fully_connected_dnnlowp_op.cc
+++ b/caffe2/quantization/server/fully_connected_dnnlowp_op.cc
@@ -190,6 +190,9 @@ bool FullyConnectedDNNLowPOp<T, ReluFused>::RunOnDevice() {
 
     if (!dequantize_output_) {
       Y_int32_.resize(Y->size());
+      if (Y_int32_.size() < Y_int32_.capacity() / 2) {
+        Y_int32_.shrink_to_fit();
+      }
       DoNothing<> doNothingObj{};
 
       if (quantize_channelwise_ || filter_qparams_[0].zero_point) {
@@ -443,6 +446,9 @@ bool FullyConnectedDNNLowPOp<T, ReluFused>::RunOnDevice() {
 #endif
 
     Y_int32_.resize(Y->size());
+    if (Y_int32_.size() < Y_int32_.capacity() / 2) {
+      Y_int32_.shrink_to_fit();
+    }
     for (int i = 0; i < M; ++i) {
       for (int j = 0; j < N; ++j) {
         int32_t sum = 0;

From 6eaa324c9fff2e5cb0ff8ddb71665ef1e594d623 Mon Sep 17 00:00:00 2001
From: mfkasim91 <firman.kasim@gmail.com>
Date: Thu, 29 Oct 2020 11:38:18 -0700
Subject: [PATCH 110/169] Implement torch.igamma (#46183)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/41637
This is regularized lower incomplete gamma function, equivalent to scipy's `gammainc` and tensorflow `igamma`.

cc fritzo mruberry

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46183

Reviewed By: gchanan

Differential Revision: D24479126

Pulled By: mruberry

fbshipit-source-id: fdf8ea289fe4ca1b408810732192411e948fcdfe
---
 NOTICE                                        | 106 +++
 aten/src/ATen/core/NamedRegistrations.cpp     |   3 +
 aten/src/ATen/core/aten_interned_strings.h    |   2 +
 aten/src/ATen/cpu/vec256/vec256_base.h        |   7 +
 aten/src/ATen/cpu/vec256/vec256_bfloat16.h    |  19 +
 .../ATen/cpu/vec256/vec256_complex_double.h   |   3 +
 .../ATen/cpu/vec256/vec256_complex_float.h    |   3 +
 aten/src/ATen/cpu/vec256/vec256_double.h      |  10 +
 aten/src/ATen/cpu/vec256/vec256_float.h       |  10 +
 aten/src/ATen/cpu/vec256/vec256_float_neon.h  |  10 +
 aten/src/ATen/native/BinaryOps.cpp            |  18 +
 aten/src/ATen/native/BinaryOps.h              |   3 +-
 aten/src/ATen/native/Math.h                   | 710 ++++++++++++++++++
 aten/src/ATen/native/cpu/BinaryOpsKernel.cpp  |  14 +
 .../ATen/native/cuda/BinaryMiscOpsKernels.cu  |   9 +
 aten/src/ATen/native/cuda/Math.cuh            | 507 ++++++++++++-
 aten/src/ATen/native/native_functions.yaml    |  15 +
 c10/util/math_compat.h                        |  10 +-
 docs/source/tensors.rst                       |   2 +
 docs/source/torch.rst                         |   1 +
 test/test_autograd.py                         |   8 +
 test/test_torch.py                            |  68 +-
 tools/autograd/derivatives.yaml               |   4 +
 torch/_tensor_docs.py                         |  14 +
 torch/_torch_docs.py                          |  41 +
 torch/overrides.py                            |   1 +
 26 files changed, 1590 insertions(+), 8 deletions(-)

diff --git a/NOTICE b/NOTICE
index 020beaea4c46..5abaac479a75 100644
--- a/NOTICE
+++ b/NOTICE
@@ -284,6 +284,112 @@ Apache License Version 2.0:
       incurred by, or claims asserted against, such Contributor by reason
       of your accepting any such warranty or additional liability.
 
+=======================================================================
+Cephes's 3-Clause BSD License
+=======================================================================
+
+Code derived from implementations in the Cephes Math Library should mention
+its derivation and reference the following license:
+
+   3-Clause BSD License for the Cephes Math Library
+   Copyright (c) 2018, Steven Moshier
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+   * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   * Neither the name of the nor the
+   names of its contributors may be used to endorse or promote products
+   derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+   ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+   DISCLAIMED. IN NO EVENT SHALL Steven Moshier BE LIABLE FOR ANY
+   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+   ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+=======================================================================
+SciPy's 3-Clause BSD License
+=======================================================================
+
+Code derived from implementations in SciPy should mention its derivation
+and reference the following license:
+
+   Copyright (c) 2001-2002 Enthought, Inc.  2003-2019, SciPy Developers.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   1. Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above
+     copyright notice, this list of conditions and the following
+     disclaimer in the documentation and/or other materials provided
+     with the distribution.
+
+   3. Neither the name of the copyright holder nor the names of its
+     contributors may be used to endorse or promote products derived
+     from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+=======================================================================
+Boost's 1.0 Software License
+=======================================================================
+
+Code derived from implementations in Boost 1.0 should mention its
+derivation and reference the following license:
+
+   Boost Software License - Version 1.0 - August 17th, 2003
+
+   Permission is hereby granted, free of charge, to any person or organization
+   obtaining a copy of the software and accompanying documentation covered by
+   this license (the "Software") to use, reproduce, display, distribute,
+   execute, and transmit the Software, and to prepare derivative works of the
+   Software, and to permit third-parties to whom the Software is furnished to
+   do so, all subject to the following:
+
+   The copyright notices in the Software and this entire statement, including
+   the above license grant, this restriction and the following disclaimer,
+   must be included in all copies of the Software, in whole or in part, and
+   all derivative works of the Software, unless such copies or derivative
+   works are solely in the form of machine-executable object code generated by
+   a source language processor.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+   SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+   FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+   ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   DEALINGS IN THE SOFTWARE.
+
    END OF TERMS AND CONDITIONS
 
    APPENDIX: How to apply the Apache License to your work.
diff --git a/aten/src/ATen/core/NamedRegistrations.cpp b/aten/src/ATen/core/NamedRegistrations.cpp
index 33e4ebcfc7dc..187b217604ba 100644
--- a/aten/src/ATen/core/NamedRegistrations.cpp
+++ b/aten/src/ATen/core/NamedRegistrations.cpp
@@ -210,6 +210,9 @@ TORCH_LIBRARY_IMPL(aten, Named, m) {
   m.impl("i0", CppFunction::makeFallthrough());
   m.impl("i0.out", CppFunction::makeFallthrough());
   m.impl("i0_", CppFunction::makeFallthrough());
+  m.impl("igamma", CppFunction::makeFallthrough());
+  m.impl("igamma.out", CppFunction::makeFallthrough());
+  m.impl("igamma_", CppFunction::makeFallthrough());
   m.impl("imag", CppFunction::makeFallthrough());
   m.impl("index_fill.Dimname_Scalar", CppFunction::makeFallthrough());
   m.impl("index_fill.Dimname_Tensor", CppFunction::makeFallthrough());
diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h
index da259e82990a..8d41520a7aff 100644
--- a/aten/src/ATen/core/aten_interned_strings.h
+++ b/aten/src/ATen/core/aten_interned_strings.h
@@ -371,6 +371,8 @@ _(aten, hstack) \
 _(aten, hypot) \
 _(aten, i0) \
 _(aten, i0_) \
+_(aten, igamma) \
+_(aten, igamma_) \
 _(aten, ifft) \
 _(aten, index) \
 _(aten, index_add) \
diff --git a/aten/src/ATen/cpu/vec256/vec256_base.h b/aten/src/ATen/cpu/vec256/vec256_base.h
index edce0e3a2cce..807a9d9780f0 100644
--- a/aten/src/ATen/cpu/vec256/vec256_base.h
+++ b/aten/src/ATen/cpu/vec256/vec256_base.h
@@ -394,6 +394,13 @@ struct Vec256 {
   Vec256<T> i0() const {
     return map(calc_i0);
   }
+  Vec256<T> igamma(const Vec256<T> &x) const {
+    Vec256<T> ret;
+    for (int64_t i = 0; i < size(); i++) {
+      ret[i] = calc_igamma(values[i], x[i]);
+    }
+    return ret;
+  }
   Vec256<T> neg() const {
     // NB: the trailing return type is needed because we need to coerce the
     // return value back to T in the case of unary operator- incuring a
diff --git a/aten/src/ATen/cpu/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec256/vec256_bfloat16.h
index 37d41676e53c..10bbe139b63f 100644
--- a/aten/src/ATen/cpu/vec256/vec256_bfloat16.h
+++ b/aten/src/ATen/cpu/vec256/vec256_bfloat16.h
@@ -290,6 +290,25 @@ template <> class Vec256<BFloat16> {
     auto o2 = _mm256_loadu_ps(tmp2);
     return cvtfp32_bf16(o1, o2);
   }
+  Vec256<BFloat16> igamma(const Vec256<BFloat16> &x) const {
+    __m256 lo, hi;
+    __m256 xlo, xhi;
+    cvtbf16_fp32(values, lo, hi);
+    cvtbf16_fp32(x.values, xlo, xhi);
+    __at_align32__ float tmp1[size() / 2], tmp2[size() / 2];
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmp1), lo);
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmp2), hi);
+    __at_align32__ float tmpx1[size() / 2], tmpx2[size() / 2];
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmpx1), xlo);
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmpx2), xhi);
+    for (int64_t i = 0; i < size() / 2; ++i) {
+      tmp1[i] = calc_igamma(tmp1[i], tmpx1[i]);
+      tmp2[i] = calc_igamma(tmp2[i], tmpx2[i]);
+    }
+    auto o1 = _mm256_loadu_ps(tmp1);
+    auto o2 = _mm256_loadu_ps(tmp2);
+    return cvtfp32_bf16(o1, o2);
+  }
   Vec256<BFloat16> log() const {
     return map(Sleef_logf8_u10);
   }
diff --git a/aten/src/ATen/cpu/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec256/vec256_complex_double.h
index d2ae6f46b44e..d7f5afd8b67d 100644
--- a/aten/src/ATen/cpu/vec256/vec256_complex_double.h
+++ b/aten/src/ATen/cpu/vec256/vec256_complex_double.h
@@ -252,6 +252,9 @@ template <> class Vec256<c10::complex<double>> {
   Vec256<c10::complex<double>> hypot(const Vec256<c10::complex<double>> &b) const {
     AT_ERROR("not supported for complex numbers");
   }
+  Vec256<c10::complex<double>> igamma(const Vec256<c10::complex<double>> &x) const {
+    AT_ERROR("not supported for complex numbers");
+  }
   Vec256<c10::complex<double>> neg() const {
     auto zero = _mm256_setzero_pd();
     return _mm256_sub_pd(zero, values);
diff --git a/aten/src/ATen/cpu/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec256/vec256_complex_float.h
index 8b4eba07f421..4df95dbea926 100644
--- a/aten/src/ATen/cpu/vec256/vec256_complex_float.h
+++ b/aten/src/ATen/cpu/vec256/vec256_complex_float.h
@@ -290,6 +290,9 @@ template <> class Vec256<c10::complex<float>> {
   Vec256<c10::complex<float>> hypot(const Vec256<c10::complex<float>> &b) const {
     AT_ERROR("not supported for complex numbers");
   }
+  Vec256<c10::complex<float>> igamma(const Vec256<c10::complex<float>> &x) const {
+    AT_ERROR("not supported for complex numbers");
+  }
   Vec256<c10::complex<float>> neg() const {
     auto zero = _mm256_setzero_ps();
     return _mm256_sub_ps(zero, values);
diff --git a/aten/src/ATen/cpu/vec256/vec256_double.h b/aten/src/ATen/cpu/vec256/vec256_double.h
index fcad154e68b2..6b611e8d2e7a 100644
--- a/aten/src/ATen/cpu/vec256/vec256_double.h
+++ b/aten/src/ATen/cpu/vec256/vec256_double.h
@@ -155,6 +155,16 @@ template <> class Vec256<double> {
   Vec256<double> i0() const {
     return map(calc_i0);
   }
+  Vec256<double> igamma(const Vec256<double> &x) const {
+    __at_align32__ double tmp[size()];
+    __at_align32__ double tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (int64_t i = 0; i < size(); i++) {
+      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
   Vec256<double> log() const {
     return Vec256<double>(Sleef_logd4_u10(values));
   }
diff --git a/aten/src/ATen/cpu/vec256/vec256_float.h b/aten/src/ATen/cpu/vec256/vec256_float.h
index 1ab11ea81529..d83895fdf854 100644
--- a/aten/src/ATen/cpu/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec256/vec256_float.h
@@ -193,6 +193,16 @@ template <> class Vec256<float> {
   Vec256<float> i0() const {
     return map(calc_i0);
   }
+  Vec256<float> igamma(const Vec256<float> &x) const {
+    __at_align32__ float tmp[size()];
+    __at_align32__ float tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (int64_t i = 0; i < size(); i++) {
+      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
   Vec256<float> neg() const {
     return _mm256_xor_ps(_mm256_set1_ps(-0.f), values);
   }
diff --git a/aten/src/ATen/cpu/vec256/vec256_float_neon.h b/aten/src/ATen/cpu/vec256/vec256_float_neon.h
index f98c645a08d6..17be2b255108 100644
--- a/aten/src/ATen/cpu/vec256/vec256_float_neon.h
+++ b/aten/src/ATen/cpu/vec256/vec256_float_neon.h
@@ -362,6 +362,16 @@ template <> class Vec256<float> {
   Vec256<float> i0() const {
     return map(calc_i0);
   }
+  Vec256<float> igamma(const Vec256<float> &x) const {
+    __at_align32__ float tmp[size()];
+    __at_align32__ float tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (int64_t i = 0; i < size(); i++) {
+      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
   Vec256<float> log() const {
     return map(std::log);
   }
diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp
index f8af756773c9..b7916ba3f9c8 100644
--- a/aten/src/ATen/native/BinaryOps.cpp
+++ b/aten/src/ATen/native/BinaryOps.cpp
@@ -46,6 +46,7 @@ DEFINE_DISPATCH(logaddexp2_stub);
 DEFINE_DISPATCH(gcd_stub);
 DEFINE_DISPATCH(lcm_stub);
 DEFINE_DISPATCH(hypot_stub);
+DEFINE_DISPATCH(igamma_stub);
 DEFINE_DISPATCH(nextafter_stub);
 DEFINE_DISPATCH(heaviside_stub);
 
@@ -968,6 +969,23 @@ Tensor& hypot_(Tensor& self, const Tensor& other) {
   return at::hypot_out(self, self, other);
 }
 
+Tensor& igamma_out(Tensor& result, const Tensor& self, const Tensor& other) {
+  auto iter = TensorIterator::binary_op(result, self, other);
+  igamma_stub(iter.device_type(), iter);
+  return result;
+}
+
+Tensor igamma(const Tensor& self, const Tensor& other) {
+  Tensor result;
+  auto iter = TensorIterator::binary_op(result, self, other);
+  igamma_stub(iter.device_type(), iter);
+  return iter.output();
+}
+
+Tensor& igamma_(Tensor& self, const Tensor& other) {
+  return at::igamma_out(self, self, other);
+}
+
 Tensor& nextafter_out(Tensor& result, const Tensor& self, const Tensor& other) {
   auto iter = TensorIterator::binary_op(result, self, other);
   nextafter_stub(iter.device_type(), iter);
diff --git a/aten/src/ATen/native/BinaryOps.h b/aten/src/ATen/native/BinaryOps.h
index 7640c8bd84ac..ee3f023fedc5 100644
--- a/aten/src/ATen/native/BinaryOps.h
+++ b/aten/src/ATen/native/BinaryOps.h
@@ -10,7 +10,7 @@ namespace at { namespace native {
 inline void alpha_check(const ScalarType dtype, Scalar alpha) {
   TORCH_CHECK(! alpha.isBoolean() || dtype == ScalarType::Bool,
               "Boolean alpha only supported for Boolean results.");
-  TORCH_CHECK(isFloatingType(dtype) || isComplexType(dtype) 
+  TORCH_CHECK(isFloatingType(dtype) || isComplexType(dtype)
               || alpha.isIntegral(true),
               "For integral input tensors, argument alpha must not be a floating point number.");
 }
@@ -68,6 +68,7 @@ DECLARE_DISPATCH(binary_fn, logaddexp2_stub);
 DECLARE_DISPATCH(binary_fn, gcd_stub);
 DECLARE_DISPATCH(binary_fn, lcm_stub);
 DECLARE_DISPATCH(binary_fn, hypot_stub);
+DECLARE_DISPATCH(binary_fn, igamma_stub);
 DECLARE_DISPATCH(binary_fn, nextafter_stub);
 DECLARE_DISPATCH(binary_fn, heaviside_stub);
 
diff --git a/aten/src/ATen/native/Math.h b/aten/src/ATen/native/Math.h
index c00ffec94119..dc5530a72813 100644
--- a/aten/src/ATen/native/Math.h
+++ b/aten/src/ATen/native/Math.h
@@ -381,6 +381,716 @@ static inline float calc_polygamma(int64_t n, float x) {
       zeta(double(n + 1), x);
 }
 
+// regularized lower incomplete gamma
+// the regularized lower, upper incomplete gamma, as well as their
+// helper functions follow SciPy's implementation
+
+/* References
+ * [igam1] "The Digital Library of Mathematical Functions", dlmf.nist.gov
+ * [igam2] Maddock et. al., "Incomplete Gamma Functions",
+ *     https://www.boost.org/doc/libs/1_61_0/libs/math/doc/html/math_toolkit/sf_gamma/igamma.html
+ */
+
+/*
+ * This implementation of the regularized incomplete gamma functions and
+ * their helper functions are derived from the implementation of SciPy's
+ * gammainc, Cephes's igam and igamc, and Boost's Lanczos approximations.
+ * See NOTICE for the licenses.
+ */
+template <typename scalar_t>
+static scalar_t ratevl(scalar_t x, const scalar_t num[], int64_t M,
+    const scalar_t denom[], int64_t N) {
+  // evaluating rational function, i.e., the ratio of two polynomials
+  // the coefficients for numerator are given by `num` while coeffs for
+  // denumerator are given by `denom`
+
+  int64_t i, dir;
+  scalar_t y, num_ans, denom_ans;
+  scalar_t absx = std::fabs(x);
+  const scalar_t *p;
+
+  if (absx > 1) {
+    /* Evaluate as a polynomial in 1/x. */
+    dir = -1;
+    p = num + M;
+    y = 1 / x;
+  }
+  else {
+    dir = 1;
+    p = num;
+    y = x;
+  }
+
+  /* Evaluate the numerator */
+  num_ans = *p;
+  p += dir;
+  for (i = 1; i <= M; i++) {
+    num_ans = num_ans * y + *p;
+    p += dir;
+  }
+  /* Evaluate the denominator */
+  if (absx > 1) {
+    p = denom + N;
+  }
+  else {
+    p = denom;
+  }
+
+  denom_ans = *p;
+  p += dir;
+  for (i = 1; i <= N; i++) {
+    denom_ans = denom_ans * y + *p;
+    p += dir;
+  }
+  if (absx > 1) {
+    i = N - M;
+    return std::pow(x, i) * num_ans / denom_ans;
+  }
+  else {
+    return num_ans / denom_ans;
+  }
+}
+
+// SciPy's lanczos implementation is taken from Boost
+/* (C) Copyright John Maddock 2006.
+ * Use, modification and distribution are subject to the
+ * Boost Software License, Version 1.0. See
+ * https://www.boost.org/LICENSE_1_0.txt or see NOTICE.
+ */
+template <typename scalar_t>
+static scalar_t lanczos_sum_expg_scaled(scalar_t x) {
+  // lanczos approximation
+  static const scalar_t lanczos_sum_expg_scaled_num[13] = {
+    0.006061842346248906525783753964555936883222,
+    0.5098416655656676188125178644804694509993,
+    19.51992788247617482847860966235652136208,
+    449.9445569063168119446858607650988409623,
+    6955.999602515376140356310115515198987526,
+    75999.29304014542649875303443598909137092,
+    601859.6171681098786670226533699352302507,
+    3481712.15498064590882071018964774556468,
+    14605578.08768506808414169982791359218571,
+    43338889.32467613834773723740590533316085,
+    86363131.28813859145546927288977868422342,
+    103794043.1163445451906271053616070238554,
+    56906521.91347156388090791033559122686859
+  };
+  static const scalar_t lanczos_sum_expg_scaled_denom[13] = {
+    1.,
+    66.,
+    1925.,
+    32670.,
+    357423.,
+    2637558.,
+    13339535.,
+    45995730.,
+    105258076.,
+    150917976.,
+    120543840.,
+    39916800.,
+    0.
+  };
+  return ratevl(x, lanczos_sum_expg_scaled_num,
+      sizeof(lanczos_sum_expg_scaled_num) / sizeof(lanczos_sum_expg_scaled_num[0]) - 1,
+      lanczos_sum_expg_scaled_denom,
+      sizeof(lanczos_sum_expg_scaled_denom) / sizeof(lanczos_sum_expg_scaled_denom[0]) - 1);
+}
+
+template <typename scalar_t>
+static scalar_t _igam_helper_fac(scalar_t a, scalar_t x) {
+  // compute x^a * exp(-a) / gamma(a)
+  // corrected from (15) and (16) in [igam2] by replacing exp(x - a) with
+  // exp(a - x).
+
+  scalar_t ax, fac, res, num, numfac;
+  static scalar_t MAXLOG = std::is_same<scalar_t,double>::value ?
+    7.09782712893383996843E2 : 88.72283905206835;
+  static scalar_t EXP1 = 2.718281828459045;
+  static scalar_t lanczos_g = 6.024680040776729583740234375;
+
+  if (std::fabs(a - x) > 0.4 * std::fabs(a)) {
+    ax = a * std::log(x) - x - std::lgamma(a);
+    if (ax < -MAXLOG) {
+      return 0.0;
+    }
+    return std::exp(ax);
+  }
+
+  fac = a + lanczos_g - 0.5;
+  res = std::sqrt(fac / EXP1) / lanczos_sum_expg_scaled(a);
+
+  if ((a < 200) && (x < 200)) {
+    res *= std::exp(a - x) * std::pow(x / fac, a);
+  }
+  else {
+    num = x - a - lanczos_g + 0.5;
+    numfac = num / fac;
+    res *= std::exp(a * (std::log1p(numfac) - numfac) + x * (0.5 - lanczos_g) / fac);
+  }
+  return res;
+}
+
+template <typename scalar_t>
+static scalar_t _igam_helper_series(scalar_t a, scalar_t x) {
+  // Compute igam using DLMF 8.11.4. [igam1]
+  static scalar_t MACHEP = std::is_same<scalar_t, double>::value ?
+    1.11022302462515654042E-16 : 5.9604644775390625E-8;
+  static int MAXITER = 2000;
+
+  int i;
+  scalar_t ans, ax, c, r;
+
+  ax = _igam_helper_fac(a, x);
+  if (ax == 0.0) {
+    return 0.0;
+  }
+
+  /* power series */
+  r = a;
+  c = 1.0;
+  ans = 1.0;
+
+  for (i = 0; i < MAXITER; i++) {
+    r += 1.0;
+    c *= x / r;
+    ans += c;
+    if (c <= MACHEP * ans) {
+      break;
+    }
+  }
+  return (ans * ax / a);
+}
+
+template <typename scalar_t>
+static scalar_t _igamc_helper_series(scalar_t a, scalar_t x) {
+  // Compute igamc using DLMF 8.7.3 [igam1]. This is related to the series in
+  // _igam_helper_series but extra care is taken to avoid cancellation.
+
+  int n;
+  scalar_t fac = 1;
+  scalar_t sum = 0;
+  scalar_t term, logx;
+  static scalar_t MAXITER = 2000;
+  static scalar_t MACHEP = std::is_same<scalar_t, double>::value ?
+    1.11022302462515654042E-16 : 5.9604644775390625E-8;
+
+  for (n = 1; n < MAXITER; n++) {
+    fac *= -x / n;
+    term = fac / (a + n);
+    sum += term;
+    if (std::fabs(term) <= MACHEP * std::fabs(sum)) {
+        break;
+    }
+  }
+
+  logx = std::log(x);
+  term = -std::expm1(a * logx - std::lgamma(1+a));
+  return term - std::exp(a * logx - std::lgamma(a)) * sum;
+}
+
+template <typename scalar_t>
+static scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t x, bool igam) {
+  // Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1]
+  static const scalar_t d[25][25] =
+    {{-3.3333333333333333e-1, 8.3333333333333333e-2, -1.4814814814814815e-2,
+      1.1574074074074074e-3, 3.527336860670194e-4, -1.7875514403292181e-4,
+      3.9192631785224378e-5, -2.1854485106799922e-6, -1.85406221071516e-6,
+      8.296711340953086e-7, -1.7665952736826079e-7, 6.7078535434014986e-9,
+      1.0261809784240308e-8, -4.3820360184533532e-9, 9.1476995822367902e-10,
+      -2.551419399494625e-11, -5.8307721325504251e-11, 2.4361948020667416e-11,
+      -5.0276692801141756e-12, 1.1004392031956135e-13, 3.3717632624009854e-13,
+      -1.3923887224181621e-13, 2.8534893807047443e-14, -5.1391118342425726e-16,
+      -1.9752288294349443e-15},
+    {-1.8518518518518519e-3, -3.4722222222222222e-3, 2.6455026455026455e-3,
+      -9.9022633744855967e-4, 2.0576131687242798e-4, -4.0187757201646091e-7,
+      -1.8098550334489978e-5, 7.6491609160811101e-6, -1.6120900894563446e-6,
+      4.6471278028074343e-9, 1.378633446915721e-7, -5.752545603517705e-8,
+      1.1951628599778147e-8, -1.7543241719747648e-11, -1.0091543710600413e-9,
+      4.1627929918425826e-10, -8.5639070264929806e-11, 6.0672151016047586e-14,
+      7.1624989648114854e-12, -2.9331866437714371e-12, 5.9966963656836887e-13,
+      -2.1671786527323314e-16, -4.9783399723692616e-14, 2.0291628823713425e-14,
+      -4.13125571381061e-15},
+    {4.1335978835978836e-3, -2.6813271604938272e-3, 7.7160493827160494e-4,
+      2.0093878600823045e-6, -1.0736653226365161e-4, 5.2923448829120125e-5,
+      -1.2760635188618728e-5, 3.4235787340961381e-8, 1.3721957309062933e-6,
+      -6.298992138380055e-7, 1.4280614206064242e-7, -2.0477098421990866e-10,
+      -1.4092529910867521e-8, 6.228974084922022e-9, -1.3670488396617113e-9,
+      9.4283561590146782e-13, 1.2872252400089318e-10, -5.5645956134363321e-11,
+      1.1975935546366981e-11, -4.1689782251838635e-15, -1.0940640427884594e-12,
+      4.6622399463901357e-13, -9.905105763906906e-14, 1.8931876768373515e-17,
+      8.8592218725911273e-15},
+    {6.4943415637860082e-4, 2.2947209362139918e-4, -4.6918949439525571e-4,
+      2.6772063206283885e-4, -7.5618016718839764e-5, -2.3965051138672967e-7,
+      1.1082654115347302e-5, -5.6749528269915966e-6, 1.4230900732435884e-6,
+      -2.7861080291528142e-11, -1.6958404091930277e-7, 8.0994649053880824e-8,
+      -1.9111168485973654e-8, 2.3928620439808118e-12, 2.0620131815488798e-9,
+      -9.4604966618551322e-10, 2.1541049775774908e-10, -1.388823336813903e-14,
+      -2.1894761681963939e-11, 9.7909989511716851e-12, -2.1782191880180962e-12,
+      6.2088195734079014e-17, 2.126978363279737e-13, -9.3446887915174333e-14,
+      2.0453671226782849e-14},
+    {-8.618882909167117e-4, 7.8403922172006663e-4, -2.9907248030319018e-4,
+      -1.4638452578843418e-6, 6.6414982154651222e-5, -3.9683650471794347e-5,
+      1.1375726970678419e-5, 2.5074972262375328e-10, -1.6954149536558306e-6,
+      8.9075075322053097e-7, -2.2929348340008049e-7, 2.956794137544049e-11,
+      2.8865829742708784e-8, -1.4189739437803219e-8, 3.4463580499464897e-9,
+      -2.3024517174528067e-13, -3.9409233028046405e-10, 1.8602338968504502e-10,
+      -4.356323005056618e-11, 1.2786001016296231e-15, 4.6792750266579195e-12,
+      -2.1492464706134829e-12, 4.9088156148096522e-13, -6.3385914848915603e-18,
+      -5.0453320690800944e-14},
+    {-3.3679855336635815e-4, -6.9728137583658578e-5, 2.7727532449593921e-4,
+      -1.9932570516188848e-4, 6.7977804779372078e-5, 1.419062920643967e-7,
+      -1.3594048189768693e-5, 8.0184702563342015e-6, -2.2914811765080952e-6,
+      -3.252473551298454e-10, 3.4652846491085265e-7, -1.8447187191171343e-7,
+      4.8240967037894181e-8, -1.7989466721743515e-14, -6.3061945000135234e-9,
+      3.1624176287745679e-9, -7.8409242536974293e-10, 5.1926791652540407e-15,
+      9.3589442423067836e-11, -4.5134262161632782e-11, 1.0799129993116827e-11,
+      -3.661886712685252e-17, -1.210902069055155e-12, 5.6807435849905643e-13,
+      -1.3249659916340829e-13},
+    {5.3130793646399222e-4, -5.9216643735369388e-4, 2.7087820967180448e-4,
+      7.9023532326603279e-7, -8.1539693675619688e-5, 5.6116827531062497e-5,
+      -1.8329116582843376e-5, -3.0796134506033048e-9, 3.4651553688036091e-6,
+      -2.0291327396058604e-6, 5.7887928631490037e-7, 2.338630673826657e-13,
+      -8.8286007463304835e-8, 4.7435958880408128e-8, -1.2545415020710382e-8,
+      8.6496488580102925e-14, 1.6846058979264063e-9, -8.5754928235775947e-10,
+      2.1598224929232125e-10, -7.6132305204761539e-16, -2.6639822008536144e-11,
+      1.3065700536611057e-11, -3.1799163902367977e-12, 4.7109761213674315e-18,
+      3.6902800842763467e-13},
+    {3.4436760689237767e-4, 5.1717909082605922e-5, -3.3493161081142236e-4,
+      2.812695154763237e-4, -1.0976582244684731e-4, -1.2741009095484485e-7,
+      2.7744451511563644e-5, -1.8263488805711333e-5, 5.7876949497350524e-6,
+      4.9387589339362704e-10, -1.0595367014026043e-6, 6.1667143761104075e-7,
+      -1.7562973359060462e-7, -1.2974473287015439e-12, 2.695423606288966e-8,
+      -1.4578352908731271e-8, 3.887645959386175e-9, -3.8810022510194121e-17,
+      -5.3279941738772867e-10, 2.7437977643314845e-10, -6.9957960920705679e-11,
+      2.5899863874868481e-17, 8.8566890996696381e-12, -4.403168815871311e-12,
+      1.0865561947091654e-12},
+    {-6.5262391859530942e-4, 8.3949872067208728e-4, -4.3829709854172101e-4,
+      -6.969091458420552e-7, 1.6644846642067548e-4, -1.2783517679769219e-4,
+      4.6299532636913043e-5, 4.5579098679227077e-9, -1.0595271125805195e-5,
+      6.7833429048651666e-6, -2.1075476666258804e-6, -1.7213731432817145e-11,
+      3.7735877416110979e-7, -2.1867506700122867e-7, 6.2202288040189269e-8,
+      6.5977038267330006e-16, -9.5903864974256858e-9, 5.2132144922808078e-9,
+      -1.3991589583935709e-9, 5.382058999060575e-16, 1.9484714275467745e-10,
+      -1.0127287556389682e-10, 2.6077347197254926e-11, -5.0904186999932993e-18,
+      -3.3721464474854592e-12},
+    {-5.9676129019274625e-4, -7.2048954160200106e-5, 6.7823088376673284e-4,
+      -6.4014752602627585e-4, 2.7750107634328704e-4, 1.8197008380465151e-7,
+      -8.4795071170685032e-5, 6.105192082501531e-5, -2.1073920183404862e-5,
+      -8.8585890141255994e-10, 4.5284535953805377e-6, -2.8427815022504408e-6,
+      8.7082341778646412e-7, 3.6886101871706965e-12, -1.5344695190702061e-7,
+      8.862466778790695e-8, -2.5184812301826817e-8, -1.0225912098215092e-14,
+      3.8969470758154777e-9, -2.1267304792235635e-9, 5.7370135528051385e-10,
+      -1.887749850169741e-19, -8.0931538694657866e-11, 4.2382723283449199e-11,
+      -1.1002224534207726e-11},
+    {1.3324454494800656e-3, -1.9144384985654775e-3, 1.1089369134596637e-3,
+      9.932404122642299e-7, -5.0874501293093199e-4, 4.2735056665392884e-4,
+      -1.6858853767910799e-4, -8.1301893922784998e-9, 4.5284402370562147e-5,
+      -3.127053674781734e-5, 1.044986828530338e-5, 4.8435226265680926e-11,
+      -2.1482565873456258e-6, 1.329369701097492e-6, -4.0295693092101029e-7,
+      -1.7567877666323291e-13, 7.0145043163668257e-8, -4.040787734999483e-8,
+      1.1474026743371963e-8, 3.9642746853563325e-18, -1.7804938269892714e-9,
+      9.7480262548731646e-10, -2.6405338676507616e-10, 5.794875163403742e-18,
+      3.7647749553543836e-11},
+    {1.579727660730835e-3, 1.6251626278391582e-4, -2.0633421035543276e-3,
+      2.1389686185689098e-3, -1.0108559391263003e-3, -3.9912705529919201e-7,
+      3.6235025084764691e-4, -2.8143901463712154e-4, 1.0449513336495887e-4,
+      2.1211418491830297e-9, -2.5779417251947842e-5, 1.7281818956040463e-5,
+      -5.6413773872904282e-6, -1.1024320105776174e-11, 1.1223224418895175e-6,
+      -6.8693396379526735e-7, 2.0653236975414887e-7, 4.6714772409838506e-14,
+      -3.5609886164949055e-8, 2.0470855345905963e-8, -5.8091738633283358e-9,
+      -1.332821287582869e-16, 9.0354604391335133e-10, -4.9598782517330834e-10,
+      1.3481607129399749e-10},
+    {-4.0725121195140166e-3, 6.4033628338080698e-3, -4.0410161081676618e-3,
+      -2.183732802866233e-6, 2.1740441801254639e-3, -1.9700440518418892e-3,
+      8.3595469747962458e-4, 1.9445447567109655e-8, -2.5779387120421696e-4,
+      1.9009987368139304e-4, -6.7696499937438965e-5, -1.4440629666426572e-10,
+      1.5712512518742269e-5, -1.0304008744776893e-5, 3.304517767401387e-6,
+      7.9829760242325709e-13, -6.4097794149313004e-7, 3.8894624761300056e-7,
+      -1.1618347644948869e-7, -2.816808630596451e-15, 1.9878012911297093e-8,
+      -1.1407719956357511e-8, 3.2355857064185555e-9, 4.1759468293455945e-20,
+      -5.0423112718105824e-10},
+    {-5.9475779383993003e-3, -5.4016476789260452e-4, 8.7910413550767898e-3,
+      -9.8576315587856125e-3, 5.0134695031021538e-3, 1.2807521786221875e-6,
+      -2.0626019342754683e-3, 1.7109128573523058e-3, -6.7695312714133799e-4,
+      -6.9011545676562133e-9, 1.8855128143995902e-4, -1.3395215663491969e-4,
+      4.6263183033528039e-5, 4.0034230613321351e-11, -1.0255652921494033e-5,
+      6.612086372797651e-6, -2.0913022027253008e-6, -2.0951775649603837e-13,
+      3.9756029041993247e-7, -2.3956211978815887e-7, 7.1182883382145864e-8,
+      8.925574873053455e-16, -1.2101547235064676e-8, 6.9350618248334386e-9,
+      -1.9661464453856102e-9},
+    {1.7402027787522711e-2, -2.9527880945699121e-2, 2.0045875571402799e-2,
+      7.0289515966903407e-6, -1.2375421071343148e-2, 1.1976293444235254e-2,
+      -5.4156038466518525e-3, -6.3290893396418616e-8, 1.8855118129005065e-3,
+      -1.473473274825001e-3, 5.5515810097708387e-4, 5.2406834412550662e-10,
+      -1.4357913535784836e-4, 9.9181293224943297e-5, -3.3460834749478311e-5,
+      -3.5755837291098993e-12, 7.1560851960630076e-6, -4.5516802628155526e-6,
+      1.4236576649271475e-6, 1.8803149082089664e-14, -2.6623403898929211e-7,
+      1.5950642189595716e-7, -4.7187514673841102e-8, -6.5107872958755177e-17,
+      7.9795091026746235e-9},
+    {3.0249124160905891e-2, 2.4817436002649977e-3, -4.9939134373457022e-2,
+      5.9915643009307869e-2, -3.2483207601623391e-2, -5.7212968652103441e-6,
+      1.5085251778569354e-2, -1.3261324005088445e-2, 5.5515262632426148e-3,
+      3.0263182257030016e-8, -1.7229548406756723e-3, 1.2893570099929637e-3,
+      -4.6845138348319876e-4, -1.830259937893045e-10, 1.1449739014822654e-4,
+      -7.7378565221244477e-5, 2.5625836246985201e-5, 1.0766165333192814e-12,
+      -5.3246809282422621e-6, 3.349634863064464e-6, -1.0381253128684018e-6,
+      -5.608909920621128e-15, 1.9150821930676591e-7, -1.1418365800203486e-7,
+      3.3654425209171788e-8},
+    {-9.9051020880159045e-2, 1.7954011706123486e-1, -1.2989606383463778e-1,
+      -3.1478872752284357e-5, 9.0510635276848131e-2, -9.2828824411184397e-2,
+      4.4412112839877808e-2, 2.7779236316835888e-7, -1.7229543805449697e-2,
+      1.4182925050891573e-2, -5.6214161633747336e-3, -2.39598509186381e-9,
+      1.6029634366079908e-3, -1.1606784674435773e-3, 4.1001337768153873e-4,
+      1.8365800754090661e-11, -9.5844256563655903e-5, 6.3643062337764708e-5,
+      -2.076250624489065e-5, -1.1806020912804483e-13, 4.2131808239120649e-6,
+      -2.6262241337012467e-6, 8.0770620494930662e-7, 6.0125912123632725e-16,
+      -1.4729737374018841e-7},
+    {-1.9994542198219728e-1, -1.5056113040026424e-2, 3.6470239469348489e-1,
+      -4.6435192311733545e-1, 2.6640934719197893e-1, 3.4038266027147191e-5,
+      -1.3784338709329624e-1, 1.276467178337056e-1, -5.6213828755200985e-2,
+      -1.753150885483011e-7, 1.9235592956768113e-2, -1.5088821281095315e-2,
+      5.7401854451350123e-3, 1.0622382710310225e-9, -1.5335082692563998e-3,
+      1.0819320643228214e-3, -3.7372510193945659e-4, -6.6170909729031985e-12,
+      8.4263617380909628e-5, -5.5150706827483479e-5, 1.7769536448348069e-5,
+      3.8827923210205533e-14, -3.53513697488768e-6, 2.1865832130045269e-6,
+      -6.6812849447625594e-7},
+    {7.2438608504029431e-1, -1.3918010932653375, 1.0654143352413968,
+      1.876173868950258e-4, -8.2705501176152696e-1, 8.9352433347828414e-1,
+      -4.4971003995291339e-1, -1.6107401567546652e-6, 1.9235590165271091e-1,
+      -1.6597702160042609e-1, 6.8882222681814333e-2, 1.3910091724608687e-8,
+      -2.146911561508663e-2, 1.6228980898865892e-2, -5.9796016172584256e-3,
+      -1.1287469112826745e-10, 1.5167451119784857e-3, -1.0478634293553899e-3,
+      3.5539072889126421e-4, 8.1704322111801517e-13, -7.7773013442452395e-5,
+      5.0291413897007722e-5, -1.6035083867000518e-5, 1.2469354315487605e-14,
+      3.1369106244517615e-6},
+    {1.6668949727276811, 1.165462765994632e-1, -3.3288393225018906,
+      4.4692325482864037, -2.6977693045875807, -2.600667859891061e-4,
+      1.5389017615694539, -1.4937962361134612, 6.8881964633233148e-1,
+      1.3077482004552385e-6, -2.5762963325596288e-1, 2.1097676102125449e-1,
+      -8.3714408359219882e-2, -7.7920428881354753e-9, 2.4267923064833599e-2,
+      -1.7813678334552311e-2, 6.3970330388900056e-3, 4.9430807090480523e-11,
+      -1.5554602758465635e-3, 1.0561196919903214e-3, -3.5277184460472902e-4,
+      9.3002334645022459e-14, 7.5285855026557172e-5, -4.8186515569156351e-5,
+      1.5227271505597605e-5},
+    {-6.6188298861372935, 1.3397985455142589e+1, -1.0789350606845146e+1,
+      -1.4352254537875018e-3, 9.2333694596189809, -1.0456552819547769e+1,
+      5.5105526029033471, 1.2024439690716742e-5, -2.5762961164755816,
+      2.3207442745387179, -1.0045728797216284, -1.0207833290021914e-7,
+      3.3975092171169466e-1, -2.6720517450757468e-1, 1.0235252851562706e-1,
+      8.4329730484871625e-10, -2.7998284958442595e-2, 2.0066274144976813e-2,
+      -7.0554368915086242e-3, 1.9402238183698188e-12, 1.6562888105449611e-3,
+      -1.1082898580743683e-3, 3.654545161310169e-4, -5.1290032026971794e-11,
+      -7.6340103696869031e-5},
+    {-1.7112706061976095e+1, -1.1208044642899116, 3.7131966511885444e+1,
+      -5.2298271025348962e+1, 3.3058589696624618e+1, 2.4791298976200222e-3,
+      -2.061089403411526e+1, 2.088672775145582e+1, -1.0045703956517752e+1,
+      -1.2238783449063012e-5, 4.0770134274221141, -3.473667358470195,
+      1.4329352617312006, 7.1359914411879712e-8, -4.4797257159115612e-1,
+      3.4112666080644461e-1, -1.2699786326594923e-1, -2.8953677269081528e-10,
+      3.3125776278259863e-2, -2.3274087021036101e-2, 8.0399993503648882e-3,
+      -1.177805216235265e-9, -1.8321624891071668e-3, 1.2108282933588665e-3,
+      -3.9479941246822517e-4},
+    {7.389033153567425e+1, -1.5680141270402273e+2, 1.322177542759164e+2,
+      1.3692876877324546e-2, -1.2366496885920151e+2, 1.4620689391062729e+2,
+      -8.0365587724865346e+1, -1.1259851148881298e-4, 4.0770132196179938e+1,
+      -3.8210340013273034e+1, 1.719522294277362e+1, 9.3519707955168356e-7,
+      -6.2716159907747034, 5.1168999071852637, -2.0319658112299095,
+      -4.9507215582761543e-9, 5.9626397294332597e-1, -4.4220765337238094e-1,
+      1.6079998700166273e-1, -2.4733786203223402e-8, -4.0307574759979762e-2,
+      2.7849050747097869e-2, -9.4751858992054221e-3, 6.419922235909132e-6,
+      2.1250180774699461e-3},
+    {2.1216837098382522e+2, 1.3107863022633868e+1, -4.9698285932871748e+2,
+      7.3121595266969204e+2, -4.8213821720890847e+2, -2.8817248692894889e-2,
+      3.2616720302947102e+2, -3.4389340280087117e+2, 1.7195193870816232e+2,
+      1.4038077378096158e-4, -7.52594195897599e+1, 6.651969984520934e+1,
+      -2.8447519748152462e+1, -7.613702615875391e-7, 9.5402237105304373,
+      -7.5175301113311376, 2.8943997568871961, -4.6612194999538201e-7,
+      -8.0615149598794088e-1, 5.8483006570631029e-1, -2.0845408972964956e-1,
+      1.4765818959305817e-4, 5.1000433863753019e-2, -3.3066252141883665e-2,
+      1.5109265210467774e-2},
+    {-9.8959643098322368e+2, 2.1925555360905233e+3, -1.9283586782723356e+3,
+      -1.5925738122215253e-1, 1.9569985945919857e+3, -2.4072514765081556e+3,
+      1.3756149959336496e+3, 1.2920735237496668e-3, -7.525941715948055e+2,
+      7.3171668742208716e+2, -3.4137023466220065e+2, -9.9857390260608043e-6,
+      1.3356313181291573e+2, -1.1276295161252794e+2, 4.6310396098204458e+1,
+      -7.9237387133614756e-6, -1.4510726927018646e+1, 1.1111771248100563e+1,
+      -4.1690817945270892, 3.1008219800117808e-3, 1.1220095449981468,
+      -7.6052379926149916e-1, 3.6262236505085254e-1, 2.216867741940747e-1,
+      4.8683443692930507e-1}};
+
+  int k, n, sgn;
+  int maxpow = 0;
+  static scalar_t MACHEP = std::is_same<scalar_t, double>::value ?
+    1.11022302462515654042E-16 : 5.9604644775390625E-8;
+  scalar_t lambda = x / a;
+  scalar_t sigma = (x - a) / a;
+  scalar_t eta, res, ck, ckterm, term, absterm;
+  scalar_t absoldterm = INFINITY;
+  scalar_t etapow[25] = {1};
+  scalar_t sum = 0;
+  scalar_t afac = 1;
+
+  if (igam) {
+    sgn = -1;
+  }
+  else {
+    sgn = 1;
+  }
+
+  if (lambda > 1) {
+    eta = std::sqrt(-2 * (std::log1p(sigma) - sigma));
+  }
+  else if (lambda < 1) {
+    eta = -std::sqrt(-2 * (std::log1p(sigma) - sigma));
+  }
+  else {
+    eta = 0;
+  }
+  res = 0.5 * std::erfc(sgn * eta * std::sqrt(a / 2));
+
+  for (k = 0; k < 25; k++) {
+    ck = d[k][0];
+    for (n = 1; n < 25; n++) {
+      if (n > maxpow) {
+        etapow[n] = eta * etapow[n-1];
+        maxpow += 1;
+      }
+      ckterm = d[k][n]*etapow[n];
+      ck += ckterm;
+      if (std::fabs(ckterm) < MACHEP * std::fabs(ck)) {
+        break;
+      }
+    }
+    term = ck * afac;
+    absterm = std::fabs(term);
+    if (absterm > absoldterm) {
+      break;
+    }
+    sum += term;
+    if (absterm < MACHEP * std::fabs(sum)) {
+      break;
+    }
+    absoldterm = absterm;
+    afac /= a;
+  }
+  res += sgn * std::exp(-0.5 * a * eta * eta) * sum / std::sqrt(2 * M_PIf * a);
+
+  return res;
+}
+
+template <typename scalar_t>
+static scalar_t _igamc_helper_continued_fraction(scalar_t a, scalar_t x) {
+  // Compute igamc using DLMF 8.9.2. [igam1]
+  int i;
+  scalar_t ans, ax, c, yc, r, t, y, z;
+  scalar_t pk, pkm1, pkm2, qk, qkm1, qkm2;
+  int MAXITER = 2000;
+  static scalar_t MACHEP = std::is_same<scalar_t, double>::value ?
+    1.11022302462515654042E-16 : 5.9604644775390625E-8;
+  static scalar_t BIG = std::is_same<scalar_t,double>::value ?
+    4.503599627370496e15 : 16777216.;
+  static scalar_t BIGINV = std::is_same<scalar_t,double>::value ?
+    2.22044604925031308085e-16 : 5.9604644775390625E-8;
+
+  ax = _igam_helper_fac(a, x);
+  if (ax == 0.0) {
+    return 0.0;
+  }
+
+  /* continued fraction */
+  y = 1.0 - a;
+  z = x + y + 1.0;
+  c = 0.0;
+  pkm2 = 1.0;
+  qkm2 = x;
+  pkm1 = x + 1.0;
+  qkm1 = z * x;
+  ans = pkm1 / qkm1;
+
+  for (i = 0; i < MAXITER; i++) {
+    c += 1.0;
+    y += 1.0;
+    z += 2.0;
+    yc = y * c;
+    pk = pkm1 * z - pkm2 * yc;
+    qk = qkm1 * z - qkm2 * yc;
+    if (qk != 0) {
+      r = pk / qk;
+      t = std::fabs((ans - r) / r);
+      ans = r;
+    }
+    else {
+      t = 1.0;
+    }
+    pkm2 = pkm1;
+    pkm1 = pk;
+    qkm2 = qkm1;
+    qkm1 = qk;
+    if (std::fabs(pk) > BIG) {
+      pkm2 *= BIGINV;
+      pkm1 *= BIGINV;
+      qkm2 *= BIGINV;
+      qkm1 *= BIGINV;
+    }
+    if (t <= MACHEP) {
+      break;
+    }
+  }
+  return ans * ax;
+}
+
+template <typename scalar_t>
+static inline scalar_t calc_igammac(scalar_t a, scalar_t x) {
+  /* the calculation of the regularized upper incomplete gamma function
+   * is done differently based on the values of a and x:
+   * - if x and/or a is at the boundary of defined region, then assign the
+   *   result at the boundary
+   * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
+   *   Large Parameter (see DLMF 8.12.4 [igam1])
+   * - if x > 1.1 and x < a, using the substraction from the regularized lower
+   *   incomplete gamma
+   * - otherwise, calculate the series from [igam2] eq (5)
+   */
+  scalar_t absxma_a;
+
+  static scalar_t SMALL = 20.0;
+  static scalar_t LARGE = 200.0;
+  static scalar_t SMALLRATIO = 0.3;
+  static scalar_t LARGERATIO = 4.5;
+
+  // note that in SciPy, a and x are non-negative, with exclusive 0s (i.e.,
+  // at most 1 of them can be 0), where igammac(0, x) = 0.0 iff x > 0.
+  if ((x < 0) || (a < 0)) {
+    // out of defined-region of the function
+    return std::numeric_limits<scalar_t>::quiet_NaN();
+  }
+  else if (a == 0) {
+    if (x > 0) {
+      return 0.0;
+    }
+    else {
+      return std::numeric_limits<scalar_t>::quiet_NaN();
+    }
+  }
+  else if (x == 0) {
+    return 1.0;
+  }
+  else if (std::isinf(a)) {
+    if (std::isinf(x)) {
+      return std::numeric_limits<scalar_t>::quiet_NaN();
+    }
+    return 1.0;
+  }
+  else if (std::isinf(x)) {
+    return 0.0;
+  }
+
+  absxma_a = std::fabs(x - a) / a;
+  if ((a > SMALL) && (a < LARGE) && (absxma_a < SMALLRATIO)) {
+     return _igam_helper_asymptotic_series(a, x, 0);
+  }
+  else if ((a > LARGE) && (absxma_a < LARGERATIO / std::sqrt(a))) {
+     return _igam_helper_asymptotic_series(a, x, 0);
+  }
+
+  if (x > 1.1) {
+    if (x < a) {
+      return 1.0 - _igam_helper_series(a, x);
+    }
+    else {
+      return _igamc_helper_continued_fraction(a, x);
+    }
+  }
+  else if (x <= 0.5) {
+    if (-0.4 / std::log(x) < a) {
+      return 1.0 - _igam_helper_series(a, x);
+    }
+    else {
+      return _igamc_helper_series(a, x);
+    }
+  }
+  else {
+    if (x * 1.1 < a) {
+      return 1.0 - _igam_helper_series(a, x);
+    }
+    else {
+      return _igamc_helper_series(a, x);
+    }
+  }
+}
+
+template <typename scalar_t>
+static inline scalar_t calc_igamma(scalar_t a, scalar_t x) {
+  /* the calculation of the regularized lower incomplete gamma function
+   * is done differently based on the values of a and x:
+   * - if x and/or a is at the boundary of defined region, then assign the
+   *   result at the boundary
+   * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
+   *   Large Parameter (see DLMF 8.12.3 [igam1])
+   * - if x > 1 and x > a, using the substraction from the regularized upper
+   *   incomplete gamma
+   * - otherwise, calculate the series from [igam2] eq (4)
+   */
+  scalar_t absxma_a;
+  static scalar_t SMALL = 20.0;
+  static scalar_t LARGE = 200.0;
+  static scalar_t SMALLRATIO = 0.3;
+  static scalar_t LARGERATIO = 4.5;
+
+  // boundary values following SciPy
+  // note that in SciPy, a and x are non-negative, with exclusive 0s (i.e.,
+  // at most 1 of them can be 0), where igamma(0, x) = 1.0 iff x > 0.
+  if ((x < 0) || (a < 0)) {
+    // out of defined-region of the function
+    return std::numeric_limits<scalar_t>::quiet_NaN();
+  }
+  else if (a == 0) {
+    if (x > 0) {
+      return 1.0;
+    }
+    else {
+      return std::numeric_limits<scalar_t>::quiet_NaN();
+    }
+  }
+  else if (x == 0) {
+    return 0.0; // zero integration limit
+  }
+  else if (std::isinf(a)) {
+    if (std::isinf(x)) {
+      return std::numeric_limits<scalar_t>::quiet_NaN();
+    }
+    return 0.0;
+  }
+  else if (std::isinf(x)) {
+    return 1.0;
+  }
+
+  /* Asymptotic regime where a ~ x. See [igam2] */
+  absxma_a = std::fabs(x - a) / a;
+  if ((a > SMALL) && (a < LARGE) && (absxma_a < SMALLRATIO)) {
+    return _igam_helper_asymptotic_series(a, x, 1);
+  }
+  else if ((a > LARGE) && (absxma_a < LARGERATIO / std::sqrt(a))) {
+    return _igam_helper_asymptotic_series(a, x, 1);
+  }
+
+  if ((x > 1.0) && (x > a)) {
+    return 1.0 - calc_igammac(a, x);
+  }
+
+  return _igam_helper_series(a, x);
+}
+
+template <>
+c10::BFloat16 calc_igamma<c10::BFloat16>(c10::BFloat16 a, c10::BFloat16 x) {
+  return calc_igamma<float>(float(a), float(x));
+}
+
+template <>
+c10::Half calc_igamma<c10::Half>(c10::Half a, c10::Half x) {
+  return calc_igamma<float>(float(a), float(x));
+}
+
 inline c10::BFloat16 calc_erfinv(c10::BFloat16 a) { return calc_erfinv(float(a)); }
 
 template <typename T>
diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
index 4e2d00e8347b..652f3ee063e1 100644
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@@ -766,6 +766,19 @@ void hypot_kernel(TensorIterator& iter) {
   });
 }
 
+void igamma_kernel(TensorIterator& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "igamma_cpu", [&]() {
+    cpu_kernel_vec(
+        iter,
+        [=](scalar_t a, scalar_t b) -> scalar_t {
+            return calc_igamma(a, b);
+        },
+        [=](Vec256<scalar_t> a, Vec256<scalar_t> b) {
+            return a.igamma(b);
+        });
+  });
+}
+
 void nextafter_kernel(TensorIterator& iter) {
   AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "nextafter_cpu", [&]() {
     cpu_kernel_vec(
@@ -824,6 +837,7 @@ REGISTER_DISPATCH(logaddexp2_stub, &logaddexp2_kernel);
 REGISTER_DISPATCH(gcd_stub, &gcd_kernel);
 REGISTER_DISPATCH(lcm_stub, &lcm_kernel);
 REGISTER_DISPATCH(hypot_stub, &hypot_kernel);
+REGISTER_DISPATCH(igamma_stub, &igamma_kernel);
 REGISTER_DISPATCH(nextafter_stub, &nextafter_kernel);
 REGISTER_DISPATCH(heaviside_stub, &heaviside_kernel);
 
diff --git a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
index 12b1a0dbf305..2f53c2bb08d7 100644
--- a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
+++ b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
@@ -92,6 +92,14 @@ void hypot_kernel_cuda(TensorIterator& iter) {
   });
 }
 
+void igamma_kernel_cuda(TensorIterator& iter) {
+  AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "igamma_cuda", [&]() {
+    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+      return calc_igamma(a, b);
+    });
+  });
+}
+
 void nextafter_kernel_cuda(TensorIterator& iter) {
   AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "nextafter_cuda", [&]() {
     gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
@@ -116,6 +124,7 @@ REGISTER_DISPATCH(logaddexp2_stub, &logaddexp2_kernel_cuda);
 REGISTER_DISPATCH(gcd_stub, &gcd_kernel_cuda);
 REGISTER_DISPATCH(lcm_stub, &lcm_kernel_cuda);
 REGISTER_DISPATCH(hypot_stub, &hypot_kernel_cuda);
+REGISTER_DISPATCH(igamma_stub, &igamma_kernel_cuda);
 REGISTER_DISPATCH(nextafter_stub, &nextafter_kernel_cuda);
 REGISTER_DISPATCH(heaviside_stub, &heaviside_kernel_cuda);
 
diff --git a/aten/src/ATen/native/cuda/Math.cuh b/aten/src/ATen/native/cuda/Math.cuh
index eec428ae2a12..ddf3679b4c27 100644
--- a/aten/src/ATen/native/cuda/Math.cuh
+++ b/aten/src/ATen/native/cuda/Math.cuh
@@ -54,12 +54,12 @@ static inline __host__ __device__ scalar_t zeta(scalar_t _x, scalar_t _q) {
   a = q;
   i = 0;
   b = 0.0;
-  while((i < 9) || (a <= 9.0)){
+  while ((i < 9) || (a <= 9.0)) {
     i += 1;
     a += 1.0;
     b = ::pow( a, -x );
     s += b;
-    if((-MACHEP < (b / s)) && ((b / s) < MACHEP)) {
+    if ((-MACHEP < (b / s)) && ((b / s) < MACHEP)) {
       return static_cast<scalar_t>(s);
     }
   };
@@ -68,16 +68,16 @@ static inline __host__ __device__ scalar_t zeta(scalar_t _x, scalar_t _q) {
   s -= 0.5 * b;
   a = 1.0;
   k = 0.0;
-  for(int i=0; i < 12; i++) {
+  for (int i=0; i < 12; i++) {
     a *= x + k;
     b /= w;
     t = a * b / A[i];
     s = s + t;
     t = t / s;
-    if(t < 0){
+    if (t < 0){
       t = -t;
     }
-    if((-MACHEP <t) && (t < MACHEP)){
+    if ((-MACHEP <t) && (t < MACHEP)){
       return static_cast<scalar_t>(s);
     }
     k += 1.0;
@@ -174,6 +174,503 @@ static inline __host__ __device__ scalar_t calc_polygamma(int n, scalar_t x) {
   return ((n % 2) ? 1.0 : -1.0) * ::exp(::lgamma(static_cast<scalar_t>(n) + 1.0)) * zeta(static_cast<scalar_t>(n + 1), x);
 }
 
+/*
+ * This implementation of the regularized incomplete gamma functions and
+ * their helper functions are derived from the implementation of SciPy's
+ * gammainc, Cephes's igam and igamc, and Boost's Lanczos approximations.
+ * See NOTICE for the licenses.
+ */
+// regularized lower & upper incomplete gamma
+template <typename scalar_t>
+static __host__ __device__ scalar_t ratevl(scalar_t x, const scalar_t num[], int64_t M,
+    const scalar_t denom[], int64_t N) {
+  // evaluating rational function, i.e., the ratio of two polynomials
+  // the coefficients for numerator are given by `num` while coeffs for
+  // denumerator are given by `denom`
+
+  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+  int64_t i, dir;
+  accscalar_t y, num_ans, denom_ans;
+  accscalar_t absx = ::fabs(x);
+  const accscalar_t *p;
+
+  if (absx > 1) {
+    /* Evaluate as a polynomial in 1/x. */
+    dir = -1;
+    p = num + M;
+    y = 1 / x;
+  }
+  else {
+    dir = 1;
+    p = num;
+    y = x;
+  }
+
+  /* Evaluate the numerator */
+  num_ans = *p;
+  p += dir;
+  for (i = 1; i <= M; i++) {
+    num_ans = num_ans * y + *p;
+    p += dir;
+  }
+  /* Evaluate the denominator */
+  if (absx > 1) {
+    p = denom + N;
+  }
+  else {
+    p = denom;
+  }
+
+  denom_ans = *p;
+  p += dir;
+  for (i = 1; i <= N; i++) {
+    denom_ans = denom_ans * y + *p;
+    p += dir;
+  }
+  if (absx > 1) {
+    i = N - M;
+    return ::pow(x, static_cast<accscalar_t>(i)) * num_ans / denom_ans;
+  }
+  else {
+    return num_ans / denom_ans;
+  }
+}
+
+template <typename scalar_t>
+static __host__ __device__ scalar_t lanczos_sum_expg_scaled(scalar_t x) {
+  // lanczos approximation
+  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+
+  static const accscalar_t lanczos_sum_expg_scaled_num[13] = {
+    0.006061842346248906525783753964555936883222,
+    0.5098416655656676188125178644804694509993,
+    19.51992788247617482847860966235652136208,
+    449.9445569063168119446858607650988409623,
+    6955.999602515376140356310115515198987526,
+    75999.29304014542649875303443598909137092,
+    601859.6171681098786670226533699352302507,
+    3481712.15498064590882071018964774556468,
+    14605578.08768506808414169982791359218571,
+    43338889.32467613834773723740590533316085,
+    86363131.28813859145546927288977868422342,
+    103794043.1163445451906271053616070238554,
+    56906521.91347156388090791033559122686859
+  };
+  static const accscalar_t lanczos_sum_expg_scaled_denom[13] = {
+    1.,
+    66.,
+    1925.,
+    32670.,
+    357423.,
+    2637558.,
+    13339535.,
+    45995730.,
+    105258076.,
+    150917976.,
+    120543840.,
+    39916800.,
+    0
+  };
+  return ratevl(static_cast<accscalar_t>(x), lanczos_sum_expg_scaled_num,
+      sizeof(lanczos_sum_expg_scaled_num) / sizeof(lanczos_sum_expg_scaled_num[0]) - 1,
+      lanczos_sum_expg_scaled_denom,
+      sizeof(lanczos_sum_expg_scaled_denom) / sizeof(lanczos_sum_expg_scaled_denom[0]) - 1);
+}
+
+template <typename scalar_t>
+static __host__ __device__ scalar_t _igam_helper_fac(scalar_t a, scalar_t x) {
+  // compute x^a * exp(-a) / gamma(a)
+  // corrected from (15) and (16) in [igam2] by replacing exp(x - a) with
+  // exp(a - x).
+
+  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+  accscalar_t ax, fac, res, num, numfac;
+  static accscalar_t MAXLOG = std::is_same<accscalar_t,double>::value ?
+    7.09782712893383996843E2 : 88.72283905206835;
+  static accscalar_t EXP1 = 2.718281828459045;
+  static accscalar_t lanczos_g = 6.024680040776729583740234375;
+
+  if (::fabs(a - x) > 0.4 * ::fabs(a)) {
+    ax = a * ::log(x) - x - ::lgamma(a);
+    if (ax < -MAXLOG) {
+      return 0.0;
+    }
+    return ::exp(ax);
+  }
+
+  fac = a + lanczos_g - 0.5;
+  res = ::sqrt(fac / EXP1) / lanczos_sum_expg_scaled(a);
+
+  if ((a < 200) && (x < 200)) {
+    res *= ::exp(a - x) * ::pow(x / fac, a);
+  }
+  else {
+    num = x - a - lanczos_g + 0.5;
+    numfac = num / fac;
+    res *= ::exp(a * (::log1p(numfac) - numfac) + x * (0.5 - lanczos_g) / fac);
+  }
+  return res;
+}
+
+template <typename scalar_t>
+static __host__ __device__ scalar_t _igam_helper_series(scalar_t a, scalar_t x) {
+  // Compute igam using DLMF 8.11.4. [igam1]
+
+  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+  static accscalar_t MACHEP = std::is_same<accscalar_t, double>::value ?
+    1.11022302462515654042E-16 : 5.9604644775390625E-8;
+  static int MAXITER = 2000;
+
+  int i;
+  accscalar_t ans, ax, c, r;
+
+  ax = _igam_helper_fac(a, x);
+  if (ax == 0.0) {
+    return 0.0;
+  }
+
+  /* power series */
+  r = a;
+  c = 1.0;
+  ans = 1.0;
+
+  for (i = 0; i < MAXITER; i++) {
+    r += 1.0;
+    c *= x / r;
+    ans += c;
+    if (c <= MACHEP * ans) {
+      break;
+    }
+  }
+  return (ans * ax / a);
+}
+
+template <typename scalar_t>
+static __host__ __device__ scalar_t _igamc_helper_series(scalar_t a, scalar_t x) {
+  // Compute igamc using DLMF 8.7.3 [igam1]. This is related to the series in
+  // _igam_helper_series but extra care is taken to avoid cancellation.
+
+  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+  int n;
+  accscalar_t fac = 1;
+  accscalar_t sum = 0;
+  accscalar_t term, logx;
+  static accscalar_t MAXITER = 2000;
+  static accscalar_t MACHEP = std::is_same<accscalar_t, double>::value ?
+    1.11022302462515654042E-16 : 5.9604644775390625E-8;
+
+  for (n = 1; n < MAXITER; n++) {
+    fac *= -x / n;
+    term = fac / (a + n);
+    sum += term;
+    if (::fabs(term) <= MACHEP * ::fabs(sum)) {
+        break;
+    }
+  }
+
+  logx = ::log(x);
+  term = -::expm1(a * logx - ::lgamma(1+a));
+  return term - ::exp(a * logx - ::lgamma(a)) * sum;
+}
+
+template <typename scalar_t>
+static __host__ __device__ scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t x, bool igam) {
+  // Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1]
+
+  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+  static const accscalar_t d[25][25] =
+    {{-3.3333333333333333e-1, 8.3333333333333333e-2, -1.4814814814814815e-2, 1.1574074074074074e-3, 3.527336860670194e-4, -1.7875514403292181e-4, 3.9192631785224378e-5, -2.1854485106799922e-6, -1.85406221071516e-6, 8.296711340953086e-7, -1.7665952736826079e-7, 6.7078535434014986e-9, 1.0261809784240308e-8, -4.3820360184533532e-9, 9.1476995822367902e-10, -2.551419399494625e-11, -5.8307721325504251e-11, 2.4361948020667416e-11, -5.0276692801141756e-12, 1.1004392031956135e-13, 3.3717632624009854e-13, -1.3923887224181621e-13, 2.8534893807047443e-14, -5.1391118342425726e-16, -1.9752288294349443e-15},
+    {-1.8518518518518519e-3, -3.4722222222222222e-3, 2.6455026455026455e-3, -9.9022633744855967e-4, 2.0576131687242798e-4, -4.0187757201646091e-7, -1.8098550334489978e-5, 7.6491609160811101e-6, -1.6120900894563446e-6, 4.6471278028074343e-9, 1.378633446915721e-7, -5.752545603517705e-8, 1.1951628599778147e-8, -1.7543241719747648e-11, -1.0091543710600413e-9, 4.1627929918425826e-10, -8.5639070264929806e-11, 6.0672151016047586e-14, 7.1624989648114854e-12, -2.9331866437714371e-12, 5.9966963656836887e-13, -2.1671786527323314e-16, -4.9783399723692616e-14, 2.0291628823713425e-14, -4.13125571381061e-15},
+    {4.1335978835978836e-3, -2.6813271604938272e-3, 7.7160493827160494e-4, 2.0093878600823045e-6, -1.0736653226365161e-4, 5.2923448829120125e-5, -1.2760635188618728e-5, 3.4235787340961381e-8, 1.3721957309062933e-6, -6.298992138380055e-7, 1.4280614206064242e-7, -2.0477098421990866e-10, -1.4092529910867521e-8, 6.228974084922022e-9, -1.3670488396617113e-9, 9.4283561590146782e-13, 1.2872252400089318e-10, -5.5645956134363321e-11, 1.1975935546366981e-11, -4.1689782251838635e-15, -1.0940640427884594e-12, 4.6622399463901357e-13, -9.905105763906906e-14, 1.8931876768373515e-17, 8.8592218725911273e-15},
+    {6.4943415637860082e-4, 2.2947209362139918e-4, -4.6918949439525571e-4, 2.6772063206283885e-4, -7.5618016718839764e-5, -2.3965051138672967e-7, 1.1082654115347302e-5, -5.6749528269915966e-6, 1.4230900732435884e-6, -2.7861080291528142e-11, -1.6958404091930277e-7, 8.0994649053880824e-8, -1.9111168485973654e-8, 2.3928620439808118e-12, 2.0620131815488798e-9, -9.4604966618551322e-10, 2.1541049775774908e-10, -1.388823336813903e-14, -2.1894761681963939e-11, 9.7909989511716851e-12, -2.1782191880180962e-12, 6.2088195734079014e-17, 2.126978363279737e-13, -9.3446887915174333e-14, 2.0453671226782849e-14},
+    {-8.618882909167117e-4, 7.8403922172006663e-4, -2.9907248030319018e-4, -1.4638452578843418e-6, 6.6414982154651222e-5, -3.9683650471794347e-5, 1.1375726970678419e-5, 2.5074972262375328e-10, -1.6954149536558306e-6, 8.9075075322053097e-7, -2.2929348340008049e-7, 2.956794137544049e-11, 2.8865829742708784e-8, -1.4189739437803219e-8, 3.4463580499464897e-9, -2.3024517174528067e-13, -3.9409233028046405e-10, 1.8602338968504502e-10, -4.356323005056618e-11, 1.2786001016296231e-15, 4.6792750266579195e-12, -2.1492464706134829e-12, 4.9088156148096522e-13, -6.3385914848915603e-18, -5.0453320690800944e-14},
+    {-3.3679855336635815e-4, -6.9728137583658578e-5, 2.7727532449593921e-4, -1.9932570516188848e-4, 6.7977804779372078e-5, 1.419062920643967e-7, -1.3594048189768693e-5, 8.0184702563342015e-6, -2.2914811765080952e-6, -3.252473551298454e-10, 3.4652846491085265e-7, -1.8447187191171343e-7, 4.8240967037894181e-8, -1.7989466721743515e-14, -6.3061945000135234e-9, 3.1624176287745679e-9, -7.8409242536974293e-10, 5.1926791652540407e-15, 9.3589442423067836e-11, -4.5134262161632782e-11, 1.0799129993116827e-11, -3.661886712685252e-17, -1.210902069055155e-12, 5.6807435849905643e-13, -1.3249659916340829e-13},
+    {5.3130793646399222e-4, -5.9216643735369388e-4, 2.7087820967180448e-4, 7.9023532326603279e-7, -8.1539693675619688e-5, 5.6116827531062497e-5, -1.8329116582843376e-5, -3.0796134506033048e-9, 3.4651553688036091e-6, -2.0291327396058604e-6, 5.7887928631490037e-7, 2.338630673826657e-13, -8.8286007463304835e-8, 4.7435958880408128e-8, -1.2545415020710382e-8, 8.6496488580102925e-14, 1.6846058979264063e-9, -8.5754928235775947e-10, 2.1598224929232125e-10, -7.6132305204761539e-16, -2.6639822008536144e-11, 1.3065700536611057e-11, -3.1799163902367977e-12, 4.7109761213674315e-18, 3.6902800842763467e-13},
+    {3.4436760689237767e-4, 5.1717909082605922e-5, -3.3493161081142236e-4, 2.812695154763237e-4, -1.0976582244684731e-4, -1.2741009095484485e-7, 2.7744451511563644e-5, -1.8263488805711333e-5, 5.7876949497350524e-6, 4.9387589339362704e-10, -1.0595367014026043e-6, 6.1667143761104075e-7, -1.7562973359060462e-7, -1.2974473287015439e-12, 2.695423606288966e-8, -1.4578352908731271e-8, 3.887645959386175e-9, -3.8810022510194121e-17, -5.3279941738772867e-10, 2.7437977643314845e-10, -6.9957960920705679e-11, 2.5899863874868481e-17, 8.8566890996696381e-12, -4.403168815871311e-12, 1.0865561947091654e-12},
+    {-6.5262391859530942e-4, 8.3949872067208728e-4, -4.3829709854172101e-4, -6.969091458420552e-7, 1.6644846642067548e-4, -1.2783517679769219e-4, 4.6299532636913043e-5, 4.5579098679227077e-9, -1.0595271125805195e-5, 6.7833429048651666e-6, -2.1075476666258804e-6, -1.7213731432817145e-11, 3.7735877416110979e-7, -2.1867506700122867e-7, 6.2202288040189269e-8, 6.5977038267330006e-16, -9.5903864974256858e-9, 5.2132144922808078e-9, -1.3991589583935709e-9, 5.382058999060575e-16, 1.9484714275467745e-10, -1.0127287556389682e-10, 2.6077347197254926e-11, -5.0904186999932993e-18, -3.3721464474854592e-12},
+    {-5.9676129019274625e-4, -7.2048954160200106e-5, 6.7823088376673284e-4, -6.4014752602627585e-4, 2.7750107634328704e-4, 1.8197008380465151e-7, -8.4795071170685032e-5, 6.105192082501531e-5, -2.1073920183404862e-5, -8.8585890141255994e-10, 4.5284535953805377e-6, -2.8427815022504408e-6, 8.7082341778646412e-7, 3.6886101871706965e-12, -1.5344695190702061e-7, 8.862466778790695e-8, -2.5184812301826817e-8, -1.0225912098215092e-14, 3.8969470758154777e-9, -2.1267304792235635e-9, 5.7370135528051385e-10, -1.887749850169741e-19, -8.0931538694657866e-11, 4.2382723283449199e-11, -1.1002224534207726e-11},
+    {1.3324454494800656e-3, -1.9144384985654775e-3, 1.1089369134596637e-3, 9.932404122642299e-7, -5.0874501293093199e-4, 4.2735056665392884e-4, -1.6858853767910799e-4, -8.1301893922784998e-9, 4.5284402370562147e-5, -3.127053674781734e-5, 1.044986828530338e-5, 4.8435226265680926e-11, -2.1482565873456258e-6, 1.329369701097492e-6, -4.0295693092101029e-7, -1.7567877666323291e-13, 7.0145043163668257e-8, -4.040787734999483e-8, 1.1474026743371963e-8, 3.9642746853563325e-18, -1.7804938269892714e-9, 9.7480262548731646e-10, -2.6405338676507616e-10, 5.794875163403742e-18, 3.7647749553543836e-11},
+    {1.579727660730835e-3, 1.6251626278391582e-4, -2.0633421035543276e-3, 2.1389686185689098e-3, -1.0108559391263003e-3, -3.9912705529919201e-7, 3.6235025084764691e-4, -2.8143901463712154e-4, 1.0449513336495887e-4, 2.1211418491830297e-9, -2.5779417251947842e-5, 1.7281818956040463e-5, -5.6413773872904282e-6, -1.1024320105776174e-11, 1.1223224418895175e-6, -6.8693396379526735e-7, 2.0653236975414887e-7, 4.6714772409838506e-14, -3.5609886164949055e-8, 2.0470855345905963e-8, -5.8091738633283358e-9, -1.332821287582869e-16, 9.0354604391335133e-10, -4.9598782517330834e-10, 1.3481607129399749e-10},
+    {-4.0725121195140166e-3, 6.4033628338080698e-3, -4.0410161081676618e-3, -2.183732802866233e-6, 2.1740441801254639e-3, -1.9700440518418892e-3, 8.3595469747962458e-4, 1.9445447567109655e-8, -2.5779387120421696e-4, 1.9009987368139304e-4, -6.7696499937438965e-5, -1.4440629666426572e-10, 1.5712512518742269e-5, -1.0304008744776893e-5, 3.304517767401387e-6, 7.9829760242325709e-13, -6.4097794149313004e-7, 3.8894624761300056e-7, -1.1618347644948869e-7, -2.816808630596451e-15, 1.9878012911297093e-8, -1.1407719956357511e-8, 3.2355857064185555e-9, 4.1759468293455945e-20, -5.0423112718105824e-10},
+    {-5.9475779383993003e-3, -5.4016476789260452e-4, 8.7910413550767898e-3, -9.8576315587856125e-3, 5.0134695031021538e-3, 1.2807521786221875e-6, -2.0626019342754683e-3, 1.7109128573523058e-3, -6.7695312714133799e-4, -6.9011545676562133e-9, 1.8855128143995902e-4, -1.3395215663491969e-4, 4.6263183033528039e-5, 4.0034230613321351e-11, -1.0255652921494033e-5, 6.612086372797651e-6, -2.0913022027253008e-6, -2.0951775649603837e-13, 3.9756029041993247e-7, -2.3956211978815887e-7, 7.1182883382145864e-8, 8.925574873053455e-16, -1.2101547235064676e-8, 6.9350618248334386e-9, -1.9661464453856102e-9},
+    {1.7402027787522711e-2, -2.9527880945699121e-2, 2.0045875571402799e-2, 7.0289515966903407e-6, -1.2375421071343148e-2, 1.1976293444235254e-2, -5.4156038466518525e-3, -6.3290893396418616e-8, 1.8855118129005065e-3, -1.473473274825001e-3, 5.5515810097708387e-4, 5.2406834412550662e-10, -1.4357913535784836e-4, 9.9181293224943297e-5, -3.3460834749478311e-5, -3.5755837291098993e-12, 7.1560851960630076e-6, -4.5516802628155526e-6, 1.4236576649271475e-6, 1.8803149082089664e-14, -2.6623403898929211e-7, 1.5950642189595716e-7, -4.7187514673841102e-8, -6.5107872958755177e-17, 7.9795091026746235e-9},
+    {3.0249124160905891e-2, 2.4817436002649977e-3, -4.9939134373457022e-2, 5.9915643009307869e-2, -3.2483207601623391e-2, -5.7212968652103441e-6, 1.5085251778569354e-2, -1.3261324005088445e-2, 5.5515262632426148e-3, 3.0263182257030016e-8, -1.7229548406756723e-3, 1.2893570099929637e-3, -4.6845138348319876e-4, -1.830259937893045e-10, 1.1449739014822654e-4, -7.7378565221244477e-5, 2.5625836246985201e-5, 1.0766165333192814e-12, -5.3246809282422621e-6, 3.349634863064464e-6, -1.0381253128684018e-6, -5.608909920621128e-15, 1.9150821930676591e-7, -1.1418365800203486e-7, 3.3654425209171788e-8},
+    {-9.9051020880159045e-2, 1.7954011706123486e-1, -1.2989606383463778e-1, -3.1478872752284357e-5, 9.0510635276848131e-2, -9.2828824411184397e-2, 4.4412112839877808e-2, 2.7779236316835888e-7, -1.7229543805449697e-2, 1.4182925050891573e-2, -5.6214161633747336e-3, -2.39598509186381e-9, 1.6029634366079908e-3, -1.1606784674435773e-3, 4.1001337768153873e-4, 1.8365800754090661e-11, -9.5844256563655903e-5, 6.3643062337764708e-5, -2.076250624489065e-5, -1.1806020912804483e-13, 4.2131808239120649e-6, -2.6262241337012467e-6, 8.0770620494930662e-7, 6.0125912123632725e-16, -1.4729737374018841e-7},
+    {-1.9994542198219728e-1, -1.5056113040026424e-2, 3.6470239469348489e-1, -4.6435192311733545e-1, 2.6640934719197893e-1, 3.4038266027147191e-5, -1.3784338709329624e-1, 1.276467178337056e-1, -5.6213828755200985e-2, -1.753150885483011e-7, 1.9235592956768113e-2, -1.5088821281095315e-2, 5.7401854451350123e-3, 1.0622382710310225e-9, -1.5335082692563998e-3, 1.0819320643228214e-3, -3.7372510193945659e-4, -6.6170909729031985e-12, 8.4263617380909628e-5, -5.5150706827483479e-5, 1.7769536448348069e-5, 3.8827923210205533e-14, -3.53513697488768e-6, 2.1865832130045269e-6, -6.6812849447625594e-7},
+    {7.2438608504029431e-1, -1.3918010932653375, 1.0654143352413968, 1.876173868950258e-4, -8.2705501176152696e-1, 8.9352433347828414e-1, -4.4971003995291339e-1, -1.6107401567546652e-6, 1.9235590165271091e-1, -1.6597702160042609e-1, 6.8882222681814333e-2, 1.3910091724608687e-8, -2.146911561508663e-2, 1.6228980898865892e-2, -5.9796016172584256e-3, -1.1287469112826745e-10, 1.5167451119784857e-3, -1.0478634293553899e-3, 3.5539072889126421e-4, 8.1704322111801517e-13, -7.7773013442452395e-5, 5.0291413897007722e-5, -1.6035083867000518e-5, 1.2469354315487605e-14, 3.1369106244517615e-6},
+    {1.6668949727276811, 1.165462765994632e-1, -3.3288393225018906, 4.4692325482864037, -2.6977693045875807, -2.600667859891061e-4, 1.5389017615694539, -1.4937962361134612, 6.8881964633233148e-1, 1.3077482004552385e-6, -2.5762963325596288e-1, 2.1097676102125449e-1, -8.3714408359219882e-2, -7.7920428881354753e-9, 2.4267923064833599e-2, -1.7813678334552311e-2, 6.3970330388900056e-3, 4.9430807090480523e-11, -1.5554602758465635e-3, 1.0561196919903214e-3, -3.5277184460472902e-4, 9.3002334645022459e-14, 7.5285855026557172e-5, -4.8186515569156351e-5, 1.5227271505597605e-5},
+    {-6.6188298861372935, 1.3397985455142589e+1, -1.0789350606845146e+1, -1.4352254537875018e-3, 9.2333694596189809, -1.0456552819547769e+1, 5.5105526029033471, 1.2024439690716742e-5, -2.5762961164755816, 2.3207442745387179, -1.0045728797216284, -1.0207833290021914e-7, 3.3975092171169466e-1, -2.6720517450757468e-1, 1.0235252851562706e-1, 8.4329730484871625e-10, -2.7998284958442595e-2, 2.0066274144976813e-2, -7.0554368915086242e-3, 1.9402238183698188e-12, 1.6562888105449611e-3, -1.1082898580743683e-3, 3.654545161310169e-4, -5.1290032026971794e-11, -7.6340103696869031e-5},
+    {-1.7112706061976095e+1, -1.1208044642899116, 3.7131966511885444e+1, -5.2298271025348962e+1, 3.3058589696624618e+1, 2.4791298976200222e-3, -2.061089403411526e+1, 2.088672775145582e+1, -1.0045703956517752e+1, -1.2238783449063012e-5, 4.0770134274221141, -3.473667358470195, 1.4329352617312006, 7.1359914411879712e-8, -4.4797257159115612e-1, 3.4112666080644461e-1, -1.2699786326594923e-1, -2.8953677269081528e-10, 3.3125776278259863e-2, -2.3274087021036101e-2, 8.0399993503648882e-3, -1.177805216235265e-9, -1.8321624891071668e-3, 1.2108282933588665e-3, -3.9479941246822517e-4},
+    {7.389033153567425e+1, -1.5680141270402273e+2, 1.322177542759164e+2, 1.3692876877324546e-2, -1.2366496885920151e+2, 1.4620689391062729e+2, -8.0365587724865346e+1, -1.1259851148881298e-4, 4.0770132196179938e+1, -3.8210340013273034e+1, 1.719522294277362e+1, 9.3519707955168356e-7, -6.2716159907747034, 5.1168999071852637, -2.0319658112299095, -4.9507215582761543e-9, 5.9626397294332597e-1, -4.4220765337238094e-1, 1.6079998700166273e-1, -2.4733786203223402e-8, -4.0307574759979762e-2, 2.7849050747097869e-2, -9.4751858992054221e-3, 6.419922235909132e-6, 2.1250180774699461e-3},
+    {2.1216837098382522e+2, 1.3107863022633868e+1, -4.9698285932871748e+2, 7.3121595266969204e+2, -4.8213821720890847e+2, -2.8817248692894889e-2, 3.2616720302947102e+2, -3.4389340280087117e+2, 1.7195193870816232e+2, 1.4038077378096158e-4, -7.52594195897599e+1, 6.651969984520934e+1, -2.8447519748152462e+1, -7.613702615875391e-7, 9.5402237105304373, -7.5175301113311376, 2.8943997568871961, -4.6612194999538201e-7, -8.0615149598794088e-1, 5.8483006570631029e-1, -2.0845408972964956e-1, 1.4765818959305817e-4, 5.1000433863753019e-2, -3.3066252141883665e-2, 1.5109265210467774e-2},
+    {-9.8959643098322368e+2, 2.1925555360905233e+3, -1.9283586782723356e+3, -1.5925738122215253e-1, 1.9569985945919857e+3, -2.4072514765081556e+3, 1.3756149959336496e+3, 1.2920735237496668e-3, -7.525941715948055e+2, 7.3171668742208716e+2, -3.4137023466220065e+2, -9.9857390260608043e-6, 1.3356313181291573e+2, -1.1276295161252794e+2, 4.6310396098204458e+1, -7.9237387133614756e-6, -1.4510726927018646e+1, 1.1111771248100563e+1, -4.1690817945270892, 3.1008219800117808e-3, 1.1220095449981468, -7.6052379926149916e-1, 3.6262236505085254e-1, 2.216867741940747e-1, 4.8683443692930507e-1}};
+
+  int k, n, sgn;
+  int maxpow = 0;
+  static accscalar_t MACHEP = std::is_same<accscalar_t, double>::value ?
+    1.11022302462515654042E-16 : 5.9604644775390625E-8;
+  accscalar_t lambda = x / a;
+  accscalar_t sigma = (x - a) / a;
+  accscalar_t eta, res, ck, ckterm, term, absterm;
+  accscalar_t absoldterm = INFINITY;
+  accscalar_t etapow[25] = {1};
+  accscalar_t sum = 0;
+  accscalar_t afac = 1;
+
+  if (igam) {
+    sgn = -1;
+  }
+  else {
+    sgn = 1;
+  }
+
+  if (lambda > 1) {
+    eta = ::sqrt(-2 * (::log1p(sigma) - sigma));
+  }
+  else if (lambda < 1) {
+    eta = -::sqrt(-2 * (::log1p(sigma) - sigma));
+  }
+  else {
+    eta = 0;
+  }
+  res = 0.5 * ::erfc(sgn * eta * ::sqrt(a / 2));
+
+  for (k = 0; k < 25; k++) {
+    ck = d[k][0];
+    for (n = 1; n < 25; n++) {
+      if (n > maxpow) {
+        etapow[n] = eta * etapow[n-1];
+        maxpow += 1;
+      }
+      ckterm = d[k][n]*etapow[n];
+      ck += ckterm;
+      if (std::fabs(ckterm) < MACHEP * std::fabs(ck)) {
+        break;
+      }
+    }
+    term = ck * afac;
+    absterm = std::fabs(term);
+    if (absterm > absoldterm) {
+      break;
+    }
+    sum += term;
+    if (absterm < MACHEP * std::fabs(sum)) {
+      break;
+    }
+    absoldterm = absterm;
+    afac /= a;
+  }
+  res += sgn * ::exp(-0.5 * a * eta * eta) * sum / ::sqrt(2 * 3.1415926535 * a);
+
+  return res;
+}
+
+template <typename scalar_t>
+static __host__ __device__ scalar_t _igamc_helper_continued_fraction(scalar_t a, scalar_t x) {
+  // Compute igamc using DLMF 8.9.2. [igam1]
+
+  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+  int i;
+  accscalar_t ans, ax, c, yc, r, t, y, z;
+  accscalar_t pk, pkm1, pkm2, qk, qkm1, qkm2;
+  int MAXITER = 2000;
+  static accscalar_t MACHEP = std::is_same<accscalar_t, double>::value ?
+    1.11022302462515654042E-16 : 5.9604644775390625E-8;
+  static accscalar_t BIG = std::is_same<accscalar_t,double>::value ?
+    4.503599627370496e15 : 16777216.;
+  static accscalar_t BIGINV = std::is_same<accscalar_t,double>::value ?
+    2.22044604925031308085e-16 : 5.9604644775390625E-8;
+
+  ax = _igam_helper_fac(a, x);
+  if (ax == 0.0) {
+    return 0.0;
+  }
+
+  /* continued fraction */
+  y = 1.0 - a;
+  z = x + y + 1.0;
+  c = 0.0;
+  pkm2 = 1.0;
+  qkm2 = x;
+  pkm1 = x + 1.0;
+  qkm1 = z * x;
+  ans = pkm1 / qkm1;
+
+  for (i = 0; i < MAXITER; i++) {
+    c += 1.0;
+    y += 1.0;
+    z += 2.0;
+    yc = y * c;
+    pk = pkm1 * z - pkm2 * yc;
+    qk = qkm1 * z - qkm2 * yc;
+    if (qk != 0) {
+      r = pk / qk;
+      t = ::fabs((ans - r) / r);
+      ans = r;
+    }
+    else {
+      t = 1.0;
+    }
+    pkm2 = pkm1;
+    pkm1 = pk;
+    qkm2 = qkm1;
+    qkm1 = qk;
+    if (::fabs(pk) > BIG) {
+      pkm2 *= BIGINV;
+      pkm1 *= BIGINV;
+      qkm2 *= BIGINV;
+      qkm1 *= BIGINV;
+    }
+    if (t <= MACHEP) {
+      break;
+    }
+  }
+  return ans * ax;
+}
+
+template <typename scalar_t>
+static inline __host__ __device__ scalar_t calc_igammac(scalar_t a, scalar_t x) {
+  /* the calculation of the regularized upper incomplete gamma function
+   * is done differently based on the values of a and x:
+   * - if x and/or a is at the boundary of defined region, then assign the
+   *   result at the boundary
+   * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
+   *   Large Parameter (see DLMF 8.12.4 [igam1])
+   * - if x > 1.1 and x < a, using the substraction from the regularized lower
+   *   incomplete gamma
+   * - otherwise, calculate the series from [igam2] eq (5)
+   */
+
+  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+  accscalar_t absxma_a;
+
+  static accscalar_t SMALL = 20.0;
+  static accscalar_t LARGE = 200.0;
+  static accscalar_t SMALLRATIO = 0.3;
+  static accscalar_t LARGERATIO = 4.5;
+
+  if ((x < 0) || (a < 0)) {
+    // out of defined-region of the function
+    return std::numeric_limits<accscalar_t>::quiet_NaN();
+  }
+  else if (a == 0) {
+    if (x > 0) {
+      return 0.0;
+    }
+    else {
+      return std::numeric_limits<accscalar_t>::quiet_NaN();
+    }
+  }
+  else if (x == 0) {
+    return 1.0;
+  }
+  else if (::isinf(static_cast<accscalar_t>(a))) {
+    if (::isinf(static_cast<accscalar_t>(x))) {
+      return std::numeric_limits<accscalar_t>::quiet_NaN();
+    }
+    return 1.0;
+  }
+  else if (::isinf(static_cast<accscalar_t>(x))) {
+    return 0.0;
+  }
+
+  absxma_a = ::fabs(x - a) / a;
+  if ((a > SMALL) && (a < LARGE) && (absxma_a < SMALLRATIO)) {
+     return _igam_helper_asymptotic_series(a, x, 0);
+  }
+  else if ((a > LARGE) && (absxma_a < LARGERATIO / ::sqrt(a))) {
+     return _igam_helper_asymptotic_series(a, x, 0);
+  }
+
+  if (x > 1.1) {
+    if (x < a) {
+      return 1.0 - _igam_helper_series(a, x);
+    }
+    else {
+      return _igamc_helper_continued_fraction(a, x);
+    }
+  }
+  else if (x <= 0.5) {
+    if (-0.4 / ::log(x) < a) {
+      return 1.0 - _igam_helper_series(a, x);
+    }
+    else {
+      return _igamc_helper_series(a, x);
+    }
+  }
+  else {
+    if (x * 1.1 < a) {
+      return 1.0 - _igam_helper_series(a, x);
+    }
+    else {
+      return _igamc_helper_series(a, x);
+    }
+  }
+}
+
+template <typename scalar_t>
+static inline __host__ __device__ scalar_t calc_igamma(scalar_t a, scalar_t x) {
+  /* the calculation of the regularized lower incomplete gamma function
+   * is done differently based on the values of a and x:
+   * - if x and/or a is at the boundary of defined region, then assign the
+   *   result at the boundary
+   * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
+   *   Large Parameter (see DLMF 8.12.3 [igam1])
+   * - if x > 1 and x > a, using the substraction from the regularized upper
+   *   incomplete gamma
+   * - otherwise, calculate the series from [igam2] eq (4)
+   */
+
+  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+  accscalar_t absxma_a;
+  static accscalar_t SMALL = 20.0;
+  static accscalar_t LARGE = 200.0;
+  static accscalar_t SMALLRATIO = 0.3;
+  static accscalar_t LARGERATIO = 4.5;
+
+  // boundary values following SciPy
+  if ((x < 0) || (a < 0)) {
+    // out of defined-region of the function
+    return std::numeric_limits<accscalar_t>::quiet_NaN();
+  }
+  else if (a == 0) {
+    if (x > 0) {
+      return 1.0;
+    }
+    else {
+      return std::numeric_limits<accscalar_t>::quiet_NaN();
+    }
+  }
+  else if (x == 0) {
+    return 0.0; // zero integration limit
+  }
+  else if (::isinf(static_cast<accscalar_t>(a))) {
+    if (::isinf(static_cast<accscalar_t>(x))) {
+      return std::numeric_limits<accscalar_t>::quiet_NaN();
+    }
+    return 0.0;
+  }
+  else if (::isinf(static_cast<accscalar_t>(x))) {
+    return 1.0;
+  }
+
+  /* Asymptotic regime where a ~ x. */
+  absxma_a = ::fabs(x - a) / a;
+  if ((a > SMALL) && (a < LARGE) && (absxma_a < SMALLRATIO)) {
+    return _igam_helper_asymptotic_series(a, x, 1);
+  }
+  else if ((a > LARGE) && (absxma_a < LARGERATIO / ::sqrt(a))) {
+    return _igam_helper_asymptotic_series(a, x, 1);
+  }
+
+  if ((x > 1.0) && (x > a)) {
+    return 1.0 - calc_igammac(a, x);
+  }
+
+  return _igam_helper_series(a, x);
+}
+
+// end of regularized lower & upper incomplete gamma
 
 template <typename scalar_t>
 static inline C10_HOST_DEVICE scalar_t calc_gcd(scalar_t a_in, scalar_t b_in) {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index cf323bcbd365..54836b19a658 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -6496,6 +6496,21 @@
   dispatch:
     DefaultBackend: hypot_
 
+- func: igamma.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: igamma_out
+
+- func: igamma(Tensor self, Tensor other) -> Tensor
+  use_c10_dispatcher: full
+  variants: method, function
+  dispatch:
+    CPU, CUDA: igamma
+
+- func: igamma_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  variants: method
+  dispatch:
+    CPU, CUDA: igamma_
+
 - func: nextafter.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU, CUDA: nextafter_out
diff --git a/c10/util/math_compat.h b/c10/util/math_compat.h
index 7d1a7b643850..b522cd26f989 100644
--- a/c10/util/math_compat.h
+++ b/c10/util/math_compat.h
@@ -59,6 +59,14 @@ namespace std {
     throw std::runtime_error("std::hypot is not implemented on older Android");
   }
 
+  // TODO: this function needs to be implemented and tested. Currently just throw an error.
+  inline float igamma(float x, float y) {
+    throw std::runtime_error("igamma is not implemented on older Android");
+  }
+  inline double igamma(double x, double y) {
+    throw std::runtime_error("igamma is not implemented on older Android");
+  }
+
   // TODO: this function needs to be implemented and tested. Currently just throw an error.
   inline float nextafter(float x, float y) {
     throw std::runtime_error("std::nextafter is not implemented on older Android");
@@ -66,7 +74,7 @@ namespace std {
   inline double nextafter(double x, double y) {
     throw std::runtime_error("std::nextafter is not implemented on older Android");
   }
-  
+
    // TODO: this function needs to be implemented and tested. Currently just throw an error.
   inline float exp2(float x) {
     throw std::runtime_error("std::exp2 is not implemented on older Android");
diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst
index 7110631088d7..3bc806067870 100644
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@@ -349,6 +349,8 @@ view of a storage and defines numeric operations on it.
    .. automethod:: hypot_
    .. automethod:: i0
    .. automethod:: i0_
+   .. automethod:: igamma
+   .. automethod:: igamma_
    .. automethod:: ifft
    .. automethod:: index_add_
    .. automethod:: index_add
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index b3c8410300c6..884db0ebd331 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -310,6 +310,7 @@ Pointwise Ops
     logit
     hypot
     i0
+    igamma
     mul
     multiply
     mvlgamma
diff --git a/test/test_autograd.py b/test/test_autograd.py
index eec016cc9623..5dc5c94c3e53 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -2881,6 +2881,14 @@ def test_pow_scalar_base(self):
         a = torch.arange(1, 13, dtype=torch.double).view(3, 4).requires_grad_()
         gradcheck(lambda a: torch.pow(2, a), (a,))
 
+    def test_igamma(self):
+        # 1e-3 offset to avoid zeros
+        # NOTE: derivative for s is not implemented
+        s = (torch.rand(100, dtype=torch.double) + 1e-3)
+        x = (torch.rand(100, dtype=torch.double) + 1e-3).requires_grad_()
+        gradcheck(torch.igamma, (s, x))
+        gradgradcheck(torch.igamma, (s, x))
+
     @skipIfNoLapack
     def test_pinverse(self):
         # Why is pinverse tested this way, and not ordinarily as other linear algebra methods?
diff --git a/test/test_torch.py b/test/test_torch.py
index 2711edbf3384..faf4fbb976bf 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -13690,6 +13690,8 @@ def test_binary_op_mem_overlap(self, device, dtype):
             ("atan2", True, True, 'cuda'),
             ("hypot", True, True, 'cpu'),
             ("hypot", True, True, 'cuda'),
+            ("igamma", True, True, 'cpu'),
+            ("igamma", True, True, 'cuda'),
             ("nextafter", True, True, 'cpu'),
             ("nextafter", True, True, 'cuda'),
             ("le", True, True, 'cpu'),
@@ -17473,6 +17475,70 @@ def test_hypot(self, device, dtype):
                 expected = np.hypot(input[0].cpu().numpy(), input[1].cpu().numpy())
             self.assertEqual(actual, expected)
 
+    def _helper_test_igamma(self, loglo, loghi, device, dtype):
+        exp1 = 2.71828182846
+        vec1 = torch.logspace(loglo, loghi, steps=500, base=exp1,
+                              dtype=torch.float64, device=device).unsqueeze(-1)
+        vec1 = vec1.to(dtype)
+        inputs = [
+            (vec1, vec1.transpose(0, 1)),
+            (vec1, vec1),  # for large number, it should approach 0.5
+            (vec1, 0.5 * vec1),  # test for considerable ratio
+            (vec1, 2.0 * vec1),
+            (vec1[::2, :], vec1[::2, :]),  # contiguous/discontiguous tests
+            (vec1[::2, :], vec1[:vec1.shape[0] // 2, :]),
+            (vec1[:vec1.shape[0] // 2, :], vec1[::2, :]),
+        ]
+        half_prec = dtype in [torch.bfloat16, torch.float16]
+        for input0, input1 in inputs:
+            actual = torch.igamma(input0, input1)
+            if half_prec:
+                input0 = input0.to(torch.float)
+                input1 = input1.to(torch.float)
+            expected = scipy.special.gammainc(input0.cpu().numpy(), input1.cpu().numpy())
+            expected = torch.from_numpy(expected).to(dtype)
+            self.assertEqual(actual, expected)
+
+    @skipCUDAIfRocm  # see issue https://github.com/pytorch/pytorch/issues/46531
+    @dtypesIfCPU(torch.float16, torch.bfloat16, torch.float32, torch.float64)
+    @dtypes(torch.float32, torch.float64)
+    @unittest.skipIf(not TEST_SCIPY, "SciPy not found")
+    @onlyOnCPUAndCUDA
+    def test_igamma_common(self, device, dtype):
+        # test igamma for reasonable range of values
+        loglo = -4  # approx 0.018
+        loghi = 4  # approx 54.6
+        self._helper_test_igamma(loglo, loghi, device, dtype)
+
+    @dtypesIfCPU(torch.float16, torch.bfloat16, torch.float32, torch.float64)
+    @dtypes(torch.float32, torch.float64)
+    @onlyOnCPUAndCUDA
+    def test_igamma_edge_cases(self, device, dtype):
+        tkwargs = {"dtype": dtype, "device": device}
+        infs = torch.zeros((3,), **tkwargs) + float("inf")
+        zeros = torch.zeros((3,), **tkwargs)
+        ones = torch.ones((3,), **tkwargs)
+        zero_to_large = torch.tensor([0., 1., 1e3], **tkwargs)
+        small_to_inf = torch.tensor([1e-3, 1., float("inf")], **tkwargs)
+        nans = torch.zeros((3,), **tkwargs) + float("nan")
+        inpouts = [
+            # (a    ,    x),       out
+            ((zeros, small_to_inf), ones),
+            ((small_to_inf, zeros), zeros),
+            ((infs, zero_to_large), zeros),
+            ((zero_to_large, infs), ones),
+            ((zeros, zeros), nans),
+            ((infs, infs), nans),
+            ((-small_to_inf, small_to_inf), nans),
+        ]
+        for inputs, output in inpouts:
+            input0, input1 = inputs
+            calc = torch.igamma(input0, input1)
+            if torch.all(torch.isnan(output)):
+                self.assertTrue(torch.all(torch.isnan(calc)))
+            else:
+                self.assertEqual(calc, output)
+
     @dtypes(torch.int64, torch.float64)
     def test_remainder_edge_cases(self, device, dtype):
         # Test variations of negative values used as input
@@ -17496,8 +17562,8 @@ def test_remainder_edge_cases(self, device, dtype):
                 r = a.remainder(b)
                 r_expected = torch.tensor([0, 0, 0, 0, -3, 3, -2, 2] * 10000, dtype=dtype, device=device)
                 self.assertEqual(r, r_expected)
-
                 # Test nan cases
+
                 a = torch.tensor([-34, 0, 34] * 20000, dtype=dtype, device=device)
                 b = torch.zeros(3 * 20000, dtype=dtype, device=device)
                 self.assertTrue(torch.isnan(a.remainder(b)).all())
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index fa03db91694c..146624183161 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -540,6 +540,10 @@
 - name: i0(Tensor self) -> Tensor
   self: not_implemented("i0")
 
+- name: igamma(Tensor self, Tensor other) -> Tensor
+  self: 'not_implemented("igamma: input")'
+  other: grad * exp((self - 1) * log(other) - other - lgamma(self))
+
 - name: index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
   self: index_backward(zeros_like(self), indices, grad)
   indices: TensorList()
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index 33c9c6a0e307..12dd77497454 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -1542,6 +1542,20 @@ def add_docstr_all(method, docstr):
 In-place version of :meth:`~Tensor.i0`
 """)
 
+add_docstr_all('igamma',
+               r"""
+igamma(other) -> Tensor
+
+See :func:`torch.igamma`
+""")
+
+add_docstr_all('igamma_',
+               r"""
+igamma_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.igamma`
+""")
+
 add_docstr_all('indices',
                r"""
 indices() -> Tensor
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index ba02b5fc3110..b3a9d57c99d2 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -3316,6 +3316,47 @@ def merge_dicts(*dicts):
 
 """.format(**common_args))
 
+add_docstr(torch.igamma,
+           r"""
+igamma(input, other, *, out=None) -> Tensor
+
+Computes the regularized lower incomplete gamma function:
+
+.. math::
+    \text{out}_{i} = \frac{1}{\Gamma(\text{input}_i)} \int_0^{\text{other}_i} t^{\text{input}_i-1} e^{-t} dt
+
+where both :math:`\text{input}_i` and :math:`\text{other}_i` are weakly positive
+and at least one is strictly positive.
+If both are zero or either is negative then :math:`\text{out}_i=\text{nan}`.
+:math:`\Gamma(\cdot)` in the equation above is the gamma function,
+
+.. math::
+    \Gamma(\text{input}_i) = \int_0^\infty t^{(\text{input}_i-1)} e^{-t} dt.
+
+See :func:`torch.lgamma` for a related function.
+
+Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`
+and float inputs.
+
+.. note::
+    The backward pass with respect to :attr:`input` is not yet supported.
+    Please open an issue on PyTorch's Github to request it.
+
+""" + r"""
+Args:
+    input (Tensor): the first non-negative input tensor
+    other (Tensor): the second non-negative input tensor
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.igamma(torch.tensor([4.0]), torch.tensor([3.0, 4.0, 5.0]))
+    tensor([0.3528, 0.5665, 0.7350])
+
+""".format(**common_args))
+
 add_docstr(torch.index_select,
            r"""
 index_select(input, dim, index, *, out=None) -> Tensor
diff --git a/torch/overrides.py b/torch/overrides.py
index 7f448b334d17..3286484e480f 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -388,6 +388,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.hstack: lambda tensors, out=None: -1,
         torch.hypot: lambda input, other, out=None: -1,
         torch.ifft: lambda input, signal_ndim, normalized=False: -1,
+        torch.igamma: lambda input, other, out=None: -1,
         torch.imag: lambda input, out=None: -1,
         torch.index_add: lambda input, dim, index, source: -1,
         torch.index_copy: lambda input, dim, index, source: -1,

From 1e275bc1a64919536a7a269fc4de0ea100710da7 Mon Sep 17 00:00:00 2001
From: Sam Estep <sestep@fb.com>
Date: Thu, 29 Oct 2020 11:57:14 -0700
Subject: [PATCH 111/169] Show Flake8 errors in GitHub CI again (#46990)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/46985.

Can someone comment on whether the "Run flake8" step should fail if `flake8` produces errors? This PR makes sure the errors are still shown, but [the job linked from the issue](https://github.com/pytorch/pytorch/runs/1320258832) also shows that the failure of that step seems to have caused the "Add annotations" step not to run.

Is this what we want, or should I instead revert back to the `--exit-zero` behavior (in this case by just removing the `-o pipefail` from this PR) that we had before https://github.com/pytorch/pytorch/issues/46740? And if the latter, then (how) should I modify this `flake8-py3` job to make sure it fails when `flake8` fails (assuming it didn't already do that?)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46990

Reviewed By: VitalyFedyunin

Differential Revision: D24593573

Pulled By: samestep

fbshipit-source-id: 361392846de9fadda1c87d2046cf8d26861524ca
---
 .github/workflows/lint.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 3e6bb17375b9..04abbc0275af 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -69,11 +69,10 @@ jobs:
         id: get_pr_tip
       - name: Run flake8
         run: |
-          set -eux
+          set -eux -o pipefail
           pip install flake8==3.8.2 flake8-bugbear flake8-comprehensions flake8-executable flake8-pyi==20.5.0 mccabe pycodestyle==2.6.0 pyflakes==2.2.0
           flake8 --version
-          flake8 > ${GITHUB_WORKSPACE}/flake8-output.txt
-          cat ${GITHUB_WORKSPACE}/flake8-output.txt
+          flake8 | tee ${GITHUB_WORKSPACE}/flake8-output.txt
       - name: Add annotations
         uses: pytorch/add-annotations-github-action@master
         with:

From fee585b5a3d7a61a5a62df81cae596b486f409f2 Mon Sep 17 00:00:00 2001
From: tmanlaibaatar <tmanlaibaatar@devvm477.atn0.facebook.com>
Date: Thu, 29 Oct 2020 11:58:10 -0700
Subject: [PATCH 112/169] Correctly mark unannotated NamedTuple field to be
 inferred TensorType (#46969)

Summary:
If there is no annotation given, we want to show users that the type is inferred

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46969

Test Plan:
Added a new test case that throws an error with the expected error message

Fixes https://github.com/pytorch/pytorch/issues/46326

Reviewed By: ZolotukhinM

Differential Revision: D24614450

Pulled By: gmagogsfm

fbshipit-source-id: dec555a53bfaa9cdefd3b21b5142f5e522847504
---
 test/test_jit.py       | 19 ++++++++++++++++++-
 torch/_jit_internal.py |  2 +-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index 9994754dcc42..f4c3a8d95b69 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -82,7 +82,7 @@
 from itertools import product
 import itertools
 from textwrap import dedent
-from typing import List, Dict, Optional, Tuple, Union
+from typing import List, Dict, NamedTuple, Optional, Tuple, Union
 import inspect
 import math
 import functools
@@ -13796,6 +13796,23 @@ def test_non_primitive_types(x):
         out = test_non_primitive_types(_MyNamedTuple(value=torch.tensor(5.0)))
         self.assertEqual(out, torch.tensor(6.0))
 
+    def test_namedtuple_type_inference(self):
+        _AnnotatedNamedTuple = NamedTuple('_NamedTupleAnnotated', [('value', int)])
+        _UnannotatedNamedTuple = namedtuple('_NamedTupleUnAnnotated', ['value'])
+
+        def test_check_named_tuple_value():
+            named_tuple = _AnnotatedNamedTuple(1)
+            return named_tuple.value
+
+        self.checkScript(test_check_named_tuple_value, ())
+
+        def test_error():
+            return _UnannotatedNamedTuple(1)
+
+        with self.assertRaisesRegex(RuntimeError, r"Expected a value of type \'Tensor \(inferred\)\' "
+                                                  r"for argument \'value\' but instead found type \'int\'."):
+            torch.jit.script(test_error)
+
     def test_isinstance_dynamic(self):
         @torch.jit.script
         def foo(a):
diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py
index 4396155c73ea..a1c800debb59 100644
--- a/torch/_jit_internal.py
+++ b/torch/_jit_internal.py
@@ -839,7 +839,7 @@ def _get_named_tuple_properties(obj):
             the_type = torch.jit.annotations.ann_to_type(obj.__annotations__[field], fake_range())
             annotations.append(the_type)
         else:
-            annotations.append(torch._C.TensorType.get())
+            annotations.append(torch._C.TensorType.getInferred())
     return type(obj).__name__, fields, annotations
 
 

From 74d730c0b5142d2466a3fe011c4298e0b2eb9840 Mon Sep 17 00:00:00 2001
From: Xiong Wei <xiongw.fnst@cn.fujitsu.com>
Date: Thu, 29 Oct 2020 12:06:11 -0700
Subject: [PATCH 113/169] implement NumPy-like functionality column_stack,
 row_stack (#46313)

Summary:
Related https://github.com/pytorch/pytorch/issues/38349

This PR implements `column_stack` as the composite ops of `torch.reshape` and `torch.hstack`, and makes `row_stack` as the alias of `torch.vstack`.

Todo

- [x] docs
- [x] alias pattern for `row_stack`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46313

Reviewed By: ngimel

Differential Revision: D24585471

Pulled By: mruberry

fbshipit-source-id: 62fc0ffd43d051dc3ecf386a3e9c0b89086c1d1c
---
 aten/src/ATen/core/aten_interned_strings.h |  2 +-
 aten/src/ATen/core/interned_strings.h      |  2 +
 aten/src/ATen/native/TensorShape.cpp       | 41 ++++++++++++++++++
 aten/src/ATen/native/native_functions.yaml | 19 ++++++++
 docs/source/torch.rst                      |  2 +
 test/test_op_aliases.py                    | 39 ++++++++++++-----
 test/test_torch.py                         | 50 +++++++++++++++-------
 torch/_torch_docs.py                       | 40 +++++++++++++++++
 torch/csrc/jit/passes/normalize_ops.cpp    |  1 +
 torch/overrides.py                         |  2 +
 10 files changed, 172 insertions(+), 26 deletions(-)

diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h
index 8d41520a7aff..e84ad93de37d 100644
--- a/aten/src/ATen/core/aten_interned_strings.h
+++ b/aten/src/ATen/core/aten_interned_strings.h
@@ -738,7 +738,6 @@ _(aten, vander) \
 _(aten, var) \
 _(aten, view) \
 _(aten, view_as) \
-_(aten, vstack) \
 _(aten, where) \
 _(aten, zero) \
 _(aten, zeros) \
@@ -783,6 +782,7 @@ _(attr, ceil_mode) \
 _(attr, checked_signal_sizes) \
 _(attr, chunks) \
 _(attr, columns) \
+_(attr, column_stack) \
 _(attr, complex_input) \
 _(attr, complex_output) \
 _(attr, condition) \
diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h
index 5f881b5edccc..7e841b8b5311 100644
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@@ -268,6 +268,8 @@ namespace c10 {
   _(aten, bin)                       \
   _(aten, pop)                       \
   _(aten, insert)                    \
+  _(aten, vstack)                    \
+  _(aten, row_stack)                 \
   _(prim, unchecked_unwrap_optional) \
   _(aten, __contains__)              \
   _(prim, BailoutTemplate)           \
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index a42e90f399d9..eec16a4cf824 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -1262,6 +1262,47 @@ static inline Tensor & sparse_transpose_(Tensor & self, int64_t dim0, int64_t di
   return self;
 }
 
+// torch.row_stack, alias for torch.vstack
+Tensor& row_stack_out(Tensor& result, TensorList tensors) {
+  return at::vstack_out(result, tensors);
+}
+
+Tensor row_stack(TensorList tensors) {
+  return at::vstack(tensors);
+}
+
+static std::vector<Tensor> reshape_input_for_column_stack(TensorList tensors) {
+  std::vector<Tensor> result(tensors.size());
+  auto transform_lambda = [](const Tensor& input) -> Tensor {
+    // reshape 0D or 1D tensor t into (t.numel(), 1)
+    if (input.dim() <= 1) {
+      return input.reshape({input.numel(), 1});
+    }
+    return input;
+  };
+  std::transform(tensors.cbegin(),
+                 tensors.cend(),
+                 result.begin(),
+                 transform_lambda);
+  return result;
+}
+
+Tensor& column_stack_out(Tensor& result, TensorList tensors) {
+  TORCH_CHECK(tensors.size() > 0,
+              "column_stack expects a non-empty TensorList");
+
+  auto reshaped_tensors = reshape_input_for_column_stack(tensors);
+  return at::hstack_out(result, reshaped_tensors);
+}
+
+Tensor column_stack(TensorList tensors) {
+  TORCH_CHECK(tensors.size() > 0,
+              "column_stack expects a non-empty TensorList");
+
+  auto reshaped_tensors = reshape_input_for_column_stack(tensors);
+  return at::hstack(reshaped_tensors);
+}
+
 static Tensor& propagate_transposed_names(
     Tensor& result,
     const Tensor& other,
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 54836b19a658..ccc07438eda9 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1546,6 +1546,16 @@
 - func: rowwise_prune(Tensor weight, Tensor mask, ScalarType compressed_indices_dtype) -> (Tensor, Tensor)
   use_c10_dispatcher: full
 
+# row_stack is the alias of vstack
+- func: row_stack(Tensor[] tensors) -> Tensor
+  use_c10_dispatcher: full
+  dispatch:
+    Math: row_stack
+
+- func: row_stack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    Math: row_stack_out
+
 - func: embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> (Tensor, Tensor, Tensor, Tensor)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
@@ -8664,6 +8674,15 @@
     CPU: col2im_backward_cpu
     CUDA: col2im_backward_cuda
 
+- func: column_stack(Tensor[] tensors) -> Tensor
+  use_c10_dispatcher: full
+  dispatch:
+    Math: column_stack
+
+- func: column_stack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    Math: column_stack_out
+
 - func: im2col.out(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   dispatch:
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index 884db0ebd331..e36a3f944a7a 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -85,6 +85,7 @@ Indexing, Slicing, Joining, Mutating Ops
 
     cat
     chunk
+    column_stack
     dstack
     gather
     hstack
@@ -94,6 +95,7 @@ Indexing, Slicing, Joining, Mutating Ops
     narrow
     nonzero
     reshape
+    row_stack
     split
     squeeze
     stack
diff --git a/test/test_op_aliases.py b/test/test_op_aliases.py
index 8a106d7860d1..a6d37c9d52e2 100644
--- a/test/test_op_aliases.py
+++ b/test/test_op_aliases.py
@@ -6,6 +6,7 @@
 from torch.testing._internal.jit_utils import JitTestCase
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, skipCPUIfNoLapack, skipCUDAIfNoMagma, onlyCPU)
+import collections
 
 # Information for generating an alias test
 # NOTE: ending the alias_name with an underscore will interpret the test
@@ -150,6 +151,8 @@ def __init__(self,
     AliasInfo('true_divide_', torch.Tensor.true_divide_, 'div_', torch.Tensor.div_,
               lambda d: torch.randn(20, device=d), get_args=lambda d: (torch.rand(20, device=d) + .1,),
               decorators=(onlyCPU,)),
+    AliasInfo('row_stack', torch.row_stack, 'vstack', torch.vstack,
+              lambda d: ((torch.randn(20, device=d), torch.randn(20, device=d)))),
 )
 
 # Placeholder test class for validating that aliases are correctly
@@ -157,6 +160,14 @@ def __init__(self,
 class TestOpNormalization(JitTestCase):
     pass
 
+
+# Clone input tensor and sequence of Tensors
+def clone_inp(inp):
+    if isinstance(inp, collections.Sequence):
+        return list(map(torch.clone, inp))
+    else:
+        return inp.clone()
+
 # Generates alias tests and adds them to the specified class (cls)
 def create_alias_tests(cls):
     for info in alias_infos:
@@ -180,10 +191,18 @@ def _fn(t):
                 arg_string = ', '.join((str(arg) for arg in info.get_args(device)))
                 script = fn_template.format(alias_name=info.alias_name, args=arg_string)
             else:
-                fn_template = '''
-                    def _fn(t):
+                is_input_tensor_list = isinstance(info.get_input(device), collections.Sequence)
+                # For sequence of Tensors, annotate the type to be List[Tensor]
+                if is_input_tensor_list:
+                    fn_template = '''
+                    def _fn(t: List[Tensor]):
                         return op(t{args})
-                '''
+                    '''
+                else:
+                    fn_template = '''
+                        def _fn(t):
+                            return op(t{args})
+                    '''
                 arg_string = ", " + ', '.join((str(arg) for arg in info.get_args(device)))
                 script = fn_template.format(args=arg_string)
 
@@ -192,8 +211,8 @@ def _fn(t):
 
             # Acquires and checks the graph remaps the alias
             inp = info.get_input(device)
-            scripted(inp.clone())
-            graph = scripted.graph_for(inp.clone())
+            scripted(clone_inp(inp))
+            graph = scripted.graph_for(clone_inp(inp))
             FileCheck().check(info.original_name).check_not(info.alias_name).run(graph)
 
             # Checks that tracing converts aliases
@@ -203,9 +222,9 @@ def _fn(t):
             def _fn(t, info=info, args=args):
                 return info.alias_op(t, *args)
 
-            traced = torch.jit.trace(_fn, (inp.clone(),))
-            traced(inp.clone())
-            graph = traced.graph_for(inp.clone())
+            traced = torch.jit.trace(_fn, (clone_inp(inp),))
+            traced(clone_inp(inp))
+            graph = traced.graph_for(clone_inp(inp))
             FileCheck().check(info.original_name).check_not(info.alias_name).run(graph)
 
         # Applies decorators
@@ -223,10 +242,10 @@ def _test_alias_computation(self, device, info=info):
             inp = info.get_input(device)
             args = info.get_args(device)
 
-            alias_input = inp.clone()
+            alias_input = clone_inp(inp)
             alias_result = alias_op(alias_input, *args)
 
-            original_input = inp.clone()
+            original_input = clone_inp(inp)
             original_result = alias_op(original_input, *args)
 
             self.assertEqual(alias_input, original_input, atol=0, rtol=0)
diff --git a/test/test_torch.py b/test/test_torch.py
index faf4fbb976bf..48810a2f92e0 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -19261,8 +19261,12 @@ def verify_against_numpy(t):
 
     def _test_special_stacks(self, dim, at_least_dim, torch_fn, np_fn, device, dtype):
         # Test error for non-tuple argument
+        t = torch.randn(10)
         with self.assertRaisesRegex(TypeError, "must be tuple of Tensors, not Tensor"):
-            torch_fn(torch.randn(10))
+            torch_fn(t)
+        # Test error for a single array
+        with self.assertRaisesRegex(TypeError, "must be tuple of Tensors, not Tensor"):
+            torch_fn((t))
 
         # Test 0-D
         num_tensors = random.randint(1, 5)
@@ -19312,25 +19316,41 @@ def _test_special_stacks(self, dim, at_least_dim, torch_fn, np_fn, device, dtype
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes(include_bfloat16=False) +
               torch.testing.get_all_complex_dtypes()))
-    def test_hstack(self, device, dtype):
-        self._test_special_stacks(1, 1, torch.hstack, np.hstack, device, dtype)
+    def test_hstack_column_stack(self, device, dtype):
+        ops = ((torch.hstack, np.hstack), (torch.column_stack, np.column_stack))
+        for torch_op, np_op in ops:
+            self._test_special_stacks(1, 1, torch_op, np_op, device, dtype)
+
+        # Test torch.column_stack with combinations of 1D and 2D tensors input
+        one_dim_tensor = torch.arange(0, 10).to(dtype=dtype, device=device)
+        two_dim_tensor = torch.arange(0, 100).to(dtype=dtype, device=device).reshape(10, 10)
+        inputs = two_dim_tensor, one_dim_tensor, two_dim_tensor, one_dim_tensor
+        torch_result = torch.column_stack(inputs)
+
+        np_inputs = [input.cpu().numpy() for input in inputs]
+        np_result = np.column_stack(np_inputs)
+
+        self.assertEqual(np_result,
+                         torch_result)
 
     @onlyOnCPUAndCUDA
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes(include_bfloat16=False) +
               torch.testing.get_all_complex_dtypes()))
-    def test_vstack(self, device, dtype):
-        self._test_special_stacks(0, 2, torch.vstack, np.vstack, device, dtype)
-        for i in range(5):
-            # Test dimension change for 1D tensor of size (N) and 2D tensor of size (1, N)
-            n = random.randint(1, 10)
-            input_a = self._generate_input((n,), dtype, device, with_extremal=False)
-            input_b = self._generate_input((1, n), dtype, device, with_extremal=False)
-            torch_input = [input_a, input_b]
-            np_input = [input.cpu().numpy() for input in torch_input]
-            actual = torch.vstack(torch_input)
-            expected = np.vstack(np_input)
-            self.assertEqual(actual, expected)
+    def test_vstack_row_stack(self, device, dtype):
+        ops = ((torch.vstack, np.vstack), (torch.row_stack, np.row_stack))
+        for torch_op, np_op in ops:
+            self._test_special_stacks(0, 2, torch_op, np_op, device, dtype)
+            for i in range(5):
+                # Test dimension change for 1D tensor of size (N) and 2D tensor of size (1, N)
+                n = random.randint(1, 10)
+                input_a = self._generate_input((n,), dtype, device, with_extremal=False)
+                input_b = self._generate_input((1, n), dtype, device, with_extremal=False)
+                torch_input = [input_a, input_b]
+                np_input = [input.cpu().numpy() for input in torch_input]
+                actual = torch_op(torch_input)
+                expected = np_op(np_input)
+                self.assertEqual(actual, expected)
 
     @onlyOnCPUAndCUDA
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index b3a9d57c99d2..4f0de3335414 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -1818,6 +1818,40 @@ def merge_dicts(*dicts):
 Alias for :func:`torch.clamp`.
 """.format(**common_args))
 
+add_docstr(torch.column_stack,
+           r"""
+column_stack(tensors, *, out=None) -> Tensor
+
+Creates a new tensor by horizontally stacking the tensors in :attr:`tensors`.
+
+Equivalent to ``torch.hstack(tensors)``, except each zero or one dimensional tensor ``t``
+in :attr:`tensors` is first reshaped into a ``(t.numel(), 1)`` column before being stacked horizontally.
+
+Args:
+    tensors (sequence of Tensors): sequence of tensors to concatenate
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.tensor([1, 2, 3])
+    >>> b = torch.tensor([4, 5, 6])
+    >>> torch.column_stack((a, b))
+    tensor([[1, 4],
+        [2, 5],
+        [3, 6]])
+    >>> a = torch.arange(5)
+    >>> b = torch.arange(10).reshape(5, 2)
+    >>> torch.column_stack((a, b, b))
+    tensor([[0, 0, 1, 0, 1],
+            [1, 2, 3, 2, 3],
+            [2, 4, 5, 4, 5],
+            [3, 6, 7, 6, 7],
+            [4, 8, 9, 8, 9]])
+
+""".format(**common_args))
+
 add_docstr(torch.complex,
            r"""
 complex(real, imag, *, out=None) -> Tensor
@@ -6720,6 +6754,12 @@ def merge_dicts(*dicts):
     torch.uint8
 """)
 
+add_docstr(torch.row_stack,
+           r"""
+row_stack(tensors, *, out=None) -> Tensor
+
+Alias of :func:`torch.vstack`.
+""".format(**common_args))
 
 add_docstr(torch.round,
            r"""
diff --git a/torch/csrc/jit/passes/normalize_ops.cpp b/torch/csrc/jit/passes/normalize_ops.cpp
index 9d9ac0203b90..a06a3f94f3b1 100644
--- a/torch/csrc/jit/passes/normalize_ops.cpp
+++ b/torch/csrc/jit/passes/normalize_ops.cpp
@@ -78,6 +78,7 @@ const std::unordered_map<Symbol, Symbol>& getOperatorAliasMap() {
       {aten::divide, aten::div},       {aten::divide_, aten::div_},
       {aten::multiply, aten::mul},     {aten::multiply_, aten::mul_},
       {aten::true_divide, aten::div},  {aten::true_divide_, aten::div_},
+      {aten::row_stack, aten::vstack},
   };
   return alias_map;
 }
diff --git a/torch/overrides.py b/torch/overrides.py
index 3286484e480f..4fd334c911d8 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -279,6 +279,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.clip: lambda input, min=None, max=None, out=None: -1,
         torch.clamp_min: lambda input, min, out=None: -1,
         torch.clamp_max: lambda input, max, out=None: -1,
+        torch.column_stack: lambda tensors, out=None: -1,
         torch.clone: lambda input: -1,
         torch.combinations: lambda input, r=2, with_replacement=False: -1,
         torch.complex: lambda real, imag: -1,
@@ -694,6 +695,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.roll: lambda input, shifts, dims=None: -1,
         torch.rot90: lambda input, k=1, dims=(0, 1): -1,
         torch.round: lambda input, out=None: -1,
+        torch.row_stack: lambda tensors, out=None: -1,  # alias for torch.vstack
         torch.rowwise_prune: (lambda weight, mask, compressed_indices_dtype: -1),
         torch.rrelu: lambda input, lower=1. / 8, upper=1. / 3, training=False, inplace=False: -1,
         torch.rsqrt: lambda input, out=None: -1,

From 680571533b9940acc841ec06c24243bf6eda48e9 Mon Sep 17 00:00:00 2001
From: Martin Yuan <myuan@fb.com>
Date: Thu, 29 Oct 2020 12:11:14 -0700
Subject: [PATCH 114/169] [RFC] Decouple fast pass functions (#46469)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46469

There are some "fast_pass" function calls, where the symbols in `ATen/native` are directly referenced from outside of native at linking stage. This PR is to decouple one of the fast pass from native, while keeping the same functionality. `scalar_to_tensor` is included through `ATen/ATen.h`, which could be referenced by any cpp file including this header.

ghstack-source-id: 114485740

Test Plan: CI

Reviewed By: ezyang

Differential Revision: D24361863

fbshipit-source-id: 28d658688687b6cde286a6e6933ab33a4b3cf9ec
---
 aten/src/ATen/ScalarOps.cpp              | 40 +++++++++++++++++++
 aten/src/ATen/ScalarOps.h                | 22 ++++++++---
 aten/src/ATen/Utils.cpp                  | 50 ++++++++++++++++++++++++
 aten/src/ATen/Utils.h                    | 14 +++++++
 aten/src/ATen/native/Fill.cpp            | 19 +--------
 aten/src/ATen/native/TensorFactories.cpp | 39 +-----------------
 aten/src/ATen/native/TensorFactories.h   |  6 +--
 7 files changed, 124 insertions(+), 66 deletions(-)
 create mode 100644 aten/src/ATen/ScalarOps.cpp

diff --git a/aten/src/ATen/ScalarOps.cpp b/aten/src/ATen/ScalarOps.cpp
new file mode 100644
index 000000000000..7a794cb5c312
--- /dev/null
+++ b/aten/src/ATen/ScalarOps.cpp
@@ -0,0 +1,40 @@
+// FastPass
+#ifdef _MSC_VER
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
+#include <math.h>
+#endif
+
+#include <ATen/ScalarOps.h>
+#include <ATen/ATen.h>
+#include <ATen/Utils.h>
+
+namespace at {
+namespace {
+template <typename scalar_t>
+inline void fill_inplace(Tensor& self, Scalar value_scalar) {
+  auto value = value_scalar.to<scalar_t>();
+  scalar_t* dptr = static_cast<scalar_t*>(self.data_ptr());
+  *dptr = value;
+}
+}
+
+namespace detail {
+Tensor& scalar_fill(Tensor& self, Scalar value) {
+  AT_DISPATCH_ALL_TYPES_AND3(
+      kHalf, kBool, kBFloat16, self.scalar_type(), "fill_out", [&]() {
+        fill_inplace<scalar_t>(self, value);
+      });
+  return self;
+}
+
+Tensor scalar_tensor_static(Scalar s, const TensorOptions& options) {
+  at::tracer::impl::NoTracerDispatchMode tracer_guard;
+  at::AutoNonVariableTypeMode non_var_type_mode(true);
+  auto result = at::detail::empty_cpu({}, options);
+  scalar_fill(result, s);
+  return result;
+}
+} // namespace detail
+} // namespace at
diff --git a/aten/src/ATen/ScalarOps.h b/aten/src/ATen/ScalarOps.h
index 8c07a9d618bc..60cee3ea284b 100644
--- a/aten/src/ATen/ScalarOps.h
+++ b/aten/src/ATen/ScalarOps.h
@@ -4,6 +4,18 @@
 #include <ATen/Tensor.h>
 #include <ATen/Functions.h>
 
+namespace at {
+namespace detail {
+// When filling a number to 1-element CPU tensor, we want to skip
+// everything but manipulate data ptr directly.
+// Ideally this fast pass should be implemented in TensorIterator,
+// but we also want to skip compute_types which in not avoidable
+// in TensorIterator for now.
+Tensor& scalar_fill(Tensor& self, Scalar value);
+TORCH_API Tensor scalar_tensor_static(Scalar s, const TensorOptions& options);
+} // namespace detail
+} // namespace at
+
 // This is in the c10 namespace because we use ADL to find the functions in it.
 namespace c10 {
 
@@ -11,16 +23,14 @@ namespace c10 {
 // to implement this without going through Derived Types (which are not part of core).
 inline at::Tensor scalar_to_tensor(Scalar s, const Device device = at::kCPU) {
   // This is the fast track we have for CPU scalar tensors.
-  if (device == at::kCPU) {
+  if (device == at::kCPU && !s.isComplex()) {
     if (s.isFloatingPoint()) {
-      return at::native::scalar_tensor(s, at::device(at::kCPU).dtype(at::kDouble));
+      return at::detail::scalar_tensor_static(s, at::device(at::kCPU).dtype(at::kDouble));
     } else if (s.isBoolean()) {
-      return at::native::scalar_tensor(s, at::device(at::kCPU).dtype(at::kBool));
-    } else if (s.isComplex()) {
-      return at::native::scalar_tensor(s, at::device(at::kCPU).dtype(at::kComplexDouble));
+      return at::detail::scalar_tensor_static(s, at::device(at::kCPU).dtype(at::kBool));
     } else {
       AT_ASSERT(s.isIntegral(false));
-      return at::native::scalar_tensor(s, at::device(at::kCPU).dtype(at::kLong));
+      return at::detail::scalar_tensor_static(s, at::device(at::kCPU).dtype(at::kLong));
     }
   }
   if (s.isFloatingPoint()) {
diff --git a/aten/src/ATen/Utils.cpp b/aten/src/ATen/Utils.cpp
index ccd4e4ba9f2f..847172b46771 100644
--- a/aten/src/ATen/Utils.cpp
+++ b/aten/src/ATen/Utils.cpp
@@ -3,6 +3,8 @@
 #include <stdexcept>
 #include <typeinfo>
 #include <cstdlib>
+#include <ATen/detail/CUDAHooksInterface.h>
+#include <ATen/Context.h>
 
 namespace at {
 
@@ -12,4 +14,52 @@ int _crash_if_asan(int arg) {
   return x[0];
 }
 
+namespace detail {
+// empty_cpu is used in ScalarOps.h, which can be referenced by other ATen files. Since we want to decouple direct referencing native symbols and only access native symbols through dispatching, we move its implementation here.
+Tensor empty_cpu(
+    IntArrayRef size,
+    const TensorOptions& options_,
+    c10::optional<c10::MemoryFormat> optional_memory_format) {
+  TORCH_CHECK(
+      !(options_.has_memory_format() && optional_memory_format.has_value()),
+      "Cannot set memory_format both in TensorOptions and explicit argument; please delete "
+      "the redundant setter.");
+  TensorOptions options =
+      options_.merge_in(TensorOptions().memory_format(optional_memory_format));
+
+  AT_ASSERT(options.device().type() == DeviceType::CPU);
+  check_size_nonnegative(size);
+
+  c10::Allocator* allocator;
+  if (options.pinned_memory()) {
+    allocator = detail::getCUDAHooks().getPinnedMemoryAllocator();
+  } else {
+    allocator = at::getCPUAllocator();
+  }
+
+  int64_t nelements = prod_intlist(size);
+  auto dtype = options.dtype();
+  int64_t size_bytes = nelements * dtype.itemsize();
+  auto storage_impl = c10::make_intrusive<StorageImpl>(
+      c10::StorageImpl::use_byte_size_t(),
+      size_bytes,
+      allocator->allocate(size_bytes),
+      allocator,
+      /*resizeable=*/true);
+
+  auto tensor = detail::make_tensor<TensorImpl>(
+      std::move(storage_impl), at::DispatchKey::CPU, dtype);
+  // Default TensorImpl has size [0]
+  if (size.size() != 1 || size[0] != 0) {
+    tensor.unsafeGetTensorImpl()->set_sizes_contiguous(size);
+  }
+
+  auto memory_format =
+      options.memory_format_opt().value_or(MemoryFormat::Contiguous);
+  tensor.unsafeGetTensorImpl()->empty_tensor_restride(memory_format);
+
+  return tensor;
+}
+} // namespace detail
+
 } // at
diff --git a/aten/src/ATen/Utils.h b/aten/src/ATen/Utils.h
index df0e49920afa..87446df09487 100644
--- a/aten/src/ATen/Utils.h
+++ b/aten/src/ATen/Utils.h
@@ -120,4 +120,18 @@ static inline T* get_generator_or_default(const c10::optional<Generator>& gen, c
   return gen.has_value() && gen->defined() ? check_generator<T>(gen) : check_generator<T>(default_gen);
 }
 
+inline void check_size_nonnegative(IntArrayRef size) {
+  for (auto x: size) {
+    TORCH_CHECK(x >= 0, "Trying to create tensor with negative dimension ", x, ": ", size);
+  }
+}
+
+namespace detail {
+CAFFE2_API
+Tensor empty_cpu(
+    IntArrayRef size,
+    const TensorOptions& options = {},
+    c10::optional<MemoryFormat> memory_format = c10::nullopt);
+} // namespace detail
+
 } // at
diff --git a/aten/src/ATen/native/Fill.cpp b/aten/src/ATen/native/Fill.cpp
index 73f7dcd61926..b466ca26fc0c 100644
--- a/aten/src/ATen/native/Fill.cpp
+++ b/aten/src/ATen/native/Fill.cpp
@@ -4,20 +4,12 @@
 #include <ATen/Dispatch.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/Fill.h>
+#include <ATen/Utils.h>
 
 namespace at {
 namespace native {
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ fill ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-namespace {
-  template <typename scalar_t>
-  inline void fill_fast(Tensor& self, Scalar value_scalar) {
-    auto value = value_scalar.to<scalar_t>();
-    scalar_t * dptr = static_cast<scalar_t *>(self.data_ptr());
-    *dptr = value;
-  }
-} // namspace
-
 Tensor& fill_out(Tensor& self, Scalar value) {
   if (self.is_quantized()) {
     at::Tensor out = at::ones(self.sizes()).to(kFloat) * value;
@@ -26,15 +18,8 @@ Tensor& fill_out(Tensor& self, Scalar value) {
     self.copy_(out);
     return self;
   }
-  // When filling a number to 1-element CPU tensor, we want to skip
-  // everything but manipulate data ptr directly.
-  // Ideally this fast pass should be implemented in TensorIterator,
-  // but we also want to skip compute_types which in not avoidable
-  // in TensorIterator for now.
   if (self.device() == at::kCPU && self.numel() == 1 && !self.is_complex() && !value.isComplex()) {
-     AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, kBFloat16, self.scalar_type(), "fill_out", [&]() {
-        fill_fast<scalar_t>(self, value);});
-     return self;
+    return at::detail::scalar_fill(self, value);
   }
   auto iter = TensorIteratorConfig()
     .set_check_mem_overlap(false)  // Fill is idempotent, so overlap is okay
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index a6683bcaaf2f..7277934092b4 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -166,44 +166,7 @@ Tensor polar(const Tensor& abs, const Tensor& angle) {
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ empty ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Tensor empty_cpu(IntArrayRef size, const TensorOptions& options_, c10::optional<c10::MemoryFormat> optional_memory_format) {
-
-  TORCH_CHECK(
-    !(options_.has_memory_format() && optional_memory_format.has_value()),
-    "Cannot set memory_format both in TensorOptions and explicit argument; please delete "
-    "the redundant setter.");
-  TensorOptions options = options_.merge_in(TensorOptions().memory_format(optional_memory_format));
-
-  AT_ASSERT(options.device().type() == DeviceType::CPU);
-  check_size_nonnegative(size);
-
-  c10::Allocator* allocator;
-  if (options.pinned_memory()) {
-    allocator = detail::getCUDAHooks().getPinnedMemoryAllocator();
-  } else {
-    allocator = at::getCPUAllocator();
-  }
-
-  int64_t nelements = prod_intlist(size);
-  auto dtype = options.dtype();
-  int64_t size_bytes = nelements * dtype.itemsize();
-  auto storage_impl = c10::make_intrusive<StorageImpl>(
-      c10::StorageImpl::use_byte_size_t(),
-      size_bytes,
-      allocator->allocate(size_bytes),
-      allocator,
-      /*resizeable=*/true);
-
-  auto tensor = detail::make_tensor<TensorImpl>(
-      std::move(storage_impl), at::DispatchKey::CPU, dtype);
-  // Default TensorImpl has size [0]
-  if (size.size() != 1 || size[0] != 0) {
-    tensor.unsafeGetTensorImpl()->set_sizes_contiguous(size);
-  }
-
-  auto memory_format = options.memory_format_opt().value_or(MemoryFormat::Contiguous);
-  tensor.unsafeGetTensorImpl()->empty_tensor_restride(memory_format);
-
-  return tensor;
+  return at::detail::empty_cpu(size, options_, optional_memory_format);
 }
 
 Tensor empty(
diff --git a/aten/src/ATen/native/TensorFactories.h b/aten/src/ATen/native/TensorFactories.h
index f551adcec693..8cae202efe13 100644
--- a/aten/src/ATen/native/TensorFactories.h
+++ b/aten/src/ATen/native/TensorFactories.h
@@ -61,11 +61,7 @@ inline void check_args(
   }
 }
 
-inline void check_size_nonnegative(IntArrayRef size) {
-  for (auto x: size) {
-    TORCH_CHECK(x >= 0, "Trying to create tensor with negative dimension ", x, ": ", size);
-  }
-}
+using at::check_size_nonnegative;
 
 inline void check_supported_max_int_with_precision(int64_t n, const Tensor& tensor) {
   TORCH_CHECK(at::scalar_tensor(n, tensor.options()).defined(),

From d850b5c98ca166e899933b481e622a05e082d0f8 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Thu, 29 Oct 2020 12:20:49 -0700
Subject: [PATCH 115/169] Fix DDP issue where parameters share same
 grad_accumulator (#46755)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46755

As reported in https://github.com/pytorch/pytorch/issues/41324, there is a bug in DDP when `find_unused_parameters=True` and 2 or more parameters share the same gradient accumulator.

In the reducer, we currently keep a mapping of grad accumulator to index and populate it with map[accumulator] = index, but this overwrites indices when the accumulator is the same. To fix this, switch the mapping values to a vector of indices to hold all such indices that share the same accumulator.
ghstack-source-id: 115453567

Test Plan: Added UT

Reviewed By: pritamdamania87

Differential Revision: D24497388

fbshipit-source-id: d32dfa9c5cd0b7a8df13c7873d5d28917b766640
---
 torch/csrc/distributed/c10d/reducer.cpp       | 24 ++++++++++----
 torch/csrc/distributed/c10d/reducer.h         |  4 +--
 .../_internal/distributed/distributed_test.py | 33 +++++++++++++++++++
 3 files changed, 53 insertions(+), 8 deletions(-)

diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index abb26dfaa10c..6f0b22b73739 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -119,8 +119,19 @@ Reducer::Reducer(
         // This is used later on when the autograd graph is traversed
         // to check for parameters for which no gradient is computed, if
         // find_unused_parameters=True.
+        // We maintain a mapping of gradient accumulator to vector of variables,
+        // since multiple parameters may share the same grad accumulator.
         if (find_unused_parameters_) {
-          gradAccToVariableMap_[grad_accumulator.get()] = index;
+          auto gradAcc = gradAccToVariablesMap_.find(grad_accumulator.get());
+          if (gradAcc == gradAccToVariablesMap_.end()) {
+            std::vector<VariableIndex> indexVec{index};
+            gradAccToVariablesMap_[grad_accumulator.get()] =
+                std::move(indexVec);
+          } else {
+            // Scenario where we have indices whose corresponding parameters
+            // share the same grad accumulator.
+            gradAcc->second.push_back(index);
+          }
         }
 
         // The gradient accumulator is stored as weak_ptr in the autograd
@@ -997,14 +1008,15 @@ void Reducer::prepare_for_backward(
   }
 
   // Find accumulator functions that don't show up in this graph.
-  for (const auto& it : gradAccToVariableMap_) {
+  for (const auto& it : gradAccToVariablesMap_) {
     // If the accumulator function is present in the graph, we know
     // a gradient will be computed for the corresponding parameter.
-    if (seen.count(it.first) > 0) {
-      continue;
+    if (seen.count(it.first) == 0) {
+      auto& indices = it.second;
+      unused_parameters_.reserve(unused_parameters_.size() + indices.size());
+      unused_parameters_.insert(
+          unused_parameters_.end(), indices.begin(), indices.end());
     }
-
-    unused_parameters_.push_back(it.second);
   }
 }
 
diff --git a/torch/csrc/distributed/c10d/reducer.h b/torch/csrc/distributed/c10d/reducer.h
index 25f81857d101..53b4ecc4f981 100644
--- a/torch/csrc/distributed/c10d/reducer.h
+++ b/torch/csrc/distributed/c10d/reducer.h
@@ -122,8 +122,8 @@ class Reducer {
 
   std::vector<std::vector<std::shared_ptr<torch::autograd::Node>>>
       grad_accumulators_;
-  std::unordered_map<torch::autograd::Node*, VariableIndex>
-      gradAccToVariableMap_;
+  std::unordered_map<torch::autograd::Node*, std::vector<VariableIndex>>
+      gradAccToVariablesMap_;
   std::vector<std::pair<uintptr_t, std::shared_ptr<torch::autograd::Node>>>
       hooks_;
 
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index bf58b89fbb55..b12a72e43f35 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -3817,6 +3817,39 @@ def forward(self, x):
                 else:
                     ddp(inp).sum().backward()
 
+        @require_backend({"gloo", "nccl"})
+        @require_backends_available({"gloo", "nccl"})
+        @skip_if_lt_x_gpu(2)
+        @skip_if_rocm
+        def test_ddp_shared_grad_acc_unused_params(self):
+            # When find_unused_parameters=True, ensure we mark unused parameters
+            # even if they share gradient accumulators.
+            class ToyModel(nn.Module):
+                def __init__(self):
+                    super(ToyModel, self).__init__()
+                    # net1, bias, and net1.bias are all unused params.
+                    self.net1 = nn.Linear(10, 5, bias=False)
+                    self.bias = nn.Parameter(torch.zeros(5))
+                    # net1.bias and self.bias are names for the same underlying
+                    # parameter, so they share the same grad acc. This caused
+                    # the bug reported in https://github.com/pytorch/pytorch/issues/41324.
+                    self.net1.bias = self.bias
+                    self.net2 = nn.Linear(10, 5)
+
+                def forward(self, x):
+                    return self.net2(x)
+
+            torch.cuda.set_device(self.rank)
+            model = ToyModel().to(torch.cuda.current_device())
+            ddp_model = torch.nn.parallel.DistributedDataParallel(
+                model, device_ids=[self.rank], find_unused_parameters=True
+            )
+            inp = torch.randn(20, 10, device=self.rank)
+            for i in range(6):
+                out = ddp_model(inp)
+                loss = out.sum()
+                loss.backward()
+
         @require_backend({"gloo", "nccl"})
         @require_backends_available({"gloo", "nccl"})
         @skip_if_lt_x_gpu(2)

From 3e499e490a016622a370df1a49041bc1e2cb03c1 Mon Sep 17 00:00:00 2001
From: Mingzhe Li <mingzhe0908@fb.com>
Date: Thu, 29 Oct 2020 13:16:32 -0700
Subject: [PATCH 116/169] Bump up NCCL to v2.8 (#46742)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46742

Use NCCL v2.8

Test Plan: waitforsandcastle

Reviewed By: mrshenli

Differential Revision: D24488800

fbshipit-source-id: d39897da1499e63ca783a81aec1ce707606423a3
---
 third_party/nccl/nccl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/nccl/nccl b/third_party/nccl/nccl
index 033d799524fb..cd5a9b73c302 160000
--- a/third_party/nccl/nccl
+++ b/third_party/nccl/nccl
@@ -1 +1 @@
-Subproject commit 033d799524fb97629af5ac2f609de367472b2696
+Subproject commit cd5a9b73c3028d2496666201588111a8c8d84878

From 1d233d7d1f48ff184a8037d46437d441a47ad0d6 Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Thu, 29 Oct 2020 13:25:25 -0700
Subject: [PATCH 117/169] [fix] torch.nn.functional.embedding -> padding_idx
 behavior (#46714)

Summary:
Reference https://github.com/pytorch/pytorch/issues/46585

Fix for second snippet in the mentioned issue.
```python
predefined_weights = torch.rand(10, 3)
result = torch.nn.functional.embedding(torch.LongTensor([1,2,0]), predefined_weights, padding_idx=0)
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46714

Reviewed By: VitalyFedyunin

Differential Revision: D24593352

Pulled By: albanD

fbshipit-source-id: 655b69d9ec57891871e26feeda2aa0dcff73beba
---
 aten/src/ATen/native/Embedding.cpp      | 15 +++++++++++++--
 test/test_nn.py                         | 16 ++++++++++++++++
 tools/autograd/derivatives.yaml         |  2 +-
 torch/csrc/autograd/FunctionsManual.cpp | 10 ++++++----
 torch/csrc/autograd/FunctionsManual.h   |  2 +-
 5 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp
index 3f250ae09909..6589a33ed2f4 100644
--- a/aten/src/ATen/native/Embedding.cpp
+++ b/aten/src/ATen/native/Embedding.cpp
@@ -17,16 +17,27 @@ Tensor embedding(const Tensor & weight, const Tensor & indices,
   auto indices_arg = TensorArg(indices, "indices", 1);
   checkScalarType("embedding", indices_arg, kLong);
 
+  auto zerofill_padding = [&](Tensor& embedding) {
+    if (padding_idx >= 0) {
+      embedding.masked_fill_((indices == padding_idx).reshape({-1, 1}), 0);
+    }
+  };
+
   // TODO: use tensor.index() after improving perf
   if (indices.dim() == 1) {
-    return weight.index_select(0, indices);
+    auto out = weight.index_select(0, indices);
+    zerofill_padding(out);
+    return out;
   }
 
   auto size = indices.sizes().vec();
   for (auto d : weight.sizes().slice(1)) {
     size.push_back(d);
   }
-  return weight.index_select(0, indices.reshape(-1)).view(size);
+
+  auto out = weight.index_select(0, indices.reshape(-1));
+  zerofill_padding(out);
+  return out.view(size);
 }
 
 Tensor embedding_backward(
diff --git a/test/test_nn.py b/test/test_nn.py
index b86e340c14a5..1c52371640a0 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -3055,6 +3055,13 @@ def test_embedding_functional(self):
         res_F = F.embedding(a, embeddings)
         self.assertEqual(res_old, res_F)
 
+        embed_old = torch.nn.Embedding(4, 3)
+        embed_old = embed_old.from_pretrained(embeddings, padding_idx=2)
+        res_old = embed_old(a)
+        res_F = F.embedding(a, embeddings, padding_idx=2)
+
+        self.assertEqual(res_old, res_F)
+
     @unittest.skipUnless('fbgemm' in torch.backends.quantized.supported_engines,
                          'Linear_FP16_weight requires FBGEMM. FBGEMM is only optimized for CPUs'
                          ' with instruction set support avx2 or newer.')
@@ -10738,6 +10745,15 @@ def fn(weight):
         fn = fn_wrapper(device)
         _assertGradAndGradgradChecks(self, fn, (weight, ))
 
+        def fn_wrapper(device):
+            def padding_fn(weight):
+                inp = torch.tensor([[0, 1, 1, 2], [1, 1, 0, 2]], dtype=torch.long).to(device)
+                return torch.nn.functional.embedding(inp, weight, padding_idx=1)
+            return padding_fn
+
+        fn = fn_wrapper(device)
+        _assertGradAndGradgradChecks(self, fn, (weight, ))
+
     def test_embedding_scalar_weight_error(self, device):
         indices = torch.rand(2, 2, device=device).long()
         weight = torch.tensor(1.0, device=device)
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 146624183161..8cbcab35685e 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -1186,7 +1186,7 @@
   weight: embedding_backward(grad, indices, weight.size(0), padding_idx, scale_grad_by_freq, sparse)
 
 - name: embedding_dense_backward(Tensor grad_output, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor
-  grad_output: embedding_dense_double_backward(grad, indices)
+  grad_output: embedding_dense_double_backward(grad, indices, padding_idx)
   indices: non_differentiable
 
 - name: _embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> (Tensor, Tensor, Tensor, Tensor)
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 0fb0e44e9450..d3752bce04cc 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -2694,16 +2694,18 @@ Tensor constant_pad_nd_backward(const Tensor& grad, IntArrayRef pad) {
   return at::constant_pad_nd(grad, negated_pad, 0);
 }
 
-Tensor embedding_dense_double_backward(const Tensor & grad, const Tensor & indices) {
-  // since first backward takes care of padding_idx
-  // and scaling by frequency, we don't need to worry
-  // about it here.
+Tensor embedding_dense_double_backward(const Tensor & grad, const Tensor & indices, int64_t padding_idx) {
+  // since first backward takes care of scaling by frequency, 
+  // we don't need to worry about it here.
   auto gg_weight = grad.index_select(0, indices.reshape(-1));
 
   // reshape gradient as per the shape of indices
   auto size = indices.sizes().vec();
   size.push_back(-1);
 
+  if (padding_idx >= 0) {
+    gg_weight.masked_fill_((indices == padding_idx).reshape({-1, 1}), 0);
+  }
   return gg_weight.view(size);
 }
 
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
index 1f5ba99e83a2..46f26610c127 100644
--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@@ -118,7 +118,7 @@ at::Tensor logdet_backward(const at::Tensor & grad, const at::Tensor& self, cons
 at::Tensor slogdet_backward(const at::Tensor& grad_logabsdet, const at::Tensor& self, const at::Tensor& signdet, const at::Tensor& logabsdet);
 at::Tensor log1p_backward(const at::Tensor& grad, const at::Tensor& self);
 at::Tensor sparse_constructor_values_backward(const at::Tensor& sparse_grad_out, const at::Tensor& indices, at::IntArrayRef values_shape);
-at::Tensor embedding_dense_double_backward(const at::Tensor & grad, const at::Tensor & indices);
+at::Tensor embedding_dense_double_backward(const at::Tensor & grad, const at::Tensor & indices, int64_t padding_idx);
 at::Tensor index_backward(at::Tensor zeros_like_self, at::TensorList indices, const at::Tensor& grad);
 at::Tensor _cudnn_ctc_loss_backward(const at::Tensor& grad_out, const at::Tensor& loss, const at::Tensor& raw_grad, bool zero_infinity);
 

From 129b41226ebef52b336bd47e30971f17b023910c Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Thu, 29 Oct 2020 13:29:00 -0700
Subject: [PATCH 118/169] [ONNX] Support nd mask index in opset >= 11 (#45252)

Summary:
Fixes below pattern for opset >= 11

`return tensor[tensor > 0]`

where rank of `tensor` > 1.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45252

Reviewed By: VitalyFedyunin

Differential Revision: D24116945

Pulled By: bzinodev

fbshipit-source-id: 384026cded1eb831bb5469e31ece4fcfb6ae8f2a
---
 test/onnx/test_pytorch_onnx_onnxruntime.py | 10 ++++++++++
 torch/onnx/symbolic_opset11.py             | 20 ++++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index f08f772aa8e1..fdaee3515ad4 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -430,6 +430,16 @@ def forward(self, input):
         m1 = torch.randn(3, 4, 5, 6, 7)
         self.run_test(MyModel(), m1)
 
+    @skipIfUnsupportedMinOpsetVersion(11)
+    @disableScriptTest()  # Need type inference
+    def test_index_mask_nd(self):
+        class MyModel(torch.nn.Module):
+            def forward(self, input):
+                return input[input > 0]
+
+        m1 = torch.randn(3, 4, 5, 6, 7)
+        self.run_test(MyModel(), m1)
+
     @disableScriptTest()
     def test_dict(self):
         class MyModel(torch.nn.Module):
diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py
index 5a266a429965..f6acc4120dc2 100644
--- a/torch/onnx/symbolic_opset11.py
+++ b/torch/onnx/symbolic_opset11.py
@@ -581,6 +581,26 @@ def mm(g, self, other):
     return g.op("Gemm", self, other, beta_f=0.0, alpha_f=1.0)
 
 
+def index(g, self, index):
+    if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
+        return g.op("ATen", self, index, operator_s="index")
+
+    if sym_help._is_packed_list(index):
+        indices = sym_help._unpack_list(index)
+    else:
+        indices = [index]
+
+    # Handle single mask index.
+    if len(indices) == 1:
+        index = indices[0]
+        if not sym_help._is_none(index) and (index.type().scalarType() == "Bool" or index.type().scalarType() == "Byte"):
+            from torch.onnx.symbolic_opset9 import nonzero
+            index = nonzero(g, index)
+            return g.op('GatherND', self, index)
+    from torch.onnx.symbolic_opset9 import index as index_opset9
+    return index_opset9(g, self, index)
+
+
 def index_fill(g, self, dim, index, value):
     dim_value = sym_help._parse_arg(dim, 'i')
     if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:

From c556d4550ccaff56da472c0b78489b383d44e85d Mon Sep 17 00:00:00 2001
From: Wang Xu <scottxu0730@gmail.com>
Date: Thu, 29 Oct 2020 13:34:33 -0700
Subject: [PATCH 119/169] fix_combine_two_partition_size (#47053)

Summary:
fix combine_two_partitions in Partitioner.py to calculate new partition used memory size after combining two partitions.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47053

Reviewed By: gcatron

Differential Revision: D24624270

Pulled By: scottxu0730

fbshipit-source-id: a0e2a8486e012d02ea797d6ba36ab304d27cc93f
---
 torch/fx/experimental/Partitioner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/fx/experimental/Partitioner.py b/torch/fx/experimental/Partitioner.py
index 1a4887b666fb..8ad6ea34c828 100644
--- a/torch/fx/experimental/Partitioner.py
+++ b/torch/fx/experimental/Partitioner.py
@@ -300,7 +300,6 @@ def create_single_node_partition(node):
                         if abs(p1.bfs_level - p2.bfs_level) <= 1:
                             # Combining p1 and p2 into p0
                             p0 = self.combine_two_partitions(p1, p2)
-                            self.set_parents_and_children()
                             p0.logical_device_ids = p2.logical_device_ids
                             # Remove p2 from non_single_node_partitions
                             non_single_node_partitions.remove(p2)
@@ -430,6 +429,7 @@ def combine_two_partitions(
         """
         partition = self.create_partition()
         partition.nodes = partition_0.nodes.union(partition_1.nodes)
+        partition.recalculate_mem_size()
         self.partitions.remove(partition_0)
         self.partitions.remove(partition_1)
         # reset parents and children for all partitions

From 1ea14e30f5c9902bc277bd7ea2bf6135fbbb3033 Mon Sep 17 00:00:00 2001
From: shubhambhokare1 <shubhambhokare1@gmail.com>
Date: Thu, 29 Oct 2020 13:54:44 -0700
Subject: [PATCH 120/169] [ONNX] Enable NoneType inputs to export API (#45792)

Summary:
Enables the use of NoneType arguments to inputs tuple in the export API

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45792

Reviewed By: heitorschueroff

Differential Revision: D24312784

Pulled By: bzinodev

fbshipit-source-id: 1717e856b56062add371af7dc09cdd9c7b5646da
---
 test/onnx/test_pytorch_onnx_onnxruntime.py   | 36 ++++++++++++++++++++
 torch/csrc/jit/python/python_arg_flatten.cpp |  5 +++
 torch/onnx/__init__.py                       |  2 +-
 3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index fdaee3515ad4..38ddb094794e 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -462,6 +462,42 @@ def forward(self, x_in):
         x = {"test_key_in": torch.randn(1, 2, 3)}
         self.run_test(MyModel(), (x,))
 
+    def test_none_as_input(self):
+        class Model(torch.nn.Module):
+            def forward(self, x, y):
+                if y is not None:
+                    return x + y
+                return x
+
+        x = torch.randn(2, 3)
+        self.run_test(Model(), (x, None))
+
+    def test_none_as_tuple_input(self):
+        class Model(torch.nn.Module):
+            def forward(self, x, y):
+                if y[0] is not None:
+                    return x + y[0]
+                if y[1] is not None:
+                    return x + y[1]
+                return x
+
+        x = torch.randn(2, 3)
+        y = torch.randn(2, 3)
+        self.run_test(Model(), (x, (None, y)))
+
+    def test_none_as_named_input(self):
+        class Model(torch.nn.Module):
+            def forward(self, x, y=None, z=None):
+                if y is not None:
+                    return x + y
+                if z is not None:
+                    return x + z
+                return x
+
+        x = torch.randn(2, 3)
+        z = torch.randn(2, 3)
+        self.run_test(Model(), (x, None, z))
+
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_cste_script(self):
         class MyModel(torch.jit.ScriptModule):
diff --git a/torch/csrc/jit/python/python_arg_flatten.cpp b/torch/csrc/jit/python/python_arg_flatten.cpp
index 41cd3cd2b8af..b854ae14387a 100644
--- a/torch/csrc/jit/python/python_arg_flatten.cpp
+++ b/torch/csrc/jit/python/python_arg_flatten.cpp
@@ -21,6 +21,7 @@ static constexpr char TupleOpen = '(';
 static constexpr char TupleClose = ')';
 static constexpr char Variable = 'v';
 static constexpr char String = 's';
+static constexpr char NoneType = 'n';
 } // namespace D
 
 namespace {
@@ -62,6 +63,8 @@ void flatten_rec(PyObject* obj, ParsedArgs& args) {
     args.vars.push_back(var);
     args.desc.metadata.emplace_back(var);
     args.desc.structure.push_back(D::Variable);
+  } else if (strcmp(THPUtils_typename(obj), "NoneType") == 0) {
+    args.desc.structure.push_back(D::NoneType);
   } else {
     std::string msg =
         "Only tuples, lists and Variables supported as JIT inputs/outputs. "
@@ -136,6 +139,8 @@ py::object unflatten_rec(
       throw std::runtime_error("Not enough Variables given to unflatten");
     auto str = *str_it++;
     return py::reinterpret_borrow<py::object>(THPUtils_packString(str));
+  } else if (type == D::NoneType) {
+    return py::reinterpret_borrow<py::object>(py::none());
   } else {
     if (var_it == var_it_end)
       throw std::runtime_error("Not enough Variables given to unflatten");
diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py
index 255c15b9da4a..421c23ebed6e 100644
--- a/torch/onnx/__init__.py
+++ b/torch/onnx/__init__.py
@@ -44,7 +44,7 @@ def export(model, args, f, export_params=True, verbose=False, training=TrainingM
         model (torch.nn.Module): the model to be exported.
         args (tuple of arguments or torch.Tensor): the inputs to
             the model, e.g., such that ``model(*args)`` is a valid
-            invocation of the model.  Any non-Tensor arguments will
+            invocation of the model.  Any non-Tensor arguments (including None) will
             be hard-coded into the exported model; any Tensor arguments
             will become inputs of the exported model, in the order they
             occur in args.  If args is a Tensor, this is equivalent

From 377a09c8e8c9957fa5a49176b534f01b7da0f5ec Mon Sep 17 00:00:00 2001
From: Basil Hosmer <bhosmer@fb.com>
Date: Thu, 29 Oct 2020 14:04:54 -0700
Subject: [PATCH 121/169] reland fast TypeMeta/ScalarType conversion (#45544)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45544

Test Plan: Imported from OSS

Reviewed By: ezyang

Differential Revision: D24006482

Pulled By: bhosmer

fbshipit-source-id: 5da2401ab40bbf58da27a5d969e00bcee7562ed6
---
 aten/src/ATen/native/DispatchStub.h  |   2 +
 aten/src/ATen/templates/TensorBody.h |   1 +
 aten/src/TH/THStorageFunctions.hpp   |   1 +
 c10/core/DefaultDtype.cpp            |   4 +-
 c10/core/ScalarType.h                |  67 +-------
 c10/core/ScalarTypeToTypeMeta.h      |  47 ++++++
 c10/core/TensorImpl.cpp              |   3 +-
 c10/core/TensorImpl.h                |   4 +-
 c10/core/TensorOptions.h             |   1 +
 c10/core/UndefinedTensorImpl.h       |   2 -
 c10/util/typeid.cpp                  |  63 ++++----
 c10/util/typeid.h                    | 223 +++++++++++++++++++--------
 torch/csrc/DynamicTypes.h            |   1 +
 torch/csrc/jit/tensorexpr/stmt.h     |   1 +
 14 files changed, 251 insertions(+), 169 deletions(-)
 create mode 100644 c10/core/ScalarTypeToTypeMeta.h

diff --git a/aten/src/ATen/native/DispatchStub.h b/aten/src/ATen/native/DispatchStub.h
index dc21a505e8c1..63e2462489be 100644
--- a/aten/src/ATen/native/DispatchStub.h
+++ b/aten/src/ATen/native/DispatchStub.h
@@ -3,7 +3,9 @@
 #include <c10/core/Backend.h>
 #include <c10/core/ScalarType.h>
 #include <c10/util/Exception.h>
+
 #include <type_traits>
+#include <atomic>
 
 // Implements instruction set specific function dispatch.
 //
diff --git a/aten/src/ATen/templates/TensorBody.h b/aten/src/ATen/templates/TensorBody.h
index 3f6292f41178..c6067a79a388 100644
--- a/aten/src/ATen/templates/TensorBody.h
+++ b/aten/src/ATen/templates/TensorBody.h
@@ -7,6 +7,7 @@
 #include <c10/core/Stream.h>
 #include <c10/core/Scalar.h>
 #include <c10/core/ScalarType.h>
+#include <c10/core/ScalarTypeToTypeMeta.h>
 #include <c10/core/Storage.h>
 #include <ATen/core/TensorAccessor.h>
 #include <c10/core/TensorImpl.h>
diff --git a/aten/src/TH/THStorageFunctions.hpp b/aten/src/TH/THStorageFunctions.hpp
index b78f8c7a3035..8d5c28daa796 100644
--- a/aten/src/TH/THStorageFunctions.hpp
+++ b/aten/src/TH/THStorageFunctions.hpp
@@ -8,6 +8,7 @@
 #include <TH/THStorageFunctions.h>
 
 #include <c10/core/ScalarType.h>
+#include <c10/core/ScalarTypeToTypeMeta.h>
 
 // Note [Weak references for intrusive refcounting]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/c10/core/DefaultDtype.cpp b/c10/core/DefaultDtype.cpp
index c4f420ab6e22..b125c9a56541 100644
--- a/c10/core/DefaultDtype.cpp
+++ b/c10/core/DefaultDtype.cpp
@@ -3,12 +3,12 @@
 
 namespace c10 {
 static auto default_dtype = caffe2::TypeMeta::Make<float>();
-static auto default_dtype_as_scalartype = typeMetaToScalarType(default_dtype);
+static auto default_dtype_as_scalartype = default_dtype.toScalarType();
 static auto default_complex_dtype = caffe2::TypeMeta::Make<c10::complex<float>>();
 
 void set_default_dtype(caffe2::TypeMeta dtype) {
   default_dtype = std::move(dtype);
-  default_dtype_as_scalartype = typeMetaToScalarType(default_dtype);
+  default_dtype_as_scalartype = default_dtype.toScalarType();
   if(default_dtype_as_scalartype == ScalarType::Double) {
     default_complex_dtype = std::move(caffe2::TypeMeta::Make<c10::complex<double>>());
   } else {
diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h
index 8f2acebd84f0..6903cf9f77ce 100644
--- a/c10/core/ScalarType.h
+++ b/c10/core/ScalarType.h
@@ -3,9 +3,12 @@
 #include <c10/util/ArrayRef.h>
 #include <c10/util/complex.h>
 #include <c10/util/Half.h>
+#include <c10/util/qint32.h>
+#include <c10/util/qint8.h>
+#include <c10/util/quint8.h>
 #include <c10/util/BFloat16.h>
+#include <c10/util/quint4x2.h>
 #include <c10/util/Optional.h>
-#include <c10/util/typeid.h>
 
 #include <complex>
 #include <cstdint>
@@ -68,6 +71,8 @@ enum class ScalarType : int8_t {
   NumOptions
 };
 
+constexpr uint16_t NumScalarTypes = static_cast<uint16_t>(ScalarType::NumOptions);
+
 namespace impl {
 
 // These are used to map ScalarTypes to C++ types.
@@ -94,7 +99,7 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_ScalarTypeToCPPType)
 
 #undef SPECIALIZE_ScalarTypeToCPPType
 
-}
+} // namespace impl
 
 template <typename T>
 struct CppTypeToScalarType;
@@ -162,64 +167,6 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType)
   _(c10::complex<float>, ComplexFloat)         \
   _(c10::complex<double>, ComplexDouble)
 
-static inline caffe2::TypeMeta scalarTypeToTypeMeta(ScalarType scalar_type) {
-#define DEFINE_CASE(ctype, name) \
-  case ScalarType::name:         \
-    return caffe2::TypeMeta::Make<ctype>();
-
-  switch (scalar_type) {
-    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CASE)
-    case ScalarType::Undefined:
-      return caffe2::TypeMeta();
-    default:
-      AT_ERROR(
-          "Unrecognized Scalartype ",
-          scalar_type,
-          " (please report this error)");
-  }
-#undef DEFINE_CASE
-}
-
-static inline c10::optional<ScalarType> tryTypeMetaToScalarType(
-    caffe2::TypeMeta dtype) {
-#define DEFINE_IF(ctype, name)                    \
-  if (dtype == caffe2::TypeMeta::Make<ctype>()) { \
-    return {ScalarType::name};                    \
-  }
-  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_IF)
-#undef DEFINE_IF
-  if (dtype == caffe2::TypeMeta()) {
-    return {ScalarType::Undefined};
-  }
-  return c10::nullopt;
-}
-
-static inline ScalarType typeMetaToScalarType(caffe2::TypeMeta dtype) {
-  if (auto scalar_type = tryTypeMetaToScalarType(dtype)) {
-    return *scalar_type;
-  }
-  AT_ERROR(
-      "Unsupported TypeMeta in ATen: ", dtype, " (please report this error)");
-}
-
-inline optional<at::ScalarType> optTypeMetaToScalarType(optional<caffe2::TypeMeta> type_meta) {
-  if (!type_meta.has_value()) {
-    return c10::nullopt;
-  }
-  return typeMetaToScalarType(*type_meta);
-}
-
-static inline bool operator==(ScalarType t, caffe2::TypeMeta m) {
-  if (auto mt = tryTypeMetaToScalarType(m)) {
-    return (*mt) == t;
-  }
-  return false;
-}
-
-static inline bool operator==(caffe2::TypeMeta m, ScalarType t) {
-  return t == m;
-}
-
 #define DEFINE_CONSTANT(_, name) \
   constexpr ScalarType k##name = ScalarType::name;
 
diff --git a/c10/core/ScalarTypeToTypeMeta.h b/c10/core/ScalarTypeToTypeMeta.h
new file mode 100644
index 000000000000..b0ac38309704
--- /dev/null
+++ b/c10/core/ScalarTypeToTypeMeta.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <c10/core/ScalarType.h>
+#include <c10/util/typeid.h>
+
+// these just expose TypeMeta/ScalarType bridge functions in c10
+// TODO move to typeid.h (or codemod away) when TypeMeta et al
+// are moved from caffe2 to c10 (see note at top of typeid.h)
+
+namespace c10 {
+
+/**
+ * convert ScalarType enum values to TypeMeta handles
+ */
+static inline caffe2::TypeMeta scalarTypeToTypeMeta(ScalarType scalar_type) {
+  return caffe2::TypeMeta::fromScalarType(scalar_type);
+}
+
+/**
+ * convert TypeMeta handles to ScalarType enum values
+ */
+static inline ScalarType typeMetaToScalarType(caffe2::TypeMeta dtype) {
+  return dtype.toScalarType();
+}
+
+/**
+ * typeMetaToScalarType(), lifted to optional
+ */
+static inline optional<at::ScalarType> optTypeMetaToScalarType(optional<caffe2::TypeMeta> type_meta) {
+  if (!type_meta.has_value()) {
+    return c10::nullopt;
+  }
+  return type_meta->toScalarType();
+}
+
+/**
+ * convenience: equality across TypeMeta/ScalarType conversion
+ */
+static inline bool operator==(ScalarType t, caffe2::TypeMeta m) {
+  return m.isScalarType(t);
+}
+
+static inline bool operator==(caffe2::TypeMeta m, ScalarType t) {
+  return t == m;
+}
+
+} // namespace c10
diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index 8702ed4fdebf..2c8a87f9c2d1 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -62,8 +62,7 @@ TensorImpl::TensorImpl(Storage&& storage, DispatchKeySet key_set, const caffe2::
       data_type_(data_type),
       device_opt_(device_opt) {
   if (!key_set.empty()) {
-    AT_ASSERT(data_type.id() ==  caffe2::TypeIdentifier::uninitialized() ||
-              device_opt_.has_value());
+    TORCH_INTERNAL_ASSERT(data_type == ScalarType::Undefined || device_opt_.has_value());
     // UndefinedTensorImpl is a singleton, so we skip logging it
     C10_LOG_API_USAGE_ONCE("tensor.create");
   }
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index ad636e51ff12..057bdc8214a8 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -1781,13 +1781,13 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
 //    strides SmallVector (pre-allocated 4)
 //    storage offset
 //    numel
-//    data type pointer
+//    data type
 //    (optional) device
 //    tensor type id
 //    miscellaneous bitfield
 //
 static_assert(sizeof(void*) != sizeof(int64_t) || // if 64-bit...
-              sizeof(TensorImpl) == sizeof(int64_t) * 31,
+              sizeof(TensorImpl) == sizeof(int64_t) * 30,
               "You changed the size of TensorImpl on 64-bit arch."
               "See Note [TensorImpl size constraints] on how to proceed.");
 } // namespace c10
diff --git a/c10/core/TensorOptions.h b/c10/core/TensorOptions.h
index 10342908a49b..40d170926141 100644
--- a/c10/core/TensorOptions.h
+++ b/c10/core/TensorOptions.h
@@ -4,6 +4,7 @@
 #include <c10/core/Backend.h>
 #include <c10/core/Layout.h>
 #include <c10/core/ScalarType.h>
+#include <c10/core/ScalarTypeToTypeMeta.h>
 #include <c10/core/Device.h>
 #include <c10/core/MemoryFormat.h>
 #include <c10/core/DispatchKeySet.h>
diff --git a/c10/core/UndefinedTensorImpl.h b/c10/core/UndefinedTensorImpl.h
index 9f1cb93c10eb..26122ed305e2 100644
--- a/c10/core/UndefinedTensorImpl.h
+++ b/c10/core/UndefinedTensorImpl.h
@@ -28,8 +28,6 @@ struct C10_API UndefinedTensorImpl final : public TensorImpl {
 private:
   UndefinedTensorImpl();
   static UndefinedTensorImpl _singleton;
-public:
-  friend struct UndefinedType;
 };
 
 } // namespace c10
diff --git a/c10/util/typeid.cpp b/c10/util/typeid.cpp
index e2070a1584a2..f3fe048b4cca 100644
--- a/c10/util/typeid.cpp
+++ b/c10/util/typeid.cpp
@@ -14,42 +14,41 @@ namespace detail {
 C10_EXPORT void _ThrowRuntimeTypeLogicError(const string& msg) {
   // In earlier versions it used to be std::abort() but it's a bit hard-core
   // for a library
-  AT_ERROR(msg);
+  TORCH_CHECK(false, msg);
 }
+} // namespace detail
 
+[[noreturn]] void TypeMeta::error_unsupported_typemeta(caffe2::TypeMeta dtype) {
+  TORCH_CHECK(false, "Unsupported TypeMeta in ATen: ", dtype, " (please report this error)");
+}
 
-} // namespace detail
+// see TypeMeta::addTypeMetaData
+std::atomic<uint16_t> TypeMeta::nextTypeIndex(NumScalarTypes);
 
-template <>
-EXPORT_IF_NOT_GCC const detail::TypeMetaData* TypeMeta::_typeMetaDataInstance<
-    detail::_Uninitialized>() noexcept {
-  static constexpr detail::TypeMetaData singleton = detail::TypeMetaData(
-      0,
-      nullptr,
-      nullptr,
-      nullptr,
-      nullptr,
-      nullptr,
-      TypeIdentifier::uninitialized(),
-      "nullptr (uninitialized)");
-  return &singleton;
+// fixed length array of TypeMetaData instances
+detail::TypeMetaData* TypeMeta::typeMetaDatas() {
+  static detail::TypeMetaData instances[MaxTypeIndex + 1] = {
+#define SCALAR_TYPE_META(T, name)         \
+    /* ScalarType::name */                \
+    detail::TypeMetaData(                 \
+      sizeof(T),                          \
+      detail::_PickNew<T>(),              \
+      detail::_PickPlacementNew<T>(),     \
+      detail::_PickCopy<T>(),             \
+      detail::_PickPlacementDelete<T>(),  \
+      detail::_PickDelete<T>(),           \
+      TypeIdentifier::Get<T>(),           \
+      c10::util::get_fully_qualified_type_name<T>()),
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SCALAR_TYPE_META)
+#undef SCALAR_TYPE_META
+    // The remainder of the array is padded with TypeMetaData blanks.
+    // The first of these is the entry for ScalarType::Undefined.
+    // The rest are consumed by CAFFE_KNOWN_TYPE entries.
+  };
+  return instances;
 }
 
-CAFFE_KNOWN_TYPE(uint8_t)
-CAFFE_KNOWN_TYPE(int8_t)
-CAFFE_KNOWN_TYPE(int16_t)
-CAFFE_KNOWN_TYPE(int)
-CAFFE_KNOWN_TYPE(int64_t)
-CAFFE_KNOWN_TYPE(at::Half)
-CAFFE_KNOWN_TYPE(float)
-CAFFE_KNOWN_TYPE(double)
-CAFFE_KNOWN_TYPE(c10::complex<c10::Half>)
-CAFFE_KNOWN_TYPE(c10::complex<float>)
-CAFFE_KNOWN_TYPE(c10::complex<double>)
-// 11 = undefined type id
-// 12 = Tensor (defined in tensor.cc)
 CAFFE_KNOWN_TYPE(std::string)
-CAFFE_KNOWN_TYPE(bool)
 CAFFE_KNOWN_TYPE(uint16_t)
 CAFFE_KNOWN_TYPE(char)
 CAFFE_KNOWN_TYPE(std::unique_ptr<std::mutex>)
@@ -79,15 +78,11 @@ using _guard_long_unique = std::conditional_t<
     _guard_long_unique_dummy<T>,
     T>;
 } // namespace detail
+
 CAFFE_KNOWN_TYPE(detail::_guard_long_unique<long>);
 CAFFE_KNOWN_TYPE(detail::_guard_long_unique<std::vector<long>>)
 
 CAFFE_KNOWN_TYPE(float*)
 CAFFE_KNOWN_TYPE(at::Half*)
-CAFFE_KNOWN_TYPE(c10::qint8)
-CAFFE_KNOWN_TYPE(c10::quint8)
-CAFFE_KNOWN_TYPE(c10::qint32)
-CAFFE_KNOWN_TYPE(at::BFloat16)
-CAFFE_KNOWN_TYPE(c10::quint4x2)
 
 } // namespace caffe2
diff --git a/c10/util/typeid.h b/c10/util/typeid.h
index 51833fb545ad..7feb335507ca 100644
--- a/c10/util/typeid.h
+++ b/c10/util/typeid.h
@@ -21,18 +21,14 @@
 #include <c10/util/Backtrace.h>
 #include <c10/util/C++17.h>
 #include <c10/util/Exception.h>
-#include <c10/util/Half.h>
 #include <c10/util/IdWrapper.h>
 #include <c10/util/Type.h>
 #include <c10/util/TypeTraits.h>
 #include <c10/util/TypeIndex.h>
-#include <c10/util/qint32.h>
-#include <c10/util/qint8.h>
-#include <c10/util/quint8.h>
-#include <c10/util/quint4x2.h>
-#include <c10/util/BFloat16.h>
 #include <c10/util/flat_hash_map.h>
 
+#include <c10/core/ScalarType.h>
+
 /*
  * TypeIdentifier is a small type containing an id.
  * Types must be registered using CAFFE_KNOWN_TYPE() for them to have a type id.
@@ -67,7 +63,7 @@ namespace caffe2 {
  */
 class C10_API TypeIdentifier final
     : public at::IdWrapper<TypeIdentifier, c10::util::type_index> {
- public:
+public:
   friend std::ostream& operator<<(std::ostream& stream, TypeIdentifier typeId);
   friend constexpr bool operator<(TypeIdentifier lhs, TypeIdentifier rhs);
 
@@ -87,9 +83,8 @@ class C10_API TypeIdentifier final
     return TypeIdentifier(c10::util::type_index{0});
   }
 
- private:
+private:
   constexpr explicit TypeIdentifier(c10::util::type_index id) : IdWrapper(id) {}
-  friend class TypeMeta; // TODO Is this friend an issue?
 };
 
 // Allow usage in std::map / std::set
@@ -126,7 +121,16 @@ struct TypeMetaData final {
   using PlacementDelete = void(void*, size_t);
   using Delete = void(void*);
 
-  TypeMetaData() = delete;
+  constexpr TypeMetaData() noexcept
+  : itemsize_(0),
+    new_(nullptr),
+    placementNew_(nullptr),
+    copy_(nullptr),
+    placementDelete_(nullptr),
+    delete_(nullptr),
+    id_(TypeIdentifier::uninitialized()),
+    name_("nullptr (uninitialized)") {}
+
   constexpr TypeMetaData(
       size_t itemsize,
       New* newFn,
@@ -136,14 +140,14 @@ struct TypeMetaData final {
       Delete* deleteFn,
       TypeIdentifier id,
       c10::string_view name) noexcept
-      : itemsize_(itemsize),
-        new_(newFn),
-        placementNew_(placementNew),
-        copy_(copy),
-        placementDelete_(placementDelete),
-        delete_(deleteFn),
-        id_(id),
-        name_(name) {}
+  : itemsize_(itemsize),
+    new_(newFn),
+    placementNew_(placementNew),
+    copy_(copy),
+    placementDelete_(placementDelete),
+    delete_(deleteFn),
+    id_(id),
+    name_(name) {}
 
   size_t itemsize_;
   New* new_;
@@ -294,25 +298,24 @@ inline constexpr TypeMetaData::Delete* _PickDelete() noexcept {
   return &_Delete<T>;
 }
 
-template <class T>
-inline C10_TYPENAME_CONSTEXPR TypeMetaData _makeTypeMetaDataInstance() {
-  C10_HOST_CONSTEXPR_VAR auto typeId = TypeIdentifier::Get<T>();
-  C10_TYPENAME_CONSTEXPR auto typeName = c10::util::get_fully_qualified_type_name<T>();
-
-  return {sizeof(T),
-          _PickNew<T>(),
-          _PickPlacementNew<T>(),
-          _PickCopy<T>(),
-          _PickPlacementDelete<T>(),
-          _PickDelete<T>(),
-          typeId,
-          typeName};
-}
-
 class _Uninitialized final {};
 
 } // namespace detail
 
+//
+// note: this is outside TypeMeta bc gcc seems to have trouble
+// with scalarTypeItemSizes as a constexpr static member used by
+// a public inline instance method
+//
+
+// item sizes for TypeMeta::itemsize() fast path
+static constexpr size_t scalarTypeItemSizes[NumScalarTypes] = {
+#define SCALAR_TYPE_SIZE(T, name) sizeof(T),
+  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SCALAR_TYPE_SIZE)
+#undef SCALAR_TYPE_SIZE
+    0, // Undefined
+};
+
 /**
  * TypeMeta is a thin class that allows us to store the type of a container such
  * as a blob, or the data type of a tensor, with a unique run-time id. It also
@@ -344,11 +347,11 @@ class C10_API TypeMeta final {
 
   TypeMeta(TypeMeta&& rhs) noexcept = default;
 
- private:
+private:
   // TypeMeta can only be created by Make, making sure that we do not
   // create incorrectly mixed up TypeMeta objects.
-  explicit TypeMeta(const detail::TypeMetaData* data) noexcept
-  : data_(data) {
+  explicit TypeMeta(const uint16_t index) noexcept
+  : index_(index) {
   }
 
  public:
@@ -356,48 +359,66 @@ class C10_API TypeMeta final {
    * Returns the type id.
    */
   TypeIdentifier id() const noexcept {
-    return data_->id_;
+    return data().id_;
+  }
+  /**
+   * true if we represent some ScalarType type
+   */
+  inline bool isScalarType() const noexcept {
+    return index_ < NumScalarTypes;
+  }
+  /**
+   * true if we represent ScalarType scalar_type
+   */
+  inline bool isScalarType(ScalarType scalar_type) const noexcept {
+    return index_ == static_cast<uint16_t>(scalar_type);
   }
   /**
    * Returns the size of the item.
    */
-  size_t itemsize() const noexcept {
-    return data_->itemsize_;
+  inline size_t itemsize() const noexcept {
+    if (C10_LIKELY(isScalarType())) {
+      return scalarTypeItemSizes[index_];
+    }
+    return data().itemsize_;
   }
+  /**
+   * Returns the new function pointer for individual items.
+   */
   New* newFn() const noexcept {
-    return data_->new_;
+    return data().new_;
   }
   /**
    * Returns the placement new function pointer for individual items.
    */
   PlacementNew* placementNew() const noexcept {
-    return data_->placementNew_;
+    return data().placementNew_;
   }
   /**
    * Returns the typed copy function pointer for individual iterms.
    */
   Copy* copy() const noexcept {
-    return data_->copy_;
+    return data().copy_;
   }
   /**
    * Returns the destructor function pointer for individual items.
    */
   PlacementDelete* placementDelete() const noexcept {
-    return data_->placementDelete_;
+    return data().placementDelete_;
   }
   Delete* deleteFn() const noexcept {
-    return data_->delete_;
+    return data().delete_;
   }
   /**
    * Returns a printable name for the type.
    */
   c10::string_view name() const noexcept {
-    return data_->name_;
+    return data().name_;
   }
 
   friend bool operator==(
-      const TypeMeta& lhs,
-      const TypeMeta& rhs) noexcept;
+      const TypeMeta lhs,
+      const TypeMeta rhs) noexcept;
 
   template <typename T>
   bool Match() const noexcept {
@@ -412,7 +433,7 @@ class C10_API TypeMeta final {
   }
 
   template <class T>
-  static C10_TYPENAME_CONSTEXPR c10::string_view TypeName() noexcept {
+  static c10::string_view TypeName() noexcept {
     return c10::util::get_fully_qualified_type_name<T>();
   }
 
@@ -437,35 +458,105 @@ class C10_API TypeMeta final {
 #pragma GCC diagnostic ignored "-Wunknown-warning-option"
 #pragma GCC diagnostic ignored "-Wundefined-var-template"
 #endif
-    return TypeMeta(_typeMetaDataInstance<T>());
+    return TypeMeta(_typeMetaData<T>());
 #ifndef _MSC_VER
 #pragma GCC diagnostic pop
 #endif
   }
 
- private:
-  const detail::TypeMetaData* data_;
+  /**
+  * convert ScalarType enum values to TypeMeta handles
+  */
+  static inline caffe2::TypeMeta fromScalarType(ScalarType scalar_type) {
+    const size_t index = static_cast<uint16_t>(scalar_type);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      index < NumScalarTypes,
+      "Unrecognized Scalartype ", scalar_type, " (please report this error)");
+    return TypeMeta(index);
+  }
 
+  /**
+   * convert TypeMeta handles to ScalarType enum values
+   */
+  inline ScalarType toScalarType() {
+    if (C10_LIKELY(isScalarType())) {
+      return static_cast<ScalarType>(index_);
+    }
+    error_unsupported_typemeta(*this);
+  }
+
+private:
+  [[noreturn]] static void error_unsupported_typemeta(caffe2::TypeMeta dtype);
+
+  // hard limit number of registered types
+  // note: constexpr provokes Windows compilation error "member may not be initialized"
+  // static constexpr size_t MaxTypeIndex = UINT8_MAX;
+  #define MaxTypeIndex UINT8_MAX
+
+  static std::atomic<uint16_t> nextTypeIndex;
+
+  static detail::TypeMetaData* typeMetaDatas();
+
+  template <class T>
+  static uint16_t addTypeMetaData() {
+    const uint16_t index = nextTypeIndex++;
+    TORCH_CHECK(index <= MaxTypeIndex,
+      "Maximum number of CAFFE_KNOWN_TYPE declarations has been exceeded. ",
+      "Please report this issue.");
+    typeMetaDatas()[index] = detail::TypeMetaData{
+      sizeof(T),
+      detail::_PickNew<T>(),
+      detail::_PickPlacementNew<T>(),
+      detail::_PickCopy<T>(),
+      detail::_PickPlacementDelete<T>(),
+      detail::_PickDelete<T>(),
+      TypeIdentifier::Get<T>(),
+      c10::util::get_fully_qualified_type_name<T>()};
+    return index;
+  }
+
+  // specializations return indexes into typeMetaDataInstances()
   template <class T>
-  C10_API static const detail::TypeMetaData* _typeMetaDataInstance() noexcept;
+  C10_API static uint16_t _typeMetaData() noexcept;
+
+  //
+  // TypeMeta just wraps this index
+  //
+
+  uint16_t index_;
+
+  inline const detail::TypeMetaData& data() const {
+    return typeMetaDatas()[index_];
+  }
 };
 
+// specializations of TypeMeta::_typeMetaData for ScalarType types
+
+#define DEFINE_SCALAR_METADATA_INSTANCE(T, name)              \
+  template <>                                                 \
+  constexpr uint16_t TypeMeta::_typeMetaData<T>() noexcept {  \
+    return static_cast<uint16_t>(ScalarType::name);           \
+  }
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_SCALAR_METADATA_INSTANCE)
+#undef DEFINE_SCALAR_METADATA_INSTANCE
+
 template <>
-C10_EXPORT const detail::TypeMetaData* TypeMeta::_typeMetaDataInstance<
-    detail::_Uninitialized>() noexcept;
+C10_EXPORT constexpr uint16_t TypeMeta::_typeMetaData<detail::_Uninitialized>() noexcept {
+  return static_cast<uint16_t>(ScalarType::Undefined);
+}
 
 inline TypeMeta::TypeMeta() noexcept
-    : data_(_typeMetaDataInstance<detail::_Uninitialized>()) {
+  : index_(_typeMetaData<detail::_Uninitialized>()) {
 }
 
 inline bool operator==(
-    const TypeMeta& lhs,
-    const TypeMeta& rhs) noexcept {
-  return (lhs.data_ == rhs.data_);
+    const TypeMeta lhs,
+    const TypeMeta rhs) noexcept {
+  return (lhs.index_ == rhs.index_);
 }
 inline bool operator!=(
-    const TypeMeta& lhs,
-    const TypeMeta& rhs) noexcept {
+    const TypeMeta lhs,
+    const TypeMeta rhs) noexcept {
   return !operator==(lhs, rhs);
 }
 
@@ -500,13 +591,11 @@ inline std::ostream& operator<<(
 #define EXPORT_IF_NOT_GCC
 #endif
 
-#define CAFFE_KNOWN_TYPE(T)                                        \
-  template <>                                                      \
-  EXPORT_IF_NOT_GCC const detail::TypeMetaData*                    \
-  TypeMeta::_typeMetaDataInstance<T>() noexcept {                  \
-    static C10_TYPENAME_CONSTEXPR detail::TypeMetaData singleton = \
-        detail::_makeTypeMetaDataInstance<T>();                    \
-    return &singleton;                                             \
+#define CAFFE_KNOWN_TYPE(T)                                           \
+  template <>                                                         \
+  EXPORT_IF_NOT_GCC uint16_t TypeMeta::_typeMetaData<T>() noexcept {  \
+    static const uint16_t index = addTypeMetaData<T>();               \
+    return index;                                                     \
   }
 
 } // namespace caffe2
diff --git a/torch/csrc/DynamicTypes.h b/torch/csrc/DynamicTypes.h
index 0877fb317cb3..774c5e99999f 100644
--- a/torch/csrc/DynamicTypes.h
+++ b/torch/csrc/DynamicTypes.h
@@ -6,6 +6,7 @@
 
 #include <ATen/Device.h>
 #include <c10/core/ScalarType.h>
+#include <c10/core/ScalarTypeToTypeMeta.h>
 #include <c10/core/Backend.h>
 #include <c10/core/Layout.h>
 
diff --git a/torch/csrc/jit/tensorexpr/stmt.h b/torch/csrc/jit/tensorexpr/stmt.h
index abdaea147c00..55c1926b3541 100644
--- a/torch/csrc/jit/tensorexpr/stmt.h
+++ b/torch/csrc/jit/tensorexpr/stmt.h
@@ -3,6 +3,7 @@
 #include <algorithm>
 #include <list>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
 #include <torch/csrc/jit/tensorexpr/expr.h>

From 71c0133e23cf961e492710cee124c227494d0c8b Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Thu, 29 Oct 2020 14:20:55 -0700
Subject: [PATCH 122/169] enable PE everywhere but mobile (#47001)

Summary:
enable PE everywhere but mobile

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47001

Reviewed By: eellison

Differential Revision: D24596252

Pulled By: Krovatkin

fbshipit-source-id: 3e3093a43287e1ff838cb03ec0e53c11c82c8dd2
---
 torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
index 3d7e8bb6f60e..d62697c1f9b6 100644
--- a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
+++ b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
@@ -40,8 +40,7 @@ C10_DEFINE_bool(
 namespace torch {
 namespace jit {
 
-// TODO: keep the else clause for trial runs
-#if defined(FBCODE_CAFFE2) || defined(C10_MOBILE)
+#if defined(C10_MOBILE)
 static std::atomic<bool> executor_mode{true};
 static std::atomic<bool> profiling_mode{false};
 #else

From 156c08b0d97184cbe112109e34267a4bf3e1b8c1 Mon Sep 17 00:00:00 2001
From: Ailing Zhang <ailzhang@fb.com>
Date: Thu, 29 Oct 2020 14:31:14 -0700
Subject: [PATCH 123/169] view_as_real doesn't work for all backends since it
 relies on strides. (#47018)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47018

Test Plan: Imported from OSS

Reviewed By: jamesr66a

Differential Revision: D24607340

Pulled By: ailzhang

fbshipit-source-id: c7fd85cd636ae9aebb22321f8f1a255af81a473f
---
 aten/src/ATen/native/native_functions.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index ccc07438eda9..c98b9423b25c 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -283,13 +283,13 @@
   use_c10_dispatcher: full
   variants: function
   dispatch:
-    DefaultBackend: view_as_real
+    CPU, CUDA: view_as_real
 
 - func: view_as_complex(Tensor(a) self) -> Tensor(a)
   use_c10_dispatcher: full
   variants: function
   dispatch:
-    DefaultBackend: view_as_complex
+    CPU, CUDA: view_as_complex
 
 - func: sgn(Tensor self) -> Tensor
   use_c10_dispatcher: full

From dc6f723cb41a78055771d243c84995ebae956d58 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 29 Oct 2020 14:31:56 -0700
Subject: [PATCH 124/169] Delete Vulkan from code generator. (#46938)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46938

It turns out that after https://github.com/pytorch/pytorch/pull/42194
landed we no longer actually generate any registrations into this
file.  That means it's completely unnecessary.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: IvanKobzarev

Differential Revision: D24573518

Pulled By: ezyang

fbshipit-source-id: b41ada9e394b780f037f5977596a36b896b5648c
---
 aten/src/ATen/templates/Functions.cpp |  3 ---
 cmake/Codegen.cmake                   |  3 ---
 tools/codegen/gen.py                  | 21 +++++++++++----------
 3 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/aten/src/ATen/templates/Functions.cpp b/aten/src/ATen/templates/Functions.cpp
index 7c9aa96f6e70..5c8ad639c7f0 100644
--- a/aten/src/ATen/templates/Functions.cpp
+++ b/aten/src/ATen/templates/Functions.cpp
@@ -7,9 +7,6 @@
 #include <ATen/CPUType.h>
 #include <ATen/QuantizedCPUType.h>
 #include <ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h>
-#ifdef USE_VULKAN
-#include <ATen/VulkanType.h>
-#endif
 
 namespace at {
 
diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index dbfd55e2d0d5..db02f7a8fb16 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -178,9 +178,6 @@ if(INTERN_BUILD_ATEN_OPS)
       --force_schema_registration
       --op_registration_whitelist ${OP_REGISTRATION_WHITELIST})
   endif()
-  if(USE_VULKAN)
-    set(GEN_VULKAN_FLAGS --vulkan)
-  endif()
 
   set(GEN_COMMAND
       "${PYTHON_EXECUTABLE}" -m tools.codegen.gen
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index af358d9d1b7c..353e7a2648d5 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -236,7 +236,7 @@ def func(f: NativeFunction) -> Optional[str]:
             return_kw = "    return "
 
             cuda_guard = ""
-            if dispatch_to_all_backends or 'CUDA' in dispatch or 'Vulkan' == dispatch:  # type: ignore
+            if dispatch_to_all_backends or 'CUDA' in dispatch:  # type: ignore
                 self_args = (a for a in f.func.arguments if a.name == "self")
 
                 # There is precedence for which argument we use to do
@@ -261,7 +261,7 @@ def func(f: NativeFunction) -> Optional[str]:
 
                 # TODO: There is probably a simpler version of this that
                 # works just as well.
-                if f.device_guard and (dispatch_to_all_backends or 'Vulkan' == dispatch) and has_tensor_options:
+                if f.device_guard and dispatch_to_all_backends and has_tensor_options:
                     cuda_guard = cuda_guard_from_tensor_options
                 elif f.device_guard and dispatch is not None and 'CUDA' in dispatch and has_tensor_options:
                     cuda_guard = f"""\
@@ -931,11 +931,6 @@ def main() -> None:
         '--rocm',
         action='store_true',
         help='reinterpret CUDA as ROCm/HIP and adjust filepaths accordingly')
-    # TODO: remove this, we should just unconditionally generate Vulkan
-    parser.add_argument(
-        '--vulkan',
-        action='store_true',
-        help='Generate Vulkan backend functions')
     # TODO: --op_registration_whitelist will be removed when all call-sites
     # for gen.py are moved over to using the operator YAML file for mobile
     # custom build.
@@ -1016,9 +1011,15 @@ def make_file_manager(install_dir: str) -> FileManager:
 #include <ATen/hip/HIPDevice.h>
 #include <ATen/hip/HIPContext.h>'''
 
-    backends = ["CPU", "SparseCPU", "MkldnnCPU", "CUDA", "SparseCUDA", "QuantizedCPU", "QuantizedCUDA"]
-    if options.vulkan:
-        backends.append("Vulkan")
+    backends = [
+        "CPU",
+        "SparseCPU",
+        "MkldnnCPU",
+        "CUDA",
+        "SparseCUDA",
+        "QuantizedCPU",
+        "QuantizedCUDA",
+    ]
     if options.backend_whitelist:
         backends = [b for b in backends if b in options.backend_whitelist]
 

From 87e86fa84cc605bd4e7bfce7d9c451187447a7d5 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 29 Oct 2020 14:31:56 -0700
Subject: [PATCH 125/169] Some miscellaneous cleanup in codegen (#46940)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46940

- Remove inaccurate generated comments
- Delete some dead code
- Delete some unused headers
- Delete unnecessary SparseTypeDerived.cpp template

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: albanD

Differential Revision: D24573971

Pulled By: ezyang

fbshipit-source-id: 3de05d9cd9bada4c73f01d6cfaf51f16ada66013
---
 aten/src/ATen/LegacyTHFunctionsCPU.cpp        |  2 -
 aten/src/ATen/LegacyTHFunctionsCPU.h          |  2 -
 aten/src/ATen/LegacyTHFunctionsCUDA.h         |  2 -
 aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp  |  2 -
 aten/src/ATen/templates/SparseTypeDerived.cpp | 43 -------------------
 aten/src/ATen/templates/TypeDerived.cpp       |  3 +-
 aten/src/ATen/templates/TypeDerived.h         |  3 +-
 tools/codegen/gen.py                          | 11 -----
 8 files changed, 2 insertions(+), 66 deletions(-)
 delete mode 100644 aten/src/ATen/templates/SparseTypeDerived.cpp

diff --git a/aten/src/ATen/LegacyTHFunctionsCPU.cpp b/aten/src/ATen/LegacyTHFunctionsCPU.cpp
index f0a55470cc1c..60db3d9a9d4e 100644
--- a/aten/src/ATen/LegacyTHFunctionsCPU.cpp
+++ b/aten/src/ATen/LegacyTHFunctionsCPU.cpp
@@ -1,7 +1,5 @@
 #include <ATen/LegacyTHFunctionsCPU.h>
 
-// @generated by aten/src/ATen/gen.py from LegacyTHFunctions.cpp
-
 #include <ATen/ATen.h>
 #include <ATen/Utils.h>
 #include <ATen/NamedTensorUtils.h>
diff --git a/aten/src/ATen/LegacyTHFunctionsCPU.h b/aten/src/ATen/LegacyTHFunctionsCPU.h
index 1bc9b66777bc..b24a83acdd5d 100644
--- a/aten/src/ATen/LegacyTHFunctionsCPU.h
+++ b/aten/src/ATen/LegacyTHFunctionsCPU.h
@@ -1,7 +1,5 @@
 #pragma once
 
-// @generated by aten/src/ATen/gen.py from LegacyTHFunctions.h
-
 #include <ATen/Context.h>
 #include <c10/core/ScalarType.h>
 #include <c10/core/TensorOptions.h>
diff --git a/aten/src/ATen/LegacyTHFunctionsCUDA.h b/aten/src/ATen/LegacyTHFunctionsCUDA.h
index 20717ad43e6f..7b3be6db3d77 100644
--- a/aten/src/ATen/LegacyTHFunctionsCUDA.h
+++ b/aten/src/ATen/LegacyTHFunctionsCUDA.h
@@ -1,7 +1,5 @@
 #pragma once
 
-// @generated by aten/src/ATen/gen.py from LegacyTHFunctions.h
-
 #include <ATen/Context.h>
 #include <c10/core/ScalarType.h>
 #include <c10/core/TensorOptions.h>
diff --git a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
index b2d8df49f51b..45ceddcd94e8 100644
--- a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
+++ b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
@@ -1,7 +1,5 @@
 #include <ATen/LegacyTHFunctionsCUDA.h>
 
-// @generated by aten/src/ATen/gen.py from LegacyTHFunctions.cpp
-
 #include <ATen/ATen.h>
 #include <ATen/Utils.h>
 #include <ATen/NamedTensorUtils.h>
diff --git a/aten/src/ATen/templates/SparseTypeDerived.cpp b/aten/src/ATen/templates/SparseTypeDerived.cpp
deleted file mode 100644
index b0a4fed24a63..000000000000
--- a/aten/src/ATen/templates/SparseTypeDerived.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-// required for old g++ to compile PRId64 macros, see
-// https://github.com/pytorch/pytorch/issues/3571
-// for context
-#define __STDC_FORMAT_MACROS
-
-#include <ATen/${Type}.h>
-
-// ${generated_comment}
-
-#include <ATen/${Generator}.h>
-#include <c10/core/Allocator.h>
-#include <ATen/DeviceGuard.h>
-#include <ATen/NativeFunctions.h>
-#include <ATen/Utils.h>
-#include <ATen/WrapDimUtils.h>
-#include <ATen/Dispatch.h>
-#include <c10/util/Half.h>
-#include <c10/core/UndefinedTensorImpl.h>
-#include <c10/util/Optional.h>
-#include <ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h>
-#include <torch/library.h>
-
-#include <cstddef>
-#include <functional>
-#include <memory>
-#include <utility>
-
-#include <ATen/Config.h>
-$extra_cuda_headers
-
-namespace at {
-
-namespace ${Type} {
-
-${type_derived_method_definitions}
-
-}  // namespace ${Type}
-
-TORCH_LIBRARY_IMPL(aten, ${Backend}, m) {
-  ${function_registrations};
-}
-
-} // namespace at
diff --git a/aten/src/ATen/templates/TypeDerived.cpp b/aten/src/ATen/templates/TypeDerived.cpp
index d65c13ae8d97..df4b83c40580 100644
--- a/aten/src/ATen/templates/TypeDerived.cpp
+++ b/aten/src/ATen/templates/TypeDerived.cpp
@@ -9,8 +9,7 @@
 
 // ${generated_comment}
 
-$storage_tensor_headers
-#include <ATen/${Generator}.h>
+#include <c10/core/TensorImpl.h>
 #include <c10/core/Allocator.h>
 #include <ATen/DeviceGuard.h>
 #include <ATen/NativeFunctions.h>
diff --git a/aten/src/ATen/templates/TypeDerived.h b/aten/src/ATen/templates/TypeDerived.h
index 4b571f40383f..165787cc2ac5 100644
--- a/aten/src/ATen/templates/TypeDerived.h
+++ b/aten/src/ATen/templates/TypeDerived.h
@@ -6,13 +6,12 @@
 #include <c10/core/Scalar.h>
 #include <c10/core/QScheme.h>
 #include <c10/core/MemoryFormat.h>
+#include <c10/core/Stream.h>
 #include <c10/util/ArrayRef.h>
 #include <c10/util/intrusive_ptr.h>
 #include <torch/csrc/WindowsTorchApiMacro.h>
 #include <ATen/Dimname.h>
 
-$extra_cuda_headers
-
 namespace c10 {
 struct Storage;
 }
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index 353e7a2648d5..544e4d8aa7e8 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -1000,13 +1000,11 @@ def make_file_manager(install_dir: str) -> FileManager:
     cuda_fm = make_file_manager(options.install_dir)
 
     extra_cuda_headers = '''\
-#include <ATen/DeviceGuard.h>
 #include <ATen/cuda/ATenCUDAGeneral.h>
 #include <ATen/cuda/CUDADevice.h>
 #include <ATen/cuda/CUDAContext.h>'''
     if options.rocm:
         extra_cuda_headers = '''\
-#include <ATen/DeviceGuard.h>
 #include <ATen/hip/ATenHIPGeneral.h>
 #include <ATen/hip/HIPDevice.h>
 #include <ATen/hip/HIPContext.h>'''
@@ -1026,15 +1024,11 @@ def make_file_manager(install_dir: str) -> FileManager:
     for dispatch in backends:
         h_template = 'TypeDerived.h'
         cpp_template = 'TypeDerived.cpp'
-        # TODO: delete this special case
-        if 'Sparse' in dispatch:
-            cpp_template = 'SparseTypeDerived.cpp'
 
         fm = cuda_fm if 'CUDA' in dispatch else cpu_fm
 
         fm.write_with_template(f'{dispatch}Type.h', h_template, lambda: {
             'Type': f'{dispatch}Type',
-            'extra_cuda_headers': extra_cuda_headers if 'CUDA' in dispatch else '',  # TODO: remove this
             'type_derived_method_declarations': list(mapMaybe(
                 compute_type_method(dispatch, target=Target.DECLARATION, selector=selector),
                 native_functions
@@ -1042,12 +1036,7 @@ def make_file_manager(install_dir: str) -> FileManager:
         })
         fm.write_with_template(f'{dispatch}Type.cpp', cpp_template, lambda: {
             'Type': f'{dispatch}Type',
-            # TODO: remove this
             'extra_cuda_headers': extra_cuda_headers if 'CUDA' in dispatch else '',
-            # TODO: remove this
-            'storage_tensor_headers': '#include <c10/core/TensorImpl.h>',
-            # TODO: remove this
-            'Generator': 'CUDAGeneratorImpl' if 'CUDA' in dispatch else 'CPUGeneratorImpl',
             'legacy_th_headers':
                 '#include <ATen/LegacyTHFunctionsCPU.h>' if dispatch == "CPU" else
                 '#include <ATen/LegacyTHFunctionsCUDA.h>' if dispatch == "CUDA" else

From 54d83296a919c23753aa8b9d5ed42d7fec02c153 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 29 Oct 2020 14:31:56 -0700
Subject: [PATCH 126/169] Desugar missing dispatch field into singleton Math
 entry (#46970)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46970

Now that catchall declarations are reinterpreted as registrations to
dispatch key Math, we can now simplify code generation logic by directly
generating to Math, and bypasing logic for catchall.  This also helps
avoid bugs where we incorrectly classify some kernels as Math and others
as not, even though they get registered in the same way.

Bill of changes:
- Give Math its own unique TORCH_LIBRARY_IMPL
- Make it so NativeFunction.dispatch is always non-None.  Simplify
  downstream conditionals accordingly
- When parsing NativeFunction, fill in missing dispatch with a
  singleton Math entry (pointing to the cpp.name!)

One thing that is a little big about this change is a lot of kernels
which previously didn't report as "math" now report as math.  I picked
a setting for these booleans that made sense to me, but I'm not sure
if e.g. XLA will handle it 100% correctly.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: albanD

Differential Revision: D24592391

Pulled By: ezyang

fbshipit-source-id: 2e3355f19f9525698864312418df08411f30a85d
---
 aten/src/ATen/templates/TypeDefault.cpp |  4 ++
 tools/codegen/gen.py                    | 65 ++++++++++---------------
 tools/codegen/model.py                  | 24 +++++----
 3 files changed, 46 insertions(+), 47 deletions(-)

diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp
index 145a5b421019..4cd1d1586d6a 100644
--- a/aten/src/ATen/templates/TypeDefault.cpp
+++ b/aten/src/ATen/templates/TypeDefault.cpp
@@ -64,6 +64,10 @@ TORCH_LIBRARY(aten, m) {
   m.def("get_gradients(int context_id) -> Dict(Tensor, Tensor)");
 }
 
+TORCH_LIBRARY_IMPL(aten, Math, m) {
+  ${math_function_registrations};
+}
+
 TORCH_LIBRARY_IMPL(aten, DefaultBackend, m) {
   ${default_backend_function_registrations};
 }
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index 544e4d8aa7e8..3b629b98d52e 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -205,10 +205,10 @@ def compute_type_method(
     @with_native_function
     def func(f: NativeFunction) -> Optional[str]:
         if dispatch is not None:
-            if f.dispatch is None or dispatch not in f.dispatch:
+            if dispatch not in f.dispatch:
                 return None
         else:
-            if f.dispatch is not None and target is not Target.REGISTRATION:
+            if target is not Target.REGISTRATION:
                 return None
 
         op_name = f"aten::{f.func.name}"
@@ -219,24 +219,21 @@ def func(f: NativeFunction) -> Optional[str]:
         returns_type = native.returns_type(f.func.returns)
         args = native.arguments(f.func)
         args_str = ', '.join(map(str, args))
+        # TODO: don't need dispatch is None shortly
         dispatch_to_all_backends = dispatch is None or dispatch in KEYWORD_ALL_BACKENDS
 
         if target is Target.DECLARATION:
             return f"{returns_type} {name}({args_str});"
         elif target is Target.DEFINITION:
-            if f.dispatch is None:
-                cpp_name = cpp.name(f.func)
-                impl_name = f"at::native::{cpp_name}"
-            else:
-                assert dispatch is not None
-                impl_name = f"at::native::{f.dispatch[dispatch]}"
+            assert dispatch is not None
+            impl_name = f"at::native::{f.dispatch[dispatch]}"
 
             args_exprs_str = ', '.join(a.name for a in args)
 
             return_kw = "    return "
 
             cuda_guard = ""
-            if dispatch_to_all_backends or 'CUDA' in dispatch:  # type: ignore
+            if dispatch_to_all_backends or 'CUDA' in dispatch:
                 self_args = (a for a in f.func.arguments if a.name == "self")
 
                 # There is precedence for which argument we use to do
@@ -284,20 +281,18 @@ def func(f: NativeFunction) -> Optional[str]:
 """
 
         elif target is Target.REGISTRATION:
-            dispatcher_sig = DispatcherSignature.from_schema(f.func)
-
-            if dispatch_to_all_backends:
-                type_name = f'TypeDefault::{name}'
+            if dispatch is None:
+                return f'm.def({cpp_string(str(f.func))});\n'
+            elif def_only or f.manual_kernel_registration:
+                return None
             else:
-                type_name = f'{dispatch}Type::{name}'
+                if dispatch_to_all_backends:
+                    type_name = f'TypeDefault::{name}'
+                else:
+                    type_name = f'{dispatch}Type::{name}'
 
-            # def registration only happens in TypeDefault
-            def_registration = ""
-            if dispatch is None:
-                def_registration = f'm.def({cpp_string(str(f.func))});\n'
+                dispatcher_sig = DispatcherSignature.from_schema(f.func)
 
-            impl_registration = ""
-            if not def_only and not f.manual_kernel_registration and (dispatch is not None or f.dispatch is None):
                 # Figure out which signature the function is
                 if local.use_c10_dispatcher() is UseC10Dispatcher.full:
                     payload = f"TORCH_FN({type_name})"
@@ -321,9 +316,7 @@ def func(f: NativeFunction) -> Optional[str]:
                 if dispatch is not None:
                     payload = f"torch::dispatch(DispatchKey::{dispatch},\n{payload})\n"
 
-                impl_registration = f'm.impl("{f.func.name}",\n{payload});\n'
-
-            return f"{def_registration}{impl_registration}"
+                return f'm.impl("{f.func.name}",\n{payload});\n'
         else:
             assert_never(target)
 
@@ -439,10 +432,7 @@ def compute_aten_op(f: NativeFunction) -> str:
 # actual kernel definitions we keep in aten/src/ATen/native/
 @with_native_function
 def compute_native_function_declaration(f: NativeFunction) -> List[str]:
-    if f.dispatch is None:
-        ns = [cpp.name(f.func)]
-    else:
-        ns = list(f.dispatch.values())
+    ns = list(f.dispatch.values())
 
     rs = []
     # Sometimes a function name shows up multiple times; only generate
@@ -763,8 +753,7 @@ def compute_declaration_yaml(f: NativeFunction) -> object:
     is_factory_method = any(isinstance(a.argument, TensorOptionsArguments) for a in cpp_args) \
         and Variant.method not in f.variants
 
-    # Having only Math in dispatch section is equivalent to no dispatch section.
-    is_abstract = f.dispatch is not None and set(f.dispatch.keys()) != set({'Math'})  # type ignore
+    is_abstract = f.dispatch.keys() != {'Math'}
 
     return OrderedDict([
         ('name', cpp.name(f.func)),
@@ -803,7 +792,7 @@ def compute_declaration_yaml(f: NativeFunction) -> object:
         ('device_guard', f.device_guard),
         ('with_gil', False),
         ('deprecated', False),
-        ('has_math_kernel', f.dispatch is not None and 'Math' in f.dispatch),
+        ('has_math_kernel', 'Math' in f.dispatch),
     ])
 
 @with_native_function
@@ -814,8 +803,9 @@ def compute_registration_declarations(f: NativeFunction) -> str:
     args_str = ', '.join(map(str, args))
     comment_data : Dict[str, str] = {
         'schema': f'aten::{f.func}',
-        'dispatch': str(f.dispatch is not None),
-        'default': str(f.dispatch is not None and any(k in f.dispatch for k in KEYWORD_ALL_BACKENDS))
+        # TODO: What exactly is the semantics of the 'dispatch' field?
+        'dispatch': str(f.dispatch.keys() != {'Math'}),
+        'default': str(any(k in f.dispatch for k in KEYWORD_ALL_BACKENDS))
     }
     return f"""{returns_type} {name}({args_str}); // {json.dumps(comment_data)}
 """
@@ -1068,9 +1058,6 @@ def make_file_manager(install_dir: str) -> FileManager:
     })
     cpu_fm.write('TypeDefault.cpp', lambda: {
         'type_method_definitions':
-        list(mapMaybe(
-            compute_type_method(None, target=Target.DEFINITION, selector=selector),
-            native_functions)) +
         list(mapMaybe(
             compute_type_method('Math', target=Target.DEFINITION, selector=selector),
             native_functions)) +
@@ -1080,9 +1067,11 @@ def make_file_manager(install_dir: str) -> FileManager:
 
         'function_registrations': list(mapMaybe(
             compute_type_method(None, target=Target.REGISTRATION, selector=selector),
-            native_functions)) + list(mapMaybe(
-                compute_type_method('Math', target=Target.REGISTRATION, selector=selector),
-                native_functions)),
+            native_functions)),
+
+        'math_function_registrations': list(mapMaybe(
+            compute_type_method('Math', target=Target.REGISTRATION, selector=selector),
+            native_functions)),
 
         'default_backend_function_registrations': list(mapMaybe(
             compute_type_method('DefaultBackend', target=Target.REGISTRATION, selector=selector),
diff --git a/tools/codegen/model.py b/tools/codegen/model.py
index 56605b7130db..95cb7438a814 100644
--- a/tools/codegen/model.py
+++ b/tools/codegen/model.py
@@ -98,13 +98,15 @@ class NativeFunction:
     # registrations don't participate in codegen-based selective build!
     manual_kernel_registration: bool
 
-    # Distinguish between a missing dispatch dict (historically, this
-    # means to register a catch-all kernel) and a present but empty
-    # dispatch dict (this means register nothing; arguably, this should
-    # subsume manual_kernel_registration).
+    # A mapping of dispatch keys to names of functions implementing
+    # them.  In native_functions.yaml, the dispatch entry is optional; in that
+    # case, that is equivalent to having written:
+    #
+    #   dispatch:
+    #       Math: $operator_name
     #
     # TODO: str key could be replaced with more explicit enum
-    dispatch: Optional[Dict[str, str]]
+    dispatch: Dict[str, str]
 
     # The location in the YAML file were this native function entry was
     # defined.  This is for conveniently reporting error messages!
@@ -162,9 +164,8 @@ def from_yaml(ei: Dict[str, object], loc: 'Location') -> 'NativeFunction':
 
         raw_dispatch = e.pop('dispatch', None)
         assert raw_dispatch is None or isinstance(raw_dispatch, dict), e
-        dispatch: Optional[Dict[str, str]] = None
+        dispatch: Dict[str, str] = {}
         if raw_dispatch is not None:
-            dispatch = {}
             for ks, v in raw_dispatch.items():
                 if ks == '__line__':
                     continue  # not worth tracking line numbers for dispatch entries
@@ -172,9 +173,14 @@ def from_yaml(ei: Dict[str, object], loc: 'Location') -> 'NativeFunction':
                 assert isinstance(v, str), e
                 for k in ks.split(","):
                     dispatch[k.strip()] = v
+        else:
+            from tools.codegen.api import cpp
+            dispatch['Math'] = cpp.name(func)
 
-        # Throws if both DefaultBackend and Math are provided
-        assert not (dispatch is not None and 'DefaultBackend' in dispatch and 'Math' in dispatch)
+        assert not ('DefaultBackend' in dispatch and 'Math' in dispatch), \
+            "cannot specify both DefaultBackend and Math on a single kernel; each " \
+            "strictly subsumes the other.  If you wanted to provide an explicit autograd " \
+            "implementation, specify DefaultBackend; otherwise specify Math only"
 
         e.pop('__line__')
         assert not e, f"leftover entries: {e}"

From 41f8641f1eb43d958202f5763ae1de69ea90ae31 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 29 Oct 2020 14:31:56 -0700
Subject: [PATCH 127/169] Delete SchemaRegister.cpp, make flag operate on
 TypeDefault.cpp (#46991)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46991

This change is motivated by a problem bdhirsh observed which is
that in internal builds that include both SchemaRegister.cpp
and TypeDefault.cpp, some operators have their schemas defined
multiple times.  Instead of dumping schema registrations in
multiple files, it seems better to just toggle how many schemas
we write into TypeDefault.cpp.

ljk53 observes that technically SchemaRegister.cpp is only needed by
full-JIT frontend, and not by light interpreter (to resolve schema
lookups).  However, in practice, the registration file seems to be
unconditionally loaded.  This change will make it harder to do the
optimization where we drop schemas in the light interpreter, but you
probably want to architect this differently (similar to per-op
registrations, DON'T do any registrations in ATen, and then write out
the schema registrations in a separate library.)

I took this opportunity to also simplify the TypeDefault generation
logic by reworking things so that we only ever call with None argument
when registering.  Soon, we should be able to just split these
files up entirely.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: ljk53

Differential Revision: D24593704

Pulled By: ezyang

fbshipit-source-id: f01ea22a3999493da77b6e254d188da0ce9adf2f
---
 aten/src/ATen/templates/SchemaRegister.cpp | 10 ------
 tools/codegen/gen.py                       | 40 +++++++---------------
 2 files changed, 13 insertions(+), 37 deletions(-)
 delete mode 100644 aten/src/ATen/templates/SchemaRegister.cpp

diff --git a/aten/src/ATen/templates/SchemaRegister.cpp b/aten/src/ATen/templates/SchemaRegister.cpp
deleted file mode 100644
index f48e732f4760..000000000000
--- a/aten/src/ATen/templates/SchemaRegister.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-// ${generated_comment}
-
-#include <c10/core/TensorOptions.h>
-#include <torch/library.h>
-
-using namespace at;
-
-TORCH_LIBRARY_FRAGMENT_THIS_API_IS_FOR_PER_OP_REGISTRATION_ONLY(aten, m) {
-  ${schema_registrations}
-}
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index 3b629b98d52e..d59e16a2aa0e 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -192,24 +192,17 @@ def compute_type_method(
     target: Target,
     # Selector object to determine which operators to generate
     # registration code for.
-    selector: SelectiveBuilder,
-    # Only valid for generating registrations.  If True, only generate
-    # def() invocations (for schema registration); do not generate
-    # any impl() invocations for, e.g., catch-all kernels
-    def_only: bool = False
+    selector: SelectiveBuilder
 ) -> Callable[[NativeFunction], Optional[str]]:
 
-    if def_only:
-        assert target is Target.REGISTRATION and dispatch is None
+    if dispatch is None:
+        assert target == Target.REGISTRATION
 
     @with_native_function
     def func(f: NativeFunction) -> Optional[str]:
         if dispatch is not None:
             if dispatch not in f.dispatch:
                 return None
-        else:
-            if target is not Target.REGISTRATION:
-                return None
 
         op_name = f"aten::{f.func.name}"
         if target is Target.REGISTRATION and not selector.is_operator_selected(op_name):
@@ -219,10 +212,10 @@ def func(f: NativeFunction) -> Optional[str]:
         returns_type = native.returns_type(f.func.returns)
         args = native.arguments(f.func)
         args_str = ', '.join(map(str, args))
-        # TODO: don't need dispatch is None shortly
-        dispatch_to_all_backends = dispatch is None or dispatch in KEYWORD_ALL_BACKENDS
+        dispatch_to_all_backends = dispatch is not None and dispatch in KEYWORD_ALL_BACKENDS
 
         if target is Target.DECLARATION:
+            assert dispatch is not None
             return f"{returns_type} {name}({args_str});"
         elif target is Target.DEFINITION:
             assert dispatch is not None
@@ -283,7 +276,7 @@ def func(f: NativeFunction) -> Optional[str]:
         elif target is Target.REGISTRATION:
             if dispatch is None:
                 return f'm.def({cpp_string(str(f.func))});\n'
-            elif def_only or f.manual_kernel_registration:
+            elif f.manual_kernel_registration:
                 return None
             else:
                 if dispatch_to_all_backends:
@@ -1046,9 +1039,6 @@ def make_file_manager(install_dir: str) -> FileManager:
 
     cpu_fm.write('TypeDefault.h', lambda: {
         'type_method_declarations':
-        list(mapMaybe(
-            compute_type_method(None, target=Target.DECLARATION, selector=selector),
-            native_functions)) +
         list(mapMaybe(
             compute_type_method('Math', target=Target.DECLARATION, selector=selector),
             native_functions)) +
@@ -1056,6 +1046,12 @@ def make_file_manager(install_dir: str) -> FileManager:
             compute_type_method('DefaultBackend', target=Target.DECLARATION, selector=selector),
             native_functions)),
     })
+
+    schema_selector = selector
+    if options.force_schema_registration:
+        schema_selector = SelectiveBuilder.get_nop_selector()
+
+    # TODO: split this file into separate files
     cpu_fm.write('TypeDefault.cpp', lambda: {
         'type_method_definitions':
         list(mapMaybe(
@@ -1066,7 +1062,7 @@ def make_file_manager(install_dir: str) -> FileManager:
             native_functions)),
 
         'function_registrations': list(mapMaybe(
-            compute_type_method(None, target=Target.REGISTRATION, selector=selector),
+            compute_type_method(None, target=Target.REGISTRATION, selector=schema_selector),
             native_functions)),
 
         'math_function_registrations': list(mapMaybe(
@@ -1102,16 +1098,6 @@ def make_file_manager(install_dir: str) -> FileManager:
             list(mapMaybe(compute_backend_select(target=Target.REGISTRATION), native_functions)),
     })
 
-    if options.force_schema_registration:
-        def computeSchemaRegister() -> Dict[str, object]:
-            schema_registrations = list(mapMaybe(
-                compute_type_method(None, target=Target.REGISTRATION, selector=SelectiveBuilder.get_nop_selector(), def_only=True),
-                native_functions))
-            return {
-                'schema_registrations': schema_registrations,
-            }
-        cpu_fm.write('SchemaRegister.cpp', computeSchemaRegister)
-
     cpu_fm.write('Declarations.yaml', lambda: format_yaml([compute_declaration_yaml(f) for f in native_functions]))
     cpu_fm.write('RegistrationDeclarations.h', lambda: {
         'registration_declarations': [compute_registration_declarations(f) for f in native_functions],

From c689b4d491d62b24afd214af646315c6a99ff30a Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 29 Oct 2020 14:31:56 -0700
Subject: [PATCH 128/169] Delete TypeDefault call code generation logic in
 VariableType (#47000)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/47000

There is a new invariant that emit_body is only ever called when
strategy is 'use_derived', which means we can delete a bunch of code.
This removes the last use of TypeXXX.h headers.

Note that this change makes sense, as the TypeDefault entries are
registered as Math entries, which means they automatically populate
Autograd (and we no longer have to register them ourselves).  Ailing
did all the hard work, this is just the payoff.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: albanD

Differential Revision: D24596584

Pulled By: ezyang

fbshipit-source-id: 6fa754b5f16e75cf2dcbf437887c0fdfda5e44b1
---
 aten/src/ATen/templates/Functions.cpp     |  1 -
 tools/autograd/gen_variable_type.py       | 48 ++++++++---------------
 tools/autograd/templates/TraceType.cpp    |  1 -
 tools/autograd/templates/VariableType.cpp |  1 -
 4 files changed, 17 insertions(+), 34 deletions(-)

diff --git a/aten/src/ATen/templates/Functions.cpp b/aten/src/ATen/templates/Functions.cpp
index 5c8ad639c7f0..7d5cdf18cde8 100644
--- a/aten/src/ATen/templates/Functions.cpp
+++ b/aten/src/ATen/templates/Functions.cpp
@@ -3,7 +3,6 @@
 #include <ATen/Functions.h>
 
 #include <ATen/core/dispatch/Dispatcher.h>
-#include <ATen/TypeDefault.h>
 #include <ATen/CPUType.h>
 #include <ATen/QuantizedCPUType.h>
 #include <ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h>
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 570943755d2e..04b8d144cd79 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -284,9 +284,6 @@
 grad_fn->set_next_edges(collect_next_edges( ${args_with_derivatives} ));
 """)
 
-CALL_DEFAULT = CodeTemplate("""\
-TypeDefault::${type_wrapper_name}(${args})""")
-
 CALL_DISPATCH_VIA_NAMESPACE = CodeTemplate("""\
 at::${api_name}(${unpacked_args})""")
 
@@ -808,7 +805,7 @@ def emit_trace_body(declaration):
 
 
 def emit_body(declaration):
-    strategy = dispatch_strategy(declaration)
+    assert dispatch_strategy(declaration) == 'use_derived'
 
     arguments = declaration['arguments']
     returns = declaration['returns']
@@ -865,8 +862,7 @@ def find_args_with_derivatives(differentiable_inputs):
 
     requires_derivative = (
         base_name not in DONT_REQUIRE_DERIVATIVE and name not in DONT_REQUIRE_DERIVATIVE and
-        len(differentiable_inputs) > 0 and len(differentiable_outputs) > 0 and
-        strategy == 'use_derived')
+        len(differentiable_inputs) > 0 and len(differentiable_outputs) > 0)
 
     if func is not None and not requires_derivative:
         raise RuntimeError('ERROR: derivative ignored for {} -- specified an autograd function without derivative'
@@ -1150,28 +1146,20 @@ def enforce_same_tensorimpl_and_storage(env, call):
 
     def emit_call(env, tie_return_values):
         combined = nested_dict(env, declaration)
-        if strategy == 'use_derived':
-            # We only care about adding `at::AutoNonVariableTypeMode` guard for non-variable dispatch
-            # (which corresponds to 'use_derived' strategy). The purpose of this guard is to make sure
-            # the baseType operations still dispatch to non-Variable type, even if the arguments passed
-            # in are now Variables.
-            # See NOTE [ Treating Variables as non-Variables in type dispatch ] for details.
-            base_type_call = emit_dispatch_call(combined['api_name'], 'self_', combined['unpacked_args'])
-            if not modifies_arguments and not returns_void:
-                call = DISPATCH_TO_NON_VAR_TYPE_WITH_TMP_RETURN_VALUES.substitute(
-                    base_type_call=base_type_call)
-
-                call += wrap_output(tie_return_values, 'tmp')
-            else:
-                call = DISPATCH_TO_NON_VAR_TYPE_WITHOUT_RETURN_VALUES.substitute(
-                    base_type_call=base_type_call)
+        # We only care about adding `at::AutoNonVariableTypeMode` guard for non-variable dispatch
+        # (which corresponds to 'use_derived' strategy). The purpose of this guard is to make sure
+        # the baseType operations still dispatch to non-Variable type, even if the arguments passed
+        # in are now Variables.
+        # See NOTE [ Treating Variables as non-Variables in type dispatch ] for details.
+        base_type_call = emit_dispatch_call(combined['api_name'], 'self_', combined['unpacked_args'])
+        if not modifies_arguments and not returns_void:
+            call = DISPATCH_TO_NON_VAR_TYPE_WITH_TMP_RETURN_VALUES.substitute(
+                base_type_call=base_type_call)
+
+            call += wrap_output(tie_return_values, 'tmp')
         else:
-            args = maybe_unwrap_optional_tensors(declaration, declaration['arguments'], declaration['args'])
-
-            call = CALL_DEFAULT.substitute(declaration, args=args)
-            if not modifies_arguments and not returns_void:
-                call = '{} = {}'.format(tie_return_values, call)
-            call = call + ';'
+            call = DISPATCH_TO_NON_VAR_TYPE_WITHOUT_RETURN_VALUES.substitute(
+                base_type_call=base_type_call)
         call = enforce_same_tensorimpl_and_storage(env, call)
         return call
 
@@ -1211,16 +1199,14 @@ def emit_increment_version():
 
     declare_returned_variables, tie_return_values, get_return_value = format_return_variables(declaration)
 
-    if strategy != 'use_type':
-        body.extend(unpack_args(env, declaration))
+    body.extend(unpack_args(env, declaration))
     if requires_derivative:
         body.extend(emit_check_inplace())
         body.extend(setup_derivative(differentiable_inputs))
     body.append(declare_returned_variables)
 
     body.append(emit_call(env, tie_return_values))
-    if strategy == 'use_derived':
-        body.extend(emit_increment_version())
+    body.extend(emit_increment_version())
     if requires_derivative:
         # set_flags has to appear after version_counter, because rebase_history
         # requires that the counter is incremented before it is called
diff --git a/tools/autograd/templates/TraceType.cpp b/tools/autograd/templates/TraceType.cpp
index d08c1e3cc5aa..3ac52ed08edc 100644
--- a/tools/autograd/templates/TraceType.cpp
+++ b/tools/autograd/templates/TraceType.cpp
@@ -1,6 +1,5 @@
 #include "torch/csrc/autograd/VariableTypeUtils.h"
 
-#include <ATen/TypeDefault.h>
 #include <torch/library.h>
 
 #include "torch/csrc/autograd/function.h"
diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp
index 079427cd97dc..ba2f99369f8d 100644
--- a/tools/autograd/templates/VariableType.cpp
+++ b/tools/autograd/templates/VariableType.cpp
@@ -1,7 +1,6 @@
 #include "torch/csrc/autograd/VariableTypeUtils.h"
 #include "torch/csrc/autograd/FunctionsManual.h"
 
-#include <ATen/TypeDefault.h>
 #include <torch/library.h>
 
 // ${generated_comment}

From 843cab3f2ee40abe7142789560a8874c7cefbb6e Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 29 Oct 2020 14:31:56 -0700
Subject: [PATCH 129/169] Delete TypeDefault.h and TypeDerived.h codegen
 entirely. (#47002)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/47002

There was no good reason for TypeDerived.h (CPUType.h) codegen
to exist after static dispatch was deleted, and now that we
have Math alias key TypeDefault.h header is not needed either.
Sorry to anyone who was using these out of tree.

I didn't entirely delete TypeDefault.h as it has a use in
a file that I can't conveniently compile test locally.  Will
kill it entirely in a follow up.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: albanD

Differential Revision: D24596583

Pulled By: ezyang

fbshipit-source-id: b5095d3509098ff74f836c5d0c272db0b2d226aa
---
 BUILD.bazel                                 |  5 ---
 aten/src/ATen/{templates => }/TypeDefault.h |  6 ---
 aten/src/ATen/templates/Functions.cpp       |  2 -
 aten/src/ATen/templates/TypeDerived.cpp     |  2 -
 aten/src/ATen/templates/TypeDerived.h       | 37 -----------------
 tools/codegen/api/types.py                  |  8 +---
 tools/codegen/gen.py                        | 45 ++++++---------------
 7 files changed, 14 insertions(+), 91 deletions(-)
 rename aten/src/ATen/{templates => }/TypeDefault.h (86%)
 delete mode 100644 aten/src/ATen/templates/TypeDerived.h

diff --git a/BUILD.bazel b/BUILD.bazel
index 9eced9b2c563..4ec99d770f70 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -126,18 +126,13 @@ genrule(
     outs = [
         "aten/src/ATen/Declarations.yaml",
         "aten/src/ATen/BackendSelectRegister.cpp",
-        "aten/src/ATen/CPUType.h",
         "aten/src/ATen/CPUType.cpp",
         "aten/src/ATen/Functions.h",
         "aten/src/ATen/Functions.cpp",
         "aten/src/ATen/NativeFunctions.h",
-        "aten/src/ATen/MkldnnCPUType.h",
         "aten/src/ATen/MkldnnCPUType.cpp",
-        "aten/src/ATen/QuantizedCPUType.h",
         "aten/src/ATen/QuantizedCPUType.cpp",
-        "aten/src/ATen/SparseCPUType.h",
         "aten/src/ATen/SparseCPUType.cpp",
-        "aten/src/ATen/TypeDefault.h",
         "aten/src/ATen/TypeDefault.cpp",
         "aten/src/ATen/core/TensorBody.h",
         "aten/src/ATen/core/TensorMethods.cpp",
diff --git a/aten/src/ATen/templates/TypeDefault.h b/aten/src/ATen/TypeDefault.h
similarity index 86%
rename from aten/src/ATen/templates/TypeDefault.h
rename to aten/src/ATen/TypeDefault.h
index fb62c7ba6354..7b5d77ba4d22 100644
--- a/aten/src/ATen/templates/TypeDefault.h
+++ b/aten/src/ATen/TypeDefault.h
@@ -1,7 +1,5 @@
 #pragma once
 
-// ${generated_comment}
-
 #include <c10/core/TensorOptions.h>
 #include <c10/core/Scalar.h>
 #include <c10/core/QScheme.h>
@@ -29,8 +27,4 @@ struct Quantizer;
 // to frontend
 using ConstQuantizerPtr = const c10::intrusive_ptr<Quantizer>&;
 
-namespace TypeDefault {
-  ${type_method_declarations}
-}  // namespace TypeDefault
-
 } // namespace at
diff --git a/aten/src/ATen/templates/Functions.cpp b/aten/src/ATen/templates/Functions.cpp
index 7d5cdf18cde8..589121af07ef 100644
--- a/aten/src/ATen/templates/Functions.cpp
+++ b/aten/src/ATen/templates/Functions.cpp
@@ -3,8 +3,6 @@
 #include <ATen/Functions.h>
 
 #include <ATen/core/dispatch/Dispatcher.h>
-#include <ATen/CPUType.h>
-#include <ATen/QuantizedCPUType.h>
 #include <ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h>
 
 namespace at {
diff --git a/aten/src/ATen/templates/TypeDerived.cpp b/aten/src/ATen/templates/TypeDerived.cpp
index df4b83c40580..3275ab76ef62 100644
--- a/aten/src/ATen/templates/TypeDerived.cpp
+++ b/aten/src/ATen/templates/TypeDerived.cpp
@@ -5,8 +5,6 @@
 #define __STDC_FORMAT_MACROS
 #endif
 
-#include <ATen/${Type}.h>
-
 // ${generated_comment}
 
 #include <c10/core/TensorImpl.h>
diff --git a/aten/src/ATen/templates/TypeDerived.h b/aten/src/ATen/templates/TypeDerived.h
deleted file mode 100644
index 165787cc2ac5..000000000000
--- a/aten/src/ATen/templates/TypeDerived.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#pragma once
-
-// ${generated_comment}
-
-#include <c10/core/TensorOptions.h>
-#include <c10/core/Scalar.h>
-#include <c10/core/QScheme.h>
-#include <c10/core/MemoryFormat.h>
-#include <c10/core/Stream.h>
-#include <c10/util/ArrayRef.h>
-#include <c10/util/intrusive_ptr.h>
-#include <torch/csrc/WindowsTorchApiMacro.h>
-#include <ATen/Dimname.h>
-
-namespace c10 {
-struct Storage;
-}
-
-namespace at {
-
-class Tensor;
-using TensorList = ArrayRef<Tensor>;
-
-class Context;
-struct Generator;
-
-struct Quantizer;
-// This is temporary typedef to enable Quantizer in aten native function API
-// we'll remove them when we are actually exposing Quantizer class
-// to frontend
-using ConstQuantizerPtr = const c10::intrusive_ptr<Quantizer>&;
-
-namespace ${Type} {
-  ${type_derived_method_declarations}
-}
-
-} // namespace at
diff --git a/tools/codegen/api/types.py b/tools/codegen/api/types.py
index e495244a183e..433f3cf5fd67 100644
--- a/tools/codegen/api/types.py
+++ b/tools/codegen/api/types.py
@@ -341,12 +341,8 @@ class NativeExpr:
 class NativeArgument:
     type: str
     name: str
-    # Native function arguments have defaults for some reasons (e.g.,
-    # the function prototypes in CPUType.h are defaulted).  There isn't
-    # really any good reason to do this, as these functions are only
-    # ever called from a context where all defaulted arguments are
-    # guaranteed to be given explicitly.
-    # TODO: Remove this
+    # Native function arguments have defaults to make it a little
+    # easier to call them directly to bypass dispatch.
     default: Optional[str]
     argument: Union[Argument, TensorOptionsArguments]
 
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index d59e16a2aa0e..134f7163518b 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -158,11 +158,10 @@ def cpp_string(s: str) -> str:
 # Dispatch keywords in native_functions.yaml that support all backends.
 KEYWORD_ALL_BACKENDS = ('DefaultBackend', 'Math')
 
-# Generates {dispatch}Type.cpp and {dispatch}Type.h (e.g., CPUType.cpp
-# and CPUType.h).  This function is also reused to implement per-operator
-# registration.  It also generates TypeDefault.cpp and TypeDefault.h when
-# dispatch target is for all backends (dispatch is None or dispatch in
-# KEYWORD_ALL_BACKENDS).
+# Generates {dispatch}Type.cpp (e.g., CPUType.cpp).  This function is also
+# reused to implement per-operator registration.  It also generates
+# TypeDefault.cpp when dispatch target is for all backends (dispatch is None or
+# dispatch in KEYWORD_ALL_BACKENDS).
 #
 # {dispatch}Type.cpp
 #   - The primary function of this file is to register all of the
@@ -179,16 +178,13 @@ def cpp_string(s: str) -> str:
 #     (as would be the case if you directly registered native::
 #     functions).
 #
-# {dispatch}Type.h
-#   - In principle, this file shouldn't exist at all; historically,
-#     it existed so that we could directly access these functions
-#     outside of the registration API for the implementation of
-#     static dispatch.  Should be deleted now!
-#
 # This function is also used for a secondary purpose: the registration
 # logic is also reused to implement per-operator registration.
 def compute_type_method(
     dispatch: Optional[str], *,
+    # TODO: Give more precise type Union[Literal[Target.DEFINITION,
+    # Target.REGISTRATION]]; requires Literal from typing_extensions
+    # which we don't have a dep for yet.
     target: Target,
     # Selector object to determine which operators to generate
     # registration code for.
@@ -196,10 +192,13 @@ def compute_type_method(
 ) -> Callable[[NativeFunction], Optional[str]]:
 
     if dispatch is None:
-        assert target == Target.REGISTRATION
+        assert target is Target.REGISTRATION
 
     @with_native_function
     def func(f: NativeFunction) -> Optional[str]:
+        # Has to be here as mypy won't transfer asserts into closures
+        assert target is not Target.DECLARATION
+
         if dispatch is not None:
             if dispatch not in f.dispatch:
                 return None
@@ -214,10 +213,7 @@ def func(f: NativeFunction) -> Optional[str]:
         args_str = ', '.join(map(str, args))
         dispatch_to_all_backends = dispatch is not None and dispatch in KEYWORD_ALL_BACKENDS
 
-        if target is Target.DECLARATION:
-            assert dispatch is not None
-            return f"{returns_type} {name}({args_str});"
-        elif target is Target.DEFINITION:
+        if target is Target.DEFINITION:
             assert dispatch is not None
             impl_name = f"at::native::{f.dispatch[dispatch]}"
 
@@ -1010,13 +1006,6 @@ def make_file_manager(install_dir: str) -> FileManager:
 
         fm = cuda_fm if 'CUDA' in dispatch else cpu_fm
 
-        fm.write_with_template(f'{dispatch}Type.h', h_template, lambda: {
-            'Type': f'{dispatch}Type',
-            'type_derived_method_declarations': list(mapMaybe(
-                compute_type_method(dispatch, target=Target.DECLARATION, selector=selector),
-                native_functions
-            )),
-        })
         fm.write_with_template(f'{dispatch}Type.cpp', cpp_template, lambda: {
             'Type': f'{dispatch}Type',
             'extra_cuda_headers': extra_cuda_headers if 'CUDA' in dispatch else '',
@@ -1037,16 +1026,6 @@ def make_file_manager(install_dir: str) -> FileManager:
         })
         del fm
 
-    cpu_fm.write('TypeDefault.h', lambda: {
-        'type_method_declarations':
-        list(mapMaybe(
-            compute_type_method('Math', target=Target.DECLARATION, selector=selector),
-            native_functions)) +
-        list(mapMaybe(
-            compute_type_method('DefaultBackend', target=Target.DECLARATION, selector=selector),
-            native_functions)),
-    })
-
     schema_selector = selector
     if options.force_schema_registration:
         schema_selector = SelectiveBuilder.get_nop_selector()

From a4caa3f5967259ecb544a53e33c6e236305ed1ef Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Thu, 29 Oct 2020 14:50:11 -0700
Subject: [PATCH 130/169] [ONNX] bump CI ort to 1.5.2 rel for stability
 (#46595)

Summary:
Recently the ort-nightly has become unstable and causing issues with CI tests. Switching to release package for now for stability, until the situation is improved.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46595

Reviewed By: houseroad

Differential Revision: D24566175

Pulled By: bzinodev

fbshipit-source-id: dcf36e976daeeb17465df88f28bc9673eebbb7b7
---
 .jenkins/caffe2/test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
index 03583f3805c7..1b26a5980d05 100755
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@@ -171,7 +171,7 @@ if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
     # default pip version is too old(9.0.2), unable to support tag `manylinux2010`.
     # Fix the pip error: Couldn't find a version that satisfies the requirement
     pip install --upgrade pip
-    pip install -q --user ort-nightly==1.5.0.dev202009182
+    pip install -q --user onnxruntime==1.5.2
   fi
   "$ROOT_DIR/scripts/onnx/test.sh"
 fi

From 78de12f58854237dc5987e10bb4ab4fb33b2d01b Mon Sep 17 00:00:00 2001
From: Pritam Damania <pritam.damania@fb.com>
Date: Thu, 29 Oct 2020 15:18:05 -0700
Subject: [PATCH 131/169] Replace -f with -x for pytest tests. (#46967)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46967

Tests under `tests/distributed/_pipeline/sync` use pytest and
specifying the `-f` option for such tests as follows: `python test/run_test.py
-i distributed/_pipeline/sync/skip/test_api -- -f` doesn't work.

The equivalent option for pytest is `-x`. To resolve this issue, I've updated
`run_test.py` to replace `-f` with `-x` for pytest tests.

More details in https://github.com/pytorch/pytorch/issues/46782

#Closes: https://github.com/pytorch/pytorch/issues/46782
ghstack-source-id: 115440558

Test Plan:
1) waitforbuildbot
2) `python test/run_test.py -i distributed/_pipeline/sync/skip/test_api -- -f`

Reviewed By: malfet

Differential Revision: D24584556

fbshipit-source-id: bd87f5b4953504e5659fe72fc8615e126e5490ff
---
 test/run_test.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test/run_test.py b/test/run_test.py
index ad4603e809f2..0fa84c00044c 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -313,6 +313,11 @@ def run_test(test_module, test_directory, options, launcher_cmd=None, extra_unit
     if extra_unittest_args:
         assert isinstance(extra_unittest_args, list)
         unittest_args.extend(extra_unittest_args)
+
+    # If using pytest, replace -f with equivalent -x
+    if options.pytest:
+        unittest_args = [arg if arg != '-f' else '-x' for arg in unittest_args]
+
     # Can't call `python -m unittest test_*` here because it doesn't run code
     # in `if __name__ == '__main__': `. So call `python test_*.py` instead.
     argv = [test_module + '.py'] + unittest_args

From e17b8dea1dd30bef55b314b0217f79ce22a13cf9 Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Thu, 29 Oct 2020 15:22:29 -0700
Subject: [PATCH 132/169] fix calculation of number of elements to not overflow
 (#46997)

Summary:
Possibly fixes https://github.com/pytorch/pytorch/issues/46764.
Computing number of tensor elements in many cases is written as
```
int64_t numel = std::accumulate(oldshape.begin(), oldshape.end(), 1,
                                  std::multiplies<int64_t>());
```
This computes the product with the type of `1` literal, which is `int`. When there's more than INT_MAX elements, result overflows. In https://github.com/pytorch/pytorch/issues/46746, the tensor that was sent to reshape had 256^4 elements, and that was computed as `0`, so reshape was not done correctly.
I've audited usages of std::accumulate and changed them to use int64_t as `init` type.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46997

Reviewed By: albanD

Differential Revision: D24624654

Pulled By: ngimel

fbshipit-source-id: 3d9c5e6355531a9df6b10500eec140e020aac77e
---
 aten/src/ATen/BatchedFallback.cpp             | 12 +--
 aten/src/ATen/TensorUtils.cpp                 |  3 +-
 aten/src/ATen/Utils.h                         | 12 ++-
 aten/src/ATen/native/Distance.cpp             | 10 ++-
 aten/src/ATen/native/LinearAlgebra.cpp        | 16 ++--
 .../ATen/native/NaiveDilatedConvolution.cpp   |  7 +-
 aten/src/ATen/native/TensorShape.cpp          |  4 +-
 .../native/cuda/NaiveDilatedConvolution.cu    |  6 +-
 aten/src/ATen/native/cuda/ScanKernels.cu      | 80 ++++++++++---------
 aten/src/ATen/native/group_norm.cpp           |  7 +-
 aten/src/ATen/native/layer_norm.h             | 14 +---
 aten/src/ATen/native/metal/MetalTensor.mm     |  2 +-
 .../native/quantized/cpu/qnormalization.cpp   |  7 +-
 aten/src/ATen/native/vulkan/Vulkan.cpp        | 10 +--
 aten/src/ATen/native/vulkan/VulkanOps.cpp     |  7 +-
 aten/src/ATen/native/vulkan/ops/Tensor.cpp    |  6 +-
 16 files changed, 94 insertions(+), 109 deletions(-)

diff --git a/aten/src/ATen/BatchedFallback.cpp b/aten/src/ATen/BatchedFallback.cpp
index 396acc9e0403..9b39006b106e 100644
--- a/aten/src/ATen/BatchedFallback.cpp
+++ b/aten/src/ATen/BatchedFallback.cpp
@@ -156,11 +156,7 @@ void batchedTensorInplaceForLoopFallback(const c10::OperatorHandle& op, torch::j
   auto first_physical_view_sizes = input_physical_views.front().tensor().sizes();
   auto batch_sizes = ArrayRef<int64_t>(
       first_physical_view_sizes.begin(), first_physical_view_sizes.begin() + num_batch_dims);
-  auto num_batches = std::accumulate(
-      batch_sizes.begin(),
-      batch_sizes.end(),
-      1,
-      std::multiplies<int64_t>());
+  const auto num_batches = prod_intlist(batch_sizes);
   // Without a shape-checking API, we're unable to compute the correct shape of
   // the output so we just error out.
   TORCH_CHECK(num_batches > 0,
@@ -293,11 +289,7 @@ void batchedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Sta
   auto num_batch_dims = input_physical_views.front().numBatchDims();
   auto some_sizes = input_physical_views.front().tensor().sizes();
   auto batch_sizes = ArrayRef<int64_t>(some_sizes.begin(), some_sizes.begin() + num_batch_dims);
-  auto num_batches = std::accumulate(
-      batch_sizes.begin(),
-      batch_sizes.end(),
-      1,
-      std::multiplies<int64_t>());
+  const auto num_batches = prod_intlist(batch_sizes);
   // Without a shape-checking API, we're unable to compute the correct shape of
   // the output so we just error out.
   TORCH_CHECK(num_batches > 0,
diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp
index 626e0c73e45e..08588f6a8cdd 100644
--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@@ -335,8 +335,7 @@ c10::optional<std::vector<int64_t>> computeStride(
   // we use the stride as if it were computed via resize.
   // This could perhaps be combined with the below code, but the complexity
   // didn't seem worth it.
-  int64_t numel = std::accumulate(oldshape.begin(), oldshape.end(), 1,
-                                  std::multiplies<int64_t>());
+  const int64_t numel = prod_intlist(oldshape);
   if (numel == 0 && oldshape.equals(newshape)) {
     return oldstride.vec();
   }
diff --git a/aten/src/ATen/Utils.h b/aten/src/ATen/Utils.h
index 87446df09487..4fe4b632362b 100644
--- a/aten/src/ATen/Utils.h
+++ b/aten/src/ATen/Utils.h
@@ -93,10 +93,18 @@ inline int64_t sum_intlist(ArrayRef<int64_t> list) {
   return std::accumulate(list.begin(), list.end(), 0ll);
 }
 
-inline int64_t prod_intlist(ArrayRef<int64_t> list) {
-  return std::accumulate(list.begin(), list.end(), 1ll, std::multiplies<int64_t>());
+//std::accumulate infers return type from `init` type, so if `init` type is not enough to hold the result, computation can overflow
+//the next 2 functions set `init` type to int64_t to avoid overflow.
+template<typename C, typename std::enable_if<std::is_integral<typename C::value_type>::value, int>::type = 0>
+inline int64_t prod_intlist(const C &container){
+    return std::accumulate(container.begin(), container.end(), static_cast<int64_t>(1), std::multiplies<int64_t>());
 }
 
+template<typename Iter,
+typename std::enable_if<std::is_integral<typename std::iterator_traits<Iter>::value_type>::value, int>::type = 0>
+inline int64_t prod_intlist(Iter begin, Iter end){
+    return std::accumulate(begin, end, static_cast<int64_t>(1), std::multiplies<int64_t>());
+}
 /**
  * Utility function to static cast input Generator* to
  * the backend generator type (CPU/CUDAGeneratorImpl etc.)
diff --git a/aten/src/ATen/native/Distance.cpp b/aten/src/ATen/native/Distance.cpp
index b2b760513a1d..91d804687290 100644
--- a/aten/src/ATen/native/Distance.cpp
+++ b/aten/src/ATen/native/Distance.cpp
@@ -27,7 +27,7 @@ Tensor pdist(const Tensor& self, const double p) {
 
 Tensor _euclidean_dist(const Tensor& x1, const Tensor& x2) {
   /** This function does the fist part of the euclidean distance calculation
-   * We divide it in two steps to simplify dealing with subgradients in the 
+   * We divide it in two steps to simplify dealing with subgradients in the
    * backward step */
   Tensor x1_norm = x1.pow(2).sum(-1, true);
   Tensor x1_pad = at::ones_like(x1_norm, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
@@ -74,7 +74,7 @@ static Tensor cdist_impl(const Tensor& x1, const Tensor& x2, const double p, c10
   std::vector<int64_t> tensor2_expand_size(expand_batch_portion);
   tensor2_expand_size.insert(tensor2_expand_size.end(), {r2, c2});
 
-  int expand_batch_product = std::accumulate(expand_batch_portion.begin(), expand_batch_portion.end(), 1, std::multiplies<int64_t>());
+  const int64_t expand_batch_product = prod_intlist(expand_batch_portion);
   std::vector<int64_t> tensor1_view{expand_batch_product, r1, c1};
   std::vector<int64_t> tensor2_view{expand_batch_product, r2, c2};
 
@@ -147,8 +147,10 @@ Tensor _cdist_backward(const Tensor& grad, const Tensor& x1, const Tensor& x2, c
   auto device2 = x2.device().type();
   TORCH_CHECK(device2 == kCPU || device2 == kCUDA, "_cdist_backward only supports CPU and CUDA devices, X2 got: ", device2);
   IntArrayRef batch_tensor1(x1.sizes().data(), std::max<int64_t>(x1.dim() - 2, 0));
-  int batch_product = std::accumulate(batch_tensor1.begin(), batch_tensor1.end(), 1, std::multiplies<int64_t>());
-  Tensor grad_x1 = at::empty_like(x1, x1.options(), LEGACY_CONTIGUOUS_MEMORY_FORMAT).view({batch_product, n, m});
+  const int64_t batch_product = prod_intlist(batch_tensor1);
+  Tensor grad_x1 =
+      at::empty_like(x1, x1.options(), LEGACY_CONTIGUOUS_MEMORY_FORMAT)
+          .view({batch_product, n, m});
   cdist_backward_stub(device1, grad_x1, grad, x1, x2, p, cdist);
   return grad_x1;
 }
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 853d913e9336..8796657dc293 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -675,8 +675,8 @@ Tensor matmul(
     std::vector<int64_t> tensor2_expand_size(expand_batch_portion);
     tensor2_expand_size.insert(tensor2_expand_size.end(), {m2, p});
 
-    int expand_batch_product = std::accumulate(expand_batch_portion.begin(), expand_batch_portion.end(),
-                                               1, std::multiplies<int64_t>());
+    const int64_t expand_batch_product =
+        prod_intlist(expand_batch_portion);
 
     std::vector<int64_t> tensor1_bmm_view({expand_batch_product});
     tensor1_bmm_view.insert(tensor1_bmm_view.end(), {n, m1});
@@ -742,7 +742,7 @@ Tensor _allocate_buffer(const Tensor& a, int n_copies, bool is_zero = false) {
     {n_copies, a.size(0), a.size(1), a.size(2)},
     a.options().memory_format(at::MemoryFormat::Contiguous)
   );
-  
+
   if (is_zero) {
     res.zero_();
   }
@@ -850,7 +850,7 @@ Tensor compute_T4(const Tensor& A) {
   auto As = _allocate_buffer(A, 4);
   // 3 for {I, A, A^2}
   _fill_matrix_powers(As, A, 3);
-  
+
   at::native::matmul(
     // output for A^2 * (I / 2 + A / 6 + A^2 / 24)
     As.select(0, 3),
@@ -1101,7 +1101,7 @@ Tensor mexp_impl(
   if (!compute_highest_degree_approx) {
     constexpr std::array<
       Tensor(*)(const Tensor&),
-      total_n_degs - 1> 
+      total_n_degs - 1>
     compute_Ts = {
       compute_T1, compute_T2, compute_T4<scalar_t>,
       compute_T8<scalar_t>, compute_T12<scalar_t>
@@ -1192,7 +1192,7 @@ Tensor mexp(const Tensor& a, bool compute_highest_degree_approx = false) {
 
 // Based on:
 //
-// Mathias, Roy. 
+// Mathias, Roy.
 // A Chain Rule for Matrix Functions and Applications.
 // SIAM J. Matrix Anal. Appl. 17 (1996): 610-620.
 //
@@ -1227,8 +1227,8 @@ Tensor backward_analytic_function_of_a_matrix(
 // Mathematics 2019, 7, 1174.
 //
 Tensor matrix_exp(const Tensor& a) {
-  TORCH_CHECK(a.dim() >= 2 
-          && (at::isFloatingType(a.scalar_type()) 
+  TORCH_CHECK(a.dim() >= 2
+          && (at::isFloatingType(a.scalar_type())
            || at::isComplexType(a.scalar_type())),
               "matrix_exp(", a.scalar_type(), "{", a.sizes(), "}): expected a tensor "
               "of floating or complex types with dim at least 2");
diff --git a/aten/src/ATen/native/NaiveDilatedConvolution.cpp b/aten/src/ATen/native/NaiveDilatedConvolution.cpp
index 459dd857727f..e80b0c546362 100644
--- a/aten/src/ATen/native/NaiveDilatedConvolution.cpp
+++ b/aten/src/ATen/native/NaiveDilatedConvolution.cpp
@@ -2,6 +2,7 @@
 
 #include <tuple>
 #include <ATen/ATen.h>
+#include <ATen/Utils.h>
 #include <ATen/native/im2col.h>
 #include <ATen/native/vol2col.h>
 
@@ -181,10 +182,8 @@ void slow_conv_dilated_all_cpu_template(
   // Temporary buffer:
   Tensor columns = at::empty({0}, options);
   if (output.defined() || grad_weight.defined() || grad_input.defined()) {
-    int64_t m = std::accumulate(
-        kernel_size.begin(), kernel_size.end(), 1, std::multiplies<int64_t>());
-    int64_t n = std::accumulate(
-        output_size.begin(), output_size.end(), 1, std::multiplies<int64_t>());
+    const int64_t m = prod_intlist(kernel_size);
+    const int64_t n = prod_intlist(output_size);
     columns.resize_({nInputPlane * m, n});
   }
   // Initialize
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index eec16a4cf824..14fb67e5d4ba 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -368,7 +368,7 @@ static Tensor cat_sparse(TensorList tensors, int64_t dim) {
     // The dimension in each tensor's values object that corresponds to the overall dimension along which we're catting.
     int64_t values_dim = wrapped - sparse_dim + 1;
     // The final size along the catted dimension.
-    int64_t total_size = std::accumulate(tensors.begin(), tensors.end(), 0, [values_dim](int64_t l, Tensor const &r) {
+    const int64_t total_size = std::accumulate(tensors.begin(), tensors.end(), static_cast<int64_t>(0), [values_dim](int64_t l, Tensor const &r) {
       return l + r._values().size(values_dim);
     });
     auto zeros_sizes = tensors[0]._values().sizes().vec();
@@ -1675,7 +1675,7 @@ Tensor unflatten(const Tensor& self, int64_t dim, IntArrayRef sizes, c10::option
   TORCH_CHECK(sizes.size() > 0, "unflatten: sizes must be non-empty");
   TORCH_INTERNAL_ASSERT(!names || names->size() == sizes.size());
 
-  auto numel = std::accumulate(sizes.begin(), sizes.end(), 1, std::multiplies<int64_t>());
+  const int64_t numel = prod_intlist(sizes);
   if (self.has_names()) {
     TORCH_CHECK(numel == self.size(dim),
       "unflatten: Provided sizes ", sizes, " don't multiply up to the size of dim ",
diff --git a/aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu b/aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu
index 2ad6f0785a17..522e3bbd8760 100644
--- a/aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu
+++ b/aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu
@@ -188,10 +188,8 @@ void slow_conv_dilated_all_cuda_template(
   int64_t nInputPlane = weight.size(1);
   int64_t nOutputPlane = weight.size(0);
   // Temporary buffers:
-  int64_t m = std::accumulate(
-      kernel_size.begin(), kernel_size.end(), 1, std::multiplies<int64_t>());
-  int64_t output_vsize = std::accumulate(
-      output_size.begin(), output_size.end(), 1, std::multiplies<int64_t>());
+  const int64_t m = prod_intlist(kernel_size);
+  const int64_t output_vsize = prod_intlist(output_size);
   Tensor columns = at::empty({0}, options);
   if (output.defined() || grad_weight.defined() || grad_input.defined()) {
     columns.resize_({nInputPlane * m, output_vsize});
diff --git a/aten/src/ATen/native/cuda/ScanKernels.cu b/aten/src/ATen/native/cuda/ScanKernels.cu
index 6bc2c381e1db..b0dc71c568ba 100644
--- a/aten/src/ATen/native/cuda/ScanKernels.cu
+++ b/aten/src/ATen/native/cuda/ScanKernels.cu
@@ -128,16 +128,16 @@ __global__ void tensor_kernel_scan_innermost_dim_with_indices(const scalar_t *se
  */
 template<typename scalar_t, class BinaryFunction>
 __global__ void tensor_kernel_scan_outer_dim_with_indices(scalar_t *self_, scalar_t *values_, int64_t *indices_,
-                  int num_orows, int num_irows, int row_size, scalar_t init, BinaryFunction binary_op) {
-  for (int orow = blockIdx.x; orow < num_orows; orow += gridDim.x) {
-    for (int irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) {
+                  const uint32_t num_orows, const uint32_t num_irows, const uint32_t row_size, scalar_t init, BinaryFunction binary_op) {
+  for (uint32_t orow = blockIdx.x; orow < num_orows; orow += gridDim.x) {
+    for (uint32_t irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) {
       scalar_t *self = self_ + orow * row_size * num_irows + irow;
       scalar_t *values = values_ + orow * row_size * num_irows + irow;
       int64_t *indices = indices_ + orow * row_size * num_irows + irow;
       scalar_t out = init;
       int64_t out_idx = 0;
 
-      for (int64_t col = 0; col < row_size; ++col) {
+      for (auto col = decltype(row_size){0}; col < row_size; ++col) {
         if(THCNumerics<scalar_t>::isnan(*self) || (!THCNumerics<scalar_t>::isnan(out) && binary_op(*self, out))) {
           out = *self;
           out_idx = col;
@@ -152,21 +152,34 @@ __global__ void tensor_kernel_scan_outer_dim_with_indices(scalar_t *self_, scala
   }
 }
 
+void check_fits_in_unsigned(int64_t val, const char* name) {
+  constexpr auto umax = std::numeric_limits<uint32_t>::max();
+  TORCH_CHECK(
+      val >= 0 && val <= umax, name, " must fit in a 32-bit uint32_t value");
+}
+
+
 template<typename scalar_t, class BinaryFunction>
 __host__ void scan_outer_dim_with_indices(const Tensor& self, Tensor& values, Tensor& indices,
                                        int dim, scalar_t init, BinaryFunction binary_op) {
-  int row_size = self.size(dim);
+  int64_t row_size = self.size(dim);
   auto sizes = self.sizes();
 
   // Treat all outer dimensions (i.e. dim_ < dim) as one.
-  int num_orows = std::accumulate(sizes.begin(), sizes.begin() + dim, 1, std::multiplies<int>());
+  const int64_t num_orows = prod_intlist(sizes.begin(), sizes.begin() + dim);
 
   // Treat all inner dimensions (i.e. dim > dimension) as one.
-  int num_irows = std::accumulate(sizes.begin() + dim + 1, sizes.end(), 1, std::multiplies<int>());
+  const int64_t num_irows = prod_intlist(sizes.begin() + dim + 1, sizes.end());
+  //for performance reasons, cuda kernels use uint32_t for loops over irows, orows and row,
+  //make sure that input is not bigger than supported by uint32_t
+  check_fits_in_unsigned(num_irows, "num_irows");
+  check_fits_in_unsigned(num_orows, "num_orows");
+  check_fits_in_unsigned(row_size, "row_size");
+
 
   dim3 threads(std::min(512, int(num_irows)));
-  int maxGridDim = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
-  dim3 grid(std::min(maxGridDim, num_orows), std::min(maxGridDim, ceil_div(num_irows, int(threads.x))));
+  int64_t maxGridDim = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
+  dim3 grid(std::min(maxGridDim, num_orows), std::min(maxGridDim, ceil_div(num_irows, int64_t{threads.x})));
   tensor_kernel_scan_outer_dim_with_indices<scalar_t><<<grid, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
     self.data_ptr<scalar_t>(), values.data_ptr<scalar_t>(), indices.data_ptr<int64_t>(),
     num_orows, num_irows, row_size, init, binary_op);
@@ -254,16 +267,16 @@ void cummin_helper_cuda(const Tensor& self, Tensor& values, Tensor& indices, int
  */
 template<typename scalar_t, class BinaryOp>
 __global__ void tensor_kernel_scan_outer_dim(scalar_t *tgt_, scalar_t *src_,
-                                              unsigned num_orows, unsigned num_irows, unsigned row_size,
-                                              scalar_t init, BinaryOp binary_op)
+                                              const uint32_t num_orows, const uint32_t num_irows, const uint32_t row_size,
+                                              const scalar_t init, BinaryOp binary_op)
 {
-  for (unsigned orow = blockIdx.x; orow < num_orows; orow += gridDim.x) {
-    for (unsigned irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) {
+  for (uint32_t orow = blockIdx.x; orow < num_orows; orow += gridDim.x) {
+    for (uint32_t irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) {
       scalar_t *src = src_ + orow * row_size * num_irows + irow;
       scalar_t *tgt = tgt_ + orow * row_size * num_irows + irow;
       scalar_t acc = init;
 
-      for (unsigned col = 0; col < row_size; ++col) {
+      for (uint32_t col = 0; col < row_size; ++col) {
         acc = binary_op(acc, *src);
         *tgt = acc;
 
@@ -286,12 +299,12 @@ __global__ void tensor_kernel_scan_outer_dim(scalar_t *tgt_, scalar_t *src_,
  */
 template<typename T, int num_threads_x, int num_threads_y, class BinaryFunction>
 __device__ void tensor_kernel_scan_innermost_dim_impl(T* row_buf, T *tgt_, T *src_,
-                                      unsigned num_rows, unsigned row_size,
+                                      const uint32_t num_rows, const uint32_t row_size,
                                       T init, BinaryFunction binary_op){
-  for (unsigned block_row = blockIdx.x * blockDim.y;
+  for (uint32_t block_row = blockIdx.x * blockDim.y;
        block_row < num_rows;
        block_row += blockDim.y * gridDim.x) {
-    unsigned row = block_row + threadIdx.y;
+    uint32_t row = block_row + threadIdx.y;
     T block_total = init;
 
     T *row_src = src_ + row * row_size;
@@ -299,10 +312,10 @@ __device__ void tensor_kernel_scan_innermost_dim_impl(T* row_buf, T *tgt_, T *sr
 
     // Perform scan on one block at a time, keeping track of the total value of
     // all blocks processed so far.
-    for (unsigned block_col = 0; block_col < row_size; block_col += 2 * num_threads_x) {
+    for (uint32_t block_col = 0; block_col < row_size; block_col += 2 * num_threads_x) {
       // Load data into shared memory (two values per thread).
-      unsigned col1 = block_col + threadIdx.x;
-      unsigned col2 = block_col + num_threads_x + threadIdx.x;
+      uint32_t col1 = block_col + threadIdx.x;
+      uint32_t col2 = block_col + num_threads_x + threadIdx.x;
       if (row < num_rows) {
         if (col1 < row_size) {
           row_buf[threadIdx.x] = row_src[col1];
@@ -324,18 +337,18 @@ __device__ void tensor_kernel_scan_innermost_dim_impl(T* row_buf, T *tgt_, T *sr
       __syncthreads();
 
       // Parallel reduction (up-sweep).
-      for (unsigned s = num_threads_x, d = 1; s >= 1; s >>= 1, d <<= 1) {
+      for (uint32_t s = num_threads_x, d = 1; s >= 1; s >>= 1, d <<= 1) {
         if (row < num_rows && threadIdx.x < s) {
-          unsigned offset = (2 * threadIdx.x + 1) * d - 1;
+          uint32_t offset = (2 * threadIdx.x + 1) * d - 1;
           row_buf[offset + d] = binary_op(row_buf[offset], row_buf[offset + d]);
         }
         __syncthreads();
       }
 
       // Down-sweep.
-      for (unsigned s = 2, d = num_threads_x / 2; d >= 1; s <<= 1, d >>= 1) {
+      for (uint32_t s = 2, d = num_threads_x / 2; d >= 1; s <<= 1, d >>= 1) {
         if (row < num_rows && threadIdx.x < s - 1) {
-          unsigned offset = 2 * (threadIdx.x + 1) * d - 1;
+          uint32_t offset = 2 * (threadIdx.x + 1) * d - 1;
           row_buf[offset + d] = binary_op(row_buf[offset], row_buf[offset + d]);
         }
         __syncthreads();
@@ -361,8 +374,8 @@ __global__ typename std::enable_if<!c10::is_complex<T>::value, void>::type
 tensor_kernel_scan_innermost_dim(
     T* tgt_,
     T* src_,
-    unsigned num_rows,
-    unsigned row_size,
+    const uint32_t num_rows,
+    const uint32_t row_size,
     T init,
     BinaryFunction binary_op) {
   __shared__ T sbuf[num_threads_y][2 * num_threads_x];
@@ -381,8 +394,8 @@ __global__ typename std::enable_if<c10::is_complex<T>::value, void>::type
 tensor_kernel_scan_innermost_dim(
     T* tgt_,
     T* src_,
-    unsigned num_rows,
-    unsigned row_size,
+    const uint32_t num_rows,
+    const uint32_t row_size,
     T init,
     BinaryFunction binary_op) {
   // As we cannot directly initialize shared array for complex types
@@ -399,23 +412,18 @@ tensor_kernel_scan_innermost_dim(
       row_buf, tgt_, src_, num_rows, row_size, init, binary_op);
 }
 
-void check_fits_in_unsigned(int64_t val, const char* name) {
-  constexpr auto umax = std::numeric_limits<unsigned>::max();
-  TORCH_CHECK(
-      val >= 0 && val <= umax, name, " must fit in a 32-bit unsigned value");
-}
 
 template<typename scalar_t, class BinaryFunction>
 __host__ void scan_outer_dim(const Tensor& self, Tensor& result,
                                        int dim, scalar_t init, BinaryFunction binary_op) {
-  int64_t row_size = self.size(dim);
+  const int64_t row_size = self.size(dim);
   auto sizes = self.sizes();
 
   // Treat all outer dimensions (i.e. dim_ < dim) as one.
-  int64_t num_orows = std::accumulate(sizes.begin(), sizes.begin() + dim, 1, std::multiplies<int64_t>());
+  const int64_t num_orows = prod_intlist(sizes.begin(), sizes.begin() + dim);
 
   // Treat all inner dimensions (i.e. dim > dimension) as one.
-  int64_t num_irows = std::accumulate(sizes.begin() + dim + 1, sizes.end(), 1, std::multiplies<int64_t>());
+  const int64_t num_irows = prod_intlist(sizes.begin() + dim + 1, sizes.end());
 
   dim3 threads(std::min(512, int(num_irows)));
   int64_t maxGridDim = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
diff --git a/aten/src/ATen/native/group_norm.cpp b/aten/src/ATen/native/group_norm.cpp
index 759167095ae3..3bd4daac917b 100644
--- a/aten/src/ATen/native/group_norm.cpp
+++ b/aten/src/ATen/native/group_norm.cpp
@@ -106,11 +106,8 @@ Tensor group_norm(
       input.sizes());
 
   const auto input_shape = input.sizes();
-  const int64_t HxW = std::accumulate(
-      input_shape.cbegin() + 2,
-      input_shape.cend(),
-      1LL,
-      std::multiplies<int64_t>());
+  const int64_t HxW =
+      prod_intlist(input_shape.cbegin() + 2, input_shape.cend());
 
   const Tensor kEmpty;
   const auto& X = input.is_contiguous() ? input : input.contiguous();
diff --git a/aten/src/ATen/native/layer_norm.h b/aten/src/ATen/native/layer_norm.h
index bf931fb26c5f..fa936ab7d4ce 100644
--- a/aten/src/ATen/native/layer_norm.h
+++ b/aten/src/ATen/native/layer_norm.h
@@ -52,16 +52,10 @@ std::tuple<Tensor, Tensor, Tensor, int64_t, int64_t> _prepare_layer_norm_inputs(
   }
 
   const int axis = input_ndim - normalized_ndim;
-  const int64_t M = std::accumulate(
-      input_shape.cbegin(),
-      input_shape.cbegin() + axis,
-      1LL,
-      std::multiplies<int64_t>());
-  const int64_t N = std::accumulate(
-      input_shape.cbegin() + axis,
-      input_shape.cend(),
-      1LL,
-      std::multiplies<int64_t>());
+  const int64_t M =
+      prod_intlist(input_shape.cbegin(), input_shape.cbegin() + axis);
+  const int64_t N =
+      prod_intlist(input_shape.cbegin() + axis, input_shape.cend());
 
   const auto& X = input.is_contiguous() ? input : input.contiguous();
   const auto& gamma = weight.is_contiguous() ? weight : weight.contiguous();
diff --git a/aten/src/ATen/native/metal/MetalTensor.mm b/aten/src/ATen/native/metal/MetalTensor.mm
index 6dfe3932bf16..b1fc38d92a6b 100644
--- a/aten/src/ATen/native/metal/MetalTensor.mm
+++ b/aten/src/ATen/native/metal/MetalTensor.mm
@@ -17,7 +17,7 @@ class API_AVAILABLE(ios(10.0), macos(10.13)) MetalTensor::Impl {
         _numel(std::accumulate(
             std::begin(_sizes),
             std::end(_sizes),
-            1,
+            (int64_t)1,
             std::multiplies<int64_t>())),
         _textureImpl(std::make_unique<MPSImageWrapper>(sizes)) {}
 
diff --git a/aten/src/ATen/native/quantized/cpu/qnormalization.cpp b/aten/src/ATen/native/quantized/cpu/qnormalization.cpp
index 6ed193cd82c9..c8bbe9d29b24 100644
--- a/aten/src/ATen/native/quantized/cpu/qnormalization.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qnormalization.cpp
@@ -71,11 +71,8 @@ Tensor quantized_group_norm_impl(
 
   const int64_t batches = input_shape[0];
   const int64_t num_channels = input_shape[1];
-  const int64_t elements_per_batch = std::accumulate(
-      input_shape.cbegin() + 1,
-      input_shape.cend(),
-      1LL,
-      std::multiplies<int64_t>());
+  const int64_t elements_per_batch =
+      prod_intlist(input_shape.cbegin() + 1, input_shape.cend());
 
   const int64_t M = batches * num_groups;
   const int64_t N = elements_per_batch / num_groups;
diff --git a/aten/src/ATen/native/vulkan/Vulkan.cpp b/aten/src/ATen/native/vulkan/Vulkan.cpp
index d6fa6a32291b..d58d7d7dcc09 100644
--- a/aten/src/ATen/native/vulkan/Vulkan.cpp
+++ b/aten/src/ATen/native/vulkan/Vulkan.cpp
@@ -7,6 +7,7 @@
 
 #include <c10/util/ArrayRef.h>
 #include <c10/util/Exception.h>
+#include <ATen/Utils.h>
 
 #ifdef USE_VULKAN_WRAPPER
 #include <vulkan_wrapper.h>
@@ -1182,11 +1183,7 @@ class VulkanTensor::Impl final {
   explicit Impl(std::vector<int64_t> sizes)
       : sizes_(std::move(sizes)),
         strides_(std::vector<int64_t>(sizes_.size())),
-        numel_(std::accumulate(
-            std::begin(sizes_),
-            std::end(sizes_),
-            1,
-            std::multiplies<int64_t>())) {
+        numel_(prod_intlist(sizes_)) {
     TORCH_CHECK(
         initVulkanContextOnce(), "Vulkan Failed to create Vulkan Context");
   }
@@ -1289,8 +1286,7 @@ class VulkanTensor::Impl final {
 
   VkDeviceSize buffer_size_for_sizes(std::vector<int64_t> sizes) const {
     const auto d = sizes.size();
-    const auto numel = std::accumulate(
-        std::begin(sizes), std::end(sizes), 1, std::multiplies<int64_t>());
+    const auto numel = prod_intlist(sizes);
     VkDeviceSize bufferSize{sizeof(float) * numel};
     // alignment to be able to copy between image and buffer
     if (d == 4) {
diff --git a/aten/src/ATen/native/vulkan/VulkanOps.cpp b/aten/src/ATen/native/vulkan/VulkanOps.cpp
index 2ad3a695d65b..0e13dce41a7c 100644
--- a/aten/src/ATen/native/vulkan/VulkanOps.cpp
+++ b/aten/src/ATen/native/vulkan/VulkanOps.cpp
@@ -5,6 +5,7 @@
 #include <c10/util/Exception.h>
 #include <c10/util/Optional.h>
 #include <ATen/InferSize.h>
+#include <ATen/Utils.h>
 
 #include <ATen/native/vulkan/Vulkan.h>
 #include <ATen/native/vulkan/VulkanCommon.h>
@@ -553,8 +554,7 @@ void add(
 void add(VulkanTensor& output, const VulkanTensor& input, const float s) {
   const auto sizes = input.sizes();
 
-  const auto C = std::accumulate(
-      sizes.cbegin(), sizes.cend() - 2, 1, std::multiplies<int64_t>());
+  const auto C = prod_intlist(sizes.cbegin(), sizes.cend() - 2);
   const auto C_4 = UP_DIV(C, 4);
   const auto H = sizes[2];
   const auto W = sizes[3];
@@ -605,8 +605,7 @@ void add(VulkanTensor& output, const VulkanTensor& input, const float s) {
 void mul(VulkanTensor& output, const VulkanTensor& input, const float s) {
   const auto sizes = input.sizes();
 
-  const auto C = std::accumulate(
-      sizes.cbegin(), sizes.cend() - 2, 1, std::multiplies<int64_t>());
+  const auto C = prod_intlist(sizes.cbegin(), sizes.cend() - 2);
   const auto C_4 = UP_DIV(C, 4);
   const auto H = sizes[2];
   const auto W = sizes[3];
diff --git a/aten/src/ATen/native/vulkan/ops/Tensor.cpp b/aten/src/ATen/native/vulkan/ops/Tensor.cpp
index a51fd972d19a..a5baf716069f 100644
--- a/aten/src/ATen/native/vulkan/ops/Tensor.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Tensor.cpp
@@ -22,11 +22,7 @@ VkDeviceSize bytes(
     size *= extents.width * extents.height * (4u * extents.depth);
   }
   else {
-    size = std::accumulate(
-        sizes.cbegin(),
-        sizes.cend(),
-        size,
-        std::multiplies<int64_t>());
+    size *= prod_intlist(sizes);
   }
 
   return size;

From 3c643d112edf627a477f565f75cacb6c5db9ab02 Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Thu, 29 Oct 2020 15:30:46 -0700
Subject: [PATCH 133/169] Pin destination memory for cuda_tensor.to("cpu",
 non_blocking=True) (#46878)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/39694.

[`torch.cuda._sleep(int(100 * get_cycles_per_ms()))`](https://github.com/pytorch/pytorch/pull/46878/files#diff-893b1eea27352f336f4cd832919e48d721e4e90186e63400b8596db6b82e7450R511-R513) in the test helps avoid flakiness noted by ngimel (https://github.com/pytorch/pytorch/pull/35144#issuecomment-602103631).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46878

Reviewed By: izdeby

Differential Revision: D24550403

Pulled By: xw285cornell

fbshipit-source-id: 1ecc35ef75f9a38ab332aacdf4835955105edafc
---
 aten/src/ATen/native/TensorConversions.cpp | 11 ++++--
 test/test_cuda.py                          | 39 ++++++++++++++--------
 2 files changed, 35 insertions(+), 15 deletions(-)

diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp
index 58df4cf110f7..63e272de57fc 100644
--- a/aten/src/ATen/native/TensorConversions.cpp
+++ b/aten/src/ATen/native/TensorConversions.cpp
@@ -29,10 +29,15 @@ static inline Tensor to_impl(const Tensor& self, const TensorOptions& options, b
     return self;
   }
 
+  bool pin_out = (non_blocking && self.is_cuda() && options.device().is_cpu() &&
+                  (options.layout() == c10::kStrided));
+
   if (memory_format == MemoryFormat::Preserve) {
     if (self.is_non_overlapping_and_dense()) {
       // Copy all strides
-      auto r = at::empty_strided(self.sizes(), self.strides(), options.memory_format(c10::nullopt));
+      auto r = at::empty_strided(self.sizes(),
+                                 self.strides(),
+                                 options.memory_format(c10::nullopt).pinned_memory(pin_out));
       r.copy_(self, non_blocking);
       return r;
     } else {
@@ -40,7 +45,9 @@ static inline Tensor to_impl(const Tensor& self, const TensorOptions& options, b
     }
   }
   // See Note [Explicit nullopt MemoryFormat argument]
-  auto r = at::empty(self.sizes(), options.memory_format(memory_format), c10::nullopt);
+  auto r = at::empty(self.sizes(),
+                     options.memory_format(memory_format).pinned_memory(pin_out),
+                     c10::nullopt);
   r.copy_(self, non_blocking);
   return r;
 }
diff --git a/test/test_cuda.py b/test/test_cuda.py
index d20af4082d03..49c7848c9122 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -504,22 +504,35 @@ def _test_copy_non_blocking(a, b):
         y = torch.ones(10000000, dtype=torch.uint8).cuda()
         _test_copy_non_blocking(x, y)
 
-    @unittest.skip("skipped because test could be flaky, see #35144")
     def test_to_non_blocking(self):
-        def _test_to_non_blocking(a, non_blocking):
-            stream = torch.cuda.current_stream()
-            with torch.cuda.stream(stream):
-                b = a.to('cuda', non_blocking=non_blocking)
-                self.assertEqual(stream.query(), not non_blocking)
-                stream.synchronize()
-                self.assertEqual(a, b)
+        stream = torch.cuda.current_stream()
 
-        # 10MB copies
-        x = torch.ones(10000000, dtype=torch.uint8)
-        _test_to_non_blocking(x, True)
+        def _test_to_non_blocking(a, non_blocking, dst):
+            torch.cuda.synchronize()
+            # Pushes an 0.1 second spin to stream so if the copy is non blocking,
+            # stream will almost surely be active when we query().
+            torch.cuda._sleep(int(100 * get_cycles_per_ms()))
+            b = a.to(device=dst, non_blocking=non_blocking)
+            self.assertEqual(stream.query(), not non_blocking)
+            stream.synchronize()
+            self.assertEqual(a, b)
+            self.assertTrue(b.is_pinned() == (non_blocking and dst == "cpu"))
+
+        for dst, try_non_blocking in product(("cuda", "cpu"), (True, False)):
+            # Creates source on the opposite device from destination.
+            src = torch.randn(1000000,
+                              device="cuda" if dst == "cpu" else "cpu",
+                              pin_memory=True if dst == "cuda" else False)
+            _test_to_non_blocking(src, try_non_blocking, dst)
 
-        y = torch.ones(10000000, dtype=torch.uint8)
-        _test_to_non_blocking(y, False)
+    def test_to_cpu_blocking_by_default(self):
+        src = torch.randn(1000000, device="cuda")
+        torch.cuda.synchronize()
+        torch.cuda._sleep(int(100 * get_cycles_per_ms()))
+        dst = src.to(device="cpu")
+        self.assertEqual(torch.cuda.current_stream().query(), True)
+        self.assertEqual(src, dst)
+        self.assertFalse(dst.is_pinned())
 
     def test_serialization_array_with_storage(self):
         x = torch.randn(5, 5).cuda()

From 71901554085e592208554c681df6aedd4432b81c Mon Sep 17 00:00:00 2001
From: Leon Gao <leongao@fb.com>
Date: Thu, 29 Oct 2020 16:15:56 -0700
Subject: [PATCH 134/169] [Transposed Conv]add ConvTranspose3d with FBGEMM as
 backend (#46608)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46608

introduce frontend API for quantized transposed convolution with only FBGEMM as backend.
ghstack-source-id: 115289210

Test Plan: https://www.internalfb.com/intern/testinfra/testconsole/testrun/7599824394184104/

Reviewed By: z-a-f

Differential Revision: D24369831

fbshipit-source-id: b8babd3ddbe0df8f4c8bc652bb745f85e0813797
---
 .../native/quantized/cpu/fbgemm_utils.cpp     |  98 +++++++++++-
 .../ATen/native/quantized/cpu/fbgemm_utils.h  |   5 +
 aten/src/ATen/native/quantized/cpu/qconv.cpp  |   5 +-
 .../native/quantized/cpu/qconv_prepack.cpp    |  27 +---
 .../native/quantized/cpu/qconv_unpack.cpp     |  13 +-
 aten/src/ATen/native/quantized/library.cpp    |  10 ++
 test/quantization/test_quantized_op.py        | 148 ++++++++++++++++--
 torch/nn/quantized/modules/__init__.py        |   3 +-
 torch/nn/quantized/modules/conv.py            |  89 ++++++++++-
 9 files changed, 350 insertions(+), 48 deletions(-)

diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
index 4d95ce4ffb4c..91c895685fd3 100644
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
@@ -60,6 +60,30 @@ void CopyToChannelsLast3dTensor(
   }
 }
 
+template <typename T>
+void CopyICFirst3dTensorToChannelsLast3dTensor(
+    int64_t G,
+    int64_t IC_G,
+    int64_t OC_G,
+    int64_t D,
+    int64_t H,
+    int64_t W,
+    const T* src,
+    T* dst) {
+  // IC OC/G THW -> G OC/G THW IC/G
+  const int64_t inner_size = D * H * W;
+  for (int64_t i = 0; i < G * OC_G; ++i) {
+    for (int64_t j = 0; j < inner_size; ++j) {
+      for (int64_t ic = 0; ic < IC_G; ++ic) {
+        int g = i / OC_G;
+        int oc = i % OC_G;
+        dst[(i * inner_size + j) * IC_G + ic] =
+            src[((g * IC_G + ic) * OC_G + oc) * inner_size + j];
+      }
+    }
+  }
+}
+
 } // namespace
 
 template <int kSpatialDim>
@@ -256,6 +280,75 @@ template fbgemm::conv_param_t<3> MakeFbgemmConvParam<3>(
     const std::vector<int>& dilations,
     const std::vector<int>& output_padding,
     bool transposed);
+template <>
+Tensor TransposeConvTensorUnpackConversion<3>(const Tensor& src, int groups) {
+  // OC IC/G DHW -> IC OC/G DHW logically
+  auto oc_g_ic_g_hw_tensors = src.chunk(groups);
+  auto fused_tensor =
+      at::cat(oc_g_ic_g_hw_tensors, 1).set_quantizer_(src.quantizer());
+  return fused_tensor.permute({1, 0, 2, 3, 4});
+}
+
+template <>
+Tensor ConvertConvWeightsToChannelLastTensor<2>(
+    const at::Tensor& src,
+    int groups,
+    bool transpose) {
+  return transpose ?
+                   // 2D conv transpose weight transform
+                   // IC OC/G KH KW -> G OC/G KH KW IC/G
+      [&]() {
+        auto ic_g_oc_g_hw_tensors = src.chunk(groups);
+        for (auto& tensor : ic_g_oc_g_hw_tensors) {
+          tensor = tensor.unsqueeze(0);
+        }
+        auto fused_tensor =
+            at::cat(ic_g_oc_g_hw_tensors).set_quantizer_(src.quantizer());
+        return fused_tensor.permute({0, 2, 3, 4, 1})
+            .contiguous(c10::MemoryFormat::Contiguous);
+      }()
+                   // 2d conv weight transform
+                   : src.contiguous(c10::MemoryFormat::ChannelsLast);
+}
+
+template <>
+Tensor ConvertConvWeightsToChannelLastTensor<3>(
+    const at::Tensor& src,
+    int groups,
+    bool transpose) {
+  if (!transpose) {
+    return ConvertToChannelsLast3dTensor(src);
+  } else {
+    TORCH_CHECK(src.dim() == 5);
+    Tensor dst;
+    const int64_t N = src.size(0);
+    const int64_t IC_G = N / groups;
+    const int64_t OC_G = src.size(1);
+    const int64_t D = src.size(2);
+    const int64_t H = src.size(3);
+    const int64_t W = src.size(4);
+    dst = MakeStridedQTensorCPU(
+        {groups * OC_G, IC_G, D, H, W},
+        {D * H * W * IC_G, 1, H * W * IC_G, W * IC_G, IC_G},
+        src.options(),
+        src.quantizer());
+    AT_DISPATCH_QINT_TYPES(
+        src.scalar_type(), "CopyICFirst3dTensorToChannelsLast3dTensor", [&]() {
+          const Tensor src_contig = src.contiguous();
+          CopyICFirst3dTensorToChannelsLast3dTensor<scalar_t>(
+              groups,
+              IC_G,
+              OC_G,
+              D,
+              H,
+              W,
+              src_contig.data_ptr<scalar_t>(),
+              dst.data_ptr<scalar_t>());
+        });
+    return dst;
+  }
+}
+
 } // namespace fbgemm_utils
 } // namespace native
 } // namespace at
@@ -263,8 +356,9 @@ template fbgemm::conv_param_t<3> MakeFbgemmConvParam<3>(
 
 #endif // USE_FBGEMM
 
-template <int kSpatialDim = 2>
-CAFFE2_API torch::class_<ConvPackedParamsBase<kSpatialDim>> register_conv_params() {
+    template <int kSpatialDim = 2>
+    CAFFE2_API torch::class_<ConvPackedParamsBase<kSpatialDim>>
+    register_conv_params() {
   static auto register_conv_params =
     torch::class_<ConvPackedParamsBase<kSpatialDim>>(
         "quantized", "Conv" + c10::to_string(kSpatialDim) + "dPackedParamsBase")
diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
index 40ef0feba61e..0cccf81e35d8 100644
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
@@ -295,6 +295,11 @@ Tensor TransposeConvTensorUnpackConversion(
     const Tensor& src,
     int groups);
 
+template <int kSpatialDim>
+Tensor ConvertConvWeightsToChannelLastTensor(
+    const at::Tensor& src,
+    int groups,
+    bool transpose);
 } // namespace fbgemm_utils
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
index e96bd26acaba..05762bfb036f 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -411,7 +411,7 @@ at::Tensor PackedConvWeight<kSpatialDim>::apply_impl(
     output_shape = MakeDeConvOutputShape<kSpatialDim>(
         N,
         M,
-        {H, W},
+        kSpatialDim == 2 ? std::vector<int64_t>{H, W} : std::vector<int64_t>{D, H, W},
         kernel,
         stride(),
         padding(),
@@ -886,6 +886,9 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
   // transpose
   m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose1d"),  QConv1dInt8<false>::run);
   m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d"),  QConvInt8<2, false>::run);
+  m.impl(
+      TORCH_SELECTIVE_NAME("quantized::conv_transpose3d"),
+      QConvInt8<3, false>::run);
 }
 
 TORCH_LIBRARY_IMPL(_quantized, QuantizedCPU, m) {
diff --git a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
index af400558b5fe..c3b20163502d 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
@@ -39,9 +39,6 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeight<
       padding.size() == kSpatialDim,
       "Specify front/top/left padding only. "
       "end/bottom/right padding assumed to be equal to front/top/left");
-  TORCH_CHECK(
-      !(transpose && kSpatialDim == 3),
-      "Currently no support for 3d conv_transpose in FBGEM. ");
   TORCH_CHECK(
       !transpose || output_padding.size() == kSpatialDim,
       "quantized::conv_prepack: Specify top/left output padding "
@@ -104,26 +101,10 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeight<
   // for both conv and conv transpose
   // but PyTorch lays them out as {out_c, in_c/groups, kH, kW}
   // (or for ConvTranspose {in_c, out_c/groups, kH, kW})
-  const at::Tensor weight_nhwc = transpose
-      ?
-      // check transpose
-      // 2D conv transpose weight transform
-      // IC OC/G KH KW -> OC KH KW IC/G
-      // transpose does not support 3d yet.
-      [&]() {
-        auto ic_g_oc_g_hw_tensors = weight.chunk(groups);
-        auto fused_tensor =
-            at::cat(ic_g_oc_g_hw_tensors, 1).set_quantizer_(weight.quantizer());
-        return fused_tensor.permute({1, 2, 3, 0})
-            .contiguous(c10::MemoryFormat::Contiguous);
-      }()
-      : (kSpatialDim == 2
-             // 2d conv weight transform
-             ? weight.contiguous(c10::MemoryFormat::ChannelsLast)
-             // 3d conv weight transform
-             : at::native::fbgemm_utils::ConvertToChannelsLast3dTensor(weight));
+  const at::Tensor weight_nhwc =
+      at::native::fbgemm_utils::ConvertConvWeightsToChannelLastTensor<kSpatialDim>(weight, groups, transpose);
   const int8_t* weight_data_int8 =
-      reinterpret_cast<int8_t*>(weight_nhwc.data_ptr<c10::qint8>());
+          reinterpret_cast<int8_t*>(weight_nhwc.data_ptr<c10::qint8>());
   std::vector<int32_t> col_offsets(output_channels);
   // compute column offsets (Similar to
   // fbgemm::col_offsets_with_zero_pt_s8acc32_ref) please note that offsets
@@ -444,6 +425,7 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
   // ConvTranspose
   m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose1d_prepack"), TORCH_FN(QConv1dPackWeightInt8::run_deconv));
   m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_prepack"), TORCH_FN(QConvPackWeightInt8<2>::run_deconv));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_prepack"), TORCH_FN(QConvPackWeightInt8<3>::run_deconv));
 }
 
 TORCH_LIBRARY_IMPL(_quantized, QuantizedCPU, m) {
@@ -452,6 +434,7 @@ TORCH_LIBRARY_IMPL(_quantized, QuantizedCPU, m) {
   // ConvTranspose
   m.impl(TORCH_SELECTIVE_NAME("_quantized::conv_transpose1d_prepack"), TORCH_FN(QConv1dPackWeightInt8::run_deconv));
   m.impl(TORCH_SELECTIVE_NAME("_quantized::conv_transpose2d_prepack"), TORCH_FN(QConvPackWeightInt8<2>::run_deconv));
+  m.impl(TORCH_SELECTIVE_NAME("_quantized::conv_transpose3d_prepack"), TORCH_FN(QConvPackWeightInt8<3>::run_deconv));
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qconv_unpack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_unpack.cpp
index a908a0b77732..484bfe44fc76 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv_unpack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv_unpack.cpp
@@ -13,10 +13,6 @@ template <int kSpatialDim>
 std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeight<
     kSpatialDim>::unpack() {
   auto* packed_weights_p = w.get();
-  TORCH_CHECK(
-      !(kSpatialDim != 2 && transpose()),
-      "FBGEMM does not support 3d unpack right "
-      "now.");
   // output channels
   const int output_channels = packed_weights_p->outputChannels();
   const int input_channels = packed_weights_p->inputChannels();
@@ -91,7 +87,7 @@ std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeight<
   if(transpose()){
     unpacked_weights =
         at::native::fbgemm_utils::TransposeConvTensorUnpackConversion<
-            2>(unpacked_weights, groups);
+            kSpatialDim>(unpacked_weights, groups);
   }
   return std::tuple<at::Tensor, c10::optional<at::Tensor>>(
       unpacked_weights, bias);
@@ -276,6 +272,7 @@ TORCH_LIBRARY_IMPL(quantized, CatchAll, m) {
   // ConvTranspose is the same, however, we want to have different name.
   m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose1d_unpack"), TORCH_FN(QConv1dUnpackWeightsInt8::run));
   m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_unpack"), TORCH_FN(QConvUnpackWeightsInt8<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_unpack"), TORCH_FN(QConvUnpackWeightsInt8<3>::run));
 
   m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_stride"), TORCH_FN(QConvStride<2>::run));
   m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_padding"), TORCH_FN(QConvPadding<2>::run));
@@ -283,6 +280,12 @@ TORCH_LIBRARY_IMPL(quantized, CatchAll, m) {
   m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_dilation"), TORCH_FN(QConvDilation<2>::run));
   m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_groups"), TORCH_FN(QConvGroups<2>::run));
   m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_transpose"), TORCH_FN(QConvTranspose<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_stride"), TORCH_FN(QConvStride<3>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_padding"), TORCH_FN(QConvPadding<3>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_output_padding"), TORCH_FN(QConvOutputPadding<3>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_dilation"), TORCH_FN(QConvDilation<3>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_groups"), TORCH_FN(QConvGroups<3>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_transpose"), TORCH_FN(QConvTranspose<3>::run));
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp
index 3150fd986300..c09501deec91 100644
--- a/aten/src/ATen/native/quantized/library.cpp
+++ b/aten/src/ATen/native/quantized/library.cpp
@@ -97,6 +97,7 @@ TORCH_LIBRARY(quantized, m) {
   // conv_tranpsose
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose1d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose3d(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose1d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] output_padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose1d_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] output_padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase"));
@@ -107,6 +108,14 @@ TORCH_LIBRARY(quantized, m) {
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d_dilation(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d_groups(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d_transpose(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose3d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] output_padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv3dPackedParamsBase"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose3d_unpack(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose3d_stride(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose3d_padding(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose3d_output_padding(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose3d_dilation(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose3d_groups(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose3d_transpose(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int"));
 
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::elu(Tensor self, float output_scale, int output_zero_point, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_prepack(Tensor weight) -> __torch__.torch.classes.quantized.EmbeddingPackedParamsBase W_prepack"));
@@ -182,6 +191,7 @@ TORCH_LIBRARY(_quantized, m) {
   m.def(TORCH_SELECTIVE_SCHEMA("_quantized::conv_transpose2d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA("_quantized::conv_transpose1d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] output_padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase"));
   m.def(TORCH_SELECTIVE_SCHEMA("_quantized::conv_transpose2d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] output_padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase"));
+  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::conv_transpose3d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] output_padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv3dPackedParamsBase"));
   m.def(TORCH_SELECTIVE_SCHEMA("_quantized::linear(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i) -> Tensor Y"));
   m.def(TORCH_SELECTIVE_SCHEMA("_quantized::linear_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y"));
   m.def(TORCH_SELECTIVE_SCHEMA("_quantized::linear_prepack(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack"));
diff --git a/test/quantization/test_quantized_op.py b/test/quantization/test_quantized_op.py
index 36f317529285..24649296b03a 100644
--- a/test/quantization/test_quantized_op.py
+++ b/test/quantization/test_quantized_op.py
@@ -3611,6 +3611,122 @@ def test_qconv_transpose2d(
         Y_q = qconv_op(X_q)
         self.assertEqual(Y_q_ref, Y_q)
 
+    """Tests the correctness of quantized convolution op."""
+    @given(batch_size=st.integers(1, 3),
+           input_channels_per_group=st.sampled_from([2, 4, 5, 8, 16, 32]),
+           time=st.integers(2, 5),
+           height=st.integers(10, 16),
+           width=st.integers(7, 14),
+           output_channels_per_group=st.sampled_from([2, 4, 5, 8, 16, 32]),
+           groups=st.integers(1, 3),
+           kernel_t=st.integers(1, 7),
+           kernel_h=st.integers(1, 7),
+           kernel_w=st.integers(1, 7),
+           stride_t=st.integers(1, 2),
+           stride_h=st.integers(1, 2),
+           stride_w=st.integers(1, 2),
+           pad_t=st.integers(0, 2),
+           pad_h=st.integers(0, 2),
+           pad_w=st.integers(0, 2),
+           o_pad_t=st.integers(0, 2),
+           o_pad_h=st.integers(0, 2),
+           o_pad_w=st.integers(0, 2),
+           dilation=st.integers(1, 2),
+           X_scale=st.floats(1.2, 1.6),
+           X_zero_point=st.integers(0, 4),
+           W_scale=st.lists(st.floats(0.2, 1.6), min_size=1, max_size=2),
+           W_zero_point=st.lists(st.integers(-5, 5), min_size=1, max_size=2),
+           Y_scale=st.floats(4.2, 5.6),
+           Y_zero_point=st.integers(0, 4),
+           use_bias=st.booleans())
+    @override_qengines
+    def test_qconv_transpose3d(
+            self,
+            batch_size,
+            input_channels_per_group,
+            time,
+            height,
+            width,
+            output_channels_per_group,
+            groups,
+            kernel_t,
+            kernel_h,
+            kernel_w,
+            stride_t,
+            stride_h,
+            stride_w,
+            pad_t,
+            pad_h,
+            pad_w,
+            o_pad_t,
+            o_pad_h,
+            o_pad_w,
+            dilation,
+            X_scale,
+            X_zero_point,
+            W_scale,
+            W_zero_point,
+            Y_scale,
+            Y_zero_point,
+            use_bias):
+        if qengine_is_qnnpack():
+            return  # QNNPACK doesn't support this
+        assume(o_pad_t < stride_t or o_pad_t < dilation)
+        assume(o_pad_h < stride_h or o_pad_h < dilation)
+        assume(o_pad_w < stride_w or o_pad_w < dilation)
+
+        input_channels = input_channels_per_group * groups
+        output_channels = output_channels_per_group * groups
+        kernels = (kernel_t, kernel_h, kernel_w)
+        strides = (stride_t, stride_h, stride_w)
+        pads = (pad_t, pad_h, pad_w)
+        o_pads = (o_pad_t, o_pad_h, o_pad_w)
+        dilations = (dilation, dilation, dilation)
+
+        qconv = torch.ops.quantized.conv_transpose3d
+        qconv_prepack = torch.ops.quantized.conv_transpose3d_prepack
+        conv_op = torch.nn.ConvTranspose3d(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=kernels,
+            stride=strides,
+            padding=pads,
+            output_padding=o_pads,
+            groups=groups,
+            dilation=dilations,
+            bias=use_bias
+        )
+        X_q, W_q, bias_float = self._test_qconv_impl(
+            qconv, qconv_prepack, conv_op, batch_size,
+            input_channels_per_group, (time, height, width),
+            output_channels_per_group, groups, kernels, strides, pads, o_pads,
+            dilations, X_scale, X_zero_point, W_scale, W_zero_point,
+            Y_scale, Y_zero_point, use_bias, use_relu=False,
+            use_channelwise=False, use_transpose=True)
+
+        # Test the module implementation
+        qconv_op = torch.nn.quantized.ConvTranspose3d(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=kernels,
+            stride=strides,
+            padding=pads,
+            output_padding=o_pads,
+            groups=groups,
+            dilation=dilations,
+            bias=use_bias
+        )
+        qconv_op.scale = Y_scale
+        qconv_op.zero_point = Y_zero_point
+        qconv_op.set_weight_bias(W_q, bias_float)
+
+        Y_dq_ref = conv_op(X_q.dequantize())
+        Y_q_ref = torch.quantize_per_tensor(Y_dq_ref, scale=Y_scale,
+                                            zero_point=Y_zero_point,
+                                            dtype=torch.quint8)
+        Y_q = qconv_op(X_q)
+        self.assertEqual(Y_q_ref, Y_q)
+
     @given(
         inputs=hu.tensor_conv(
             spatial_dim=1, batch_size_range=(1, 3),
@@ -3863,22 +3979,26 @@ def test_qconv3d(
         stride_w=st.integers(1, 2),
         pad_d=st.integers(1, 2), pad_h=st.integers(1, 2),
         pad_w=st.integers(1, 2),
-        channelwise=st.booleans(),
-        qengine=st.sampled_from(("fbgemm",)))
+        o_pad=st.integers(0, 2),
+        channelwise=st.booleans())
+    @override_qengines
     def test_qconv3d_unpack(
-        self, inputs, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w,
-        channelwise, qengine
+        self, inputs, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, o_pad,
+        channelwise
     ):
-        if qengine not in supported_qengines:
-            return
-
-        with override_quantized_engine(qengine):
-            qconv3d_prepack = torch.ops.quantized.conv3d_prepack
-            qconv3d_unpack = torch.ops.quantized.conv3d_unpack
-            self._test_qconv_unpack_impl(
-                qconv3d_prepack, qconv3d_unpack, inputs,
-                (stride_d, stride_h, stride_w), (pad_d, pad_h, pad_w), None,
-                channelwise)
+        if qengine_is_qnnpack():
+            return  # QNNPACK doesn't support this
+        transposed = inputs[-1]
+        if transposed:
+            qconv_prepack = torch.ops.quantized.conv_transpose3d_prepack
+            qconv_unpack = torch.ops.quantized.conv_transpose3d_unpack
+        else:
+            qconv_prepack = torch.ops.quantized.conv3d_prepack
+            qconv_unpack = torch.ops.quantized.conv3d_unpack
+        self._test_qconv_unpack_impl(
+            qconv_prepack, qconv_unpack, inputs,
+            (stride_d, stride_h, stride_w), (pad_d, pad_h, pad_w), (o_pad, o_pad, o_pad),
+            channelwise)
 
 class TestPadding(TestCase):
     @given(batch_size=st.integers(1, 64),
diff --git a/torch/nn/quantized/modules/__init__.py b/torch/nn/quantized/modules/__init__.py
index a40a3e3fbcac..a064c72dda98 100644
--- a/torch/nn/quantized/modules/__init__.py
+++ b/torch/nn/quantized/modules/__init__.py
@@ -7,7 +7,7 @@
 from .normalization import LayerNorm, GroupNorm, InstanceNorm1d, \
     InstanceNorm2d, InstanceNorm3d
 from .conv import Conv1d, Conv2d, Conv3d
-from .conv import ConvTranspose1d, ConvTranspose2d
+from .conv import ConvTranspose1d, ConvTranspose2d, ConvTranspose3d
 from .linear import Linear
 from .embedding_ops import Embedding, EmbeddingBag
 
@@ -91,6 +91,7 @@ def from_float(mod):
     'Conv3d',
     'ConvTranspose1d',
     'ConvTranspose2d',
+    'ConvTranspose3d',
     'DeQuantize',
     'ELU',
     'Embedding',
diff --git a/torch/nn/quantized/modules/conv.py b/torch/nn/quantized/modules/conv.py
index 31c914d2bf35..e9f4a4c701eb 100644
--- a/torch/nn/quantized/modules/conv.py
+++ b/torch/nn/quantized/modules/conv.py
@@ -606,9 +606,6 @@ class ConvTranspose2d(_ConvTransposeNd):
     For details on input arguments, parameters, and implementation see
     :class:`~torch.nn.ConvTranspose2d`.
 
-    .. note:: Currently only the QNNPACK engine is implemented.
-        Please, set the `torch.backends.quantized.engine = 'qnnpack'`
-
     For special notes, please, see :class:`~torch.nn.quantized.Conv2d`
 
     Attributes:
@@ -620,6 +617,7 @@ class ConvTranspose2d(_ConvTransposeNd):
 
     Examples::
 
+        >>> # QNNPACK or FBGEMM as backend
         >>> torch.backends.quantized.engine = 'qnnpack'
         >>> # With square kernels and equal stride
         >>> m = nnq.ConvTranspose2d(16, 33, 3, stride=2)
@@ -684,3 +682,88 @@ def forward(self, input):
             raise ValueError("Input shape must be `(N, C, H, W)`!")
         return ops.quantized.conv_transpose2d(
             input, self._packed_params, self.scale, self.zero_point)
+
+class ConvTranspose3d(_ConvTransposeNd):
+    r"""Applies a 3D transposed convolution operator over an input image
+    composed of several input planes.
+    For details on input arguments, parameters, and implementation see
+    :class:`~torch.nn.ConvTranspose3d`.
+
+    .. note:: Currently only the FBGEMM engine is implemented.
+        Please, set the `torch.backends.quantized.engine = 'fbgemm'`
+
+    For special notes, please, see :class:`~torch.nn.quantized.Conv3d`
+
+    Attributes:
+        weight (Tensor):     packed tensor derived from the learnable weight
+                             parameter.
+        scale (Tensor):      scalar for the output scale
+        zero_point (Tensor): scalar for the output zero point
+    See :class:`~torch.nn.ConvTranspose3d` for other attributes.
+
+    Examples::
+
+        >>> torch.backends.quantized.engine = 'fbgemm'
+        >>> # With cubic kernels and equal stride
+        >>> m = nnq.ConvTranspose3d(16, 33, 3, stride=2)
+        >>> # non-cubic kernels and unequal stride and with padding
+        >>> m = nnq.ConvTranspose3d(16, 33, (3, 3, 5), stride=(2, 1, 1), padding=(4, 2, 2))
+        >>> input = torch.randn(20, 16, 50, 100, 100)
+        >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8)
+        >>> output = m(q_input)
+        >>> # exact output size can be also specified as an argument
+        >>> input = torch.randn(1, 16, 12, 12, 12)
+        >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8)
+        >>> downsample = nnq.Conv3d(16, 16, 3, stride=2, padding=1)
+        >>> upsample = nnq.ConvTranspose3d(16, 16, 3, stride=2, padding=1)
+        >>> h = downsample(q_input)
+        >>> h.size()
+        torch.Size([1, 16, 6, 6, 6])
+        >>> output = upsample(h, output_size=input.size())
+        >>> output.size()
+        torch.Size([1, 16, 12, 12, 12])
+    """
+
+    _FLOAT_MODULE = nn.ConvTranspose3d
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, output_padding=0, groups=1, bias=True,
+                 dilation=1, padding_mode='zeros'):
+        kernel_size = _pair(kernel_size)
+        stride = _pair(stride)
+        padding = _pair(padding)
+        dilation = _pair(dilation)
+        output_padding = _pair(output_padding)
+
+        super(ConvTranspose3d, self).__init__(
+            in_channels, out_channels, kernel_size, stride, padding, dilation,
+            True, output_padding, groups, bias, padding_mode)
+
+    def _get_name(self):
+        return 'QuantizedConvTranpose3d'
+
+    def set_weight_bias(self, w, b):
+        # type: (torch.Tensor, Optional[torch.Tensor]) -> None
+        self._packed_params = torch.ops.quantized.conv_transpose3d_prepack(
+            w, b, self.stride, self.padding, self.output_padding, self.dilation,
+            self.groups)
+
+    def _weight_bias(self):
+        w, b = torch.ops.quantized.conv3d_unpack(self._packed_params)
+        return w, b
+
+    def weight(self):
+        (w, _) = self._weight_bias()
+        return w
+
+    def bias(self):
+        (_, b) = self._weight_bias()
+        return b
+
+    def forward(self, input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 5:
+            raise ValueError("Input shape must be `(N, C, T, H, W)`!")
+        return ops.quantized.conv_transpose3d(
+            input, self._packed_params, self.scale, self.zero_point)

From 9bc8f071a3e90ae9033c79b8372f9c0ede129e9c Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Thu, 29 Oct 2020 17:01:02 -0700
Subject: [PATCH 135/169] [WIP] Move torch.fx into its own target (#46658)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46658

ghstack-source-id: 115213192

Test Plan: waitforsadcastle

Reviewed By: zdevito, vkuzo

Differential Revision: D24374723

fbshipit-source-id: 2b5708001f5df2ffb21ea5e586e26030653ccdcf
---
 test/quantization/test_quantize_fx.py         |  13 +-
 test/test_quantization.py                     |  11 +-
 torch/quantization/__init__.py                |   6 +-
 .../testing/_internal/common_quantization.py  | 157 ++++++++++--------
 4 files changed, 103 insertions(+), 84 deletions(-)

diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index 0f9b4b74f869..7d1588673ca6 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -7,12 +7,15 @@
 import torch.multiprocessing as mp
 
 # graph mode quantization based on fx
-from torch.quantization import (
-    QuantType,
-    quant_type_to_str,
+from torch.quantization.quantize_fx import (
     prepare_fx,
     convert_fx,
     prepare_qat_fx,
+)
+
+from torch.quantization import (
+    QuantType,
+    quant_type_to_str,
     default_qconfig,
     default_dynamic_qconfig,
     default_dynamic_quant_observer,
@@ -831,13 +834,13 @@ def forward(self, x):
         print(m.__dict__.keys())
         m.eval()
         qconfig_dict = {'': torch.quantization.default_qconfig}
-        prepared = torch.quantization.prepare_fx(m, qconfig_dict)
+        prepared = prepare_fx(m, qconfig_dict)
         # calibrate
         prepared(torch.randn(4, 1, 4, 4))
         # copy
         prepared_copy = copy.deepcopy(prepared)
         # quantize, should run with no errors
-        quantized = torch.quantization.convert_fx(prepared_copy)
+        quantized = convert_fx(prepared_copy)
 
 
 @skipIfNoFBGEMM
diff --git a/test/test_quantization.py b/test/test_quantization.py
index 26d3ee020983..9eba83b21e4f 100644
--- a/test/test_quantization.py
+++ b/test/test_quantization.py
@@ -61,9 +61,14 @@
 from quantization.test_quantize_jit import TestQuantizeDynamicJitOps  # noqaa: F401
 
 # 3. GraphModule based graph mode quantization
-from quantization.test_quantize_fx import TestQuantizeFx  # noqa: F401
-from quantization.test_quantize_fx import TestQuantizeFxOps  # noqa: F401
-from quantization.test_quantize_fx import TestQuantizeFxModels  # noqa: F401
+try:
+    from quantization.test_quantize_fx import TestQuantizeFx  # noqa: F401
+    from quantization.test_quantize_fx import TestQuantizeFxOps  # noqa: F401
+    from quantization.test_quantize_fx import TestQuantizeFxModels  # noqa: F401
+except ImportError:
+    # In FBCode we separate FX out into a separate target for the sake of dev
+    # velocity. These are covered by a separate test target `quantization_fx`
+    pass
 
 # Tooling: numeric_suite
 from quantization.test_numeric_suite import TestEagerModeNumericSuite  # noqa: F401
diff --git a/torch/quantization/__init__.py b/torch/quantization/__init__.py
index d6daa79fae53..24e929b5fc8e 100644
--- a/torch/quantization/__init__.py
+++ b/torch/quantization/__init__.py
@@ -6,7 +6,7 @@
 from .stubs import *
 from .quant_type import *
 from .quantize_jit import *
-from .quantize_fx import *
+# from .quantize_fx import *
 from .quantization_mappings import *
 from .fuser_method_mappings import *
 
@@ -26,8 +26,8 @@ def default_eval_fn(model, calib_data):
     # Top level API for graph mode quantization on TorchScript
     'quantize_jit', 'quantize_dynamic_jit',
     # Top level API for graph mode quantization on GraphModule(torch.fx)
-    'fuse_fx', 'quantize_fx',  # TODO: add quantize_dynamic_fx
-    'prepare_fx', 'prepare_dynamic_fx', 'convert_fx',
+    # 'fuse_fx', 'quantize_fx',  # TODO: add quantize_dynamic_fx
+    # 'prepare_fx', 'prepare_dynamic_fx', 'convert_fx',
     'QuantType', 'quant_type_to_str',  # quantization type
     # custom module APIs
     'get_default_static_quant_module_mappings', 'get_static_quant_module_class',
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index 24db8f634e86..2cb29dcba613 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -12,21 +12,31 @@
 from torch.quantization import QuantWrapper, QuantStub, DeQuantStub, \
     default_qconfig, default_dynamic_qconfig, default_per_channel_qconfig, QConfig, default_observer, default_weight_observer, \
     propagate_qconfig_, convert, get_default_qconfig, quantize_dynamic_jit, quantize_jit, float_qparams_dynamic_qconfig, \
-    get_default_qat_qconfig, PerChannelMinMaxObserver, default_dynamic_quant_observer, QConfigDynamic
+    get_default_qat_qconfig, PerChannelMinMaxObserver, default_dynamic_quant_observer, QConfigDynamic, QuantType
 from torch.quantization.quantization_mappings import (
     get_default_dynamic_quant_module_mappings,
     get_default_qconfig_propagation_list,
     get_default_qat_module_mappings,
 )
 
-# graph mode quantization based on fx
 from torch.quantization import (
     QuantType,
-    prepare_fx,
-    prepare_qat_fx,
-    convert_fx,
 )
 
+try:
+    # symbolic trace
+    from torch.fx import symbolic_trace
+
+    # graph mode quantization based on fx
+    from torch.quantization.quantize_fx import (
+        prepare_fx,
+        prepare_qat_fx,
+        convert_fx,
+    )
+    HAS_FX = True
+except ImportError:
+    HAS_FX = False
+
 import copy
 import io
 import functools
@@ -597,76 +607,77 @@ def printGraphModule(self, graph_module, print_str=True):
             print(str_to_print)
         return str_to_print
 
-    def checkGraphModeFxOp(self, model, inputs, quant_type,
-                           expected_node=None,
-                           expected_node_occurrence=None,
-                           expected_node_list=None,
-                           debug=False,
-                           print_debug_info=False,
-                           custom_qconfig=None):
-        """ Quantizes model with graph mode quantization on fx and check if the
-        quantized model contains the quantized_node
-
-        Args:
-            model: floating point torch.nn.Module
-            inputs: one positional sample input arguments for model
-            expected_node: NodeSpec
-                  e.g. NodeSpec.call_function(torch.quantize_per_tensor)
-            expected_node_occurrence: a dict from NodeSpec to
-                  expected number of occurences (int)
-                  e.g. {NodeSpec.call_function(torch.quantize_per_tensor) : 1,
-                        NodeSpec.call_method('dequantize'): 1}
-            expected_node_list: a list of NodeSpec, used to check the order
-                  of the occurrence of Node
-                  e.g. [NodeSpec.call_function(torch.quantize_per_tensor),
-                        NodeSpec.call_module(nnq.Conv2d),
-                        NodeSpec.call_function(F.hardtanh_),
-                        NodeSpec.call_method('dequantize')]
-        """
-        # TODO: make img_data a single example instead of a list
-        if type(inputs) == list:
-            inputs = inputs[0]
-
-        if quant_type == QuantType.QAT:
-            qconfig = get_default_qat_qconfig(torch.backends.quantized.engine)
-            model.train()
-        elif quant_type == QuantType.STATIC:
-            qconfig = get_default_qconfig(torch.backends.quantized.engine)
-            model.eval()
-        else:
-            qconfig = default_dynamic_qconfig
-            model.eval()
+    if HAS_FX:
+        def checkGraphModeFxOp(self, model, inputs, quant_type,
+                            expected_node=None,
+                            expected_node_occurrence=None,
+                            expected_node_list=None,
+                            debug=False,
+                            print_debug_info=False,
+                            custom_qconfig=None):
+            """ Quantizes model with graph mode quantization on fx and check if the
+            quantized model contains the quantized_node
+
+            Args:
+                model: floating point torch.nn.Module
+                inputs: one positional sample input arguments for model
+                expected_node: NodeSpec
+                    e.g. NodeSpec.call_function(torch.quantize_per_tensor)
+                expected_node_occurrence: a dict from NodeSpec to
+                    expected number of occurences (int)
+                    e.g. {NodeSpec.call_function(torch.quantize_per_tensor) : 1,
+                            NodeSpec.call_method('dequantize'): 1}
+                expected_node_list: a list of NodeSpec, used to check the order
+                    of the occurrence of Node
+                    e.g. [NodeSpec.call_function(torch.quantize_per_tensor),
+                            NodeSpec.call_module(nnq.Conv2d),
+                            NodeSpec.call_function(F.hardtanh_),
+                            NodeSpec.call_method('dequantize')]
+            """
+            # TODO: make img_data a single example instead of a list
+            if type(inputs) == list:
+                inputs = inputs[0]
+
+            if quant_type == QuantType.QAT:
+                qconfig = get_default_qat_qconfig(torch.backends.quantized.engine)
+                model.train()
+            elif quant_type == QuantType.STATIC:
+                qconfig = get_default_qconfig(torch.backends.quantized.engine)
+                model.eval()
+            else:
+                qconfig = default_dynamic_qconfig
+                model.eval()
 
-        # overwrite qconfig with custom_qconfig
-        if custom_qconfig is not None:
-            qconfig = custom_qconfig
+            # overwrite qconfig with custom_qconfig
+            if custom_qconfig is not None:
+                qconfig = custom_qconfig
 
-        if quant_type == QuantType.QAT:
-            prepare = prepare_qat_fx
-        else:
-            prepare = prepare_fx
-
-        qconfig_dict = {'': qconfig}
-        prepared = prepare(model, qconfig_dict)
-        if not quant_type == QuantType.DYNAMIC:
-            prepared(*inputs)
-        prepared_copy = copy.deepcopy(prepared)
-        qgraph = convert_fx(prepared)
-        qgraph_debug = convert_fx(prepared_copy, debug=True)
-        result = qgraph(*inputs)
-        result_debug = qgraph_debug(*inputs)
-
-        qgraph_to_check = qgraph_debug if debug else qgraph
-        if print_debug_info:
-            print()
-            print('quant type:', quant_type)
-            print('original model:', model)
-            print()
-            print('quantized model:', qgraph_to_check)
-            self.printGraphModule(qgraph_to_check)
-            print()
-        self.checkGraphModuleNodes(
-            qgraph_to_check, expected_node, expected_node_occurrence, expected_node_list)
+            if quant_type == QuantType.QAT:
+                prepare = prepare_qat_fx
+            else:
+                prepare = prepare_fx
+
+            qconfig_dict = {'': qconfig}
+            prepared = prepare(model, qconfig_dict)
+            if not quant_type == QuantType.DYNAMIC:
+                prepared(*inputs)
+            prepared_copy = copy.deepcopy(prepared)
+            qgraph = convert_fx(prepared)
+            qgraph_debug = convert_fx(prepared_copy, debug=True)
+            result = qgraph(*inputs)
+            result_debug = qgraph_debug(*inputs)
+
+            qgraph_to_check = qgraph_debug if debug else qgraph
+            if print_debug_info:
+                print()
+                print('quant type:', quant_type)
+                print('original model:', model)
+                print()
+                print('quantized model:', qgraph_to_check)
+                self.printGraphModule(qgraph_to_check)
+                print()
+            self.checkGraphModuleNodes(
+                qgraph_to_check, expected_node, expected_node_occurrence, expected_node_list)
 
 
     def checkEmbeddingSerialization(self, qemb, num_embeddings, embedding_dim, indices, offsets,

From ddeacf1565ca08015c20cb6d17ef0357937fff5c Mon Sep 17 00:00:00 2001
From: Heitor Schueroff <heitorschueroff@fb.com>
Date: Thu, 29 Oct 2020 17:10:31 -0700
Subject: [PATCH 136/169] Fix median bug on discontigous tensors (#46917)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46917

fixes https://github.com/pytorch/pytorch/issues/46814

Test Plan: Imported from OSS

Reviewed By: albanD

Differential Revision: D24633412

Pulled By: heitorschueroff

fbshipit-source-id: 54732671b298bdc2b04b13ab3a373892ee0933c3
---
 aten/src/ATen/native/cuda/Sorting.cu        |  2 +-
 aten/src/ATen/native/cuda/SortingCommon.cuh |  1 +
 test/test_torch.py                          | 17 +++++++++++++++++
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/cuda/Sorting.cu b/aten/src/ATen/native/cuda/Sorting.cu
index 0a5760580c06..c6688b286914 100644
--- a/aten/src/ATen/native/cuda/Sorting.cu
+++ b/aten/src/ATen/native/cuda/Sorting.cu
@@ -319,7 +319,7 @@ std::tuple<Tensor&, Tensor&> median_with_indices_impl(
   NoNamesGuard guard;
 
   dim = at::maybe_wrap_dim(dim, self.dim());
-  Tensor in = self.dim() > 0 ? self : self.unsqueeze(0);
+  Tensor in = self.dim() > 0 ? self.contiguous() : self.unsqueeze(0);
 
   int64_t size = in.size(dim);
   TORCH_CHECK(
diff --git a/aten/src/ATen/native/cuda/SortingCommon.cuh b/aten/src/ATen/native/cuda/SortingCommon.cuh
index 54513955e912..0e5cb7371d58 100644
--- a/aten/src/ATen/native/cuda/SortingCommon.cuh
+++ b/aten/src/ATen/native/cuda/SortingCommon.cuh
@@ -143,6 +143,7 @@ static uint64_t nextHighestPowerOf2(uint64_t n) {
 }
 
 
+// WARNING: This function assumes input tensors are contiguous
 template <typename scalar_t, typename index_t, typename Launcher>
 void run_launcher(
     Tensor& values,
diff --git a/test/test_torch.py b/test/test_torch.py
index 48810a2f92e0..23ebf0d964b7 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -10536,6 +10536,23 @@ def check(op, a, args, key):
         check(torch.median, [[nan, nan], [1, 2]], [1], [[nan, 1]])
         check(torch.nanmedian, [[nan, nan], [1, 2]], [1], [[nan, 1.]])
 
+        # Discontiguous and strided tensors
+        a = torch.arange(12, device=device)
+        self.assertEqual(a[::2].median(), torch.tensor(4, device=device))
+        self.assertEqual(a[::2].nanmedian(), torch.tensor(4, device=device))
+
+        a.resize_(3, 4)
+        self.assertEqual(a.T.median(), torch.tensor(5, device=device))
+        self.assertEqual(a.T.nanmedian(), torch.tensor(5, device=device))
+        self.assertEqual(a[::2, ::2].median(-1)[0], torch.tensor([0, 8], device=device))
+        self.assertEqual(a[::2, ::2].nanmedian(-1)[0], torch.tensor([0, 8], device=device))
+
+        a.resize_(2, 3, 2)
+        self.assertEqual(a.T.median(), torch.tensor(5, device=device))
+        self.assertEqual(a.T.nanmedian(), torch.tensor(5, device=device))
+        self.assertEqual(a[:, ::2, :].median(-1)[0], torch.tensor([[0, 4], [6, 10]], device=device))
+        self.assertEqual(a[:, ::2, :].nanmedian(-1)[0], torch.tensor([[0, 4], [6, 10]], device=device))
+
 
     @onlyOnCPUAndCUDA
     @dtypes(torch.float, torch.double)

From f5477e370343caccc181c71a562fb3bd723e2dc4 Mon Sep 17 00:00:00 2001
From: yangu <yangu@microsoft.com>
Date: Thu, 29 Oct 2020 17:28:44 -0700
Subject: [PATCH 137/169] Enable python code coverage on windows (#44548)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/43897

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44548

Reviewed By: walterddr

Differential Revision: D24582777

Pulled By: malfet

fbshipit-source-id: 9b68b72ba356fef61461fc2446c73360f67ce0b4
---
 .../win-test-helpers/setup_pytorch_env.bat     |  2 +-
 .jenkins/pytorch/win-test.sh                   | 18 ++++++++++++++++++
 codecov.yml                                    |  5 +++++
 3 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
index 34c3698a1307..1e3cfe090abf 100644
--- a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
+++ b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
@@ -39,7 +39,7 @@ if %errorlevel% neq 0 ( exit /b %errorlevel% )
 popd
 
 :: The version is fixed to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
-pip install "ninja==1.10.0.post1" future "hypothesis==4.53.2" "librosa>=0.6.2" psutil pillow unittest-xml-reporting pytest
+pip install "ninja==1.10.0.post1" future "hypothesis==4.53.2" "librosa>=0.6.2" psutil pillow unittest-xml-reporting pytest coverage
 if %errorlevel% neq 0 ( exit /b %errorlevel% )
 :: No need to install faulthandler since we only test Python >= 3.6 on Windows
 :: faulthandler is builtin since Python 3.3
diff --git a/.jenkins/pytorch/win-test.sh b/.jenkins/pytorch/win-test.sh
index abcd5756d747..adf9b4c82620 100755
--- a/.jenkins/pytorch/win-test.sh
+++ b/.jenkins/pytorch/win-test.sh
@@ -14,6 +14,10 @@ fi
 
 export TMP_DIR="${PWD}/build/win_tmp"
 export TMP_DIR_WIN=$(cygpath -w "${TMP_DIR}")
+export PROJECT_DIR="${PWD}"
+export PROJECT_DIR_WIN=$(cygpath -w "${PROJECT_DIR}")
+export TEST_DIR="${PWD}/test"
+export TEST_DIR_WIN=$(cygpath -w "${TEST_DIR}")
 export PYTORCH_FINAL_PACKAGE_DIR="/c/users/circleci/workspace/build-results"
 export PYTORCH_FINAL_PACKAGE_DIR_WIN=$(cygpath -w "${PYTORCH_FINAL_PACKAGE_DIR}")
 
@@ -45,6 +49,7 @@ run_tests() {
         $SCRIPT_HELPERS_DIR/test_libtorch.bat
     else
         if [[ "${JOB_BASE_NAME}" == *-test1 ]]; then
+            export PYTORCH_COLLECT_COVERAGE=1
             $SCRIPT_HELPERS_DIR/test_python_nn.bat "$DETERMINE_FROM" && \
             $SCRIPT_HELPERS_DIR/test_libtorch.bat
             if [[ "${USE_CUDA}" == "1" ]]; then
@@ -59,3 +64,16 @@ run_tests() {
 }
 
 run_tests && assert_git_not_dirty && echo "TEST PASSED"
+
+if [[ "${BUILD_ENVIRONMENT}" == "pytorch-win-vs2019-cuda10-cudnn7-py3" ]] && [[ "${JOB_BASE_NAME}" == *-test1 ]]; then
+  pushd $TEST_DIR
+  python -mpip install coverage
+  echo "Generating XML coverage report"
+  time python -mcoverage xml
+  popd
+
+  pushd $PROJECT_DIR
+  python -mpip install codecov
+  python -mcodecov
+  popd
+fi
diff --git a/codecov.yml b/codecov.yml
index 7ed3d662bb39..525f85e01898 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -3,13 +3,18 @@ coverage:
     project:
       default:
         threshold: 1%
+codecov:
+  notify:
+    after_n_builds: 2
 comment:
   layout: "diff"
   behavior: once
   require_changes: true
   require_base: yes
   require_head: yes
+  after_n_builds: 2
   branches:
     - "master"
 fixes:
   - "/opt/conda/lib/python3.8/site-packages/::project/"
+  - "C:/Users/circleci/project/build/win_tmp/build/::project/"

From 2e2dc5874b0791cfe485685f75231d14af6daa85 Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Thu, 29 Oct 2020 18:07:31 -0700
Subject: [PATCH 138/169] Fix lint (#47095)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47095

Test Plan: Imported from OSS

Reviewed By: ZolotukhinM

Differential Revision: D24639056

Pulled By: jamesr66a

fbshipit-source-id: e4f7842eb0438675723d1cac78e20d13b96e802c
---
 .../testing/_internal/common_quantization.py  | 53 ++++++++-----------
 1 file changed, 23 insertions(+), 30 deletions(-)

diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index 2cb29dcba613..1b2b4165b044 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -19,14 +19,7 @@
     get_default_qat_module_mappings,
 )
 
-from torch.quantization import (
-    QuantType,
-)
-
 try:
-    # symbolic trace
-    from torch.fx import symbolic_trace
-
     # graph mode quantization based on fx
     from torch.quantization.quantize_fx import (
         prepare_fx,
@@ -609,30 +602,30 @@ def printGraphModule(self, graph_module, print_str=True):
 
     if HAS_FX:
         def checkGraphModeFxOp(self, model, inputs, quant_type,
-                            expected_node=None,
-                            expected_node_occurrence=None,
-                            expected_node_list=None,
-                            debug=False,
-                            print_debug_info=False,
-                            custom_qconfig=None):
+                               expected_node=None,
+                               expected_node_occurrence=None,
+                               expected_node_list=None,
+                               debug=False,
+                               print_debug_info=False,
+                               custom_qconfig=None):
             """ Quantizes model with graph mode quantization on fx and check if the
-            quantized model contains the quantized_node
-
-            Args:
-                model: floating point torch.nn.Module
-                inputs: one positional sample input arguments for model
-                expected_node: NodeSpec
-                    e.g. NodeSpec.call_function(torch.quantize_per_tensor)
-                expected_node_occurrence: a dict from NodeSpec to
-                    expected number of occurences (int)
-                    e.g. {NodeSpec.call_function(torch.quantize_per_tensor) : 1,
-                            NodeSpec.call_method('dequantize'): 1}
-                expected_node_list: a list of NodeSpec, used to check the order
-                    of the occurrence of Node
-                    e.g. [NodeSpec.call_function(torch.quantize_per_tensor),
-                            NodeSpec.call_module(nnq.Conv2d),
-                            NodeSpec.call_function(F.hardtanh_),
-                            NodeSpec.call_method('dequantize')]
+                quantized model contains the quantized_node
+
+                Args:
+                    model: floating point torch.nn.Module
+                    inputs: one positional sample input arguments for model
+                    expected_node: NodeSpec
+                        e.g. NodeSpec.call_function(torch.quantize_per_tensor)
+                    expected_node_occurrence: a dict from NodeSpec to
+                        expected number of occurences (int)
+                        e.g. {NodeSpec.call_function(torch.quantize_per_tensor) : 1,
+                                NodeSpec.call_method('dequantize'): 1}
+                    expected_node_list: a list of NodeSpec, used to check the order
+                        of the occurrence of Node
+                        e.g. [NodeSpec.call_function(torch.quantize_per_tensor),
+                                NodeSpec.call_module(nnq.Conv2d),
+                                NodeSpec.call_function(F.hardtanh_),
+                                NodeSpec.call_method('dequantize')]
             """
             # TODO: make img_data a single example instead of a list
             if type(inputs) == list:

From e3b55a8a652f82f0f3de9ab22fbddba76766114b Mon Sep 17 00:00:00 2001
From: Xiaodong Wang <xdwang@fb.com>
Date: Thu, 29 Oct 2020 18:12:54 -0700
Subject: [PATCH 139/169] [pytorch/ops] Concat fast path w/ zero tensor
 (#46805)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46805

The current implementation goes with slow path if there is zero tensor in the list. This is inefficient. Use the fast path for torch.cat even if there are empty tensors. This wastes one thread block for the empty tensor, but still much better than the slow path.

Test Plan: CI + sandcastle

Reviewed By: ngimel

Differential Revision: D24524441

fbshipit-source-id: 529c8af51ecf8374621deee3a9d16cacbd214741
---
 aten/src/ATen/native/cuda/Shape.cu | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/aten/src/ATen/native/cuda/Shape.cu b/aten/src/ATen/native/cuda/Shape.cu
index 5da007507905..64af6cb268a2 100644
--- a/aten/src/ATen/native/cuda/Shape.cu
+++ b/aten/src/ATen/native/cuda/Shape.cu
@@ -237,7 +237,12 @@ void hip_parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension,
            batchCounter < CAT_ARRAY_BATCH_SIZE &&
              (i+batchCounter) < inputs.size();
            ++batchCounter) {
-        int64_t dimSize = at::native::size(inputs[i+batchCounter], dimension);
+        int64_t dimSize = 0;
+        // There is a legacy case where a 1-D empty tensor can be concat with
+        // high-dimensional tensor
+        if (inputs[i+batchCounter].numel() > 0) {
+          dimSize = at::native::size(inputs[i+batchCounter], dimension);
+        }
 
         stackInputs[batchCounter].input =
           inputs[i+batchCounter].data_ptr<scalar_t>();
@@ -338,7 +343,12 @@ void parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension,
           batchCounter < CAT_ARRAY_BATCH_SIZE &&
             (i+batchCounter) < inputs.size();
           ++batchCounter) {
-      int64_t dimSize = at::native::size(inputs[i+batchCounter], dimension);
+      int64_t dimSize = 0;
+      // There is a legacy case where a 1-D empty tensor can be concat with
+      // high-dimensional tensor
+      if (inputs[i+batchCounter].numel() > 0) {
+        dimSize = at::native::size(inputs[i+batchCounter], dimension);
+      }
       catMetaData.input[batchCounter] = inputs[i+batchCounter].data_ptr<scalar_t>();
       catMetaData.offset[batchCounter] = offset;
       catMetaData.dimSize[batchCounter] = dimSize;
@@ -431,7 +441,6 @@ Tensor& cat_out_cuda(Tensor& out, TensorList inputs, int64_t dimension) {
   auto should_skip = [](const Tensor &t) {
     return t.dim() == 1 && at::native::size(t, 0) == 0;
   };
-  bool hasSkippedInput = false;
 
   const Tensor *notSkippedTensor = NULL;  // non-owning reference
   int nDims = 0;
@@ -452,10 +461,8 @@ Tensor& cat_out_cuda(Tensor& out, TensorList inputs, int64_t dimension) {
   }
   at::assert_no_internal_overlap(out);
 
-  for (int i = 0; i < inputs.size(); i++)
-  {
+  for (int i = 0; i < inputs.size(); i++) {
     if (should_skip(inputs[i])) {
-      hasSkippedInput = true;
       continue;
     }
     nDims = inputs[i].dim();
@@ -501,11 +508,10 @@ Tensor& cat_out_cuda(Tensor& out, TensorList inputs, int64_t dimension) {
   // We parallelize the copy if all 6 conditions pass:
   //
   // 1. There is more than one input tensor
-  // 2. No empty inputs
-  // 3. The out tensor is 32-bit indexable
-  // 4. The number of dimensions is <= 4
-  // 5. All input tensors are contiguous (output tensor may be non-contig)
-  // 6. All input tensors can use 32-bit indexing
+  // 2. The out tensor is 32-bit indexable
+  // 3. The number of dimensions is <= 4
+  // 4. All input tensors are contiguous (output tensor may be non-contig)
+  // 5. All input tensors can use 32-bit indexing
 
   const bool all32BitIndexable = std::all_of(inputs.begin(), inputs.end(),
     [] (const Tensor& t) {
@@ -522,7 +528,6 @@ Tensor& cat_out_cuda(Tensor& out, TensorList inputs, int64_t dimension) {
     });
   allSameType = allSameType && (out.scalar_type() == firstType);
   if (inputs.size() > 1 &&
-      !hasSkippedInput &&
       out.dim() <= CAT_ARRAY_MAX_INPUT_DIMS &&
       at::cuda::detail::canUse32BitIndexMath(out) &&
       allContiguous &&

From 366888a5e2c50d01ac45cba22192f7fde1dff963 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Thu, 29 Oct 2020 18:37:48 -0700
Subject: [PATCH 140/169] [quant][graphmode][fx] Remove logging for standalone
 module api calls (#47032)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/47032

these are not top level apis, not supposed to be called directly by user.

Test Plan: Imported from OSS

Reviewed By: vkuzo

Differential Revision: D24610602

fbshipit-source-id: c5510f06b05499387d70f23508470b676aea582c
---
 torch/quantization/quantize_fx.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py
index 5543c99298f5..93043559bf48 100644
--- a/torch/quantization/quantize_fx.py
+++ b/torch/quantization/quantize_fx.py
@@ -103,10 +103,8 @@ def _prepare_standalone_module_fx(model, qconfig_dict, prepare_custom_config_dic
                                    custom module is observed or not
 
     """
-    torch._C._log_api_usage_once("quantization_api.quantize_fx._prepare_standalone_module_fx")
     return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict, is_standalone_module=True)
 
-
 def fuse_fx(model, fuse_custom_config_dict=None):
     r""" Fuse modules like conv+bn, conv+bn+relu etc, model must be in eval mode.
     Fusion rules are defined in torch.quantization.fx.fusion_pattern.py
@@ -345,5 +343,4 @@ def _convert_standalone_module_fx(graph_module, debug=False, convert_custom_conf
       A quantized standalone module which accepts quantized input(if needed)
       and produces quantized output (if needed).
     """
-    torch._C._log_api_usage_once("quantization_api.quantize_fx._convert_standalone_module_fx")
     return _convert_fx(graph_module, debug, convert_custom_config_dict, is_standalone_module=True)

From 707d271493b6c1074e5aab5874528e859d783b6f Mon Sep 17 00:00:00 2001
From: Sam Estep <sestep@fb.com>
Date: Thu, 29 Oct 2020 18:42:03 -0700
Subject: [PATCH 141/169] Fix links in tools/build_variables.bzl (#47066)

Summary:
I know the `aten/src/ATen/core/CMakeLists.txt` link is now correct because that file was deleted in 061ed739c17028fe907737b52f50a495ab5b4617:
```
$ git log --full-history -- aten/src/ATen/core/CMakeLists.txt | head -n 1
commit 061ed739c17028fe907737b52f50a495ab5b4617
$ git show 061ed739c17028fe907737b52f50a495ab5b4617^ | head -n 1
commit f99a693cd9ff7a9b5fdc71357dac66b8192786d3
```
But I can't tell what the `tools/cpp_build/torch/CMakeLists.txt` link is supposed to be, because that file (indeed, its entire parent directory) doesn't seem to have ever existed:
```
$ git log --full-history -- tools/cpp_build/torch
```
(The output of the above command is empty.) So I saw that the grandparent directory was deleted in 130881f0e37cdedc0e90f6c9ed84957aee6c80ef:
```
$ git log --full-history -- tools/cpp_build | head -n 1
commit 130881f0e37cdedc0e90f6c9ed84957aee6c80ef
$ git show 130881f0e37cdedc0e90f6c9ed84957aee6c80ef^ | head -n 1
commit c6facc2aaa5a568756e971a9d2b7f2af282dff39
```
And looking at [the history of that directory](https://github.com/pytorch/pytorch/commits/c6facc2aaa5a568756e971a9d2b7f2af282dff39/tools/cpp_build), I see that some of the last commits touch `torch/CMakeLists.txt`, so I'm just using that here and hoping it's correct.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47066

Reviewed By: albanD, seemethere

Differential Revision: D24628980

Pulled By: samestep

fbshipit-source-id: 0fe8887e323593ef1676c34d4b920aeeaebd8550
---
 tools/build_variables.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index 86fc691c298c..89b92d18a5df 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -41,7 +41,7 @@ def libtorch_generated_sources(gencode_pattern):
         "autograd/generated/TraceType_4.cpp",
     ]]
 
-# copied from https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/core/CMakeLists.txt
+# copied from https://github.com/pytorch/pytorch/blob/f99a693cd9ff7a9b5fdc71357dac66b8192786d3/aten/src/ATen/core/CMakeLists.txt
 jit_core_headers = [
     "torch/csrc/utils/memory.h",
     "torch/csrc/WindowsTorchApiMacro.h",
@@ -69,7 +69,7 @@ jit_core_sources = [
     "torch/csrc/jit/frontend/source_range.cpp",
 ]
 
-# copied from https://github.com/pytorch/pytorch/blob/master/tools/cpp_build/torch/CMakeLists.txt
+# copied from https://github.com/pytorch/pytorch/blob/0bde610c14b92d351b968a0228df29e92442b1cc/torch/CMakeLists.txt
 # There are some common files used in both internal lite-interpreter and full-jit. Making a separate
 # list for the shared files.
 

From d95e1afad37ab91a7c8d3fed19d86f2b7f2d218f Mon Sep 17 00:00:00 2001
From: Jiakai Liu <liujiakai@fb.com>
Date: Thu, 29 Oct 2020 22:51:49 -0700
Subject: [PATCH 142/169] [pytorch] add script to run all codegen (#46243)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46243

Add util script to test whether any codegen output changes.

Test Plan: Imported from OSS

Reviewed By: ezyang

Differential Revision: D24388873

Pulled By: ljk53

fbshipit-source-id: ef9ef7fe6067df1e0c53aba725fc13b0dfd7f4c2
---
 .jenkins/pytorch/codegen-test.sh        | 60 +++++++++++++++++++++++++
 tools/autograd/gen_annotated_fn_args.py |  1 +
 tools/autograd/gen_autograd.py          |  3 +-
 tools/jit/gen_unboxing_wrappers.py      |  3 +-
 4 files changed, 65 insertions(+), 2 deletions(-)
 create mode 100755 .jenkins/pytorch/codegen-test.sh

diff --git a/.jenkins/pytorch/codegen-test.sh b/.jenkins/pytorch/codegen-test.sh
new file mode 100755
index 000000000000..3b75999ceb2e
--- /dev/null
+++ b/.jenkins/pytorch/codegen-test.sh
@@ -0,0 +1,60 @@
+#!/usr/bin/env bash
+
+# This script can also be used to test whether your diff changes any codegen output.
+#
+# Run it before and after your change:
+#   .jenkins/pytorch/codegen-test.sh <baseline_output_dir>
+#   .jenkins/pytorch/codegen-test.sh <test_output_dir>
+#
+# Then run diff to compare the generated files:
+#   diff -Naur <baseline_output_dir> <test_output_dir>
+
+set -eu -o pipefail
+
+if [ "$#" -eq 0 ]; then
+  COMPACT_JOB_NAME="${BUILD_ENVIRONMENT}"
+  source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
+  OUT="$(dirname "${BASH_SOURCE[0]}")/../../codegen_result"
+else
+  OUT=$1
+fi
+
+set -x
+
+rm -rf "$OUT"
+
+# aten codegen
+python -m tools.codegen.gen \
+  -d "$OUT"/torch/share/ATen
+
+# torch codegen
+python -m tools.setup_helpers.generate_code \
+  --declarations-path "$OUT"/torch/share/ATen/Declarations.yaml \
+  --install_dir "$OUT"
+
+# pyi codegen
+mkdir -p "$OUT"/pyi/torch/_C
+mkdir -p "$OUT"/pyi/torch/nn
+python -m tools.pyi.gen_pyi \
+  --declarations-path "$OUT"/torch/share/ATen/Declarations.yaml \
+  --out "$OUT"/pyi
+
+# autograd codegen (called by torch codegen but can run independently)
+python -m tools.autograd.gen_autograd \
+  "$OUT"/torch/share/ATen/Declarations.yaml \
+  "$OUT"/autograd \
+  tools/autograd
+
+# unboxing_wrappers codegen (called by torch codegen but can run independently)
+mkdir -p "$OUT"/unboxing_wrappers
+python -m tools.jit.gen_unboxing_wrappers \
+  "$OUT"/torch/share/ATen/Declarations.yaml \
+  "$OUT"/unboxing_wrappers \
+  tools/jit/templates
+
+# annotated_fn_args codegen (called by torch codegen but can run independently)
+mkdir -p "$OUT"/annotated_fn_args
+python -m tools.autograd.gen_annotated_fn_args \
+  "$OUT"/torch/share/ATen/Declarations.yaml \
+  "$OUT"/annotated_fn_args \
+  tools/autograd
diff --git a/tools/autograd/gen_annotated_fn_args.py b/tools/autograd/gen_annotated_fn_args.py
index 7b4b0ece8da6..661694f3d6ba 100644
--- a/tools/autograd/gen_annotated_fn_args.py
+++ b/tools/autograd/gen_annotated_fn_args.py
@@ -20,6 +20,7 @@
     get_py_variable_methods,
     op_name,
 )
+import argparse
 import textwrap
 from .gen_autograd import load_aten_declarations
 
diff --git a/tools/autograd/gen_autograd.py b/tools/autograd/gen_autograd.py
index da937a4377fa..2783eb644bc6 100644
--- a/tools/autograd/gen_autograd.py
+++ b/tools/autograd/gen_autograd.py
@@ -302,7 +302,8 @@ def main():
     parser.add_argument('autograd', metavar='AUTOGRAD',
                         help='path to autograd directory')
     args = parser.parse_args()
-    gen_autograd(args.declarations, args.out, args.autograd)
+    gen_autograd(args.declarations, args.out, args.autograd,
+                 SelectiveBuilder.get_nop_selector())
 
 
 if __name__ == '__main__':
diff --git a/tools/jit/gen_unboxing_wrappers.py b/tools/jit/gen_unboxing_wrappers.py
index 8d1fb00fc8d2..f2896fac7f22 100644
--- a/tools/jit/gen_unboxing_wrappers.py
+++ b/tools/jit/gen_unboxing_wrappers.py
@@ -535,7 +535,8 @@ def main():
     parser.add_argument('template_path', metavar='TEMPLATE_PATH',
                         help='path to templates directory')
     args = parser.parse_args()
-    gen_unboxing_wrappers(args.declarations, args.out, args.template_path)
+    gen_unboxing_wrappers(args.declarations, args.out, args.template_path,
+                          SelectiveBuilder.get_nop_selector())
 
 
 if __name__ == '__main__':

From 0dbd72935aa1675ea736a5228b9a21a888bbcb6f Mon Sep 17 00:00:00 2001
From: Yanan Cao <gmagogsfm@gmail.com>
Date: Fri, 30 Oct 2020 00:28:55 -0700
Subject: [PATCH 143/169] Split comm hooks into python-dependent hooks and
 others (#47019)

Summary:
Stack from [ghstack](https://github.com/ezyang/ghstack):
* **https://github.com/pytorch/pytorch/issues/47019 Split comm hooks into python-dependent hooks and others**

This is needed because we plan to move most of c10d C++ implementation into `libtorch_*.so`, which can not have Python dependencies.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47019

Reviewed By: albanD

Differential Revision: D24614129

Pulled By: gmagogsfm

fbshipit-source-id: 3f32586b932a2fe6a7b01a3800f000e66e9786bb
---
 tools/build_variables.bzl                     |  1 +
 torch/csrc/distributed/c10d/comm.cpp          | 49 ---------------
 torch/csrc/distributed/c10d/comm.h            | 26 +-------
 .../distributed/c10d/default_comm_hooks.cpp   |  4 +-
 .../distributed/c10d/default_comm_hooks.h     |  4 +-
 torch/csrc/distributed/c10d/init.cpp          |  1 +
 .../distributed/c10d/python_comm_hook.cpp     | 60 +++++++++++++++++++
 .../csrc/distributed/c10d/python_comm_hook.h  | 34 +++++++++++
 8 files changed, 102 insertions(+), 77 deletions(-)
 create mode 100644 torch/csrc/distributed/c10d/python_comm_hook.cpp
 create mode 100644 torch/csrc/distributed/c10d/python_comm_hook.h

diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index 89b92d18a5df..65f5ec1c6903 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -545,6 +545,7 @@ libtorch_python_core_sources = [
 libtorch_python_distributed_core_sources = [
     "torch/csrc/distributed/c10d/comm.cpp",
     "torch/csrc/distributed/c10d/default_comm_hooks.cpp",
+    "torch/csrc/distributed/c10d/python_comm_hook.cpp",
     "torch/csrc/distributed/c10d/init.cpp",
     "torch/csrc/distributed/c10d/reducer.cpp",
 ]
diff --git a/torch/csrc/distributed/c10d/comm.cpp b/torch/csrc/distributed/c10d/comm.cpp
index f9fa2bbd7e8f..4ee19888e4c2 100644
--- a/torch/csrc/distributed/c10d/comm.cpp
+++ b/torch/csrc/distributed/c10d/comm.cpp
@@ -85,54 +85,5 @@ void broadcast_coalesced(
   }
 }
 
-PythonCommHook::~PythonCommHook() {
-  py::gil_scoped_acquire ag;
-  state_.dec_ref();
-  hook_.dec_ref();
-  // Explicitly set state_ and hook_ to nullptr to prevent py::object's dtor
-  // to decref on the PyObject again.
-  // See Note [Destructing py::object] in python_ivalue.h
-  state_.ptr() = nullptr;
-  hook_.ptr() = nullptr;
-}
-
-c10::intrusive_ptr<torch::jit::Future> PythonCommHook::runHook(
-    GradBucket& bucket) {
-  py::gil_scoped_acquire acquire;
-
-  py::object py_fut = hook_(state_, bucket);
-
-  try {
-    return py_fut.cast<std::shared_ptr<torch::jit::PythonFutureWrapper>>()->fut;
-  } catch (const py::cast_error& e) {
-    auto type = py_fut.get_type();
-    auto errMsg = c10::str(
-        e.what(),
-        ". DDP communication hook's callback must return a "
-        "torch.futures.Future or torch._C.Future object, but got ",
-        type.attr("__module__").cast<std::string>(),
-        ".",
-        type.attr("__qualname__").cast<std::string>());
-    throw std::runtime_error(errMsg);
-  }
-}
-
-std::vector<at::Tensor> PythonCommHook::parseHookResult(
-    const c10::IValue& result) {
-  TORCH_INTERNAL_ASSERT(
-      result.isPyObject() || result.isTensorList(),
-      "expected the hook result is either a PyObject or TensorList");
-
-  if (result.isPyObject()) {
-    py::gil_scoped_acquire ag;
-    py::object obj = torch::jit::toPyObject(result);
-    auto value = torch::jit::toIValue(
-        obj, c10::ListType::create(c10::TensorType::get()));
-
-    return value.toTensorVector();
-  }
-
-  return result.toTensorVector();
-}
 
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/comm.h b/torch/csrc/distributed/c10d/comm.h
index 20e0f948808d..1eb78b201569 100644
--- a/torch/csrc/distributed/c10d/comm.h
+++ b/torch/csrc/distributed/c10d/comm.h
@@ -1,8 +1,8 @@
 #pragma once
 
 #include <ATen/ATen.h>
+#include <ATen/core/ivalue.h>
 #include <c10d/ProcessGroup.hpp>
-#include <torch/csrc/utils/pybind.h>
 
 namespace c10d {
 
@@ -48,7 +48,7 @@ class TORCH_PYTHON_API CommHookInterface {
   // Passes the input grad bucket to the registered communication hook.
   // Once the tensors in the bucket are ready, kicks off the hook asynchronously
   // and returns a future that holds the communication results.
-  virtual c10::intrusive_ptr<torch::jit::Future> runHook(
+  virtual c10::intrusive_ptr<c10::ivalue::Future> runHook(
       GradBucket& bucket) = 0;
 
   // Returns the resulting tensors once the communication hook result is ready.
@@ -58,28 +58,6 @@ class TORCH_PYTHON_API CommHookInterface {
       const c10::IValue& result) = 0;
 };
 
-class TORCH_PYTHON_API PythonCommHook : public CommHookInterface {
- public:
-  // Takes a state and a callable hook. The inputs are Python objects.
-  // The state is passed to the hook in runHook method, and it can be used to
-  // maintain and update any state information during the execution of the hook.
-  // The hook performs user-specified processing and returns a future indicating
-  // asychronous communication of gradients.
-  PythonCommHook(py::object state, py::object hook)
-      : state_(std::move(state)), hook_(std::move(hook)) {}
-
-  ~PythonCommHook() override;
-
-  c10::intrusive_ptr<torch::jit::Future> runHook(GradBucket& bucket) override;
-
-  std::vector<at::Tensor> parseHookResult(const c10::IValue& result) override;
-
- private:
-  // Only needed for stateful communication.
-  py::object state_;
-  py::object hook_;
-};
-
 // This CppCommHook interface only requires implementing runHook method that
 // potentially uses a state.
 template <typename T>
diff --git a/torch/csrc/distributed/c10d/default_comm_hooks.cpp b/torch/csrc/distributed/c10d/default_comm_hooks.cpp
index 0f7a24acc40d..10da31bf0b03 100644
--- a/torch/csrc/distributed/c10d/default_comm_hooks.cpp
+++ b/torch/csrc/distributed/c10d/default_comm_hooks.cpp
@@ -6,7 +6,7 @@
 
 namespace c10d {
 
-c10::intrusive_ptr<torch::jit::Future> AllReduceCommHook::runHook(
+c10::intrusive_ptr<c10::ivalue::Future> AllReduceCommHook::runHook(
     GradBucket& bucket) {
   auto allreduce_work = state_->allreduce(bucket.getTensorsRef());
 
@@ -19,7 +19,7 @@ c10::intrusive_ptr<torch::jit::Future> AllReduceCommHook::runHook(
   return fut->then(div_by_process_group_size, fut->elementType());
 }
 
-c10::intrusive_ptr<torch::jit::Future> FP16CompressCommHook::runHook(
+c10::intrusive_ptr<c10::ivalue::Future> FP16CompressCommHook::runHook(
     GradBucket& bucket) {
   auto& tensors = bucket.getTensorsRef();
   for (auto& tensor : tensors) {
diff --git a/torch/csrc/distributed/c10d/default_comm_hooks.h b/torch/csrc/distributed/c10d/default_comm_hooks.h
index 5e53e01ac688..dd141fe0eb58 100644
--- a/torch/csrc/distributed/c10d/default_comm_hooks.h
+++ b/torch/csrc/distributed/c10d/default_comm_hooks.h
@@ -8,13 +8,13 @@ namespace c10d {
 class AllReduceCommHook : public CppCommHookInterface<ProcessGroup*> {
   ~AllReduceCommHook() override {}
 
-  c10::intrusive_ptr<torch::jit::Future> runHook(GradBucket& bucket) override;
+  c10::intrusive_ptr<c10::ivalue::Future> runHook(GradBucket& bucket) override;
 };
 
 class FP16CompressCommHook : public CppCommHookInterface<ProcessGroup*> {
   ~FP16CompressCommHook() override {}
 
-  c10::intrusive_ptr<torch::jit::Future> runHook(GradBucket& bucket) override;
+  c10::intrusive_ptr<c10::ivalue::Future> runHook(GradBucket& bucket) override;
 };
 
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 47a3ebabe941..27155d81e5c1 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -26,6 +26,7 @@
 
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/distributed/c10d/comm.h>
+#include <torch/csrc/distributed/c10d/python_comm_hook.h>
 #include <torch/csrc/distributed/c10d/reducer.h>
 #include <torch/csrc/jit/python/pybind_utils.h>
 #include <torch/csrc/utils/object_ptr.h>
diff --git a/torch/csrc/distributed/c10d/python_comm_hook.cpp b/torch/csrc/distributed/c10d/python_comm_hook.cpp
new file mode 100644
index 000000000000..6b25018d38a3
--- /dev/null
+++ b/torch/csrc/distributed/c10d/python_comm_hook.cpp
@@ -0,0 +1,60 @@
+#include <torch/csrc/distributed/c10d/python_comm_hook.h>
+
+#include <ATen/core/functional.h>
+#include <torch/csrc/distributed/c10d/reducer.h>
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <torch/csrc/utils/tensor_flatten.h>
+
+namespace c10d {
+
+PythonCommHook::~PythonCommHook() {
+  py::gil_scoped_acquire ag;
+  state_.dec_ref();
+  hook_.dec_ref();
+  // Explicitly set state_ and hook_ to nullptr to prevent py::object's dtor
+  // to decref on the PyObject again.
+  // See Note [Destructing py::object] in python_ivalue.h
+  state_.ptr() = nullptr;
+  hook_.ptr() = nullptr;
+}
+
+c10::intrusive_ptr<c10::ivalue::Future> PythonCommHook::runHook(
+    GradBucket& bucket) {
+  py::gil_scoped_acquire acquire;
+
+  py::object py_fut = hook_(state_, bucket);
+
+  try {
+    return py_fut.cast<std::shared_ptr<torch::jit::PythonFutureWrapper>>()->fut;
+  } catch (const py::cast_error& e) {
+    auto type = py_fut.get_type();
+    auto errMsg = c10::str(
+        e.what(),
+        ". DDP communication hook's callback must return a "
+        "torch.futures.Future or torch._C.Future object, but got ",
+        type.attr("__module__").cast<std::string>(),
+        ".",
+        type.attr("__qualname__").cast<std::string>());
+    throw std::runtime_error(errMsg);
+  }
+}
+
+std::vector<at::Tensor> PythonCommHook::parseHookResult(
+    const c10::IValue& result) {
+  TORCH_INTERNAL_ASSERT(
+      result.isPyObject() || result.isTensorList(),
+      "expected the hook result is either a PyObject or TensorList");
+
+  if (result.isPyObject()) {
+    py::gil_scoped_acquire ag;
+    py::object obj = torch::jit::toPyObject(result);
+    auto value = torch::jit::toIValue(
+        obj, c10::ListType::create(c10::TensorType::get()));
+
+    return value.toTensorVector();
+  }
+
+  return result.toTensorVector();
+}
+
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/python_comm_hook.h b/torch/csrc/distributed/c10d/python_comm_hook.h
new file mode 100644
index 000000000000..e38ba096460f
--- /dev/null
+++ b/torch/csrc/distributed/c10d/python_comm_hook.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/comm.h>
+
+#include <ATen/ATen.h>
+#include <ATen/core/ivalue.h>
+#include <c10d/ProcessGroup.hpp>
+#include <torch/csrc/utils/pybind.h>
+
+namespace c10d {
+
+class TORCH_PYTHON_API PythonCommHook : public CommHookInterface {
+ public:
+  // Takes a state and a callable hook. The inputs are Python objects.
+  // The state is passed to the hook in runHook method, and it can be used to
+  // maintain and update any state information during the execution of the hook.
+  // The hook performs user-specified processing and returns a future indicating
+  // asychronous communication of gradients.
+  PythonCommHook(py::object state, py::object hook)
+      : state_(std::move(state)), hook_(std::move(hook)) {}
+
+  ~PythonCommHook() override;
+
+  c10::intrusive_ptr<c10::ivalue::Future> runHook(GradBucket& bucket) override;
+
+  std::vector<at::Tensor> parseHookResult(const c10::IValue& result) override;
+
+ private:
+  // Only needed for stateful communication.
+  py::object state_;
+  py::object hook_;
+};
+
+} // namespace c10d

From f9d32c4fa8f5990ac09ba0f9a1298d0348433179 Mon Sep 17 00:00:00 2001
From: Meghan Lele <meghanl@fb.com>
Date: Fri, 30 Oct 2020 00:36:13 -0700
Subject: [PATCH 144/169] [JIT] Add selective backend lowering API (#43613)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/43613

**Summary**
This commit adds a helper/utility to faciliate the selective lowering of
specific submodules within a module hierarchy to a JIT backend. The reason
that this is needed is that lowering a submodule of a scripted
module to a backend after the module has been scripted requires
adjusting its JIT type.

**Test Plan**
This commit refactors `NestedModuleTest` in `jit/test_backends.py` to
use this new selective lowering API.

**Fixes**
This commit fixes ##41432.

Test Plan: Imported from OSS

Reviewed By: mortzur

Differential Revision: D23339855

Pulled By: SplitInfinity

fbshipit-source-id: d9e69aa502febbe04fd41558c70d219729252be9
---
 aten/src/ATen/core/jit_type.h            |   7 +
 aten/src/ATen/core/type.cpp              |   8 +
 test/jit/test_backends.py                | 210 ++++++++++++++++++++++-
 torch/csrc/jit/backends/backend_init.cpp | 136 +++++++++++++++
 4 files changed, 353 insertions(+), 8 deletions(-)

diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index 06900064e266..c1da20221e62 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -2102,6 +2102,13 @@ struct CAFFE2_API ClassType : public NamedType {
   // valid again.
   void unsafeRemoveAttribute(const std::string& name);
 
+  // [Internal Only] Change the type of an attribute of the ClassType,
+  // The caller is responsible to make sure the modification is safe:
+  // it is unsafe to maintain uses of the old type of the attribute,
+  // and any code that works on the attribute is now invalid.
+  // Only newly created code is valid again.
+  void unsafeChangeAttributeType(const std::string& name, TypePtr new_ty);
+
   // Add attribute \p NAME if it doesn't exist or verify that it has a
   // compatible type otherwise.
   size_t addOrCheckAttribute(
diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp
index f5478d040060..67b7899bb22f 100644
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@@ -1315,6 +1315,14 @@ void ClassType::unsafeRemoveAttribute(const std::string& name) {
   AT_ASSERT(attributes_.size() == attributeTypes_.size());
 }
 
+void ClassType::unsafeChangeAttributeType(const std::string& name, TypePtr new_ty) {
+  auto slot = getAttributeSlot(name);
+  auto old_attr_info = attributes_[slot];
+  AT_ASSERT(old_attr_info.getKind() == AttributeKind::REGULAR_ATTRIBUTE);
+  attributes_[slot] = ClassAttribute(old_attr_info.getKind(), new_ty, old_attr_info.getName());
+  attributeTypes_[slot] = new_ty;
+}
+
 size_t ClassType::addConstant(const std::string& name, const IValue& value) {
   checkNotExist(name, "constant");
   size_t slot = constantNames_.size();
diff --git a/test/jit/test_backends.py b/test/jit/test_backends.py
index 89330ddbd2d9..9f61ec77a1f6 100644
--- a/test/jit/test_backends.py
+++ b/test/jit/test_backends.py
@@ -5,7 +5,9 @@
 
 import torch
 import torch._C
+from torch.testing import FileCheck
 from pathlib import Path
+
 from torch.testing._internal.common_utils import (
     IS_FBCODE,
     IS_MACOS,
@@ -34,6 +36,13 @@ def to_test_backend_multi(module, method_compile_spec):
     return torch._C._jit_to_backend("test_backend", module, method_compile_spec)
 
 
+def to_test_backend_selective(module, method_compile_spec, submodules):
+    def _to_test_backend(module):
+        return to_test_backend(module, method_compile_spec)
+
+    return torch._C._jit_to_backend_selective(module, _to_test_backend, submodules)
+
+
 class BasicModule(torch.nn.Module):
     """
     A simple Module used to test to_backend lowering machinery.
@@ -81,9 +90,9 @@ def check_function(self, function_name, input):
         backend_method = self.lowered_module.__getattr__(function_name)
 
         # Run methods.
-        python_output = python_method(input, input)
-        jit_output = jit_method(input, input)
-        backend_output = backend_method(input, input)
+        python_output = python_method(*input)
+        jit_output = jit_method(*input)
+        backend_output = backend_method(*input)
 
         # The answers returned by Python, JIT and to_backend should all match.
         self.assertEqual(python_output, backend_output)
@@ -95,6 +104,24 @@ def save_load(self):
         """
         self.lowered_module = self.getExportImportCopy(self.lowered_module)
 
+    def test_execution(self):
+        """
+        Stub for correctness tests.
+        """
+        pass
+
+    def test_save_load(self):
+        """
+        Stub for serialization tests.
+        """
+        pass
+
+    def test_errors(self):
+        """
+        Stub for testing error checking.
+        """
+        pass
+
 
 class BasicModuleTest(JitBackendTestCase):
     """
@@ -116,9 +143,9 @@ def test_execution(self):
         input = torch.randn(5)
 
         # Test all three module methods.
-        self.check_function("accum", input)
-        self.check_function("sub_accum", input)
-        self.check_function("forward", input)
+        self.check_function("accum", (input, input))
+        self.check_function("sub_accum", (input, input))
+        self.check_function("forward", (input, input))
 
     @skipIfRocm
     def test_save_load(self):
@@ -166,8 +193,12 @@ def setUp(self):
         self.module = NestedModuleTest.NestedModule(BasicModule())
         # Both modules in self.scripted_module are ScriptModules.
         self.scripted_module = torch.jit.script(NestedModuleTest.NestedModule(BasicModule()))
+
+        # First, script another instance of NestedModule with share_types=False so that it can be
+        # selectively lowered without modifying the type of self.scripted_module.
         lowered_module = to_test_backend_multi(
-            self.scripted_module, {"forward": {"": ""}}
+            torch.jit.script(BasicModule()),
+            {"accum": {"": ""}, "sub_accum": {"": ""}, "forward": {"": ""}},
         )
         # self.lowered_module is a ScriptModule, but its submodule is a lowered module.
         self.lowered_module = torch.jit.script(NestedModuleTest.NestedModule(lowered_module))
@@ -177,7 +208,7 @@ def test_execution(self):
         input = torch.randn(5)
 
         # Test forward.
-        self.check_function("forward", input)
+        self.check_function("forward", (input, input))
 
     def test_save_load(self):
         # Lowered module should produce the same outputs.
@@ -190,6 +221,161 @@ def test_save_load(self):
         self.test_execution()
 
 
+class SelectiveLoweringTest(JitBackendTestCase):
+    """
+    Tests for the selective lowering API.
+    """
+    class OuterModule(torch.nn.Module):
+        def __init__(self, sub1, sub2, other):
+            super().__init__()
+            self.sub1 = sub1
+            self.sub2 = sub2
+            self.other = other
+
+        def forward(self, x, y):
+            # Call the module that will be lowered directly to test
+            # type remapping in modules that are not its parent.
+            a, b = self.sub1.submodule.forward(x, y)
+            c, d = self.sub2.forward(x, y)
+            e, f = self.other.forward(x, y)
+            return a + c + e, b + d + f
+
+    class MiddleModule(torch.nn.Module):
+        def __init__(self, submodule):
+            super().__init__()
+            self.submodule = submodule
+
+        def forward(self, x, y):
+            return self.submodule.forward(x, y)
+
+    def setUp(self):
+        super().setUp()
+        OuterModule = SelectiveLoweringTest.OuterModule
+        MiddleModule = SelectiveLoweringTest.MiddleModule
+
+        def script_without_type_sharing(mod):
+            return torch.jit._recursive.create_script_module(mod, torch.jit._recursive.infer_methods_to_compile, share_types=False)
+        # Create Python, JIT and backend versions of a hierarchy that looks like this:
+        #                 --------- OuterModule --------
+        #                 |              |              |
+        #           MiddleModule    MiddleModule   MiddleModule
+        #                |               |              |
+        #           BasicModule     BasicModule    BasicModule
+        #
+        # Two BasicModules will be lowered and the third will not.
+        self.module = OuterModule(MiddleModule(BasicModule()), MiddleModule(BasicModule()), MiddleModule(BasicModule()))
+        self.scripted_module = script_without_type_sharing(OuterModule(MiddleModule(
+            BasicModule()), MiddleModule(BasicModule()), MiddleModule(BasicModule())))
+        self.lowered_module = script_without_type_sharing(OuterModule(MiddleModule(
+            BasicModule()), MiddleModule(BasicModule()), MiddleModule(BasicModule())))
+        self.lowered_module = to_test_backend_selective(self.lowered_module, {"forward": ""}, [
+                                                        "sub1.submodule", "sub2.submodule"])
+
+    def test_execution(self):
+        input = torch.randn(5)
+        self.check_function("forward", (input, input))
+
+        self.test_selective_lowering_type_remap()
+
+    def test_save_load(self):
+        self.test_execution()
+        self.save_load()
+        self.test_execution()
+
+        self.test_selective_lowering_type_remap()
+
+    def test_selective_lowering_type_remap(self):
+        """
+        Check that type remapping and replacement occurred during selective lowering.
+        """
+        # Check that self.lowered_module was not lowered, but that it does contain test_backendLoweredModule due to it
+        # calling the lowered module directly.
+        FileCheck() \
+            .check("OuterModule") \
+            .check("BasicModule") \
+            .run(self.scripted_module.graph)
+        FileCheck() \
+            .check("OuterModule") \
+            .check_not("__torch__.torch.classes.__backends__.test_backend") \
+            .check("test_backendLoweredModule") \
+            .run(self.lowered_module.graph)
+
+        # Check that self.lowered_module.sub1/sub2 were not lowered but that BasicModule has been replaced in their graphs.
+        FileCheck() \
+            .check("MiddleModule") \
+            .check("BasicModule") \
+            .check_not("test_backendLoweredModule") \
+            .run(self.scripted_module.sub1.graph)
+        FileCheck() \
+            .check("MiddleModule") \
+            .check_not("__torch__.torch.classes.__backends__.test_backend") \
+            .check("test_backendLoweredModule") \
+            .check_not("BasicModule") \
+            .run(self.lowered_module.sub1.graph)
+
+        FileCheck() \
+            .check("MiddleModule") \
+            .check("BasicModule") \
+            .check_not("test_backendLoweredModule") \
+            .run(self.scripted_module.sub2.graph)
+        FileCheck() \
+            .check("MiddleModule") \
+            .check_not("__torch__.torch.classes.__backends__.test_backend") \
+            .check("test_backendLoweredModule") \
+            .check_not("BasicModule") \
+            .run(self.lowered_module.sub2.graph)
+
+        # Check that self.lowered_module.sub1/sub2.submodule were lowered. Its graph should mention
+        # __torch__.torch.classes.__backends__.test_backend, the TorchBind class for executing functions
+        # on the test JIT backend.
+        FileCheck() \
+            .check("test_backendLoweredModule") \
+            .check_not("BasicModule") \
+            .check("__torch__.torch.classes.__backends__.test_backend") \
+            .run(self.lowered_module.sub1.submodule.graph)
+
+        FileCheck() \
+            .check("test_backendLoweredModule") \
+            .check_not("BasicModule") \
+            .check("__torch__.torch.classes.__backends__.test_backend") \
+            .run(self.lowered_module.sub2.submodule.graph)
+
+        # Check that self.other and self.other.submodule have been left untouched by the selective lowering process.
+        FileCheck() \
+            .check("MiddleModule") \
+            .check("BasicModule") \
+            .check_not("__torch__.torch.classes.__backends__.test_backend") \
+            .check_not("test_backendLoweredModule") \
+            .run(self.scripted_module.other.graph)
+        FileCheck() \
+            .check("BasicModule") \
+            .check_not("__torch__.torch.classes.__backends__.test_backend") \
+            .check_not("test_backendLoweredModule") \
+            .run(self.scripted_module.other.submodule.graph)
+
+    def test_errors(self):
+        """
+        Check errors associated with selective lowering.
+        """
+        # Check error messages thrown when attempting to lower something that is not a ScriptModule.
+        with self.assertRaisesRegex(RuntimeError, r"Object .* is not a ScriptModule"):
+            to_test_backend_selective(torch.nn.ReLU(), {"forward": ""}, ["submodule"])
+
+        MiddleModule = SelectiveLoweringTest.MiddleModule
+        mod = MiddleModule(BasicModule())
+        mod.new_attr = 3
+
+        with self.assertRaisesRegex(RuntimeError, r"Attribute named new_attr is not a Module"):
+            to_test_backend_selective(torch.jit.script(mod), {"forward": ""}, ["new_attr"])
+
+        # Check error message thrown when module hierarchy doesn't have unique types.
+        OuterModule = SelectiveLoweringTest.OuterModule
+        mod = OuterModule(MiddleModule(BasicModule()), MiddleModule(BasicModule()), MiddleModule(BasicModule()))
+
+        with self.assertRaisesRegex(RuntimeError, r"Selective lowering is only supported for module hierarchies with unique types"):
+            to_test_backend_selective(torch.jit.script(mod), {"forward": ""}, ["sub1.submodule"])
+
+
 class TestBackends(JitTestCase):
     """
     This class wraps and invokes all subclasses of JitBackendTestCase so that each one
@@ -200,19 +386,27 @@ def __init__(self, name):
         super().__init__(name)
         self.basic_module_test = BasicModuleTest(name)
         self.nested_module_test = NestedModuleTest(name)
+        self.selective_lowering_test = SelectiveLoweringTest(name)
 
     def setUp(self):
         super().setUp()
         if not TEST_WITH_ROCM:
             self.basic_module_test.setUp()
             self.nested_module_test.setUp()
+            self.selective_lowering_test.setUp()
 
     @skipIfRocm
     def test_execution(self):
         self.basic_module_test.test_execution()
         self.nested_module_test.test_execution()
+        self.selective_lowering_test.test_execution()
 
     @skipIfRocm
     def test_save_load(self):
         self.basic_module_test.test_save_load()
         self.nested_module_test.test_save_load()
+        self.selective_lowering_test.test_save_load()
+
+    @skipIfRocm
+    def test_errors(self):
+        self.selective_lowering_test.test_errors()
diff --git a/torch/csrc/jit/backends/backend_init.cpp b/torch/csrc/jit/backends/backend_init.cpp
index d23a518d1ca3..596c19e6ba1b 100644
--- a/torch/csrc/jit/backends/backend_init.cpp
+++ b/torch/csrc/jit/backends/backend_init.cpp
@@ -2,11 +2,121 @@
 #include <torch/csrc/jit/backends/backend_detail.h>
 #include <torch/csrc/jit/backends/backend_resolver.h>
 #include <torch/csrc/jit/frontend/code_template.h>
+#include <torch/csrc/jit/python/module_python.h>
 #include <torch/csrc/jit/python/pybind_utils.h>
 
 namespace torch {
 namespace jit {
 
+// Get all types that are shared in the module hierarchy rooted at \p mod.
+std::unordered_set<TypePtr> getSharedModuleTypes(Module& mod) {
+  // Maintain a set of all TypePtrs.
+  std::unordered_set<TypePtr> types;
+  // Maintain another set of TypePtrs that have been encountered more than once.
+  std::unordered_set<TypePtr> duplicate_types;
+
+  // Iterate over all modules in the hierarchy, including the root.
+  for (auto module : mod.modules()) {
+    auto module_type = module.type();
+    if (types.count(module_type) > 0) {
+      duplicate_types.insert(module_type);
+    }
+
+    types.insert(module_type);
+  }
+
+  return duplicate_types;
+}
+
+// Selectively lower \p mod to a backend. \p to_backend
+// is called to lower modules. \p modules_to_lower contains
+// qualified names of submodules of \p mod that should be lowered.
+void toBackendSelectiveImpl(
+    Module& mod,
+    const py::function& to_backend,
+    const std::vector<std::string>& modules_to_lower,
+    const std::unordered_set<TypePtr>& duplicate_types) {
+  // This map will be used later to remap types in ancestor module graphs for
+  // all lowered submodules.
+  std::unordered_map<TypePtr, TypePtr> type_remap;
+
+  // For each module that should be lowered:
+  for (const auto& module_to_lower : modules_to_lower) {
+    // Use QualifiedName to parse the qualified module names.
+    c10::QualifiedName qual_module_name(module_to_lower);
+    auto& atoms = qual_module_name.atoms();
+
+    // Search through the module hierarchy using the atoms of
+    // qual_module_name until current points to the module to
+    // be lowered and parent points to its parent.
+    Module current = mod;
+    Module parent;
+
+    for (size_t i = 0, e = atoms.size(); i < e; ++i) {
+      IValue submodule = current.attr(atoms[i]);
+      if (submodule.isModule()) {
+        if (i == e - 1) {
+          parent = current;
+        }
+        current = submodule.toModule();
+      } else {
+        std::stringstream err;
+        err << "Attribute named " << atoms[i] << " is not a Module";
+        throw std::runtime_error(err.str());
+      }
+    }
+
+    // Check that the parent type is not shared and therefore can be edited.
+    if (duplicate_types.count(parent.type()) > 0) {
+      throw py::cast_error(c10::str(
+          "Selective lowering is only supported for module hierarchies with unique types for selected modules; ",
+          parent.type()->repr_str(),
+          " is shared"));
+    }
+
+    // Call to_backend on the module that needs to be lowered. It needs to be
+    // wrapped before doing so because _to_jit_backend accepts wrapped modules.
+    // The result needs to be unwrapped in order to access its type below.
+    auto lowered_submodule =
+        py::cast<Module>(to_backend(py::module::import("torch.jit._recursive")
+                                        .attr("wrap_cpp_module")(current))
+                             .attr("_c"));
+
+    // Adjust the parent's type so that the type of the submodule matches
+    // the type of lowered_submodule.
+    auto parent_type = parent.type();
+
+    parent_type->unsafeChangeAttributeType(
+        atoms.back(), lowered_submodule.type());
+    parent.setattr(atoms.back(), lowered_submodule._ivalue());
+
+    // Record the type mapping from old type -> lowered type.
+    type_remap[current.type()] = lowered_submodule.type();
+  }
+
+  // Having lowered all of the modules that needed to be lowered, remap types in
+  // all graphs in the hierarchy so that the graphs all use the new lowered
+  // type.
+  auto type_remap_fn = [&type_remap](TypePtr in) {
+    auto it = type_remap.find(in);
+    if (it == type_remap.end())
+      return in;
+    return it->second;
+  };
+
+  // modules() iterates over all modules in the hierarchy including the root.
+  for (auto module : mod.modules()) {
+    auto module_type = module.type();
+    for (auto& fn : module_type->methods()) {
+      auto method = module.get_method(fn->name());
+      auto graph = method.graph();
+      graph->remapTypes(type_remap_fn);
+      auto new_schema = fn->getSchema().cloneWithRemappedTypes(type_remap_fn);
+      fn->setSchema(new_schema);
+    }
+  }
+}
+
 void initJitBackendBindings(PyObject* module) {
   // Bind a function for lowering to each JIT backend. The name of the backend
   // must be the first argument. For example, to lower a Module to
@@ -244,6 +354,32 @@ void initJitBackendBindings(PyObject* module) {
                 py::cast<Module>(orig_module.attr("_c")),
                 method_compile_spec));
       });
+
+  m.def(
+      "_jit_to_backend_selective",
+      [=](py::handle orig_module,
+          const py::function& to_backend,
+          const std::vector<std::string>& modules_to_lower) {
+        if (auto original_module =
+                as_module(py::cast<py::object>(orig_module))) {
+          // Clone the Module to avoid editing types that are shared with
+          // Modules in other instances outside this hierarchy.
+          Module& mod = original_module.value();
+          auto cloned_mod = mod.clone();
+          // Get all shared module types. Type sharing is only a problem if the
+          // parent modules of the ones to lower are in this set.
+          auto shared_types = getSharedModuleTypes(cloned_mod);
+          toBackendSelectiveImpl(
+              cloned_mod, to_backend, modules_to_lower, shared_types);
+          // Wrap the result in a RecursiveScriptModule because that's what
+          // the caller passed in.
+          return py::module::import("torch.jit._recursive")
+              .attr("wrap_cpp_module")(cloned_mod);
+        }
+
+        throw py::cast_error(c10::str(
+            "Object ", py::str(orig_module), " is not a ScriptModule"));
+      });
 }
 } // namespace jit
 } // namespace torch

From 6c34aa720c43d2959a1154a076c5ffdc0da07fb6 Mon Sep 17 00:00:00 2001
From: Wang Xu <scottxu0730@gmail.com>
Date: Fri, 30 Oct 2020 01:57:56 -0700
Subject: [PATCH 145/169] add add_node function for partition to fix partition
 mem size calculation (#47083)

Summary:
Placeholders and constants in the partition are counted twice when combining two partitions. This PR fixes it by adding add_node function into Partition class. A unit test is also updated to test if the partition size is correct

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47083

Reviewed By: gcatron

Differential Revision: D24634368

Pulled By: scottxu0730

fbshipit-source-id: ab408f29da4fbf729fd9741dcb3bdb3076dc30c4
---
 test/test_fx_experimental.py         |  6 ++++--
 torch/fx/experimental/Partitioner.py | 17 ++++++++++++++---
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py
index 434c678c53dc..d5d3daef722b 100644
--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@@ -97,8 +97,10 @@ def forward(self, a):
         module_with_submodules = ret.module_with_submodules
         dag = ret.dag
         self.assertEqual(traced(a), module_with_submodules(a))
-        for i, node in enumerate(dag.nodes):
-            assert node.logical_device_ids == [i]
+        assert dag.nodes[0].logical_device_ids == [0]
+        assert dag.nodes[0].size_bytes == 80
+        assert dag.nodes[1].logical_device_ids == [1]
+        assert dag.nodes[1].size_bytes == 144
 
     def test_sparse_nn_partition(self):
         class MyRecommendationModule(torch.nn.Module):
diff --git a/torch/fx/experimental/Partitioner.py b/torch/fx/experimental/Partitioner.py
index 8ad6ea34c828..9c1bdaaa1335 100644
--- a/torch/fx/experimental/Partitioner.py
+++ b/torch/fx/experimental/Partitioner.py
@@ -64,6 +64,17 @@ def recalculate_mem_size(self):
         for node in self.nodes:
             self.used_mem_bytes += get_extra_size_of(node, self.nodes)
 
+    def add_node(self, node):
+        input_nodes: Dict[Node, None] = {}
+        map_arg(node.args, lambda n: input_nodes.setdefault(n))
+        map_arg(node.kwargs, lambda n: input_nodes.setdefault(n))
+        # Add current node's input nodes if they are placeholder or constants
+        for n in input_nodes:
+            if n.op in {'placeholder', 'get_attr'}:
+                self.nodes.add(n)
+        self.nodes.add(node)
+
+
 class PartitionResult(NamedTuple):
     """NameTuple used for returning DAG and a new graph module
     """
@@ -224,7 +235,7 @@ def create_single_node_partition(node):
             """
             partition = self.create_partition()
             total_size_needed = get_extra_size_of(node, set())
-            partition.nodes.add(node)
+            partition.add_node(node)
             partition.used_mem_bytes = total_size_needed
             single_node_partitions.append(partition)
 
@@ -268,7 +279,7 @@ def create_single_node_partition(node):
                             total_size_of_input_nodes = get_extra_size_of(node, partition.nodes)
                             partition_to_left_mem_bytes[partition] = device.available_mem_bytes
                             partition.logical_device_ids.append(device.logical_id)
-                    partition.nodes.add(node)
+                    partition.add_node(node)
                     partition_to_left_mem_bytes[partition] -= total_size_of_input_nodes
                     partition.used_mem_bytes += total_size_of_input_nodes
                 # No device left, create single node partitions
@@ -551,7 +562,7 @@ def is_embedding_node(node: Node) -> bool:
                     total_size_of_input_nodes = get_extra_size_of(node, partition.nodes)
                     if total_size_of_input_nodes > available_mem_bytes:
                         raise RuntimeError(node.target + 'is too large to fit into a device')
-                partition.nodes.add(node)
+                partition.add_node(node)
                 partition.used_mem_bytes += total_size_of_input_nodes
         reset_partition_in_sparse_nn(partition, new_partition=False)
         # Set parents and children for each partition

From eec201c1388434a8d1efbfaf3d6ab091247b9aa9 Mon Sep 17 00:00:00 2001
From: Bugra Akyildiz <vbugra@fb.com>
Date: Fri, 30 Oct 2020 02:34:03 -0700
Subject: [PATCH 146/169] Add last_n_window_collector

Summary:
Add `last_n_window_collector` as C2 supports and PyTorch currently does not have this operator: https://www.internalfb.com/intern/diffusion/FBS/browsefile/master/fbcode/caffe2/caffe2/operators/last_n_window_collector.cc?lines=139

## Problem that we are solving

This operator works on multiple pieces of data and collects last `n` element that has been seen.

If you have the following pieces of data that has been passed around:

```
  [1, 2, 3, 4]
  [5, 6, 7]
  [8, 9, 10, 11]
```

for 3 times and the number of collector is given to be 6. The expected result is:

```
  [6, 7, 8, 9, 10, 11]
```
What this means is that, almost like we need a FIFO(First in First Out) mechanism where as we are passing this data through the collector, we will be pushing some other data at the end.

In this particular example, in the first pass(the data is `[1, 2, 3, 4]`) , we hold `[1, 2, 3, 4]` in the queue as our queue size is 6.

In the second pass(the data is `[5, 6, 7]`), we hold `[2, 3, 4, 5, 6, 7]` in the queue and since 1 is inserted the last, it will drop due to the size limitation of the queue.

In the third pass(the data is `[8, 9, 10, 11]`), we hold `[6, 7, 8, 9, 10, 11]` in the queue and `2,3,4,5` are dropped due the the size of the queue.

For multidimension case, when we have the following data:

```
  [[1, 2], [2, 3], [3, 4], [4, 5]]
  [[5, 6], [6, 7], [7, 8]]
  [[8, 9], [9, 10], [10, 11], [11, 12]]
```

and our queue size is 6.

In the first pass, we will have `  [[1, 2], [2, 3], [3, 4], [4, 5]]`
In the second pass, we will have `[2, 3], [3, 4], [4, 5]] [[5, 6], [6, 7], [7, 8]]`
In the third pass, we will have `[6, 7], [7, 8]] [[8, 9], [9, 10], [10, 11], [11, 12]]`

### The implementation

I am using FIFO queue in Python which is in the collections library. This accepts `maxlen` argument which can be used to set the size of the queue.

I am using last n indices of the tensor through list indices and in this operator, I am not doing copy.

In the test plan, I have both single dimension tensors as well as multi-dimension tensors.

### Benchmark
I used various different configurations and added a benchmark test. PyTorch implementation is much master than Caffe2 implementation:

#### CPU Benchmark
```
torch_response.median
0.00019254473969340324

caffe_response.median
0.00030233583599794657
```

#### GPU Benchmark

```
torch_response.mean
0.000081007429903838786

caffe_response.mean
0.00010279081099724863
```

Test Plan:
### For CPU:
```
buck test //caffe2/torch/fb/sparsenn:test
```

### For GPU:
- Used an on-demand machine and did the following commands:
```
jf get D24435544
buck test mode/opt  //caffe2/torch/fb/sparsenn:test
```
https://www.internalfb.com/intern/testinfra/testconsole/testrun/4222124688138052/

Reviewed By: dzhulgakov, radkris-git

Differential Revision: D24435544

fbshipit-source-id: 8193b4746b20f2a4920fd4d41271341045cdcee1
---
 caffe2/python/layers/last_n_window_collector.py | 4 ----
 caffe2/python/workspace.py                      | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/caffe2/python/layers/last_n_window_collector.py b/caffe2/python/layers/last_n_window_collector.py
index a16b731a2f78..5e6874b4cca0 100644
--- a/caffe2/python/layers/last_n_window_collector.py
+++ b/caffe2/python/layers/last_n_window_collector.py
@@ -1,10 +1,6 @@
 ## @package last_n_window_collector
 # Module caffe2.python.layers.last_n_window_collector
 
-
-
-
-
 from caffe2.python import core, schema
 from caffe2.python.layers.layers import ModelLayer
 
diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py
index 99983e84f097..0aa46ee2d4b3 100644
--- a/caffe2/python/workspace.py
+++ b/caffe2/python/workspace.py
@@ -335,7 +335,7 @@ def StringifyNetName(name):
 def GetNetName(net):
     if isinstance(net, basestring):
         return net
-    if type(net).__name__ == "Net":
+    if type(net).__name__ == "Net" or type(net).__name__ == "NetWithShapeInference":
         return net.Name()
     if isinstance(net, caffe2_pb2.NetDef):
         return net.name

From 67b7e751e6b5931a9f45274653f4f653a4e6cdf6 Mon Sep 17 00:00:00 2001
From: lixinyu <lixinyu@devgpu175.prn2.facebook.com>
Date: Fri, 30 Oct 2020 07:52:43 -0700
Subject: [PATCH 147/169] add warning if DataLoader is going to create
 excessive number of thread (#46867)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/46867

Test Plan: Imported from OSS

Reviewed By: albanD

Differential Revision: D24545540

Pulled By: glaringlee

fbshipit-source-id: a3bef0d417e535b8ec0bb33f39cfa2308aadfff0
---
 test/test_dataloader.py        |  8 +++-
 torch/utils/data/dataloader.py | 83 +++++++++++++++++++++++++++++++++-
 2 files changed, 89 insertions(+), 2 deletions(-)

diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 67a9c8477e8b..0d6ee2e03bd6 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -1564,7 +1564,7 @@ def test_proper_exit(self):
             # In all cases, all processes should end properly.
             if use_workers:
                 exit_methods = [None, 'loader_error', 'loader_kill', 'worker_error', 'worker_kill']
-                persistent_workers = self.persistent_workers 
+                persistent_workers = self.persistent_workers
             else:
                 exit_methods = [None, 'loader_error', 'loader_kill']
                 persistent_workers = False
@@ -1840,6 +1840,12 @@ def test_default_collate_shared_tensor(self):
         finally:
             _utils.worker._worker_info = old
 
+    def test_excessive_thread_creation_warning(self):
+        with self.assertWarnsRegex(
+            UserWarning,
+                r"excessive worker creation might get DataLoader running slow or even freeze"):
+            dataloader = DataLoader(self.dataset, batch_size=2, num_workers=1000)
+
 
 class StringDataset(Dataset):
     def __init__(self):
diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index 59c1827e1842..8d7726ebd129 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -5,6 +5,7 @@
 in `./_utils/worker.py`.
 """
 
+import os
 import threading
 import itertools
 import warnings
@@ -290,10 +291,13 @@ def __init__(self, dataset: Dataset[T_co], batch_size: Optional[int] = 1,
 
         self._iterator = None
 
+        self.check_worker_number_rationality()
+
     def _get_iterator(self) -> '_BaseDataLoaderIter':
         if self.num_workers == 0:
             return _SingleProcessDataLoaderIter(self)
         else:
+            self.check_worker_number_rationality()
             return _MultiProcessingDataLoaderIter(self)
 
     @property
@@ -399,6 +403,83 @@ def __len__(self) -> int:
         else:
             return len(self._index_sampler)
 
+    def check_worker_number_rationality(self):
+        # This function check whether the dataloader's worker number is rational based on
+        # current system's resource. Current rule is that if the number of workers this
+        # Dataloader will create is bigger than the number of logical cpus that is allowed to
+        # use, than we will pop up a warning to let user pay attention.
+        #
+        # eg. If current system has 2 physical CPUs with 16 cores each. And each core support 2
+        #     threads, then the total logical cpus here is 2 * 16 * 2 = 64. Let's say current
+        #     DataLoader process can use half of them which is 32, then the rational max number of
+        #     worker that initiated from this process is 32.
+        #     Now, let's say the created DataLoader has num_works = 40, which is bigger than 32.
+        #     So the warning message is triggered to notify the user to lower the worker number if
+        #     necessary.
+        #
+        #
+        # [Note] Please note that this function repects `cpuset` only when os.sched_getaffinity is
+        #        available (available in most of Linux system, but not OSX and Windows).
+        #        When os.sched_getaffinity is not available, os.cpu_count() is called instead, but
+        #        it doesn't repect cpuset.
+        #        We don't take threading into account since each worker process is single threaded
+        #        at this time.
+        #
+        #        We don't set any threading flags (eg. OMP_NUM_THREADS, MKL_NUM_THREADS, etc)
+        #        other than `torch.set_num_threads` to 1 in the worker process, if the passing
+        #        in functions use 3rd party modules that rely on those threading flags to determine
+        #        how many thread to create (eg. numpy, etc), then it is caller's responsibility to
+        #        set those flags correctly.
+        def _create_warning_msg(num_worker_suggest, num_worker_created, cpuset_checked):
+
+            suggested_max_worker_msg = ((
+                "Our suggested max number of worker in current system is {}{}, which is smaller "
+                "than what this DataLoader is going to create.").format(
+                    num_worker_suggest,
+                    ("" if cpuset_checked else " (`cpuset` is not taken into account)"))
+            ) if num_worker_suggest is not None else (
+                "DataLoader is not able to compute a suggested max number of worker in current system.")
+
+            warn_msg = (
+                "This DataLoader will create {} worker processes in total. {} "
+                "Please be aware that excessive worker creation might get DataLoader running slow or even freeze, "
+                "lower the worker number to avoid potential slowness/freeze if necessary.").format(
+                    num_worker_created,
+                    suggested_max_worker_msg)
+            return warn_msg
+
+        if not self.num_workers or self.num_workers == 0:
+            return
+
+        # try to compute a suggested max number of worker based on system's resource
+        max_num_worker_suggest = None
+        cpuset_checked = False
+        if hasattr(os, 'sched_getaffinity'):
+            try:
+                max_num_worker_suggest = len(os.sched_getaffinity(0))
+                cpuset_checked = True
+            except Exception:
+                pass
+        if max_num_worker_suggest is None:
+            # os.cpu_count() could return Optional[int]
+            # get cpu count first and check None in order to satify mypy check
+            cpu_count = os.cpu_count()
+            if cpu_count is not None:
+                max_num_worker_suggest = cpu_count
+
+        if max_num_worker_suggest is None:
+            warnings.warn(_create_warning_msg(
+                max_num_worker_suggest,
+                self.num_workers,
+                cpuset_checked))
+            return
+
+        if self.num_workers > max_num_worker_suggest:
+            warnings.warn(_create_warning_msg(
+                max_num_worker_suggest,
+                self.num_workers,
+                cpuset_checked))
+
 
 class _BaseDataLoaderIter(object):
     def __init__(self, loader: DataLoader) -> None:
@@ -843,7 +924,7 @@ def _reset(self, loader, first_iter=False):
         # contains all `True`s if not using an iterable-style dataset
         # (i.e., if kind != Iterable).
         # Not that this indicates that a worker still has work to do *for this epoch*.
-        # It does not mean that a worker is dead. In case of `_persistent_workers`, 
+        # It does not mean that a worker is dead. In case of `_persistent_workers`,
         # the worker will be reset to available in the next epoch.
         self._workers_status = [True for i in range(self._num_workers)]
         # We resume the prefetching in case it was enabled

From 7df0224cba8b0f6aa31a6893f87df6f12a0ec3aa Mon Sep 17 00:00:00 2001
From: Facebook Community Bot <facebook-github-bot@users.noreply.github.com>
Date: Fri, 30 Oct 2020 08:14:05 -0700
Subject: [PATCH 148/169] Automated submodule update: FBGEMM (#47071)

Summary:
This is an automated pull request to update the first-party submodule for [pytorch/FBGEMM](https://github.com/pytorch/FBGEMM).

New submodule commit: https://github.com/pytorch/FBGEMM/commit/39d5addbff3c942e3c2c97b30c46eed167737b31

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47071

Test Plan: Ensure that CI jobs succeed on GitHub before landing.

Reviewed By: albanD

Differential Revision: D24628407

fbshipit-source-id: 9b636f66b92e5853cafd521e704996ebc2faa954
---
 third_party/fbgemm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/fbgemm b/third_party/fbgemm
index 23cb1db72b03..39d5addbff3c 160000
--- a/third_party/fbgemm
+++ b/third_party/fbgemm
@@ -1 +1 @@
-Subproject commit 23cb1db72b03e29984eefe58c5c99d733a85435d
+Subproject commit 39d5addbff3c942e3c2c97b30c46eed167737b31

From 2643800881a04f31d8f548a9abbdf1772d899a6a Mon Sep 17 00:00:00 2001
From: Heitor Schueroff <heitorschueroff@fb.com>
Date: Fri, 30 Oct 2020 09:32:09 -0700
Subject: [PATCH 149/169] Fix max_pool2d with ceil_mode bug (#46558)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46558

This PR fixes a bug with how pooling output shape was computed.

## BC Breaking Notes
Previously, a bug in the pooling code allowed a sliding window to be entirely off bounds. Now, sliding windows must start inside the input or left padding (not right padding, see https://github.com/pytorch/pytorch/issues/46929) and may only go off-bounds if ceil_mode=True.

fixes #45357

TODO

- [x] Ensure existing tests are checking for the correct output size

Test Plan: Imported from OSS

Reviewed By: albanD

Differential Revision: D24633372

Pulled By: heitorschueroff

fbshipit-source-id: 55925243a53df5d6131a1983076f11cab7516d6b
---
 aten/src/ATen/native/Pool.h            |  5 +++--
 test/quantization/test_quantized_op.py |  4 ++--
 test/test_nn.py                        | 21 +++++++++++++++++++++
 torch/nn/modules/pooling.py            | 24 ++++++++++++++++++++++++
 4 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/native/Pool.h b/aten/src/ATen/native/Pool.h
index b89554fd4d48..071460b090cd 100644
--- a/aten/src/ATen/native/Pool.h
+++ b/aten/src/ATen/native/Pool.h
@@ -28,11 +28,12 @@ static inline T pooling_output_shape_pad_lr(
     T outputSize = div_rtn<T>(
         inputSize + pad_l + pad_r - dilation * (kernelSize - 1) - 1 +
         (ceil_mode ? stride - 1 : 0), stride) + 1;
-    if (pad_l) {
+    if (ceil_mode) {
         // ensure that the last pooling starts inside the image
         // needed to avoid problems in ceil mode
-        if ((outputSize - 1) * stride >= inputSize + pad_l)
+        if ((outputSize - 1) * stride >= inputSize + pad_l) {
           --outputSize;
+        }
     }
     return outputSize;
 }
diff --git a/test/quantization/test_quantized_op.py b/test/quantization/test_quantized_op.py
index 24649296b03a..96568662c052 100644
--- a/test/quantization/test_quantized_op.py
+++ b/test/quantization/test_quantized_op.py
@@ -91,9 +91,9 @@ def pool_output_shape(input_size, kernel_size, padding, stride,
     output_size = (
         (input_size + 2 * padding - dilation * (kernel_size - 1) - 1
          + (stride - 1 if ceiling_mode else 0)) // stride + 1)
-    if (padding > 0 and
+    if (ceiling_mode and
             ((output_size - 1) * stride >= input_size + padding)):
-        output_size += 1
+        output_size -= 1
     return output_size
 
 """
diff --git a/test/test_nn.py b/test/test_nn.py
index 1c52371640a0..d14bb5aa9963 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -10522,6 +10522,27 @@ def test(nonlinearity, *args, **kwargs):
         test('threshold', 3, 2)
         test('threshold', 3, 2, inplace=True)
 
+    def test_pooling_shape(self, device):
+        ''' Test the output shape calculation for pooling functions '''
+
+        # Checks output shape against expected for 1D, 2D and 3D
+        def check(expected_out_shape, sizes, *args, **kwargs):
+            for kernel in ['max', 'avg']:
+                for i in [1, 2, 3]:
+                    if hasattr(torch.nn.functional, f'{kernel}_pool{i}d'):
+                        op = getattr(torch.nn.functional, f'{kernel}_pool{i}d')
+                        t = torch.randn(sizes[:i + 2], device=device)
+                        self.assertEqual(op(t, *args, **kwargs).shape, expected_out_shape[:i + 2])
+
+        check((1, 1, 3, 3, 4), (1, 1, 5, 6, 7), kernel_size=1, stride=2, padding=0, ceil_mode=True)
+        check((1, 1, 2, 3, 3), (1, 1, 3, 4, 5), kernel_size=2, stride=2, padding=1, ceil_mode=False)
+        check((1, 1, 2, 3, 3), (1, 1, 3, 4, 5), kernel_size=2, stride=2, padding=1, ceil_mode=True)
+
+        # Test case from issue https://github.com/pytorch/pytorch/issues/45357
+        x = torch.randn(1, 1, 6, 7, device=device)
+        y = torch.nn.functional.max_pool2d(x, 1, stride=(2, 2), padding=0, ceil_mode=True)
+        self.assertEqual(y.size(), (1, 1, 3, 4))
+
     @onlyOnCPUAndCUDA   # TODO: fix on XLA
     def test_adaptive_avg_pool2d_output_size_one(self, device):
         def helper(size, memory_format):
diff --git a/torch/nn/modules/pooling.py b/torch/nn/modules/pooling.py
index 734912684d8f..7a43fcc2ea2d 100644
--- a/torch/nn/modules/pooling.py
+++ b/torch/nn/modules/pooling.py
@@ -45,6 +45,10 @@ class MaxPool1d(_MaxPoolNd):
     for :attr:`padding` number of points. :attr:`dilation` is the stride between the elements within the
     sliding window. This `link`_ has a nice visualization of the pooling parameters.
 
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+
     Args:
         kernel_size: The size of the sliding window, must be > 0.
         stride: The stride of the sliding window, must be > 0. Default value is :attr:`kernel_size`.
@@ -104,6 +108,10 @@ class MaxPool2d(_MaxPoolNd):
     for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points.
     It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
 
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+
     The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
 
         - a single ``int`` -- in which case the same value is used for the height and width dimension
@@ -174,6 +182,10 @@ class MaxPool3d(_MaxPoolNd):
     for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points.
     It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
 
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+
     The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
 
         - a single ``int`` -- in which case the same value is used for the depth, height and width dimension
@@ -474,6 +486,10 @@ class AvgPool1d(_AvgPoolNd):
     If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
     for :attr:`padding` number of points.
 
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+
     The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding` can each be
     an ``int`` or a one-element tuple.
 
@@ -537,6 +553,10 @@ class AvgPool2d(_AvgPoolNd):
     If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
     for :attr:`padding` number of points.
 
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+
     The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding` can either be:
 
         - a single ``int`` -- in which case the same value is used for the height and width dimension
@@ -614,6 +634,10 @@ class AvgPool3d(_AvgPoolNd):
     If :attr:`padding` is non-zero, then the input is implicitly zero-padded on all three sides
     for :attr:`padding` number of points.
 
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+
     The parameters :attr:`kernel_size`, :attr:`stride` can either be:
 
         - a single ``int`` -- in which case the same value is used for the depth, height and width dimension

From f05b66b70d6afc4a0b9a6dbda9552f8173ad6108 Mon Sep 17 00:00:00 2001
From: Basil Hosmer <bhosmer@fb.com>
Date: Fri, 30 Oct 2020 10:11:22 -0700
Subject: [PATCH 150/169] pass TypeMeta by value (#45026)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45026

Test Plan: Imported from OSS

Reviewed By: ezyang

Differential Revision: D23802943

Pulled By: bhosmer

fbshipit-source-id: 81b06ef00bf8eb4375c0e0ff2032e03bd1d1188a
---
 aten/src/ATen/OpaqueTensorImpl.h              |  2 +-
 aten/src/ATen/SparseTensorImpl.cpp            |  4 ++--
 aten/src/ATen/SparseTensorImpl.h              |  4 ++--
 aten/src/ATen/core/blob.h                     |  4 ++--
 aten/src/ATen/native/Resize.h                 |  2 +-
 .../native/vulkan/VulkanOpaqueTensorImpl.h    |  2 +-
 aten/src/ATen/quantized/QTensorImpl.cpp       |  2 +-
 aten/src/ATen/quantized/QTensorImpl.h         |  2 +-
 c10/core/DefaultDtype.cpp                     |  4 ++--
 c10/core/DefaultDtype.h                       |  4 ++--
 c10/core/ScalarTypeToTypeMeta.h               |  8 ++++++++
 c10/core/TensorImpl.cpp                       |  6 +++---
 c10/core/TensorImpl.h                         | 20 +++++++++----------
 caffe2/contrib/opencl/context.h               |  2 +-
 caffe2/core/context.h                         |  2 +-
 caffe2/core/context_base.h                    |  6 +++---
 caffe2/core/context_gpu.h                     |  2 +-
 caffe2/core/operator.h                        |  6 +++---
 caffe2/core/plan_executor.cc                  |  4 ++--
 caffe2/core/tensor.h                          | 14 ++++++-------
 caffe2/core/types.cc                          |  4 ++--
 caffe2/core/types.h                           |  4 ++--
 caffe2/ideep/utils/ideep_context.h            |  2 +-
 caffe2/operators/dataset_ops.cc               | 16 +++++++--------
 caffe2/operators/dataset_ops.h                |  2 +-
 caffe2/operators/index_ops.h                  |  4 ++--
 caffe2/operators/numpy_tile_op.h              |  2 +-
 caffe2/operators/tile_op.cc                   |  2 +-
 caffe2/python/pybind_state.cc                 |  4 ++--
 caffe2/python/pybind_state.h                  | 10 +++++-----
 caffe2/python/pybind_state_dlpack.cc          |  4 ++--
 caffe2/python/pybind_state_dlpack.h           |  6 +++---
 caffe2/python/pybind_state_ideep.cc           | 10 +++++-----
 torch/csrc/DynamicTypes.cpp                   |  4 ++--
 torch/csrc/DynamicTypes.h                     |  2 +-
 35 files changed, 92 insertions(+), 84 deletions(-)

diff --git a/aten/src/ATen/OpaqueTensorImpl.h b/aten/src/ATen/OpaqueTensorImpl.h
index a4007c3115dc..b00f80d232db 100644
--- a/aten/src/ATen/OpaqueTensorImpl.h
+++ b/aten/src/ATen/OpaqueTensorImpl.h
@@ -21,7 +21,7 @@ struct CAFFE2_API OpaqueTensorImpl : public TensorImpl {
   // public constructor for now...
   OpaqueTensorImpl(
       at::DispatchKeySet key_set,
-      const caffe2::TypeMeta& data_type,
+      const caffe2::TypeMeta data_type,
       c10::Device device,
       OpaqueHandle opaque_handle,
       c10::IntArrayRef sizes)
diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp
index 3119c81ac8aa..45492d7b212e 100644
--- a/aten/src/ATen/SparseTensorImpl.cpp
+++ b/aten/src/ATen/SparseTensorImpl.cpp
@@ -30,12 +30,12 @@ namespace {
 //
 // This means that we allocate a [1,0] size indices tensor and a [0] size
 // values tensor for such an empty tensor.
-SparseTensorImpl::SparseTensorImpl(at::DispatchKeySet key_set, const caffe2::TypeMeta& data_type)
+SparseTensorImpl::SparseTensorImpl(at::DispatchKeySet key_set, const caffe2::TypeMeta data_type)
   :   SparseTensorImpl(key_set, data_type
       , at::empty({1, 0}, at::initialTensorOptions().device(sparseTensorSetToDeviceType(key_set)).dtype(ScalarType::Long))
       , at::empty({0}, at::initialTensorOptions().device(sparseTensorSetToDeviceType(key_set)).dtype(data_type))) {}
 
-SparseTensorImpl::SparseTensorImpl(at::DispatchKeySet key_set, const caffe2::TypeMeta& data_type, at::Tensor indices, at::Tensor values)
+SparseTensorImpl::SparseTensorImpl(at::DispatchKeySet key_set, const caffe2::TypeMeta data_type, at::Tensor indices, at::Tensor values)
     : TensorImpl(key_set, data_type, values.device())
     , sparse_dim_(1)
     , dense_dim_(0)
diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h
index bdccb540734f..b8e6bb26bf7f 100644
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@@ -31,7 +31,7 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl {
 
 public:
   // Public for now...
-  explicit SparseTensorImpl(at::DispatchKeySet, const caffe2::TypeMeta&);
+  explicit SparseTensorImpl(at::DispatchKeySet, const caffe2::TypeMeta);
 
   int64_t nnz() const { return values_.size(0); }
   int64_t sparse_dim() const { return sparse_dim_; }
@@ -217,7 +217,7 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl {
     refresh_numel();
   }
 private:
-    explicit SparseTensorImpl(at::DispatchKeySet, const caffe2::TypeMeta&, at::Tensor indices, at::Tensor values);
+    explicit SparseTensorImpl(at::DispatchKeySet, const caffe2::TypeMeta, at::Tensor indices, at::Tensor values);
 
   /**
    * Copy the tensor metadata fields (e.g. sizes / strides / storage pointer / storage_offset)
diff --git a/aten/src/ATen/core/blob.h b/aten/src/ATen/core/blob.h
index 988e99b2395e..3b6bafa12e62 100644
--- a/aten/src/ATen/core/blob.h
+++ b/aten/src/ATen/core/blob.h
@@ -51,7 +51,7 @@ class CAFFE2_API Blob final : public c10::intrusive_ptr_target {
   /**
    * Returns the meta info of the blob.
    */
-  const TypeMeta& meta() const noexcept {
+  const TypeMeta meta() const noexcept {
     return meta_;
   }
 
@@ -155,7 +155,7 @@ class CAFFE2_API Blob final : public c10::intrusive_ptr_target {
         TypeMeta::Make<typename std::remove_const<T>::type>()));
   }
 
-  void* ShareExternal(void* allocated, const TypeMeta& meta) {
+  void* ShareExternal(void* allocated, const TypeMeta meta) {
     free_();
     meta_ = meta;
     pointer_ = allocated;
diff --git a/aten/src/ATen/native/Resize.h b/aten/src/ATen/native/Resize.h
index 8fdc977092f4..be61ffb8b546 100644
--- a/aten/src/ATen/native/Resize.h
+++ b/aten/src/ATen/native/Resize.h
@@ -73,7 +73,7 @@ static inline void checkInBoundsForStorage(
     IntArrayRef size,
     IntArrayRef stride,
     int64_t storage_offset,
-    const caffe2::TypeMeta& data_type,
+    const caffe2::TypeMeta data_type,
     const Storage& new_storage) {
   int64_t storage_size_bytes =
       detail::computeStorageNbytes(size, stride, data_type.itemsize());
diff --git a/aten/src/ATen/native/vulkan/VulkanOpaqueTensorImpl.h b/aten/src/ATen/native/vulkan/VulkanOpaqueTensorImpl.h
index 6afc28676f2b..fc4f9945fcaf 100644
--- a/aten/src/ATen/native/vulkan/VulkanOpaqueTensorImpl.h
+++ b/aten/src/ATen/native/vulkan/VulkanOpaqueTensorImpl.h
@@ -11,7 +11,7 @@ template <typename OpaqueHandle>
 struct VulkanOpaqueTensorImpl : public OpaqueTensorImpl<OpaqueHandle> {
   VulkanOpaqueTensorImpl(
       at::DispatchKeySet key_set,
-      const caffe2::TypeMeta& data_type,
+      const caffe2::TypeMeta data_type,
       c10::Device device,
       OpaqueHandle opaque_handle,
       c10::IntArrayRef sizes,
diff --git a/aten/src/ATen/quantized/QTensorImpl.cpp b/aten/src/ATen/quantized/QTensorImpl.cpp
index 40ecbf6d5720..1c79ac186c1a 100644
--- a/aten/src/ATen/quantized/QTensorImpl.cpp
+++ b/aten/src/ATen/quantized/QTensorImpl.cpp
@@ -5,7 +5,7 @@ namespace at {
 QTensorImpl::QTensorImpl(
     Storage&& storage,
     DispatchKeySet key_set,
-    const caffe2::TypeMeta& data_type,
+    const caffe2::TypeMeta data_type,
     QuantizerPtr quantizer)
     : TensorImpl(std::move(storage), key_set, data_type),
       quantizer_(quantizer) {}
diff --git a/aten/src/ATen/quantized/QTensorImpl.h b/aten/src/ATen/quantized/QTensorImpl.h
index c2728c7aab46..efce432d5863 100644
--- a/aten/src/ATen/quantized/QTensorImpl.h
+++ b/aten/src/ATen/quantized/QTensorImpl.h
@@ -18,7 +18,7 @@ struct CAFFE2_API QTensorImpl : public c10::TensorImpl {
   QTensorImpl(
       Storage&& storage,
       DispatchKeySet key_set,
-      const caffe2::TypeMeta& data_type,
+      const caffe2::TypeMeta data_type,
       QuantizerPtr quantizer);
 
   // TODO: Expose in PyTorch Frontend
diff --git a/c10/core/DefaultDtype.cpp b/c10/core/DefaultDtype.cpp
index b125c9a56541..00e4d3f32437 100644
--- a/c10/core/DefaultDtype.cpp
+++ b/c10/core/DefaultDtype.cpp
@@ -16,13 +16,13 @@ void set_default_dtype(caffe2::TypeMeta dtype) {
   }
 }
 
-const caffe2::TypeMeta& get_default_dtype() {
+const caffe2::TypeMeta get_default_dtype() {
   return default_dtype;
 }
 ScalarType get_default_dtype_as_scalartype() {
   return default_dtype_as_scalartype;
 }
-const caffe2::TypeMeta& get_default_complex_dtype() {
+const caffe2::TypeMeta get_default_complex_dtype() {
   return default_complex_dtype;
 }
 } // namespace c10
diff --git a/c10/core/DefaultDtype.h b/c10/core/DefaultDtype.h
index eda34b217727..d0a17474bda4 100644
--- a/c10/core/DefaultDtype.h
+++ b/c10/core/DefaultDtype.h
@@ -9,7 +9,7 @@ class TypeMeta;
 
 namespace c10 {
 C10_API void set_default_dtype(caffe2::TypeMeta dtype);
-C10_API const caffe2::TypeMeta& get_default_dtype();
+C10_API const caffe2::TypeMeta get_default_dtype();
 C10_API ScalarType get_default_dtype_as_scalartype();
-C10_API const caffe2::TypeMeta& get_default_complex_dtype();
+C10_API const caffe2::TypeMeta get_default_complex_dtype();
 } // namespace c10
diff --git a/c10/core/ScalarTypeToTypeMeta.h b/c10/core/ScalarTypeToTypeMeta.h
index b0ac38309704..b6e7f6cf1993 100644
--- a/c10/core/ScalarTypeToTypeMeta.h
+++ b/c10/core/ScalarTypeToTypeMeta.h
@@ -44,4 +44,12 @@ static inline bool operator==(caffe2::TypeMeta m, ScalarType t) {
   return t == m;
 }
 
+static inline bool operator!=(ScalarType t, caffe2::TypeMeta m) {
+  return !(t == m);
+}
+
+static inline bool operator!=(caffe2::TypeMeta m, ScalarType t) {
+  return !(t == m);
+}
+
 } // namespace c10
diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index 2c8a87f9c2d1..e68729bf05f6 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -47,13 +47,13 @@ const at::Tensor& TensorImpl::grad() const {
 TensorImpl::TensorImpl(
     Storage&& storage,
     DispatchKeySet key_set,
-    const caffe2::TypeMeta& data_type)
+    const caffe2::TypeMeta data_type)
     : TensorImpl(std::move(storage), key_set, data_type, storage.device()) {}
 
-TensorImpl::TensorImpl(DispatchKeySet key_set, const caffe2::TypeMeta& data_type, c10::optional<c10::Device> device_opt)
+TensorImpl::TensorImpl(DispatchKeySet key_set, const caffe2::TypeMeta data_type, c10::optional<c10::Device> device_opt)
     : TensorImpl({}, key_set, data_type, std::move(device_opt)) {}
 
-TensorImpl::TensorImpl(Storage&& storage, DispatchKeySet key_set, const caffe2::TypeMeta& data_type,
+TensorImpl::TensorImpl(Storage&& storage, DispatchKeySet key_set, const caffe2::TypeMeta data_type,
                        c10::optional<c10::Device> device_opt)
     : storage_(std::move(storage)),
       sizes_{0},
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 057bdc8214a8..0145fd220941 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -322,24 +322,24 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   TensorImpl(
       Storage&& storage,
       DispatchKeySet,
-      const caffe2::TypeMeta& data_type);
+      const caffe2::TypeMeta data_type);
 
   /**
    * Construct a 1-dim 0 size tensor that doesn't have a storage.
    */
-  TensorImpl(DispatchKeySet, const caffe2::TypeMeta& data_type, c10::optional<c10::Device> device_opt);
+  TensorImpl(DispatchKeySet, const caffe2::TypeMeta data_type, c10::optional<c10::Device> device_opt);
 
   // Legacy constructors so I don't have to go update call sites.
   // TODO: When Variable is added, delete these constructors
   TensorImpl(
       Storage&& storage,
       DispatchKey dispatch_key,
-      const caffe2::TypeMeta& data_type)
+      const caffe2::TypeMeta data_type)
       : TensorImpl(
             std::move(storage),
             DispatchKeySet(dispatch_key),
             data_type) {}
-  TensorImpl(DispatchKey dispatch_key, const caffe2::TypeMeta& data_type, c10::optional<c10::Device> device_opt)
+  TensorImpl(DispatchKey dispatch_key, const caffe2::TypeMeta data_type, c10::optional<c10::Device> device_opt)
     : TensorImpl(DispatchKeySet(dispatch_key), data_type, device_opt) {}
 
  private:
@@ -347,7 +347,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   // storage.  Still, we pass it in separately because it's easier to write
   // the initializer list if we're not worried about storage being moved out
   // from under us.
-  TensorImpl(Storage&& storage, DispatchKeySet, const caffe2::TypeMeta& data_type, c10::optional<c10::Device>);
+  TensorImpl(Storage&& storage, DispatchKeySet, const caffe2::TypeMeta data_type, c10::optional<c10::Device>);
 
  public:
   TensorImpl(const TensorImpl&) = delete;
@@ -665,7 +665,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * Returns the TypeMeta of a tensor, which describes what data type
    * it is (e.g., int, float, ...)
    */
-  const caffe2::TypeMeta& dtype() const {
+  const caffe2::TypeMeta dtype() const {
     return data_type_;
   }
 
@@ -1235,10 +1235,10 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
 
   void ShareExternalPointer(
       DataPtr&& data_ptr,
-      const caffe2::TypeMeta& data_type,
+      const caffe2::TypeMeta data_type,
       size_t size_bytes) {
     TORCH_CHECK(
-        data_type.id() != caffe2::TypeIdentifier::uninitialized(),
+        data_type != ScalarType::Undefined,
         "To share with a raw external pointer you need to pass in an "
         "initialized data_type(TypeMeta).");
     if (!size_bytes) {
@@ -1275,7 +1275,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * If the existing data does not match the desired type, it will be deleted
    * and a new storage will be created.
    */
-  inline void* raw_mutable_data(const caffe2::TypeMeta& meta) {
+  inline void* raw_mutable_data(const caffe2::TypeMeta meta) {
     // For 0-size tensors it's fine to return any pointer (including nullptr)
     if (data_type_ == meta && storage_initialized()) {
       return static_cast<void*>(static_cast<char*>(storage_.data()) + storage_offset_ * meta.itemsize());
@@ -1369,7 +1369,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
 
   void set_storage_and_dtype(
       at::Storage storage,
-      const caffe2::TypeMeta& data_type) {
+      const caffe2::TypeMeta data_type) {
     set_storage_keep_dtype(storage);
     data_type_ = data_type;
   }
diff --git a/caffe2/contrib/opencl/context.h b/caffe2/contrib/opencl/context.h
index ce788a39a7cd..15bfda2203f0 100644
--- a/caffe2/contrib/opencl/context.h
+++ b/caffe2/contrib/opencl/context.h
@@ -59,7 +59,7 @@ class OpenCLContext final {
 
   template <class SrcContext, class DstContext>
   inline void
-  CopyItems(const TypeMeta& meta, size_t n, const void* src, void* dst) {
+  CopyItems(const TypeMeta meta, size_t n, const void* src, void* dst) {
     CAFFE_ENFORCE(!meta.copy(), "OpenCLContext requires fundamental types.");
     CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
   }
diff --git a/caffe2/core/context.h b/caffe2/core/context.h
index f3f4a9138ce1..b0e99ef1e59e 100644
--- a/caffe2/core/context.h
+++ b/caffe2/core/context.h
@@ -131,7 +131,7 @@ class CAFFE2_API CPUContext final : public BaseContext {
 
   template <class SrcContext, class DstContext>
   inline void
-  CopyItems(const TypeMeta& meta, size_t n, const void* src, void* dst) {
+  CopyItems(const TypeMeta meta, size_t n, const void* src, void* dst) {
     if (meta.copy()) {
       meta.copy()(src, dst, n);
     } else {
diff --git a/caffe2/core/context_base.h b/caffe2/core/context_base.h
index bad6872819de..036ac98fdc91 100644
--- a/caffe2/core/context_base.h
+++ b/caffe2/core/context_base.h
@@ -104,7 +104,7 @@ class CAFFE2_API BaseContext {
   }
 
   void CopyItemsSameDevice(
-      const caffe2::TypeMeta& meta,
+      const caffe2::TypeMeta meta,
       size_t n,
       const void* src,
       void* dst) {
@@ -117,7 +117,7 @@ class CAFFE2_API BaseContext {
   }
 
   void CopyItemsFromCPU(
-      const caffe2::TypeMeta& meta,
+      const caffe2::TypeMeta meta,
       size_t n,
       const void* src,
       void* dst) {
@@ -130,7 +130,7 @@ class CAFFE2_API BaseContext {
   }
 
   void CopyItemsToCPU(
-      const caffe2::TypeMeta& meta,
+      const caffe2::TypeMeta meta,
       size_t n,
       const void* src,
       void* dst) {
diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h
index c0930b1a0e61..7406132f8788 100644
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@@ -279,7 +279,7 @@ class CAFFE2_CUDA_API CUDAContext final : public BaseContext {
 
   template <class SrcContext, class DstContext>
   inline void
-  CopyItems(const TypeMeta& meta, size_t n, const void* src, void* dst) {
+  CopyItems(const TypeMeta meta, size_t n, const void* src, void* dst) {
     CAFFE_ENFORCE(!meta.copy(), "CUDAContext requires fundamental types.");
     CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
   }
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index ea9ae7892a23..a4abc97f73e4 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -1246,7 +1246,7 @@ struct DispatchHelper<FixedValues<>, ExtraArgs...> {
   template <typename FirstType, typename... Types, typename... ExtraArgs>      \
   struct DispatchHelper<TensorTypes<FirstType, Types...>, ExtraArgs...> {      \
     template <typename Op>                                                     \
-    static bool call(Op* op, const TypeMeta& meta) {                           \
+    static bool call(Op* op, const TypeMeta meta) {                           \
       static_assert(                                                           \
           !std::is_same<GenericTensorImplementation, FirstType>::value,        \
           "GenericTensorImplementation must be the last in TensorTypes list"); \
@@ -1269,7 +1269,7 @@ struct DispatchHelper<FixedValues<>, ExtraArgs...> {
   template <typename... ExtraArgs>                                             \
   struct DispatchHelper<TensorTypes<>, ExtraArgs...> {                         \
     template <typename Op>                                                     \
-    static bool call(Op* /* unused */, const TypeMeta& meta) {                 \
+    static bool call(Op* /* unused */, const TypeMeta meta) {                 \
       CAFFE_THROW("Unsupported type of tensor: ", meta.name());                \
     }                                                                          \
     template <typename Op>                                                     \
@@ -1287,7 +1287,7 @@ struct DispatchHelper<FixedValues<>, ExtraArgs...> {
       TensorTypes<GenericTensorImplementation>,                                \
       ExtraArgs...> {                                                          \
     template <typename Op>                                                     \
-    static bool call(Op* op, const TypeMeta&) {                                \
+    static bool call(Op* op, const TypeMeta) {                                \
       return op->template DoRunWithOtherType<ExtraArgs...>();                  \
     }                                                                          \
     template <typename Op>                                                     \
diff --git a/caffe2/core/plan_executor.cc b/caffe2/core/plan_executor.cc
index 97c309c078e4..06e27ef5be54 100644
--- a/caffe2/core/plan_executor.cc
+++ b/caffe2/core/plan_executor.cc
@@ -133,8 +133,8 @@ std::function<bool(int64_t)> getContinuationTest(
 // if the blob doesn't exist or is not initialized, return false
 inline bool getShouldStop(const Blob* b) {
   if (!b ||
-      b->meta().id() ==
-          TypeIdentifier::uninitialized()) { // not exist or uninitialized
+      b->meta() ==
+          ScalarType::Undefined) { // not exist or uninitialized
     return false;
   }
 
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 27f8b471b71b..83df5306e177 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -299,14 +299,14 @@ class CAFFE2_API Tensor final {
 
   void ShareExternalPointer(
       void* src,
-      const TypeMeta& data_type,
+      const TypeMeta data_type,
       size_t nbytes = 0,
       MemoryDeleter d = nullptr) const {
     CAFFE_ENFORCE_WITH_CALLER(
         impl_->is_contiguous(),
         "Right now ShareExternalPointer is only supported for contiguous Tensor.");
     CAFFE_ENFORCE_WITH_CALLER(
-        data_type.id() != caffe2::TypeIdentifier::uninitialized(),
+        data_type != ScalarType::Undefined,
         "To share with a raw external pointer you need to pass in an "
         "initialized data_type(TypeMeta).");
     impl_.get()->ShareExternalPointer(
@@ -315,7 +315,7 @@ class CAFFE2_API Tensor final {
 
   void ShareExternalPointer(
       at::DataPtr&& data_ptr,
-      const TypeMeta& data_type,
+      const TypeMeta data_type,
       size_t nbytes) {
     impl_.get()->ShareExternalPointer(std::move(data_ptr), data_type, nbytes);
   }
@@ -342,7 +342,7 @@ class CAFFE2_API Tensor final {
     return impl_.get()->data<T>();
   }
 
-  inline void* raw_mutable_data(const TypeMeta& meta) const {
+  inline void* raw_mutable_data(const TypeMeta meta) const {
     return impl_.get()->raw_mutable_data(meta);
   }
 
@@ -358,7 +358,7 @@ class CAFFE2_API Tensor final {
   inline void* raw_mutable_data() const {
     const auto& data_type = impl_->dtype();
     CAFFE_ENFORCE_WITH_CALLER(
-        data_type.id() != caffe2::TypeIdentifier::uninitialized(),
+        data_type != ScalarType::Undefined,
         "Calling raw_mutable_data() without meta, but the current meta is "
         "of unknown type.");
     return raw_mutable_data(data_type);
@@ -469,7 +469,7 @@ class CAFFE2_API Tensor final {
   /**
    * Returns the TypeMeta object associated with the current data type.
    */
-  inline const TypeMeta& dtype() const {
+  inline const TypeMeta dtype() const {
     return impl_->dtype();
   }
 
@@ -477,7 +477,7 @@ class CAFFE2_API Tensor final {
    * (To be deprecated) Returns the TypeMeta object associated with the current
    * data type.
    */
-  inline const TypeMeta& meta() const {
+  inline const TypeMeta meta() const {
     return impl_->dtype();
   }
 
diff --git a/caffe2/core/types.cc b/caffe2/core/types.cc
index d1007fe76e86..c738fc50a288 100644
--- a/caffe2/core/types.cc
+++ b/caffe2/core/types.cc
@@ -8,7 +8,7 @@
 
 namespace caffe2 {
 
-TensorProto::DataType TypeMetaToDataType(const TypeMeta& meta) {
+TensorProto::DataType TypeMetaToDataType(const TypeMeta meta) {
   static_assert(
       sizeof(int) == 4, "int in this compiler does not equal to 4 bytes.");
   static std::map<TypeIdentifier, TensorProto::DataType> data_type_map{
@@ -36,7 +36,7 @@ TensorProto::DataType TypeMetaToDataType(const TypeMeta& meta) {
       it == data_type_map.end() ? TensorProto_DataType_UNDEFINED : it->second);
 }
 
-const TypeMeta& DataTypeToTypeMeta(const TensorProto::DataType& dt) {
+const TypeMeta DataTypeToTypeMeta(const TensorProto::DataType& dt) {
   static std::map<TensorProto::DataType, TypeMeta> type_meta_map{
       {TensorProto_DataType_FLOAT, TypeMeta::Make<float>()},
       {TensorProto_DataType_INT32, TypeMeta::Make<int>()},
diff --git a/caffe2/core/types.h b/caffe2/core/types.h
index c0e8d7bbfb3d..5dda5a5e0810 100644
--- a/caffe2/core/types.h
+++ b/caffe2/core/types.h
@@ -47,10 +47,10 @@ inline int32_t GetDimFromOrderString(const std::string& str) {
 inline constexpr char NameScopeSeparator() { return '/'; }
 
 // From TypeMeta to caffe2::DataType protobuffer enum.
-CAFFE2_API TensorProto::DataType TypeMetaToDataType(const TypeMeta& meta);
+CAFFE2_API TensorProto::DataType TypeMetaToDataType(const TypeMeta meta);
 
 // From caffe2::DataType protobuffer enum to TypeMeta
-CAFFE2_API const TypeMeta& DataTypeToTypeMeta(const TensorProto::DataType& dt);
+CAFFE2_API const TypeMeta DataTypeToTypeMeta(const TensorProto::DataType& dt);
 
 }  // namespace caffe2
 
diff --git a/caffe2/ideep/utils/ideep_context.h b/caffe2/ideep/utils/ideep_context.h
index 823b4bec16bd..d0f1207a08f6 100644
--- a/caffe2/ideep/utils/ideep_context.h
+++ b/caffe2/ideep/utils/ideep_context.h
@@ -91,7 +91,7 @@ class IDEEPContext final : public BaseContext {
 
   template <class SrcContext, class DstContext>
   inline void
-  CopyItems(const TypeMeta& meta, size_t n, const void* src, void* dst) {
+  CopyItems(const TypeMeta meta, size_t n, const void* src, void* dst) {
     if (meta.copy()) {
       meta.copy()(src, dst, n);
     } else {
diff --git a/caffe2/operators/dataset_ops.cc b/caffe2/operators/dataset_ops.cc
index 95fcfc1ab923..c311ad23e4ed 100644
--- a/caffe2/operators/dataset_ops.cc
+++ b/caffe2/operators/dataset_ops.cc
@@ -407,7 +407,7 @@ class UnPackRecordsOp : public Operator<CPUContext> {
 
     // Precomputer the output sizes to avoid resizing
     std::vector<std::vector<int64_t>> outputDims(numTensors);
-    std::vector<const TypeMeta*> metas(numTensors);
+    std::vector<TypeMeta> metas(numTensors);
 
     CAFFE_ENFORCE(
         numRows > 0 || InputSize() > 1,
@@ -428,7 +428,7 @@ class UnPackRecordsOp : public Operator<CPUContext> {
 
         // Checks to ensure that dimensions/sizes match
         CAFFE_ENFORCE_EQ(outputDims[j].size(), input.dim());
-        CAFFE_ENFORCE(*metas[j] == input.dtype());
+        CAFFE_ENFORCE(metas[j] == input.dtype());
         // We look from first dimension, because we concat on the first.
         for (int k = 1; k < input.dim(); ++k) {
           CAFFE_ENFORCE_EQ(input.sizes()[k], outputDims[j][k]);
@@ -442,7 +442,7 @@ class UnPackRecordsOp : public Operator<CPUContext> {
     std::vector<void*> destinations(numTensors);
     for (int i = 0; i < numTensors; ++i) {
       Output(i)->Resize(outputDims[i]);
-      destinations[i] = Output(i)->raw_mutable_data(*metas[i]);
+      destinations[i] = Output(i)->raw_mutable_data(metas[i]);
     }
 
     for (int i = 0; i < numRows; ++i) {
@@ -450,7 +450,7 @@ class UnPackRecordsOp : public Operator<CPUContext> {
         const auto& input = tensors[i][j];
 
         context_.CopyItemsSameDevice(
-            *metas[j],
+            metas[j],
             input.numel(),
             input.raw_data() /* src */,
             destinations[j] /* dst */
@@ -468,7 +468,7 @@ class UnPackRecordsOp : public Operator<CPUContext> {
   void getShapeAndMetaFromInput(
       const Shared2DTensorVectorPtr& inputs,
       std::vector<std::vector<int64_t>>& outputDims,
-      std::vector<const TypeMeta*>& metas) {
+      std::vector<TypeMeta>& metas) {
     const auto& inputZero = inputs->at(0);
 
     const auto numTensors = inputZero.size();
@@ -479,13 +479,13 @@ class UnPackRecordsOp : public Operator<CPUContext> {
     for (int i = 0; i < numTensors; ++i) {
       outputDims[i] = inputZero[i].sizes().vec();
       outputDims[i][0] = 0;
-      metas[i] = &inputZero[i].dtype();
+      metas[i] = inputZero[i].dtype();
     }
   }
 
   void getShapeAndMetaFromPrototypeBlobs(
       std::vector<std::vector<int64_t>>& outputDims,
-      std::vector<const TypeMeta*>& metas) {
+      std::vector<TypeMeta>& metas) {
     const auto numTensors = fields_.size();
     CAFFE_ENFORCE_EQ(numTensors, InputSize() - 1);
     CAFFE_ENFORCE_EQ(numTensors, OutputSize());
@@ -493,7 +493,7 @@ class UnPackRecordsOp : public Operator<CPUContext> {
       const auto& input = Input(i + 1);
       outputDims[i] = input.sizes().vec();
       outputDims[i][0] = 0;
-      metas[i] = &input.dtype();
+      metas[i] = input.dtype();
     }
   }
 
diff --git a/caffe2/operators/dataset_ops.h b/caffe2/operators/dataset_ops.h
index 70a294e14136..fc890014dbb2 100644
--- a/caffe2/operators/dataset_ops.h
+++ b/caffe2/operators/dataset_ops.h
@@ -146,7 +146,7 @@ class TreeWalker {
       return size;
     }
 
-    inline const TypeMeta& meta() const {
+    inline const TypeMeta meta() const {
       return walker_.input(fieldId_).dtype();
     }
 
diff --git a/caffe2/operators/index_ops.h b/caffe2/operators/index_ops.h
index 890753caf2fe..2f5705cb4c26 100644
--- a/caffe2/operators/index_ops.h
+++ b/caffe2/operators/index_ops.h
@@ -18,7 +18,7 @@ using int64_tValue = int64_t;
 
 struct IndexBase {
  public:
-  IndexBase(int64_tValue maxElements, const TypeMeta& type)
+  IndexBase(int64_tValue maxElements, const TypeMeta type)
       : maxElements_{maxElements}, meta_(type), frozen_{false} {}
 
   void Freeze() {
@@ -35,7 +35,7 @@ struct IndexBase {
 
   virtual ~IndexBase() {}
 
-  const TypeMeta& Type() const {
+  const TypeMeta Type() const {
     return meta_;
   }
 
diff --git a/caffe2/operators/numpy_tile_op.h b/caffe2/operators/numpy_tile_op.h
index 8a39b40df0f8..ac9886ec503a 100644
--- a/caffe2/operators/numpy_tile_op.h
+++ b/caffe2/operators/numpy_tile_op.h
@@ -92,7 +92,7 @@ class NumpyTileOp : public Operator<Context> {
 
  private:
   void DoTile(
-      const TypeMeta& meta,
+      const TypeMeta meta,
       int item_size,
       int outer_dim,
       int inner_dim,
diff --git a/caffe2/operators/tile_op.cc b/caffe2/operators/tile_op.cc
index 40684c50575b..b0d797fce7ff 100644
--- a/caffe2/operators/tile_op.cc
+++ b/caffe2/operators/tile_op.cc
@@ -71,7 +71,7 @@ bool TileOp<CPUContext>::DoRunWithType<std::string>() {
   // size from axis up
   const int inner_size = X.size_from_dim(axis);
 
-  const TypeMeta& meta = X.dtype();
+  const TypeMeta meta = X.dtype();
   const int item_size = X.itemsize();
   const char* X_ptr = reinterpret_cast<const char*>(X.raw_data());
   char* Y_ptr = reinterpret_cast<char*>(Y->raw_mutable_data(meta));
diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
index 2923b98c565f..65a246e4a39c 100644
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@@ -118,7 +118,7 @@ static_assert(
     sizeof(int) == sizeof(int32_t),
     "We make an assumption that int is always int32 for numpy "
     "type mapping.");
-int CaffeToNumpyType(const TypeMeta& meta) {
+int CaffeToNumpyType(const TypeMeta meta) {
 #ifdef USE_NUMPY
   static std::map<TypeIdentifier, int> numpy_type_map{
       {TypeMeta::Id<bool>(), NPY_BOOL},
@@ -143,7 +143,7 @@ int CaffeToNumpyType(const TypeMeta& meta) {
 #endif // USE_NUMPY
 }
 
-const TypeMeta& NumpyTypeToCaffe(int numpy_type) {
+const TypeMeta NumpyTypeToCaffe(int numpy_type) {
 #ifdef USE_NUMPY
   static std::map<int, TypeMeta> caffe_type_map{
       {NPY_BOOL, TypeMeta::Make<bool>()},
diff --git a/caffe2/python/pybind_state.h b/caffe2/python/pybind_state.h
index b8f9dbaf3719..b3926e941194 100644
--- a/caffe2/python/pybind_state.h
+++ b/caffe2/python/pybind_state.h
@@ -103,8 +103,8 @@ static_assert(
     "We make an assumption that int is always int32 for numpy "
     "type mapping.");
 
-int CaffeToNumpyType(const TypeMeta& dtype);
-const TypeMeta& NumpyTypeToCaffe(int numpy_type);
+int CaffeToNumpyType(const TypeMeta dtype);
+const TypeMeta NumpyTypeToCaffe(int numpy_type);
 
 class TensorFetcher : public BlobFetcherBase {
  public:
@@ -114,7 +114,7 @@ class TensorFetcher : public BlobFetcherBase {
 
   // Checks whether the data with type `dtype` needs to be copied in the context
   // of `tensor`
-  bool NeedsCopy(const Tensor* tensor, const TypeMeta& dtype) const {
+  bool NeedsCopy(const Tensor* tensor, const TypeMeta dtype) const {
 #ifdef USE_NUMPY
     return tensor->GetDeviceType() != CPU ||
         CaffeToNumpyType(dtype) == NPY_OBJECT;
@@ -200,9 +200,9 @@ class TensorFeeder : public BlobFeederBase {
     auto g = MakeGuard([&]() { Py_XDECREF(array); });
 
     const auto npy_type = PyArray_TYPE(array);
-    const TypeMeta& dtype = NumpyTypeToCaffe(npy_type);
+    const TypeMeta dtype = NumpyTypeToCaffe(npy_type);
     CAFFE_ENFORCE(
-        dtype.id() != TypeIdentifier::uninitialized(),
+        dtype != ScalarType::Undefined,
         "This numpy data type is not supported: ",
         PyArray_TYPE(array),
         ".");
diff --git a/caffe2/python/pybind_state_dlpack.cc b/caffe2/python/pybind_state_dlpack.cc
index 7b1ec2b8e141..a7204481224f 100644
--- a/caffe2/python/pybind_state_dlpack.cc
+++ b/caffe2/python/pybind_state_dlpack.cc
@@ -14,7 +14,7 @@ const DLDeviceType* CaffeToDLDeviceType(int device_type) {
   return it == dl_device_type_map.end() ? nullptr : &it->second;
 }
 
-const DLDataType* CaffeToDLType(const TypeMeta& meta) {
+const DLDataType* CaffeToDLType(const TypeMeta meta) {
   static std::map<TypeIdentifier, DLDataType> dl_type_map{
       {TypeMeta::Id<int8_t>(), DLDataType{0, 8, 1}},
       {TypeMeta::Id<int16_t>(), DLDataType{0, 16, 1}},
@@ -30,7 +30,7 @@ const DLDataType* CaffeToDLType(const TypeMeta& meta) {
   return it == dl_type_map.end() ? nullptr : &it->second;
 }
 
-const TypeMeta& DLTypeToCaffe(const DLDataType& dl_type) {
+const TypeMeta DLTypeToCaffe(const DLDataType& dl_type) {
   try {
     if (dl_type.lanes != 1) {
       throw std::invalid_argument("invalid type");
diff --git a/caffe2/python/pybind_state_dlpack.h b/caffe2/python/pybind_state_dlpack.h
index 54f3157e7634..bcdbc50a61d4 100644
--- a/caffe2/python/pybind_state_dlpack.h
+++ b/caffe2/python/pybind_state_dlpack.h
@@ -16,9 +16,9 @@ namespace py = pybind11;
 
 const DLDeviceType* CaffeToDLDeviceType(int device_type);
 
-const DLDataType* CaffeToDLType(const TypeMeta& meta);
+const DLDataType* CaffeToDLType(const TypeMeta meta);
 
-const TypeMeta& DLTypeToCaffe(const DLDataType& dl_type);
+const TypeMeta DLTypeToCaffe(const DLDataType& dl_type);
 
 // TODO: remove context
 template <class Context>
@@ -40,7 +40,7 @@ class DLPackWrapper {
     if (tensor->numel() <= 0) {
       tensor->Resize(0);
     }
-    if (tensor->dtype().id() == TypeIdentifier::uninitialized()) {
+    if (tensor->dtype() == ScalarType::Undefined) {
       // treat uninitialized tensor as float tensor
       tensor->template mutable_data<float>();
     }
diff --git a/caffe2/python/pybind_state_ideep.cc b/caffe2/python/pybind_state_ideep.cc
index 8d09b0aaa326..bbeaf524f055 100644
--- a/caffe2/python/pybind_state_ideep.cc
+++ b/caffe2/python/pybind_state_ideep.cc
@@ -97,7 +97,7 @@ class IDeepFetcher : public BlobFetcherBase {
 };
 
 class IDeepFeeder : public BlobFeederBase {
-  itensor::data_type type_transform(const TypeMeta &meta) {
+  itensor::data_type type_transform(const TypeMeta meta) {
     if (meta == TypeMeta::Make<float>())
       return itensor::data_type::f32;
     else if (meta == TypeMeta::Make<int>())
@@ -119,10 +119,10 @@ class IDeepFeeder : public BlobFeederBase {
     PyArrayObject *array = PyArray_GETCONTIGUOUS(original_array);
     auto g = MakeGuard([&]() { Py_XDECREF(array); });
     const auto npy_type = PyArray_TYPE(array);
-    const TypeMeta &meta = NumpyTypeToCaffe(npy_type);
+    const TypeMeta meta = NumpyTypeToCaffe(npy_type);
     CAFFE_ENFORCE_NE(
-        meta.id(),
-        TypeIdentifier::uninitialized(),
+        meta,
+        ScalarType::Undefined,
         "This numpy data type is not supported: ",
         PyArray_TYPE(array), ".");
 
@@ -172,7 +172,7 @@ class IDeepFeeder : public BlobFeederBase {
       auto g = MakeGuard([&]() { Py_XDECREF(array); });
 
       const auto npy_type = PyArray_TYPE(array);
-      const TypeMeta &meta = NumpyTypeToCaffe(npy_type);
+      const TypeMeta meta = NumpyTypeToCaffe(npy_type);
 
       // TODO: if necessary, use dispatcher.
       if ((in_place && blob->IsType<itensor>())
diff --git a/torch/csrc/DynamicTypes.cpp b/torch/csrc/DynamicTypes.cpp
index f7e48c3b682d..92e8a93c284e 100644
--- a/torch/csrc/DynamicTypes.cpp
+++ b/torch/csrc/DynamicTypes.cpp
@@ -59,7 +59,7 @@ at::DeprecatedTypeProperties* get_type(at::Backend backend, at::ScalarType scala
 
 PyTypeObject* getPyTypeObject(
     const at::Storage& storage,
-    const caffe2::TypeMeta& dtype) {
+    const caffe2::TypeMeta dtype) {
   at::ScalarType scalarType = at::typeMetaToScalarType(dtype);
   auto attype = &at::getDeprecatedTypeProperties(
       at::dispatchKeyToBackend(c10::computeDispatchKey(scalarType, c10::nullopt, storage.device_type())),
@@ -106,7 +106,7 @@ THPLayout* getTHPLayout(at::Layout layout) {
 
 PyObject* createPyObject(
     const at::Storage& storage,
-    const caffe2::TypeMeta& data_type) {
+    const caffe2::TypeMeta data_type) {
   auto type = getPyTypeObject(storage, data_type);
   auto obj = THPObjectPtr(type->tp_alloc(type, 0));
   if (!obj) throw python_error();
diff --git a/torch/csrc/DynamicTypes.h b/torch/csrc/DynamicTypes.h
index 774c5e99999f..d93d0e3b5cf5 100644
--- a/torch/csrc/DynamicTypes.h
+++ b/torch/csrc/DynamicTypes.h
@@ -30,7 +30,7 @@ void registerLayoutObject(THPLayout *thp_layout, at::Layout layout);
 
 PyObject* createPyObject(
     const at::Storage& storage,
-    const caffe2::TypeMeta& data_type);
+    const caffe2::TypeMeta data_type);
 at::Storage createStorage(PyObject* obj);
 bool isStorage(PyObject* obj);
 

From c7fc8cab3bbd4ec137abb805a71af383f5623947 Mon Sep 17 00:00:00 2001
From: Basil Hosmer <bhosmer@fb.com>
Date: Fri, 30 Oct 2020 10:11:22 -0700
Subject: [PATCH 151/169] track Half/ComplexHalf default dtype (#45043)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45043

Test Plan: Imported from OSS

Reviewed By: ezyang

Differential Revision: D23806097

Pulled By: bhosmer

fbshipit-source-id: 1c816b09c1e6b3c7ba85ed43d8e6c2518a768da4
---
 c10/core/DefaultDtype.cpp | 16 +++++++++++-----
 c10/util/typeid.h         |  7 ++++++-
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/c10/core/DefaultDtype.cpp b/c10/core/DefaultDtype.cpp
index 00e4d3f32437..583d4452bfbd 100644
--- a/c10/core/DefaultDtype.cpp
+++ b/c10/core/DefaultDtype.cpp
@@ -7,12 +7,18 @@ static auto default_dtype_as_scalartype = default_dtype.toScalarType();
 static auto default_complex_dtype = caffe2::TypeMeta::Make<c10::complex<float>>();
 
 void set_default_dtype(caffe2::TypeMeta dtype) {
-  default_dtype = std::move(dtype);
+  default_dtype = dtype;
   default_dtype_as_scalartype = default_dtype.toScalarType();
-  if(default_dtype_as_scalartype == ScalarType::Double) {
-    default_complex_dtype = std::move(caffe2::TypeMeta::Make<c10::complex<double>>());
-  } else {
-    default_complex_dtype = std::move(caffe2::TypeMeta::Make<c10::complex<float>>());
+  switch (default_dtype_as_scalartype) {
+    case ScalarType::Half:
+      default_complex_dtype = ScalarType::ComplexHalf;
+      break;
+    case ScalarType::Double:
+      default_complex_dtype = ScalarType::ComplexDouble;
+      break;
+    default:
+      default_complex_dtype = ScalarType::ComplexFloat;
+      break;
   }
 }
 
diff --git a/c10/util/typeid.h b/c10/util/typeid.h
index 7feb335507ca..5bdbdc4271df 100644
--- a/c10/util/typeid.h
+++ b/c10/util/typeid.h
@@ -341,12 +341,17 @@ class C10_API TypeMeta final {
   TypeMeta(const TypeMeta& src) noexcept = default;
 
   /**
-   * Assignment operator.
+   * Assignment operators.
    */
   TypeMeta& operator=(const TypeMeta& src) noexcept = default;
 
   TypeMeta(TypeMeta&& rhs) noexcept = default;
 
+  inline TypeMeta& operator=(ScalarType scalar_type) noexcept {
+    index_ = static_cast<uint16_t>(scalar_type);
+    return *this;
+  }
+
 private:
   // TypeMeta can only be created by Make, making sure that we do not
   // create incorrectly mixed up TypeMeta objects.

From 99fed7bd87badd68870dd622cdbe87521a08e70f Mon Sep 17 00:00:00 2001
From: Basil Hosmer <bhosmer@fb.com>
Date: Fri, 30 Oct 2020 10:11:22 -0700
Subject: [PATCH 152/169] faster TensorOptions merging (#45046)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45046

Test Plan: Imported from OSS

Reviewed By: ezyang

Differential Revision: D23806787

Pulled By: bhosmer

fbshipit-source-id: 3c8304f9a4503658081f8805ec06da78a467e125
---
 aten/src/ATen/Utils.cpp                       | 16 +++++------
 aten/src/ATen/native/MetaTensor.cpp           |  2 +-
 aten/src/ATen/native/TensorConversions.cpp    |  2 +-
 aten/src/ATen/native/TensorFactories.cpp      |  2 +-
 .../ATen/native/quantized/TensorFactories.cpp |  4 +--
 c10/core/TensorOptions.h                      | 27 ++++++++++++-------
 6 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/aten/src/ATen/Utils.cpp b/aten/src/ATen/Utils.cpp
index 847172b46771..8a4fa37e469e 100644
--- a/aten/src/ATen/Utils.cpp
+++ b/aten/src/ATen/Utils.cpp
@@ -18,14 +18,16 @@ namespace detail {
 // empty_cpu is used in ScalarOps.h, which can be referenced by other ATen files. Since we want to decouple direct referencing native symbols and only access native symbols through dispatching, we move its implementation here.
 Tensor empty_cpu(
     IntArrayRef size,
-    const TensorOptions& options_,
+    const TensorOptions& options,
     c10::optional<c10::MemoryFormat> optional_memory_format) {
   TORCH_CHECK(
-      !(options_.has_memory_format() && optional_memory_format.has_value()),
+      !(options.has_memory_format() && optional_memory_format.has_value()),
       "Cannot set memory_format both in TensorOptions and explicit argument; please delete "
       "the redundant setter.");
-  TensorOptions options =
-      options_.merge_in(TensorOptions().memory_format(optional_memory_format));
+  const MemoryFormat memory_format =
+    optional_memory_format.value_or(
+      options.memory_format_opt().value_or(
+        MemoryFormat::Contiguous));
 
   AT_ASSERT(options.device().type() == DeviceType::CPU);
   check_size_nonnegative(size);
@@ -38,8 +40,8 @@ Tensor empty_cpu(
   }
 
   int64_t nelements = prod_intlist(size);
-  auto dtype = options.dtype();
-  int64_t size_bytes = nelements * dtype.itemsize();
+  const caffe2::TypeMeta dtype = options.dtype();
+  const int64_t size_bytes = nelements * dtype.itemsize();
   auto storage_impl = c10::make_intrusive<StorageImpl>(
       c10::StorageImpl::use_byte_size_t(),
       size_bytes,
@@ -54,8 +56,6 @@ Tensor empty_cpu(
     tensor.unsafeGetTensorImpl()->set_sizes_contiguous(size);
   }
 
-  auto memory_format =
-      options.memory_format_opt().value_or(MemoryFormat::Contiguous);
   tensor.unsafeGetTensorImpl()->empty_tensor_restride(memory_format);
 
   return tensor;
diff --git a/aten/src/ATen/native/MetaTensor.cpp b/aten/src/ATen/native/MetaTensor.cpp
index f8f0231b181c..a7042b283c4c 100644
--- a/aten/src/ATen/native/MetaTensor.cpp
+++ b/aten/src/ATen/native/MetaTensor.cpp
@@ -14,7 +14,7 @@ Tensor empty_meta(
     !(options_.has_memory_format() && optional_memory_format.has_value()),
     "Cannot set memory_format both in TensorOptions and explicit argument; please delete "
     "the redundant setter.");
-  TensorOptions options = options_.merge_in(TensorOptions().memory_format(optional_memory_format));
+  TensorOptions options = options_.merge_memory_format(optional_memory_format);
 
   // TODO: deduplicate this logic with empty_cpu
 
diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp
index 63e272de57fc..d941f3b8e169 100644
--- a/aten/src/ATen/native/TensorConversions.cpp
+++ b/aten/src/ATen/native/TensorConversions.cpp
@@ -63,7 +63,7 @@ Tensor to(
     !(options_.has_memory_format() && optional_memory_format.has_value()),
     "Cannot set memory_format both in TensorOptions and explicit argument; please delete "
     "the redundant setter.");
-  auto options = options_.merge_in(TensorOptions().memory_format(optional_memory_format));
+  auto options = options_.merge_memory_format(optional_memory_format);
 
   TORCH_CHECK(options.requires_grad_opt() == c10::nullopt,
            "to(options) expects unset requires_grad flag, but got "
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index 7277934092b4..0cec2dd32b0e 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -240,7 +240,7 @@ Tensor empty_like(
   TensorOptions options =
       self.options()
           .merge_in(options_)
-          .merge_in(TensorOptions().memory_format(optional_memory_format));
+          .merge_memory_format(optional_memory_format);
 
   TORCH_CHECK(
       !(options.layout() != kStrided &&
diff --git a/aten/src/ATen/native/quantized/TensorFactories.cpp b/aten/src/ATen/native/quantized/TensorFactories.cpp
index 11b84fa4713d..a05026d49f46 100644
--- a/aten/src/ATen/native/quantized/TensorFactories.cpp
+++ b/aten/src/ATen/native/quantized/TensorFactories.cpp
@@ -20,7 +20,7 @@ Tensor empty_affine_quantized(
     !(options_.has_memory_format() && optional_memory_format.has_value()),
     "Cannot set memory_format both in TensorOptions and explicit argument; please delete "
     "the redundant setter.");
-  auto options = options_.merge_in(TensorOptions().memory_format(optional_memory_format));
+  auto options = options_.merge_memory_format(optional_memory_format);
   TORCH_CHECK(
       options.has_dtype(),
       "Must provide data type for Tensor creation functions.");
@@ -42,7 +42,7 @@ Tensor empty_per_channel_affine_quantized(
     !(options_.has_memory_format() && optional_memory_format.has_value()),
     "Cannot set memory_format both in TensorOptions and explicit argument; please delete "
     "the redundant setter.");
-  auto options = options_.merge_in(TensorOptions().memory_format(optional_memory_format));
+  auto options = options_.merge_memory_format(optional_memory_format);
   TORCH_CHECK(
       options.has_dtype(),
       "Must provide data type for Tensor creation functions.");
diff --git a/c10/core/TensorOptions.h b/c10/core/TensorOptions.h
index 40d170926141..c8c7f058513d 100644
--- a/c10/core/TensorOptions.h
+++ b/c10/core/TensorOptions.h
@@ -377,15 +377,24 @@ struct C10_API TensorOptions {
   /// device guard.
   ///
   TensorOptions merge_in(TensorOptions options) const noexcept {
-    TensorOptions r = options;
-    if (!r.has_device()) r.set_device(device_opt());
-    if (!r.has_dtype()) r.set_dtype(dtype_opt());
-    if (!r.has_layout()) r.set_layout(layout_opt());
+    TensorOptions merged = *this;
+    if (options.has_device()) merged.set_device(options.device_opt());
+    if (options.has_dtype()) merged.set_dtype(options.dtype_opt());
+    if (options.has_layout()) merged.set_layout(options.layout_opt());
     // NB: requires grad is right biased; not a logical AND/OR!
-    if (!r.has_requires_grad()) r.set_requires_grad(requires_grad_opt());
-    if (!r.has_pinned_memory()) r.set_pinned_memory(pinned_memory_opt());
-    if (!r.has_memory_format()) r.set_memory_format(memory_format_opt());
-    return r;
+    if (options.has_requires_grad()) merged.set_requires_grad(options.requires_grad_opt());
+    if (options.has_pinned_memory()) merged.set_pinned_memory(options.pinned_memory_opt());
+    if (options.has_memory_format()) merged.set_memory_format(options.memory_format_opt());
+    return merged;
+  }
+
+  // TODO remove after TensorOptions rationalization
+  TensorOptions merge_memory_format(c10::optional<MemoryFormat> optional_memory_format) const noexcept {
+    TensorOptions merged = *this;
+    if (optional_memory_format.has_value()) {
+      merged.set_memory_format(*optional_memory_format);
+    }
+    return merged;
   }
 
   // Resolves the tensor type set specified by the current construction axes.
@@ -493,8 +502,8 @@ struct C10_API TensorOptions {
   // NB: We didn't use c10::optional here, because then we can't pack
   // the has_***_ boolean fields.
 
-  caffe2::TypeMeta dtype_ = caffe2::TypeMeta::Make<float>(); // 64-bit
   Device device_ = at::kCPU; // 32-bit
+  caffe2::TypeMeta dtype_ = caffe2::TypeMeta::Make<float>(); // 16-bit
   Layout layout_ = at::kStrided; // 8-bit
   MemoryFormat memory_format_ = MemoryFormat::Contiguous; // 8-bit
 

From 69fe10c12788675e8968f9d29530439ac44648bd Mon Sep 17 00:00:00 2001
From: Basil Hosmer <bhosmer@fb.com>
Date: Fri, 30 Oct 2020 10:11:22 -0700
Subject: [PATCH 153/169] use bitfield to shrink TensorImpl (#45263)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45263

Test Plan: Imported from OSS

Reviewed By: ezyang

Differential Revision: D23900587

Pulled By: bhosmer

fbshipit-source-id: 9214b887fde010bd7c8be848ee7846329c35752f
---
 c10/core/TensorImpl.cpp |  3 +++
 c10/core/TensorImpl.h   | 34 ++++++++++++++++++++++------------
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index e68729bf05f6..9f2ca1d2ca07 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -61,6 +61,9 @@ TensorImpl::TensorImpl(Storage&& storage, DispatchKeySet key_set, const caffe2::
       numel_(0),
       data_type_(data_type),
       device_opt_(device_opt) {
+
+  init_bitfields();
+
   if (!key_set.empty()) {
     TORCH_INTERNAL_ASSERT(data_type == ScalarType::Undefined || device_opt_.has_value());
     // UndefinedTensorImpl is a singleton, so we skip logging it
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 0145fd220941..da849b049b65 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -1675,36 +1675,47 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   // INVARIANT: named_tensor_meta_ != nullptr  <==>  key_set_.has(DispatchKey::Named)
   DispatchKeySet key_set_;
 
-  // You get to have eight byte-size fields here, before you
-  // should pack this into a bitfield.
+  // Tensor is contiguous
   bool is_contiguous_ = true;
 
+  // default member initializers for bit-fields only available with -std=c++2a or -std=gnu++2a
+  inline void init_bitfields() {
+    is_channels_last_ = false;
+    is_channels_last_contiguous_ = false;
+    is_channels_last_3d_ = false;
+    is_channels_last_3d_contiguous_ = false;
+    is_non_overlapping_and_dense_ = false;
+    is_wrapped_number_ = false;
+    allow_tensor_metadata_change_ = true;
+    reserved_ = false;
+  }
+
   // Tensor is stored in the channels last 2d memory format, when dimensions
   // order is (N)CHW and C-strides < W-strides < H-strides (< N-strides)
   // (If size of any dimension is equal to 1, this dimension strides value
   // is not taken into account).
-  bool is_channels_last_ = false;
+  bool is_channels_last_ : 1;
 
   // Channels last contiguous tensor is channel last tensor which occupies
   // contiguous memory block.
-  bool is_channels_last_contiguous_ = false;
+  bool is_channels_last_contiguous_ : 1;
 
   // Tensor is stored in the channels last 3d memory format, when dimensions
   // order is (N)CDHW and C-strides < W-strides < H-strides < D - strides (< N-strides)
   // (If size of any dimension is equal to 1, this dimension strides value
   // is not taken into account).
-  bool is_channels_last_3d_ = false;
+  bool is_channels_last_3d_ : 1;
 
   // Channels last 3d contiguous tensor is channel last 3d tensor which occupies
   // contiguous memory block.
-  bool is_channels_last_3d_contiguous_ = false;
+  bool is_channels_last_3d_contiguous_ : 1;
 
   // Dense tensor is the tensor that store values in a contiguous block of memory.
   // Non-overlapping tensor is the tensor in which elements occupy individual
   // non-repetitive memory.
-  bool is_non_overlapping_and_dense_ = false;
+  bool is_non_overlapping_and_dense_ : 1;
 
-  bool is_wrapped_number_ = false;
+  bool is_wrapped_number_ : 1;
 
   // NOTE [ Metadata Change for a Detached Tensor ]
   //
@@ -1721,14 +1732,13 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   // NOTE: For a full list of tensor metadata fields, please see
   // `copy_tensor_metadata()` in TensorImpl and its subclasses to find
   // which fields are copied by value.
-  bool allow_tensor_metadata_change_ = true;
+  bool allow_tensor_metadata_change_ : 1;
 
   // we decide to keep reserved_ and it will
   // live in Tensor after the split
   // The logic is that if Extend() or ReserveSpace() were ever called,
   // then subsequent Resize()s will not free up Storage.
-  bool reserved_ = false;
-
+  bool reserved_ : 1;
 };
 
 // Note [TensorImpl size constraints]
@@ -1787,7 +1797,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
 //    miscellaneous bitfield
 //
 static_assert(sizeof(void*) != sizeof(int64_t) || // if 64-bit...
-              sizeof(TensorImpl) == sizeof(int64_t) * 30,
+              sizeof(TensorImpl) == sizeof(int64_t) * 29,
               "You changed the size of TensorImpl on 64-bit arch."
               "See Note [TensorImpl size constraints] on how to proceed.");
 } // namespace c10

From edac4060d7e7a0a074deb0612c4c13f262c8614e Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Fri, 30 Oct 2020 10:36:53 -0700
Subject: [PATCH 154/169] Fix mul cuda for bool (#47031)

Summary:
Also, add tests for tensor by scalar multiplication / division

Fixes https://github.com/pytorch/pytorch/issues/47007

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47031

Reviewed By: walterddr

Differential Revision: D24608874

Pulled By: malfet

fbshipit-source-id: 4e15179904814d6e67228276d3d11ff1b5d15d0d
---
 aten/src/ATen/native/cuda/BinaryMulDivKernel.cu |  2 +-
 test/test_torch.py                              | 13 ++++++++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
index be3f4f0bb01e..f80d0906dfa2 100644
--- a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
+++ b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
@@ -65,7 +65,7 @@ void div_kernel_cuda(TensorIterator& iter) {
 }
 
 void mul_kernel_cuda(TensorIterator& iter) {
-  if (!isIntegralType(iter.common_dtype(), /*includeBool*/ false) &&
+  if (!isIntegralType(iter.common_dtype(), /*includeBool*/ true) &&
     (iter.is_cpu_scalar(1) || iter.is_cpu_scalar(2))) {
     //if common dtype is half the scalar constant can overflow in half precision, and yet the result can
     //still be representable in the half dtype. Cast scalar to acc_type to have better accuracy
diff --git a/test/test_torch.py b/test/test_torch.py
index 23ebf0d964b7..d927446140cf 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -26,7 +26,7 @@
 from torch.testing._internal.common_methods_invocations import tri_tests_args, run_additional_tri_tests, \
     _compare_trilu_indices
 from torch.testing._internal.common_utils import \
-    (TestCase, iter_indices, TEST_NUMPY, TEST_SCIPY, TEST_WITH_ROCM, run_tests,
+    (TestCase, iter_indices, TEST_NUMPY, TEST_SCIPY, TEST_WITH_ASAN, TEST_WITH_ROCM, run_tests,
      skipIfNoLapack, suppress_warnings, IS_WINDOWS, NO_MULTIPROCESSING_SPAWN,
      do_test_dtypes, IS_SANDCASTLE, load_tests, slowTest,
      skipCUDANonDefaultStreamIf, skipCUDAMemoryLeakCheckIf, BytesIOContext,
@@ -19463,6 +19463,17 @@ def compare_helper_(like_fn, t):
                     tp = t.permute(p)
                     compare_helper_(like_fn, tp)
 
+    @unittest.skipIf(TEST_WITH_ASAN, "Integer overflows are not allowed under ASAN")
+    @dtypes(*torch.testing.get_all_dtypes())
+    def test_muldiv_scalar(self, device, dtype):
+        x = make_tensor((10, 3), device, dtype, low=None, high=None)
+        s = make_tensor((1,), 'cpu', dtype, low=None, high=None).item()
+        y = torch.full_like(x, s)
+        self.assertEqual(x * s, x * y)
+        self.assertEqual(s * x, y * x)
+        self.assertEqual(x / s, x / y)
+        self.assertEqual(s / x, y / x)
+
 # Tests that compare a device's computation with the (gold-standard) CPU's.
 class TestDevicePrecision(TestCase):
     exact_dtype = True

From 1dd220bd8412c359ca5726de7912a578690be84b Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@fb.com>
Date: Fri, 30 Oct 2020 11:09:33 -0700
Subject: [PATCH 155/169] Add C++ coverage for Ubuntu cpu tests (#46656)

Summary:
In order to enable C++ code coverage for tests, we need to build pytorch with the correct coverage flags. This PR should introduce a build that allows coverage tests to stem from a specific coverage build.

This PR does the following:
1. Adds a new build to `*-coverage_*` build with the correct `--coverage` flag for C++ coverage
2. Calls `lcov` at the end of testing to capture C++ coverage results
3. Pushes C++ results along with Python results
4. Shards the coverage test to not take ~4hrs

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46656

Reviewed By: walterddr

Differential Revision: D24636213

Pulled By: janeyx99

fbshipit-source-id: 362a1a2a20c069ba0a7931669194dac53ac81133
---
 .circleci/cimodel/data/pytorch_build_data.py  |  6 +++++-
 .../cimodel/data/pytorch_build_definitions.py |  9 ++++----
 .circleci/config.yml                          | 21 +++++++++++++------
 .../job-specs/pytorch-job-specs.yml           |  4 +++-
 .jenkins/pytorch/build.sh                     |  5 +++++
 .jenkins/pytorch/test.sh                      | 17 +++++++++------
 6 files changed, 44 insertions(+), 18 deletions(-)

diff --git a/.circleci/cimodel/data/pytorch_build_data.py b/.circleci/cimodel/data/pytorch_build_data.py
index 5156e69b77af..39f2a208a5ec 100644
--- a/.circleci/cimodel/data/pytorch_build_data.py
+++ b/.circleci/cimodel/data/pytorch_build_data.py
@@ -84,7 +84,11 @@
         ("gcc", [
             ("9", [
                 ("3.8", [
-                    ("coverage", [XImportant(True)]),
+                    ("coverage", [
+                        (True, [
+                            ("shard_test", [XImportant(True)]),
+                        ]),
+                    ]),
                 ]),
             ]),
         ]),
diff --git a/.circleci/cimodel/data/pytorch_build_definitions.py b/.circleci/cimodel/data/pytorch_build_definitions.py
index 0c03fac487d6..75b0e8812e1b 100644
--- a/.circleci/cimodel/data/pytorch_build_definitions.py
+++ b/.circleci/cimodel/data/pytorch_build_definitions.py
@@ -272,6 +272,7 @@ def instantiate_configs():
         compiler_version = fc.find_prop("compiler_version")
         is_xla = fc.find_prop("is_xla") or False
         is_asan = fc.find_prop("is_asan") or False
+        is_coverage = fc.find_prop("is_coverage") or False
         is_onnx = fc.find_prop("is_onnx") or False
         is_pure_torch = fc.find_prop("is_pure_torch") or False
         is_vulkan = fc.find_prop("is_vulkan") or False
@@ -311,6 +312,10 @@ def instantiate_configs():
             python_version = fc.find_prop("pyver")
             parms_list[0] = fc.find_prop("abbreviated_pyver")
 
+        if is_coverage:
+            parms_list_ignored_for_docker_image.append("coverage")
+            python_version = fc.find_prop("pyver")
+
         if is_onnx:
             parms_list.append("onnx")
             python_version = fc.find_prop("pyver")
@@ -325,7 +330,6 @@ def instantiate_configs():
         is_important = fc.find_prop("is_important") or False
         parallel_backend = fc.find_prop("parallel_backend") or None
         build_only = fc.find_prop("build_only") or False
-        is_coverage = fc.find_prop("is_coverage") or False
         shard_test = fc.find_prop("shard_test") or False
         # TODO: fix pure_torch python test packaging issue.
         if shard_test:
@@ -333,9 +337,6 @@ def instantiate_configs():
             restrict_phases.extend(["test1", "test2"])
         if build_only or is_pure_torch:
             restrict_phases = ["build"]
-        if is_coverage and restrict_phases is None:
-            restrict_phases = ["build", "coverage_test"]
-
 
         gpu_resource = None
         if cuda_version and cuda_version != "10":
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 2ad408cf6b37..2d367af02d3f 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -655,9 +655,11 @@ jobs:
           echo "Retrieving test reports"
           docker cp $id:/var/lib/jenkins/workspace/test/test-reports ./ || echo 'No test reports found!'
           if [[ ${BUILD_ENVIRONMENT} == *"coverage"* ]]; then
-              echo "Retrieving coverage report"
+              echo "Retrieving Python coverage report"
               docker cp $id:/var/lib/jenkins/workspace/test/.coverage ./test
               docker cp $id:/var/lib/jenkins/workspace/test/coverage.xml ./test
+              echo "Retrieving C++ coverage report"
+              docker cp $id:/var/lib/jenkins/workspace/build/coverage.info ./test
               python3 -mpip install codecov
               python3 -mcodecov
           fi
@@ -6877,16 +6879,23 @@ workflows:
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.6-clang9"
           resource_class: large
       - pytorch_linux_build:
-          name: pytorch_linux_bionic_py3_8_gcc9_build
+          name: pytorch_linux_bionic_py3_8_gcc9_coverage_build
           requires:
             - "docker-pytorch-linux-bionic-py3.8-gcc9"
-          build_environment: "pytorch-linux-bionic-py3.8-gcc9-build"
+          build_environment: "pytorch-linux-bionic-py3.8-gcc9-coverage-build"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.8-gcc9"
       - pytorch_linux_test:
-          name: pytorch_linux_bionic_py3_8_gcc9_coverage_test
+          name: pytorch_linux_bionic_py3_8_gcc9_coverage_test1
           requires:
-            - pytorch_linux_bionic_py3_8_gcc9_build
-          build_environment: "pytorch-linux-bionic-py3.8-gcc9-coverage_test"
+            - pytorch_linux_bionic_py3_8_gcc9_coverage_build
+          build_environment: "pytorch-linux-bionic-py3.8-gcc9-coverage-test1"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.8-gcc9"
+          resource_class: large
+      - pytorch_linux_test:
+          name: pytorch_linux_bionic_py3_8_gcc9_coverage_test2
+          requires:
+            - pytorch_linux_bionic_py3_8_gcc9_coverage_build
+          build_environment: "pytorch-linux-bionic-py3.8-gcc9-coverage-test2"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.8-gcc9"
           resource_class: large
       - pytorch_linux_build:
diff --git a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
index 868f32fd49fa..f6f37dbb0470 100644
--- a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
+++ b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
@@ -217,9 +217,11 @@ jobs:
           echo "Retrieving test reports"
           docker cp $id:/var/lib/jenkins/workspace/test/test-reports ./ || echo 'No test reports found!'
           if [[ ${BUILD_ENVIRONMENT} == *"coverage"* ]]; then
-              echo "Retrieving coverage report"
+              echo "Retrieving Python coverage report"
               docker cp $id:/var/lib/jenkins/workspace/test/.coverage ./test
               docker cp $id:/var/lib/jenkins/workspace/test/coverage.xml ./test
+              echo "Retrieving C++ coverage report"
+              docker cp $id:/var/lib/jenkins/workspace/build/coverage.info ./test
               python3 -mpip install codecov
               python3 -mcodecov
           fi
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index 3e197d867b6e..b94e797e7010 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -42,6 +42,11 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
   nvcc --version
 fi
 
+if [[ "$BUILD_ENVIRONMENT" == *coverage* ]]; then
+  # enable build option in CMake
+  export USE_CPP_CODE_COVERAGE=ON
+fi
+
 # TODO: Don't run this...
 pip_install -r requirements.txt || true
 
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 1af459ab8cc8..2a784ed1690a 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -401,10 +401,15 @@ else
   test_distributed
   test_benchmarks
   test_rpc
-  if [[ "$BUILD_ENVIRONMENT" == *coverage* ]]; then
-    pushd test
-    echo "Generating XML coverage report"
-    time python -mcoverage xml
-    popd
-  fi
+fi
+
+if [[ "$BUILD_ENVIRONMENT" == *coverage* ]]; then
+  pushd test
+  echo "Generating XML coverage report"
+  time python -mcoverage xml
+  popd
+  pushd build
+  echo "Generating lcov coverage report for C++ sources"
+  time lcov --capture --directory . --output-file coverage.info
+  popd
 fi

From 085193c291204eafa4ac6f3647226c8189de36a3 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Fri, 30 Oct 2020 12:23:10 -0700
Subject: [PATCH 156/169] [quant][graphmode][fx][fusion] Add test for fuse_fx
 (#47085)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/47085

Both in train and eval mode

Test Plan: Imported from OSS

Reviewed By: vkuzo

Differential Revision: D24632457

fbshipit-source-id: 486aee4e073fb87e9da46a344e8dc77e848a60cf
---
 test/quantization/test_quantize_fx.py    | 119 +++++++++++++++++++++++
 test/test_quantization.py                |   1 +
 torch/quantization/fx/fusion_patterns.py |  10 +-
 3 files changed, 126 insertions(+), 4 deletions(-)

diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index 7d1588673ca6..34bb78743bc8 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -3,6 +3,7 @@
 import torch.nn as nn
 import torch.nn.quantized as nnq
 import torch.nn.quantized.dynamic as nnqd
+import torch.nn.intrinsic as nni
 import torch.nn.intrinsic.quantized as nniq
 import torch.multiprocessing as mp
 
@@ -57,6 +58,124 @@
 import unittest
 import io
 
+class TestFuseFx(QuantizationTestCase):
+    def test_fuse_conv_bn_relu(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv1d = nn.Conv1d(1, 1, 1)
+                self.conv2d = nn.Conv2d(1, 1, 1)
+                self.conv3d = nn.Conv3d(1, 1, 1)
+                self.bn1d = nn.BatchNorm1d(1)
+                self.bn2d = nn.BatchNorm2d(1)
+                self.bn3d = nn.BatchNorm3d(1)
+                self.conv1d2 = nn.Conv1d(1, 1, 1)
+                self.conv2d2 = nn.Conv2d(1, 1, 1)
+                self.conv3d2 = nn.Conv3d(1, 1, 1)
+                self.bn1d2 = nn.BatchNorm1d(1)
+                self.bn2d2 = nn.BatchNorm2d(1)
+                self.bn3d2 = nn.BatchNorm3d(1)
+                self.relu = nn.ReLU()
+
+            def forward(self, x):
+                x = self.conv1d(x)
+                x = self.bn1d(x)
+                x = self.conv2d(x)
+                x = self.bn2d(x)
+                x = self.conv3d(x)
+                x = self.bn3d(x)
+                x = self.conv1d2(x)
+                x = self.bn1d2(x)
+                x = self.relu(x)
+                x = self.conv2d2(x)
+                x = self.bn2d2(x)
+                x = self.relu(x)
+                x = self.conv3d2(x)
+                x = self.bn3d2(x)
+                x = self.relu(x)
+                return x
+
+        # test train mode
+        m = M().train()
+        # currently we don't check if the module are configured with qconfig before fusion
+        # TODO: if we decide to do that in the future, this test needs to
+        # be updated
+        # train mode fuse_fx is called in prepare_qat_fx
+        m = prepare_qat_fx(m, {})
+        expected_nodes = [
+            ns.call_module(nni.ConvBn2d),
+            ns.call_module(nni.ConvBn3d),
+            ns.call_module(nni.ConvBnReLU2d),
+            ns.call_module(nni.ConvBnReLU3d),
+        ]
+        # ConvBnRelu1d is not fused
+        expected_occurrence = {
+            ns.call_module(nn.ReLU): 1
+        }
+        self.checkGraphModuleNodes(
+            m,
+            expected_node_list=expected_nodes,
+            expected_node_occurrence=expected_occurrence)
+
+        # test eval mode
+        m = M().eval()
+        from torch.quantization.quantize_fx import fuse_fx
+        # fuse_fx is a top level api and only supports eval mode
+        m = fuse_fx(m)
+        expected_nodes = [
+            ns.call_module(nn.Conv2d),
+            ns.call_module(nn.Conv3d),
+            ns.call_module(nni.ConvReLU2d),
+            ns.call_module(nni.ConvReLU3d),
+        ]
+        # ConvBnRelu1d is not fused
+        expected_occurrence = {
+            ns.call_module(nn.ReLU): 1
+        }
+        self.checkGraphModuleNodes(
+            m,
+            expected_node_list=expected_nodes,
+            expected_node_occurrence=expected_occurrence)
+
+    def test_fuse_module_relu(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv1d = nn.Conv1d(1, 1, 1)
+                self.conv2d = nn.Conv2d(1, 1, 1)
+                self.conv3d = nn.Conv3d(1, 1, 1)
+                self.bn1d = nn.BatchNorm1d(1)
+                self.bn2d = nn.BatchNorm2d(1)
+                self.bn3d = nn.BatchNorm3d(1)
+                self.relu = nn.ReLU()
+
+            def forward(self, x):
+                x = self.conv1d(x)
+                x = self.relu(x)
+                x = self.conv2d(x)
+                x = self.relu(x)
+                x = self.conv3d(x)
+                x = self.relu(x)
+                x = self.bn1d(x)
+                x = self.relu(x)
+                x = self.bn2d(x)
+                x = self.relu(x)
+                x = self.bn3d(x)
+                x = self.relu(x)
+                return x
+
+        m = M().eval()
+        from torch.quantization.quantize_fx import fuse_fx
+        m = fuse_fx(m)
+        expected_nodes = [
+            ns.call_module(nni.ConvReLU1d),
+            ns.call_module(nni.ConvReLU2d),
+            ns.call_module(nni.ConvReLU3d),
+            ns.call_module(nni.BNReLU2d),
+            ns.call_module(nni.BNReLU3d),
+        ]
+        self.checkGraphModuleNodes(m, expected_node_list=expected_nodes)
+
 @skipIfNoFBGEMM
 class TestQuantizeFx(QuantizationTestCase):
     def _get_conv_linear_test_cases(self):
diff --git a/test/test_quantization.py b/test/test_quantization.py
index 9eba83b21e4f..682d9baff68e 100644
--- a/test/test_quantization.py
+++ b/test/test_quantization.py
@@ -62,6 +62,7 @@
 
 # 3. GraphModule based graph mode quantization
 try:
+    from quantization.test_quantize_fx import TestFuseFx  # noqa: F401
     from quantization.test_quantize_fx import TestQuantizeFx  # noqa: F401
     from quantization.test_quantize_fx import TestQuantizeFxOps  # noqa: F401
     from quantization.test_quantize_fx import TestQuantizeFxModels  # noqa: F401
diff --git a/torch/quantization/fx/fusion_patterns.py b/torch/quantization/fx/fusion_patterns.py
index d537630c2406..4c92192dc5be 100644
--- a/torch/quantization/fx/fusion_patterns.py
+++ b/torch/quantization/fx/fusion_patterns.py
@@ -9,15 +9,18 @@
 # Fusion Patterns
 # ---------------------
 
-@register_fusion_pattern((torch.nn.BatchNorm2d, torch.nn.Conv2d))
 @register_fusion_pattern((torch.nn.ReLU, torch.nn.Conv1d))
 @register_fusion_pattern((torch.nn.ReLU, torch.nn.Conv2d))
 @register_fusion_pattern((torch.nn.ReLU, torch.nn.Conv3d))
 @register_fusion_pattern((torch.nn.functional.relu, torch.nn.Conv1d))
 @register_fusion_pattern((torch.nn.functional.relu, torch.nn.Conv2d))
 @register_fusion_pattern((torch.nn.functional.relu, torch.nn.Conv3d))
+@register_fusion_pattern((torch.nn.BatchNorm2d, torch.nn.Conv2d))
+@register_fusion_pattern((torch.nn.BatchNorm3d, torch.nn.Conv3d))
 @register_fusion_pattern((torch.nn.ReLU, (torch.nn.BatchNorm2d, torch.nn.Conv2d)))
+@register_fusion_pattern((torch.nn.ReLU, (torch.nn.BatchNorm3d, torch.nn.Conv3d)))
 @register_fusion_pattern((torch.nn.functional.relu, (torch.nn.BatchNorm2d, torch.nn.Conv2d)))
+@register_fusion_pattern((torch.nn.functional.relu, (torch.nn.BatchNorm3d, torch.nn.Conv3d)))
 class ConvBNReLUFusion():
     def __init__(self, quantizer, node):
         super().__init__()
@@ -66,7 +69,8 @@ def fuse(self, quantizer, load_arg, fuse_custom_config_dict=None):
         fuser_method = get_fuser_method(op_type_list, additional_fuser_method_mapping)
         if fuser_method is None:
             raise NotImplementedError("Cannot fuse modules: {}".format(types))
-        setattr(quantizer.modules[conv_parent_name], conv_name, fuser_method(*op_list))
+        fused = fuser_method(*op_list)
+        setattr(quantizer.modules[conv_parent_name], conv_name, fused)
 
         # TODO: do we need to make sure bn is only used once?
         if self.bn_node is not None:
@@ -77,8 +81,6 @@ def fuse(self, quantizer, load_arg, fuse_custom_config_dict=None):
 
 @register_fusion_pattern((torch.nn.functional.relu, torch.nn.Linear))
 @register_fusion_pattern((torch.nn.ReLU, torch.nn.Linear))
-@register_fusion_pattern((torch.nn.functional.relu, torch.nn.BatchNorm1d))
-@register_fusion_pattern((torch.nn.ReLU, torch.nn.BatchNorm1d))
 @register_fusion_pattern((torch.nn.functional.relu, torch.nn.BatchNorm2d))
 @register_fusion_pattern((torch.nn.ReLU, torch.nn.BatchNorm2d))
 @register_fusion_pattern((torch.nn.functional.relu, torch.nn.BatchNorm3d))

From c86af4aa55ab3e63aba7568692b678ce662f27e2 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Fri, 30 Oct 2020 13:31:10 -0700
Subject: [PATCH 157/169] Disable NEON acceleration on older compilers (#47099)

Summary:
Optimized build compiled by gcc-7.5.0 generates numerically incorrect code

Works around https://github.com/pytorch/pytorch/issues/47098

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47099

Reviewed By: walterddr

Differential Revision: D24642272

Pulled By: malfet

fbshipit-source-id: 2cfb43e950c0d1c92cfcee13749f1ad13248c39b
---
 aten/src/ATen/cpu/vec256/vec256_float_neon.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/cpu/vec256/vec256_float_neon.h b/aten/src/ATen/cpu/vec256/vec256_float_neon.h
index 17be2b255108..f410e415277f 100644
--- a/aten/src/ATen/cpu/vec256/vec256_float_neon.h
+++ b/aten/src/ATen/cpu/vec256/vec256_float_neon.h
@@ -25,6 +25,8 @@ namespace {
 //    https://bugs.llvm.org/show_bug.cgi?id=45824
 // Most likely we will do aarch32 support with inline asm.
 #if defined(__aarch64__)
+// See https://github.com/pytorch/pytorch/issues/47098
+#if defined(__clang__) || (__GNUC__ > 8 || (__GNUC__ == 8 && __GNUC_MINOR__ > 3))
 
 #ifdef __BIG_ENDIAN__
 #error "Big endian is not supported."
@@ -675,6 +677,7 @@ Vec256<float> inline fmadd(const Vec256<float>& a, const Vec256<float>& b, const
   return Vec256<float>(r0, r1);
 }
 
-#endif
+#endif /* defined(__clang__) || (__GNUC__ > 8 || (__GNUC__ == 8 && __GNUC_MINOR__ > 3)) */
+#endif /* defined(aarch64) */
 
 }}}

From a81572cdc54b3238cbe5902a3991484e9d092df7 Mon Sep 17 00:00:00 2001
From: Thomas Viehmann <tv@beamnet.de>
Date: Fri, 30 Oct 2020 15:16:19 -0700
Subject: [PATCH 158/169] Add anomaly mode for C++ (#46981)

Summary:
This adds anomaly mode for C++.

The backtrace isn't perfect yet, but it's a start.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46981

Reviewed By: IvanKobzarev

Differential Revision: D24631957

Pulled By: albanD

fbshipit-source-id: 4b91e205e7e51f4cf0fbc651da5013a00a3b2497
---
 test/cpp/api/autograd.cpp            | 35 ++++++++++++++++++++++
 torch/csrc/autograd/anomaly_mode.cpp | 43 ++++++++++++++++++++++++++--
 torch/csrc/autograd/anomaly_mode.h   | 10 +++++--
 torch/csrc/autograd/engine.h         |  2 +-
 4 files changed, 84 insertions(+), 6 deletions(-)

diff --git a/test/cpp/api/autograd.cpp b/test/cpp/api/autograd.cpp
index 8a8aa75541ac..502db310de4e 100644
--- a/test/cpp/api/autograd.cpp
+++ b/test/cpp/api/autograd.cpp
@@ -7,6 +7,7 @@
 #include <test/cpp/api/support.h>
 
 using namespace torch::autograd;
+using namespace torch::test;
 
 #define ASSERT_VARIABLE_EQ(a,b) ASSERT_TRUE(torch::allclose((a),(b)))
 #define EXPECT_VARIABLE_EQ(a,b) EXPECT_TRUE(torch::allclose((a),(b)))
@@ -154,6 +155,40 @@ TEST(AutogradAPITests, RetainGrad) {
   ASSERT_VARIABLE_EQ(input * 18, input.grad());
 }
 
+TEST(AutogradAPITests, AnomalyMode) {
+  // Needs to have backtrace as warning and then throw an error
+  torch::autograd::AnomalyMode::set_enabled(true);
+  {
+    WarningCapture warnings;
+    auto x = torch::tensor({5.0}, torch::requires_grad());
+    auto y = x * x;
+    auto z = y * y;
+    y += 1;
+    ASSERT_THROWS_WITH(z.backward(), "inplace");
+    ASSERT_TRUE(
+        warnings.str().find("Traceback of forward") != std::string::npos);
+  }
+  {
+    WarningCapture warnings;
+    // Double backward
+    auto x = torch::tensor({0.0}, torch::requires_grad());
+    auto y = x.pow(1.5);
+    auto gr =
+        grad({y}, {x}, {}, /*retain_graph=*/true, /*create_backward=*/true);
+    ASSERT_THROWS_WITH(grad({gr[0]}, {x});, "returned nan");
+    auto msgs = warnings.messages();
+    ASSERT_EQ(msgs.size(), 2);
+    ASSERT_TRUE(
+        msgs[0].find("Traceback of forward call that caused the error") !=
+        std::string::npos);
+    ASSERT_TRUE(
+        msgs[1].find(
+            "Traceback of forward call that induced the previous calculation") !=
+        std::string::npos);
+  }
+  torch::autograd::AnomalyMode::set_enabled(false);
+}
+
 TEST(CustomAutogradTest, CustomFunction) {
   struct MyFunction : public Function<MyFunction> {
     static Variable forward(AutogradContext *ctx, Variable var1, int mul, Variable var2) {
diff --git a/torch/csrc/autograd/anomaly_mode.cpp b/torch/csrc/autograd/anomaly_mode.cpp
index bbb76fba656f..16783fdbef15 100644
--- a/torch/csrc/autograd/anomaly_mode.cpp
+++ b/torch/csrc/autograd/anomaly_mode.cpp
@@ -1,9 +1,48 @@
+#include <c10/util/Backtrace.h>
+#include <c10/util/Exception.h>
 #include <torch/csrc/autograd/anomaly_mode.h>
+#include <torch/csrc/autograd/function.h>
 
-namespace torch { namespace autograd {
+namespace torch {
+namespace autograd {
 
 bool AnomalyMode::_enabled = false;
 
 AnomalyMetadata::~AnomalyMetadata() = default;
 
-}}
+void AnomalyMetadata::store_stack() {
+  traceback_ = c10::get_backtrace(/* frames_to_skip */ 1);
+}
+
+void AnomalyMetadata::print_stack(const std::string& current_node_name) {
+  TORCH_WARN(
+      "Error detected in ",
+      current_node_name,
+      ". ",
+      "Traceback of forward call that caused the error:\n",
+      traceback_);
+
+  auto& cur_parent = parent_;
+  // if there is no "parent_" in metadata, then it means this metadata's node
+  // is the root and stop printing the traceback
+  while (cur_parent) {
+    auto parent_metadata = cur_parent->metadata();
+    TORCH_WARN(
+        "\n\n",
+        "Previous calculation was induced by ",
+        cur_parent->name(),
+        ". "
+        "Traceback of forward call that induced the previous calculation:\n",
+        parent_metadata->traceback_);
+    // get the parent of this node, if this node is a root, pyparent is simply
+    // null
+    cur_parent = parent_metadata->parent_;
+  }
+}
+
+void AnomalyMetadata::assign_parent(const std::shared_ptr<Node>& parent_node) {
+  parent_ = parent_node;
+}
+
+} // namespace autograd
+} // namespace torch
diff --git a/torch/csrc/autograd/anomaly_mode.h b/torch/csrc/autograd/anomaly_mode.h
index 013600b230fc..d7384003bc8e 100644
--- a/torch/csrc/autograd/anomaly_mode.h
+++ b/torch/csrc/autograd/anomaly_mode.h
@@ -24,9 +24,13 @@ struct TORCH_API AnomalyMode {
 
 struct TORCH_API AnomalyMetadata {
   virtual ~AnomalyMetadata();
-  virtual void store_stack() = 0;
-  virtual void print_stack(const std::string& current_node_name) = 0;
-  virtual void assign_parent(const std::shared_ptr<Node>& parent_node) = 0;
+  virtual void store_stack();
+  virtual void print_stack(const std::string& current_node_name);
+  virtual void assign_parent(const std::shared_ptr<Node>& parent_node);
+
+ private:
+  std::string traceback_;
+  std::shared_ptr<Node> parent_;
 };
 
 }}
diff --git a/torch/csrc/autograd/engine.h b/torch/csrc/autograd/engine.h
index 0dde6e735d10..33623b7f4b20 100644
--- a/torch/csrc/autograd/engine.h
+++ b/torch/csrc/autograd/engine.h
@@ -284,7 +284,7 @@ struct TORCH_API Engine {
       std::shared_ptr<Node> graph_root);
 
   virtual std::unique_ptr<AnomalyMetadata> make_anomaly_metadata() {
-    return nullptr;
+    return std::make_unique<AnomalyMetadata>();
   }
 
   // We pass cpu_ready_queue to evaluate_function, so that it knows

From d1d6dc2e3c697d310267ce32f32b2ed545bab3a3 Mon Sep 17 00:00:00 2001
From: "Mohamad H. Danesh" <mohamad4danesh@gmail.com>
Date: Fri, 30 Oct 2020 16:01:58 -0700
Subject: [PATCH 159/169] Add more specific error message (#46905)

Summary:
While using `torch.utils.data.TensorDataset`, if sizes of tensors mismatch, there's now a proper error message.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46905

Reviewed By: ngimel

Differential Revision: D24565712

Pulled By: mrshenli

fbshipit-source-id: 98cdf189591c2a7a1b693627cc8464e8f553d9ee
---
 torch/utils/data/dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/utils/data/dataset.py b/torch/utils/data/dataset.py
index c910cab9aef8..7c45c10dd812 100644
--- a/torch/utils/data/dataset.py
+++ b/torch/utils/data/dataset.py
@@ -164,7 +164,7 @@ class TensorDataset(Dataset[Tuple[Tensor, ...]]):
     tensors: Tuple[Tensor, ...]
 
     def __init__(self, *tensors: Tensor) -> None:
-        assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors)
+        assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors), "Size mismatch between tensors"
         self.tensors = tensors
 
     def __getitem__(self, index):

From 7eb427e931bec9359cc579934885e1f3a4dc4472 Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@fb.com>
Date: Fri, 30 Oct 2020 16:21:08 -0700
Subject: [PATCH 160/169] Providing more information while crashing process in
 async error handling (#46274)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46274

We crash the process in NCCL Async Error Handling if the collective
has been running for greater than some set timeout. This PR logs more
information about the rank and duration the collective ran before throwing an exception.
ghstack-source-id: 115614622

Test Plan:
Run desync tests and flow. Here are the Flow runs showing the right messages: f225031389
f225032004

Reviewed By: jiayisuse

Differential Revision: D24200144

fbshipit-source-id: 02d48f13352aed40a4476768c123d5cebbedc8e0
---
 torch/lib/c10d/ProcessGroupNCCL.cpp | 30 ++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp
index 36880f22b36d..218ed460535c 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.cpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.cpp
@@ -392,10 +392,16 @@ void ProcessGroupNCCL::WorkNCCL::synchronizeInternal(
           LOG(INFO) << "[Rank " << rank_
                     << "] Wrote aborted communicator id to store: " << storeKey;
         }
-        LOG(INFO) << "[Rank " << rank_
-                  << "] Caught collective operation timeout for work: "
-                  << (*this);
-        throw std::runtime_error("Operation timed out!");
+        auto currentTimepoint = std::chrono::steady_clock::now();
+        auto timeElapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
+            currentTimepoint - workStartTime_);
+        std::string exceptionMsg = c10::str("[Rank ", rank_, "] ",
+                                            "Caught collective operation timeout: ",
+                                            (*this),
+                                            " ran for ",
+                                            timeElapsed.count(),
+                                            " milliseconds before timing out.");
+        throw std::runtime_error(exceptionMsg);
       }
       // Check for errors and throw appropriate exception.
       checkAndThrowException();
@@ -504,12 +510,18 @@ void ProcessGroupNCCL::abortTimedOutCollectives(std::unordered_set<std::string>&
     // Check for Timeouts in the WorkNCCL Operations, and abort all
     // communicators accordingly.
     if (work.timedOut()) {
-      LOG(INFO)
-          << "[Rank " << rank_
-          << "] Watchdog caught collective operation timeout for work: "
-          << work;
+      auto currentTimepoint = std::chrono::steady_clock::now();
+      auto timeElapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
+          currentTimepoint - work.workStartTime_);
+      std::string exceptionMsg = c10::str("[Rank ", rank_, "] ",
+                                          "Watchdog caught collective operation timeout: ",
+                                          work,
+                                          " ran for ",
+                                          timeElapsed.count(),
+                                          " milliseconds before timing out.");
+      LOG(INFO) << exceptionMsg;
       std::exception_ptr exception_ptr = std::make_exception_ptr(
-          std::runtime_error("NCCL Operation Timed Out"));
+          std::runtime_error(exceptionMsg));
       work.setException(exception_ptr);
       for (const auto& ncclComm : work.ncclComms_) {
         ncclComm->ncclCommAbort();

From cb4b6336baac14173ec5659896fe3e7f06c2f5b2 Mon Sep 17 00:00:00 2001
From: Horace He <horacehe2007@yahoo.com>
Date: Fri, 30 Oct 2020 17:06:55 -0700
Subject: [PATCH 161/169] [FX] Fix handling of attributes (#47030)

Summary:
Probably works :)

Fixes https://github.com/pytorch/pytorch/issues/46872

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47030

Reviewed By: ngimel

Differential Revision: D24652600

Pulled By: Chillee

fbshipit-source-id: 3fe7099ad02d1b5c23a7335b855d36d373603d18
---
 test/test_fx.py            | 20 ++++++++++++++++++++
 torch/fx/symbolic_trace.py | 18 ++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/test/test_fx.py b/test/test_fx.py
index 4945ea857ede..05e5a821f4ef 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -1079,6 +1079,26 @@ def foo(x, y):
         x, y = torch.randn(3, 4), torch.randn(3, 4)
         self.checkGraphModule(foo, (x, y))
 
+    def test_direct_param_use(self):
+        class TransposeTest(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.b = torch.nn.Parameter(torch.rand(4, 3))
+
+            def forward(self, x):
+                return self.b
+
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = TransposeTest()
+
+            def forward(self, x):
+                return self.a.b, self.a.b.t(), self.a.b.view(12)
+
+        traced = torch.fx.symbolic_trace(Foo())
+        assert(all('constant' not in node.target for node in traced.graph.nodes))
+
 
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/fx/symbolic_trace.py b/torch/fx/symbolic_trace.py
index ab8a871adcf1..2865c97da22b 100644
--- a/torch/fx/symbolic_trace.py
+++ b/torch/fx/symbolic_trace.py
@@ -173,6 +173,21 @@ def trace(self, root: Union[torch.nn.Module, Callable]) -> Graph:
         fn, args = self.create_args_for_root(fn, isinstance(root, torch.nn.Module))
 
         orig_call = torch.nn.Module.__call__
+        orig_getattr = torch.nn.Module.__getattr__
+
+        parameter_proxy_cache = {}  # Reduce number of get_attr calls
+
+        # Method dispatch on parameters is not recorded unless it's directly used.
+        # Thus, we need to insert a proxy when __getattr__ requests a parameter.
+        def module_getattr_wrapper(mod, attr):
+            attr_val = orig_getattr(mod, attr)
+            if isinstance(attr_val, torch.nn.Parameter):
+                for n, p in self.root.named_parameters():
+                    if attr_val is p:
+                        if n not in parameter_proxy_cache:
+                            parameter_proxy_cache[n] = self.create_proxy('get_attr', n, (), {})
+                        return parameter_proxy_cache[n]
+            return attr_val
 
         def module_call_wrapper(mod, *args, **kwargs):
             def forward(*args, **kwargs):
@@ -181,11 +196,14 @@ def forward(*args, **kwargs):
             return self.call_module(mod, forward, args, kwargs)
 
         try:
+            # Seems to be a mypy limitation: https://github.com/python/mypy/issues/2427
+            torch.nn.Module.__getattr__ = module_getattr_wrapper  # type: ignore
             torch.nn.Module.__call__ = module_call_wrapper
             self.create_node('output', 'output', (self.create_arg(fn(*args)),), {},
                              type_expr=fn.__annotations__.get('return', None))
         finally:
             torch.nn.Module.__call__ = orig_call
+            torch.nn.Module.__getattr__ = orig_getattr  # type: ignore
         return self.graph
 
 # Symbolic tracing API

From 1aa57bb7619747c18f4053dbc1e8a7e5a6768f05 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@fb.com>
Date: Fri, 30 Oct 2020 18:32:45 -0700
Subject: [PATCH 162/169] Moving coverage, xunit, pytest installation to Docker
 (#47082)

Summary:
Fixes a TODO. This PR moves `pip_install unittest-xml-reporting coverage pytest` to the base Ubuntu docker instead of running that installation during every test.

This should save some time.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47082

Reviewed By: samestep

Differential Revision: D24634393

Pulled By: janeyx99

fbshipit-source-id: 3b980890409eafef9b006b9e03ad7f3e9017529e
---
 .circleci/docker/centos-rocm/Dockerfile  |  2 +-
 .circleci/docker/common/install_conda.sh |  4 ++--
 .circleci/docker/ubuntu-cuda/Dockerfile  |  2 +-
 .circleci/docker/ubuntu-rocm/Dockerfile  |  2 +-
 .circleci/docker/ubuntu/Dockerfile       |  2 +-
 .jenkins/pytorch/test.sh                 | 16 ++++++----------
 6 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/.circleci/docker/centos-rocm/Dockerfile b/.circleci/docker/centos-rocm/Dockerfile
index 1bc7b0deea32..a94a7167a7f4 100644
--- a/.circleci/docker/centos-rocm/Dockerfile
+++ b/.circleci/docker/centos-rocm/Dockerfile
@@ -27,7 +27,7 @@ RUN rm install_glibc.sh
 ADD ./common/install_user.sh install_user.sh
 RUN bash ./install_user.sh && rm install_user.sh
 
-# Install conda
+# Install conda and other packages (e.g., numpy, coverage, pytest)
 ENV PATH /opt/conda/bin:$PATH
 ARG ANACONDA_PYTHON_VERSION
 ADD ./common/install_conda.sh install_conda.sh
diff --git a/.circleci/docker/common/install_conda.sh b/.circleci/docker/common/install_conda.sh
index db8f1a457ecf..c63e28029f07 100755
--- a/.circleci/docker/common/install_conda.sh
+++ b/.circleci/docker/common/install_conda.sh
@@ -96,13 +96,13 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
   # TODO: This isn't working atm
   conda_install nnpack -c killeent
 
-  # Install some other packages
+  # Install some other packages, including those needed for Python test reporting
   # TODO: Why is scipy pinned
   # numba & llvmlite is pinned because of https://github.com/numba/numba/issues/4368
   # scikit-learn is pinned because of
   # https://github.com/scikit-learn/scikit-learn/issues/14485 (affects gcc 5.5
   # only)
-  as_jenkins pip install --progress-bar off pytest scipy==1.1.0 scikit-learn==0.20.3 scikit-image librosa>=0.6.2 psutil numba==0.46.0 llvmlite==0.30.0
+  as_jenkins pip install --progress-bar off pytest scipy==1.1.0 scikit-learn==0.20.3 scikit-image librosa>=0.6.2 psutil numba==0.46.0 llvmlite==0.30.0 unittest-xml-reporting coverage
 
   popd
 fi
diff --git a/.circleci/docker/ubuntu-cuda/Dockerfile b/.circleci/docker/ubuntu-cuda/Dockerfile
index d3a9027d5f06..b767440066e7 100644
--- a/.circleci/docker/ubuntu-cuda/Dockerfile
+++ b/.circleci/docker/ubuntu-cuda/Dockerfile
@@ -24,7 +24,7 @@ ARG KATEX
 ADD ./common/install_katex.sh install_katex.sh
 RUN bash ./install_katex.sh && rm install_katex.sh
 
-# Install conda
+# Install conda and other packages (e.g., numpy, coverage, pytest)
 ENV PATH /opt/conda/bin:$PATH
 ARG ANACONDA_PYTHON_VERSION
 ADD ./common/install_conda.sh install_conda.sh
diff --git a/.circleci/docker/ubuntu-rocm/Dockerfile b/.circleci/docker/ubuntu-rocm/Dockerfile
index 5fd133d08245..761bf0438d7f 100644
--- a/.circleci/docker/ubuntu-rocm/Dockerfile
+++ b/.circleci/docker/ubuntu-rocm/Dockerfile
@@ -21,7 +21,7 @@ RUN bash ./install_clang.sh && rm install_clang.sh
 ADD ./common/install_user.sh install_user.sh
 RUN bash ./install_user.sh && rm install_user.sh
 
-# Install conda
+# Install conda and other packages (e.g., numpy, coverage, pytest)
 ENV PATH /opt/conda/bin:$PATH
 ARG ANACONDA_PYTHON_VERSION
 ADD ./common/install_conda.sh install_conda.sh
diff --git a/.circleci/docker/ubuntu/Dockerfile b/.circleci/docker/ubuntu/Dockerfile
index ca4d3c58dbc6..3938d99ade0b 100644
--- a/.circleci/docker/ubuntu/Dockerfile
+++ b/.circleci/docker/ubuntu/Dockerfile
@@ -33,7 +33,7 @@ ARG KATEX
 ADD ./common/install_katex.sh install_katex.sh
 RUN bash ./install_katex.sh && rm install_katex.sh
 
-# Install conda
+# Install conda and other packages (e.g., numpy, coverage, pytest)
 ENV PATH /opt/conda/bin:$PATH
 ARG ANACONDA_PYTHON_VERSION
 ADD ./common/install_conda.sh install_conda.sh
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 2a784ed1690a..88bcfc93e19d 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -11,17 +11,13 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 
 echo "Testing pytorch"
 
-if [ -n "${IN_CI}" ]; then
-  # TODO move this to docker
-  pip_install unittest-xml-reporting coverage pytest
+if [[ "$BUILD_ENVIRONMENT" == *-slow-* ]]; then
+  export PYTORCH_TEST_WITH_SLOW=1
+  export PYTORCH_TEST_SKIP_FAST=1
+fi
 
-  if [[ "$BUILD_ENVIRONMENT" == *-slow-* ]]; then
-    export PYTORCH_TEST_WITH_SLOW=1
-    export PYTORCH_TEST_SKIP_FAST=1
-  fi
-  if [[ "$BUILD_ENVIRONMENT" == *coverage* ]]; then
-    export PYTORCH_COLLECT_COVERAGE=1
-  fi
+if [[ "$BUILD_ENVIRONMENT" == *coverage* ]]; then
+  export PYTORCH_COLLECT_COVERAGE=1
 fi
 
 if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then

From 7f056e99dd10cff3dab0a6e1931335a3bb2a1ce4 Mon Sep 17 00:00:00 2001
From: Xingying Cheng <xcheng16@fb.com>
Date: Fri, 30 Oct 2020 19:02:16 -0700
Subject: [PATCH 163/169] [fbcode] Make model reader utilities.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
For some of the end to end flow projects, we will need the capabilities to read module information during model validation or model publishing.
Creating this model_reader.py for utilities for model content reading, this diff we included the following functionalities:
1. read the model bytecode version;
2. check if a model is lite PyTorch script module;
3. check if a model is PyTorch script module.

Test Plan:
```
[xcheng16@devvm1099]/data/users/xcheng16/fbsource/fbcode% buck test pytorch_mobile/utils/tests:mobile_model_reader_tests
Processing filesystem changes: finished in 1.5 sec
Parsing buck files: finished in 1.6 sec
Building: finished in 4.9 sec (100%) 9249/43504 jobs, 2 updated
  Total time: 6.5 sec
More details at https://www.internalfb.com/intern/buck/build/6d0e2c23-d86d-4248-811f-31cb1aa7eab3
Tpx test run coordinator for Facebook. See https://fburl.com/tpx for details.
Running with tpx session id: 2ffccd62-ece5-44b5-8350-3a292243fad9
Trace available for this run at /tmp/tpx-20201030-122220.664763/trace.log
Started reporting to test run: https://our.intern.facebook.com/intern/testinfra/testrun/3940649711969390
    ✓ ListingSuccess: pytorch_mobile/utils/tests:mobile_model_reader_tests - main (10.234)
    ✓ Pass: pytorch_mobile/utils/tests:mobile_model_reader_tests - test_is_pytorch_lite_module (pytorch_mobile.utils.tests.test_model_reader.TestModelLoader) (7.039)
    ✓ Pass: pytorch_mobile/utils/tests:mobile_model_reader_tests - test_is_pytorch_script_module (pytorch_mobile.utils.tests.test_model_reader.TestModelLoader) (7.205)
    ✓ Pass: pytorch_mobile/utils/tests:mobile_model_reader_tests - test_read_module_bytecode_version (pytorch_mobile.utils.tests.test_model_reader.TestModelLoader) (7.223)
Summary
  Pass: 3
  ListingSuccess: 1
Finished test run: https://our.intern.facebook.com/intern/testinfra/testrun/3940649711969390

Reviewed By: husthyc

Differential Revision: D24655999

fbshipit-source-id: 5095ca158d89231fb17285d445548f91ddb89bab
---
 torch/utils/show_pickle.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch/utils/show_pickle.py b/torch/utils/show_pickle.py
index 0e2498d64c56..9e55ebff48b9 100644
--- a/torch/utils/show_pickle.py
+++ b/torch/utils/show_pickle.py
@@ -68,6 +68,7 @@ def persistent_load(self, pid):
     def dump(cls, in_stream, out_stream):
         value = cls(in_stream).load()
         pprint.pprint(value, stream=out_stream)
+        return value
 
 
 def main(argv, output_stream=None):

From e3f912e8b797fe70062b4c6899e19f28fa590d87 Mon Sep 17 00:00:00 2001
From: Ricardo Juan Palma Duran <rduran@fb.com>
Date: Fri, 30 Oct 2020 20:58:20 -0700
Subject: [PATCH 164/169] Revert D24655999: [fbcode] Make model reader
 utilities.

Test Plan: revert-hammer

Differential Revision:
D24655999 (https://github.com/pytorch/pytorch/commit/7f056e99dd10cff3dab0a6e1931335a3bb2a1ce4)

Original commit changeset: 5095ca158d89

fbshipit-source-id: c43f672def7331667421e01b90f979940366e3c9
---
 torch/utils/show_pickle.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torch/utils/show_pickle.py b/torch/utils/show_pickle.py
index 9e55ebff48b9..0e2498d64c56 100644
--- a/torch/utils/show_pickle.py
+++ b/torch/utils/show_pickle.py
@@ -68,7 +68,6 @@ def persistent_load(self, pid):
     def dump(cls, in_stream, out_stream):
         value = cls(in_stream).load()
         pprint.pprint(value, stream=out_stream)
-        return value
 
 
 def main(argv, output_stream=None):

From ee0033af9b7531ba0a453844dc621f8cfa140c20 Mon Sep 17 00:00:00 2001
From: Yi Wang <wayi@fb.com>
Date: Fri, 30 Oct 2020 23:17:56 -0700
Subject: [PATCH 165/169] [Gradient Compression] Surface C++ comm hooks to
 Python API as built-in comm hooks (#46959)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46959

1. Implemented the Pybind part.
2. In the reducer, once the builtin_comm_hook_type is set,  a c++ comm hook instance will be created in Reducer::autograd_hook.
3. Added unit tests for the builit-in comm hooks.

Original PR issue: C++ DDP Communication Hook https://github.com/pytorch/pytorch/issues/46348
ghstack-source-id: 115629230

Test Plan: buck test mode/dev-nosan caffe2/test/distributed:c10d -- test_builtin_ddp_comm_hooks_nccl

Reviewed By: pritamdamania87

Differential Revision: D24471910

fbshipit-source-id: f96b752298549ea2067e2568189f1b394abcd99a
---
 test/distributed/test_c10d.py                 | 45 ++++++++++++++++++-
 torch/csrc/distributed/c10d/comm.h            |  3 +-
 .../distributed/c10d/default_comm_hooks.h     | 13 ++++++
 torch/csrc/distributed/c10d/init.cpp          | 44 ++++++++++++------
 torch/csrc/distributed/c10d/reducer.cpp       | 33 +++++++++++++-
 torch/csrc/distributed/c10d/reducer.h         |  7 +++
 .../algorithms/ddp_comm_hooks/__init__.py     |  2 +
 torch/lib/c10d/ProcessGroupNCCL.hpp           | 30 +++++++++++--
 torch/nn/parallel/distributed.py              | 36 ++++++++++++++-
 9 files changed, 192 insertions(+), 21 deletions(-)

diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py
index ff9fe993460c..270acf9bf5fc 100644
--- a/test/distributed/test_c10d.py
+++ b/test/distributed/test_c10d.py
@@ -3398,6 +3398,21 @@ def _gpu_model_with_ddp_comm_hook(self, process_group, hook=None, gradient_as_bu
 
         return gpu_model
 
+    def _gpu_model_with_builtin_ddp_comm_hook(self, process_group, hook=None, gradient_as_bucket_view=False):
+        device_id = gpus_for_rank(self.world_size)[self.rank][0]
+        gpu_model = DistributedDataParallel(
+            ModuleForDdpCommHook().to(device_id),
+            device_ids=[device_id],
+            process_group=process_group,
+            gradient_as_bucket_view=gradient_as_bucket_view,
+        )
+
+        # Register a built-in DDP communication hook if defined
+        if hook is not None:
+            gpu_model._register_builtin_comm_hook(hook)
+
+        return gpu_model
+
     def _run_and_verify_hook(self, model, input, expected_grad):
         # Run forward
         output = model(input, self.rank)
@@ -3474,18 +3489,46 @@ def allreduce_hook(state: object, bucket: dist._GradBucket) -> torch._C.Future:
         # check whether the grads are equal to what DDP without hook would return.
         self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2))
 
+    def _test_builtin_ddp_comm_hooks_nccl(self, gradient_as_bucket_view=False):
+        """
+        This unit test verifies whether built-in DDP communication hooks ALLREDUCE and FP16_COMPRESS
+        can give the same result result with the case of no hook registered.
+        """
+        store = c10d.FileStore(self.file_name, self.world_size)
+        process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
+
+        for comm_hook_type in [dist.BuiltinCommHookType.ALLREDUCE, dist.BuiltinCommHookType.FP16_COMPRESS]:
+            # Get GPU model with the built-in allreduce communication hook.
+            gpu_model = self._gpu_model_with_builtin_ddp_comm_hook(
+                process_group, comm_hook_type, gradient_as_bucket_view)
+
+            # check whether the grads are equal to what DDP without hook would return.
+            self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2))
+
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
     @skip_if_rocm
     def test_ddp_comm_hook_allreduce_hook_nccl(self):
         self._test_ddp_comm_hook_allreduce_hook_nccl()
 
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    @skip_if_rocm
+    def test_builtin_ddp_comm_hooks_nccl(self):
+        self._test_builtin_ddp_comm_hooks_nccl()
+
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
     @skip_if_rocm
     def test_ddp_comm_hook_allreduce_hook_nccl_grad_is_view(self):
         self._test_ddp_comm_hook_allreduce_hook_nccl(gradient_as_bucket_view=True)
 
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    @skip_if_rocm
+    def test_builtin_ddp_comm_hooks_nccl_grad_is_view(self):
+        self._test_builtin_ddp_comm_hooks_nccl(gradient_as_bucket_view=True)
+
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
     @skip_if_rocm
@@ -3603,7 +3646,7 @@ def dummy_hook(state, bucket):
         model._register_comm_hook(None, dummy_hook)
 
         with self.assertRaisesRegex(
-            RuntimeError, "register_comm_hook can only be called once."
+            RuntimeError, "register_comm_hook or register_builtin_comm_hook can only be called once."
         ):
             model._register_comm_hook(None, dummy_hook)
 
diff --git a/torch/csrc/distributed/c10d/comm.h b/torch/csrc/distributed/c10d/comm.h
index 1eb78b201569..58f40f81ccb3 100644
--- a/torch/csrc/distributed/c10d/comm.h
+++ b/torch/csrc/distributed/c10d/comm.h
@@ -60,8 +60,9 @@ class TORCH_PYTHON_API CommHookInterface {
 
 // This CppCommHook interface only requires implementing runHook method that
 // potentially uses a state.
+// Still need TORCH_PYTHON_API instead of TORCH_API to support Windows platform.
 template <typename T>
-class TORCH_API CppCommHookInterface : public CommHookInterface {
+class TORCH_PYTHON_API CppCommHookInterface : public CommHookInterface {
  public:
   explicit CppCommHookInterface(T& state) : state_(state) {}
 
diff --git a/torch/csrc/distributed/c10d/default_comm_hooks.h b/torch/csrc/distributed/c10d/default_comm_hooks.h
index dd141fe0eb58..140fc2505ae4 100644
--- a/torch/csrc/distributed/c10d/default_comm_hooks.h
+++ b/torch/csrc/distributed/c10d/default_comm_hooks.h
@@ -5,13 +5,26 @@
 
 namespace c10d {
 
+enum class BuiltinCommHookType {
+  ALLREDUCE = 1,
+  FP16_COMPRESS = 2,
+};
+
 class AllReduceCommHook : public CppCommHookInterface<ProcessGroup*> {
+ public:
+  explicit AllReduceCommHook(ProcessGroup* state)
+      : CppCommHookInterface<ProcessGroup*>(state) {}
+
   ~AllReduceCommHook() override {}
 
   c10::intrusive_ptr<c10::ivalue::Future> runHook(GradBucket& bucket) override;
 };
 
 class FP16CompressCommHook : public CppCommHookInterface<ProcessGroup*> {
+ public:
+  explicit FP16CompressCommHook(ProcessGroup* state)
+      : CppCommHookInterface<ProcessGroup*>(state) {}
+
   ~FP16CompressCommHook() override {}
 
   c10::intrusive_ptr<c10::ivalue::Future> runHook(GradBucket& bucket) override;
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 27155d81e5c1..7568e9979530 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -124,19 +124,25 @@ class PythonStore : public ::c10d::Store {
   }
 };
 
-// This method is called from DDP's Python API. Its inputs are
-// a c10d reducer object, state, and callable comm_hook. State and
-// comm_hook inputs are Python objects and this function creates a
-// c10d PythonCommHook object using these inputs. It later calls
-// register_comm_hook function of the reducer input to register that
-// PythonCommHook object.
+// Called from DDP's Python API to create a c10d Python comm hook object.
+// The input state and callable comm_hook are Python objects. It later calls
+// register_comm_hook function of the reducer input to register the hook.
 void _register_comm_hook(
     ::c10d::Reducer& reducer,
     py::object state,
     py::object comm_hook) {
   reducer.register_comm_hook(std::make_unique<::c10d::PythonCommHook>(
       std::move(state), std::move(comm_hook)));
-};
+}
+
+// Called from DDP's Python API to create a c10d C++ comm hook.
+// The input is an enum hook type. It later calls register_builtin_comm_hook
+// function of the reducer input to set the hook type.
+void _register_builtin_comm_hook(
+    ::c10d::Reducer& reducer,
+    ::c10d::BuiltinCommHookType comm_hook_type) {
+  reducer.register_builtin_comm_hook(comm_hook_type);
+}
 
 PyObject* c10d_init(PyObject* _unused, PyObject* noargs) {
   C10_LOG_API_USAGE_ONCE("c10d.python.import");
@@ -147,12 +153,19 @@ PyObject* c10d_init(PyObject* _unused, PyObject* noargs) {
 
   auto module = py::handle(c10d_module).cast<py::module>();
 
-  module.def(
-      "_register_comm_hook",
-      &_register_comm_hook,
-      py::arg("reducer"),
-      py::arg("state"),
-      py::arg("comm_hook"));
+  module
+      .def(
+          "_register_comm_hook",
+          &_register_comm_hook,
+          py::arg("reducer"),
+          py::arg("state"),
+          py::arg("comm_hook"),
+          py::call_guard<py::gil_scoped_release>())
+      .def(
+          "_register_builtin_comm_hook",
+          &_register_builtin_comm_hook,
+          py::arg("reducer"),
+          py::arg("comm_hook_type"));
 
   shared_ptr_class_<::c10d::GradBucket>(module, "_GradBucket")
       .def(py::init<std::vector<Tensor>&>(), py::arg("tensors"))
@@ -168,6 +181,11 @@ PyObject* c10d_init(PyObject* _unused, PyObject* noargs) {
             a single tensor.
            )");
 
+  py::enum_<::c10d::BuiltinCommHookType>(module, "BuiltinCommHookType", R"(
+An enum-like class for built-in communication hooks: ``ALLREDUCE`` and ``FP16_COMPRESS``.)")
+      .value("ALLREDUCE", ::c10d::BuiltinCommHookType::ALLREDUCE)
+      .value("FP16_COMPRESS", ::c10d::BuiltinCommHookType::FP16_COMPRESS);
+
   shared_ptr_class_<::c10d::Reducer>(module, "Reducer")
       .def(
           py::init<
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index 6f0b22b73739..ba114560b144 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -208,7 +208,8 @@ Reducer::Reducer(
 // used for algorithms like Gradient Compression/GossipGrad. This hook can be
 // registered from Python API using `register_comm_hook`. `PythonCommHook`
 // enables registering a Python hook and is a subclass of `CommHookInterface`.
-// `CommHookInterface` can be used to implement CPP hooks in the future.
+// Additionally, there are also some built-in C++ hook implementations that can
+// be specified by calling `register_builtin_comm_hook` from Python API.
 
 Reducer::~Reducer() noexcept(false) {
   // Remove all hooks on variables registered by this Reducer. This is necessary
@@ -1369,7 +1370,8 @@ bool Reducer::rebuild_buckets() {
 // See Note [DDP Communication Hook]
 void Reducer::register_comm_hook(std::unique_ptr<CommHookInterface> iface) {
   TORCH_CHECK(
-      comm_hook_ == nullptr, "register_comm_hook can only be called once.");
+      comm_hook_ == nullptr,
+      "register_comm_hook or register_builtin_comm_hook can only be called once.");
   // TODO(@sinannasir): Single-process multiple-device mode support for DDP
   // communication hook. Related to GH Issue #42542.
   TORCH_CHECK(
@@ -1379,6 +1381,33 @@ void Reducer::register_comm_hook(std::unique_ptr<CommHookInterface> iface) {
   comm_hook_ = std::move(iface);
 }
 
+// See Note [DDP Communication Hook]
+void Reducer::register_builtin_comm_hook(
+    c10d::BuiltinCommHookType comm_hook_type) {
+  TORCH_CHECK(
+      comm_hook_ == nullptr,
+      "register_builtin_comm_hook or register_comm_hook can only be called once.");
+  TORCH_CHECK(
+      replicas_.size() == 1,
+      "Communication hook does not support single-process multiple-device mode.");
+
+  switch (comm_hook_type) {
+    case c10d::BuiltinCommHookType::ALLREDUCE:
+      comm_hook_ =
+          std::make_unique<c10d::AllReduceCommHook>(process_group_.get());
+      LOG(INFO) << "Built-in communication hook ALLREDUCE is registered.";
+      break;
+    case c10d::BuiltinCommHookType::FP16_COMPRESS:
+      comm_hook_ =
+          std::make_unique<c10d::FP16CompressCommHook>(process_group_.get());
+      LOG(INFO) << "Built-in communication hook FP16_COMPRESS is registered.";
+      break;
+    default:
+      TORCH_WARN_ONCE(
+          "Unknown built-in DDP comm hook type is provided. No comm hook will be used.");
+  }
+}
+
 void Reducer::ensure_prior_reduction_finished() {
   // Check that any prior reduction has finished.
   // The variable `require_finalize_` is true until all gradients
diff --git a/torch/csrc/distributed/c10d/reducer.h b/torch/csrc/distributed/c10d/reducer.h
index 53b4ecc4f981..c3125e2a3b36 100644
--- a/torch/csrc/distributed/c10d/reducer.h
+++ b/torch/csrc/distributed/c10d/reducer.h
@@ -12,6 +12,7 @@
 #include <torch/csrc/autograd/variable.h>
 #include <torch/csrc/distributed/autograd/context/context.h>
 #include <torch/csrc/distributed/c10d/comm.h>
+#include <torch/csrc/distributed/c10d/default_comm_hooks.h>
 
 namespace c10d {
 
@@ -58,8 +59,14 @@ class Reducer {
   // Registers a hook to the reducer. The hook is `CommHookInterface`
   // type to allow both Python and CPP hooks. This function can only
   // be called once before calling backward.
+ // Cannot combine with the call of `register_builtin_comm_hook`.
   void register_comm_hook(std::unique_ptr<CommHookInterface> iface);
 
+  // Registers a built-in C++ comm hook to the reducer. This function can only
+  // be called once before calling backward.
+  // Cannot combine with the call of `register_comm_hook`.
+  void register_builtin_comm_hook(c10d::BuiltinCommHookType comm_hook_type);
+
   // Returns a vector of tensors in each bucket in sequential order.
   std::vector<std::vector<at::Tensor>> get_bucket_tensors() const;
 
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
index 6b07e23c9476..9b5fd66bdb57 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
@@ -1,6 +1,7 @@
 from enum import Enum
 from functools import partial
 
+import torch.distributed as dist
 import torch.distributed.algorithms.ddp_comm_hooks.default_hooks as default
 import torch.distributed.algorithms.ddp_comm_hooks.quantization_hooks as quantization
 from torch.nn.parallel import DistributedDataParallel
@@ -38,6 +39,7 @@ def register_ddp_comm_hook(
         to the DDP model. User can specify the type of hook as an enum
         ``DDPCommHookType`` type using ``comm_hook_type`` input. State input will
         be passed to the model.
+        Uses Python comm hook implementations.
 
         Example::
             >>> register_ddp_comm_hook(DDPCommHookType.FP16_COMPRESS, model, state)
diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp
index 3b52616ee3fa..e6f589275a11 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.hpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.hpp
@@ -14,6 +14,8 @@
 #include <ATen/cuda/CUDAEvent.h>
 #include <c10/core/Stream.h>
 #include <c10/core/StreamGuard.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAStream.h>
 
 namespace c10d {
 
@@ -302,8 +304,29 @@ class ProcessGroupNCCL : public ProcessGroup {
 
       // Do not free the underlying data storage of value_ before its
       // usage on futureNCCLCallbackStream_ finish.
-      TORCH_INTERNAL_ASSERT(record_stream_cb_);
-      record_stream_cb_(value_, futureNCCLCallbackStream_->unwrap());
+      if (record_stream_cb_ != nullptr) {
+        // If a Python communication hook is used, record_stream_cb_ will be
+        // set in torch/csrc/jit/python/pybind_utils.h, which allows Python
+        // dependency to be imported.
+        record_stream_cb_(value_, futureNCCLCallbackStream_->unwrap());
+      } else {
+        // If a C++ communication hook is used, create and set a record stream
+        // callback.
+        TORCH_INTERNAL_ASSERT(
+            value_.isTensorList() || value_.isTensor(),
+            "the future value must be either a tensor list or a tensor.");
+        at::Tensor tensor;
+        if (value_.isTensorList()) {
+          const auto tensors = value_.toTensorVector();
+          TORCH_INTERNAL_ASSERT(
+              tensors.size() == 1, "expected exactly 1 tensor");
+          tensor = tensors[0];
+        } else {
+          tensor = value_.toTensor();
+        }
+        c10::cuda::CUDACachingAllocator::recordStream(
+            tensor.storage().data_ptr(), *futureNCCLCallbackStream_);
+      }
 
       // Use the dedicated callback stream to run callback.
       // Cannot move capture std::function in lambda, because it cannot deduce
@@ -558,7 +581,8 @@ class ProcessGroupNCCL : public ProcessGroup {
   // This function iterates through the list of WorkNCCL objects in the
   // workList_ corresponding to incomplete collectives and then aborts NCCL
   // communicators associated with timed out collectives.
-  void abortTimedOutCollectives(std::unordered_set<std::string>& abortedCommIds);
+  void abortTimedOutCollectives(
+      std::unordered_set<std::string>& abortedCommIds);
 
   void workCleanupLoop();
 
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index 7d64ed8a4174..1789fea4816b 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -975,7 +975,7 @@ def join(self, divide_by_initial_world_size=True, enable=True):
 
     def _register_comm_hook(self, state: object, hook: callable):
         r"""
-        Register a communication hook which is an enhancement that provides a
+        Registers a communication hook which is an enhancement that provides a
         flexible hook to users where they can specify how DDP aggregates gradients
         across multiple workers.
 
@@ -1060,6 +1060,40 @@ def _register_comm_hook(self, state: object, hook: callable):
         self._check_comm_hook(hook)
         dist._register_comm_hook(self.reducer, state, hook)
 
+    def _register_builtin_comm_hook(
+        self, comm_hook_type: dist.BuiltinCommHookType
+    ):
+        r"""
+        Registers a built-in communication hook that specifies how DDP
+        aggregates gradients across multiple workers.
+        The built-in hooks aim to provide efficient C++ implementations for certain hooks,
+        which might not be as efficient if implemented in Python using a Python communication hook.
+
+        Arguments:
+            comm_hook_type (dist.BuiltinCommHookType): type of communication hook, such as
+            ALLREDUCE, FP16_COMPRESS, etc.
+
+        .. warning ::
+            DDP communication hook can only be registered once and should be registered
+            before calling backward.
+
+        .. warning ::
+            DDP communication hook does not support single-process multiple-device mode.
+            Gradbucket tensors should consist of only a single tensor.
+
+        .. warning ::
+            DDP communication hook is experimental and subject to change.
+
+        Example::
+            Below is an example of a FP16 compression where gradients are
+            compressed into 16-bit floating-point numbers before allreduce, and
+            then decompressed after allreduce.
+
+            >>> ddp._register_builtin_comm_hook(dist.BuiltinCommHookType.FP16_COMPRESS)
+
+        """
+        dist._register_builtin_comm_hook(self.reducer, comm_hook_type)
+
     def _distributed_broadcast_coalesced(
         self, tensors, buffer_size, authoritative_rank=0
     ):

From 54feb00bbde9f6220721259f4a352288ab610c03 Mon Sep 17 00:00:00 2001
From: Ansley Ussery <ansley@fb.com>
Date: Sat, 31 Oct 2020 10:49:30 -0700
Subject: [PATCH 166/169] Create prototype for AST rewriter (#46410)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/46410

Test Plan: Imported from OSS

Reviewed By: SplitInfinity

Differential Revision: D24665950

Pulled By: ansley

fbshipit-source-id: b72110436126a24ddc294b8ee7b3f691281c1f1b
---
 test/test_fx_experimental.py      | 111 ++++++++++++++++++++++++++++++
 torch/fx/experimental/rewriter.py |  87 +++++++++++++++++++++++
 torch/fx/node.py                  |   1 +
 torch/fx/symbolic_trace.py        |   6 +-
 4 files changed, 204 insertions(+), 1 deletion(-)
 create mode 100644 torch/fx/experimental/rewriter.py

diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py
index d5d3daef722b..54b140e660de 100644
--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@@ -2,8 +2,14 @@
 from torch.fx.symbolic_trace import symbolic_trace
 from torch.fx.experimental import GraphManipulation
 from torch.fx.experimental.Partitioner import Partitioner, Device, PartitionerConfig
+from torch.fx.experimental.rewriter import RewritingTracer
+from torch.fx.graph_module import GraphModule
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.jit_utils import JitTestCase
+from typing import Union, Callable
+
+def symbolic_trace_with_rewrite(root: Union[torch.nn.Module, Callable]) -> GraphModule:
+    return GraphModule(root if isinstance(root, torch.nn.Module) else torch.nn.Module(), RewritingTracer().trace(root))
 
 class TestFXExperimental(JitTestCase):
     def test_find_single_partition(self):
@@ -162,5 +168,110 @@ def forward(self, a, b, offset):
         self.assertEqual(traced(a, b, offset), module_with_submodules(a, b, offset))
         assert len(module_with_submodules.graph.nodes) == 24
 
+    def test_call_to_assert_no_msg(self):
+
+        class M(torch.nn.Module):
+            def forward(self, a, b):
+                assert a == b
+                return a + b
+        m = M()
+        traced = symbolic_trace_with_rewrite(m)
+
+        # Make sure the graph is well-formed
+        traced.graph.lint(traced)
+
+        # Check the IR to make sure there's a call_function node with target == "Assert"
+        self.assertTrue(any(node.op == "call_function" and node.target == torch.Assert for node in traced.graph.nodes))
+
+        # Ensure that the assert throws when it's supposed to and doesn't throw when it's not supposed to
+        traced(3, 3)
+        with self.assertRaisesRegex(AssertionError, ""):
+            traced(3, 5)
+
+        # Confirm that the output is correct
+        self.assertEqual(traced(3, 3), m(3, 3))
+
+    def test_call_to_assert_with_msg(self):
+
+        class M(torch.nn.Module):
+            def forward(self, a, b):
+                assert a == b, "test message"
+                return a + b
+        m = M()
+        traced = symbolic_trace_with_rewrite(m)
+
+        # Make sure the graph is well-formed
+        traced.graph.lint(traced)
+
+        # Check the IR to make sure there's a call_function node with target == "Assert"
+        self.assertTrue(any(node.op == "call_function" and node.target == torch.Assert for node in traced.graph.nodes))
+
+        # Ensure that the assert throws when it's supposed to and doesn't throw when it's not supposed to
+        traced(3, 3)
+        with self.assertRaisesRegex(AssertionError, "test message"):
+            traced(3, 5)
+
+        # Confirm that the output is correct
+        self.assertEqual(traced(3, 3), m(3, 3))
+
+    def test_call_to_assert_with_empty_msg(self):
+
+        class M(torch.nn.Module):
+            def forward(self, a, b):
+                assert a == b, ""
+                return a + b
+        m = M()
+        traced = symbolic_trace_with_rewrite(m)
+
+        # Make sure the graph is well-formed
+        traced.graph.lint(traced)
+
+        # Check the IR to make sure there's a call_function node with target == "Assert"
+        self.assertTrue(any(node.op == "call_function" and node.target == torch.Assert for node in traced.graph.nodes))
+
+        # Ensure that the assert throws when it's supposed to and doesn't throw when it's not supposed to
+        traced(3, 3)
+        with self.assertRaisesRegex(AssertionError, ""):
+            traced(3, 5)
+
+        # Confirm that the output is correct
+        self.assertEqual(traced(3, 3), m(3, 3))
+
+    def test_call_to_assert_with_multiline_message(self):
+
+        class M(torch.nn.Module):
+            def forward(self, a, b):
+                error_msg = """
+An error message with
+terrible spacing
+                """
+                assert a == b, error_msg
+                return a + b
+        m = M()
+        traced = symbolic_trace_with_rewrite(m)
+
+        # Make sure the graph is well-formed
+        traced.graph.lint(traced)
+
+        # Check the IR to make sure there's a call_function node with target == "Assert"
+        self.assertTrue(any(node.op == "call_function" and node.target == torch.Assert for node in traced.graph.nodes))
+
+        # Ensure that the assert throws when it's supposed to and doesn't throw when it's not supposed to
+        error_msg = """
+An error message with
+terrible spacing
+    """
+        traced(3, 3)
+        with self.assertRaisesRegex(AssertionError, error_msg):
+            traced(3, 5)
+
+        # Confirm that the output is correct
+        self.assertEqual(traced(3, 3), m(3, 3))
+
+    def test_traceable_function_with_nonstandard_name(self):
+        def foo(x):
+            return torch.relu(x)
+        traced = symbolic_trace_with_rewrite(foo)
+
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/fx/experimental/rewriter.py b/torch/fx/experimental/rewriter.py
new file mode 100644
index 000000000000..9981c884780f
--- /dev/null
+++ b/torch/fx/experimental/rewriter.py
@@ -0,0 +1,87 @@
+import ast
+import inspect
+import textwrap
+import copy
+from types import FunctionType
+from typing import Union, Callable
+from torch.fx.symbolic_trace import Tracer
+from torch.fx.graph import Graph
+from torch.jit.frontend import normalize_source_lines
+import torch
+
+class AST_Rewriter(ast.NodeTransformer):
+    """
+    Take a FunctionType object representing a `forward` method, then
+    perform an AST rewrite to swap out nodes that are not symbolically
+    traceable with a callsite to the FX alternative.
+
+    To support swapping out an AST node, define a new `visit` method on
+    that node. For more details, see:
+    https://docs.python.org/3/library/ast.html#ast.NodeTransformer
+    """
+
+    def rewrite(self, fn: FunctionType):
+
+        # Normalize the source lines
+        sourcelines, _ = inspect.getsourcelines(fn)
+        sourcelines = normalize_source_lines(sourcelines)
+        source = ''.join(sourcelines)
+        normalized_str = textwrap.dedent(source)
+
+        # Rewrite the original AST
+        source_ast = ast.parse(normalized_str)
+        dest_ast = ast.fix_missing_locations(self.visit(source_ast))
+
+        # Pull out the compiled fucntion from the newly-created Module
+        code = compile(dest_ast, "", "exec")
+        globals_dict = copy.copy(fn.__globals__)
+        keys_before = set(globals_dict.keys())
+        exec(code, globals_dict)
+        new_keys = list(set(globals_dict.keys()) - keys_before)
+        assert len(new_keys) == 1
+        fn_compiled = globals_dict[new_keys[0]]
+
+        # Return the correct FunctionType object
+        return fn_compiled
+
+    def visit_Assert(self, node):
+        """
+        Swap out the Assert node (Python's `assert`) with a callsite to the
+        symbolically-traceable torch.Assert function
+        """
+        # Create the Call node
+        call_node = ast.parse('torch.Assert()', mode='eval').body
+        msg = node.msg if node.msg else ast.Constant(value="", kind=None)
+        call_node.args = [node.test, msg]
+
+        # Ensure that the new node conforms to the Python AST grammar
+        expr_wrapper = ast.Expr(value=call_node)
+
+        # Return the new Call node to signify that we want to use it as
+        # a replacement for the original Assert node
+        return ast.copy_location(expr_wrapper, node)
+
+
+class RewritingTracer(Tracer):
+    def trace(self, root: Union[torch.nn.Module, Callable]) -> Graph:
+        return super().trace(_rewrite(root))
+
+
+def _rewrite(fn : Union[torch.nn.Module, Callable]) -> Union[torch.nn.Module, Callable]:
+    if isinstance(fn, torch.nn.Module):
+        # Rewrite this module's forward() and all of its recursive children's
+        # forward. Return the new rewritten module hierarchy.
+        def rewrite_module(m : torch.nn.Module):
+            class RewrittenModule(torch.nn.Module):
+                def __init__(self, orig):
+                    super().__init__()
+                    self.__dict__ = copy.copy(orig.__dict__)
+            RewrittenModule.forward = AST_Rewriter().rewrite(m.forward)
+            new_m = RewrittenModule(m)
+            for name, child in new_m.named_children():
+                new_m[name] = rewrite_module(child)
+            return new_m
+        return rewrite_module(fn)
+    else:
+        # Rewrite this single free function
+        return AST_Rewriter().rewrite(fn)
diff --git a/torch/fx/node.py b/torch/fx/node.py
index 118b32fe15b0..e43487961b70 100644
--- a/torch/fx/node.py
+++ b/torch/fx/node.py
@@ -20,6 +20,7 @@
     BaseArgumentTypes
 ]]
 
+
 class Node:
     def __init__(self, graph: 'Graph', name: str, op: str, target: Target,
                  args: Tuple[Argument, ...], kwargs: Dict[str, Argument],
diff --git a/torch/fx/symbolic_trace.py b/torch/fx/symbolic_trace.py
index 2865c97da22b..f723c625d022 100644
--- a/torch/fx/symbolic_trace.py
+++ b/torch/fx/symbolic_trace.py
@@ -6,7 +6,7 @@
 from .node import Argument
 from .graph import Graph
 from .graph_module import GraphModule
-from .proxy import TracerBase
+from .proxy import Proxy, TracerBase
 
 HAS_VARSTUFF = inspect.CO_VARARGS | inspect.CO_VARKEYWORDS
 
@@ -206,6 +206,10 @@ def forward(*args, **kwargs):
             torch.nn.Module.__getattr__ = orig_getattr  # type: ignore
         return self.graph
 
+    def _proxy_placeholder(self, name: str, type_expr: Optional[Any] = None) -> Proxy:
+        return Proxy(self.create_node('placeholder', name, (), {}, type_expr=type_expr), self)
+
+
 # Symbolic tracing API
 #
 # Given an `nn.Module` or function instance `root`, this function will return a `GraphModule`

From 317b78d56ed434bb52030c3472affbd0feed8344 Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Sat, 31 Oct 2020 18:05:42 -0700
Subject: [PATCH 167/169] Revert D24665950: Create prototype for AST rewriter

Test Plan: revert-hammer

Differential Revision:
D24665950 (https://github.com/pytorch/pytorch/commit/54feb00bbde9f6220721259f4a352288ab610c03)

Original commit changeset: b72110436126

fbshipit-source-id: 961412df006acd33c91a745c809832d5c6494c76
---
 test/test_fx_experimental.py      | 111 ------------------------------
 torch/fx/experimental/rewriter.py |  87 -----------------------
 torch/fx/node.py                  |   1 -
 torch/fx/symbolic_trace.py        |   6 +-
 4 files changed, 1 insertion(+), 204 deletions(-)
 delete mode 100644 torch/fx/experimental/rewriter.py

diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py
index 54b140e660de..d5d3daef722b 100644
--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@@ -2,14 +2,8 @@
 from torch.fx.symbolic_trace import symbolic_trace
 from torch.fx.experimental import GraphManipulation
 from torch.fx.experimental.Partitioner import Partitioner, Device, PartitionerConfig
-from torch.fx.experimental.rewriter import RewritingTracer
-from torch.fx.graph_module import GraphModule
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.jit_utils import JitTestCase
-from typing import Union, Callable
-
-def symbolic_trace_with_rewrite(root: Union[torch.nn.Module, Callable]) -> GraphModule:
-    return GraphModule(root if isinstance(root, torch.nn.Module) else torch.nn.Module(), RewritingTracer().trace(root))
 
 class TestFXExperimental(JitTestCase):
     def test_find_single_partition(self):
@@ -168,110 +162,5 @@ def forward(self, a, b, offset):
         self.assertEqual(traced(a, b, offset), module_with_submodules(a, b, offset))
         assert len(module_with_submodules.graph.nodes) == 24
 
-    def test_call_to_assert_no_msg(self):
-
-        class M(torch.nn.Module):
-            def forward(self, a, b):
-                assert a == b
-                return a + b
-        m = M()
-        traced = symbolic_trace_with_rewrite(m)
-
-        # Make sure the graph is well-formed
-        traced.graph.lint(traced)
-
-        # Check the IR to make sure there's a call_function node with target == "Assert"
-        self.assertTrue(any(node.op == "call_function" and node.target == torch.Assert for node in traced.graph.nodes))
-
-        # Ensure that the assert throws when it's supposed to and doesn't throw when it's not supposed to
-        traced(3, 3)
-        with self.assertRaisesRegex(AssertionError, ""):
-            traced(3, 5)
-
-        # Confirm that the output is correct
-        self.assertEqual(traced(3, 3), m(3, 3))
-
-    def test_call_to_assert_with_msg(self):
-
-        class M(torch.nn.Module):
-            def forward(self, a, b):
-                assert a == b, "test message"
-                return a + b
-        m = M()
-        traced = symbolic_trace_with_rewrite(m)
-
-        # Make sure the graph is well-formed
-        traced.graph.lint(traced)
-
-        # Check the IR to make sure there's a call_function node with target == "Assert"
-        self.assertTrue(any(node.op == "call_function" and node.target == torch.Assert for node in traced.graph.nodes))
-
-        # Ensure that the assert throws when it's supposed to and doesn't throw when it's not supposed to
-        traced(3, 3)
-        with self.assertRaisesRegex(AssertionError, "test message"):
-            traced(3, 5)
-
-        # Confirm that the output is correct
-        self.assertEqual(traced(3, 3), m(3, 3))
-
-    def test_call_to_assert_with_empty_msg(self):
-
-        class M(torch.nn.Module):
-            def forward(self, a, b):
-                assert a == b, ""
-                return a + b
-        m = M()
-        traced = symbolic_trace_with_rewrite(m)
-
-        # Make sure the graph is well-formed
-        traced.graph.lint(traced)
-
-        # Check the IR to make sure there's a call_function node with target == "Assert"
-        self.assertTrue(any(node.op == "call_function" and node.target == torch.Assert for node in traced.graph.nodes))
-
-        # Ensure that the assert throws when it's supposed to and doesn't throw when it's not supposed to
-        traced(3, 3)
-        with self.assertRaisesRegex(AssertionError, ""):
-            traced(3, 5)
-
-        # Confirm that the output is correct
-        self.assertEqual(traced(3, 3), m(3, 3))
-
-    def test_call_to_assert_with_multiline_message(self):
-
-        class M(torch.nn.Module):
-            def forward(self, a, b):
-                error_msg = """
-An error message with
-terrible spacing
-                """
-                assert a == b, error_msg
-                return a + b
-        m = M()
-        traced = symbolic_trace_with_rewrite(m)
-
-        # Make sure the graph is well-formed
-        traced.graph.lint(traced)
-
-        # Check the IR to make sure there's a call_function node with target == "Assert"
-        self.assertTrue(any(node.op == "call_function" and node.target == torch.Assert for node in traced.graph.nodes))
-
-        # Ensure that the assert throws when it's supposed to and doesn't throw when it's not supposed to
-        error_msg = """
-An error message with
-terrible spacing
-    """
-        traced(3, 3)
-        with self.assertRaisesRegex(AssertionError, error_msg):
-            traced(3, 5)
-
-        # Confirm that the output is correct
-        self.assertEqual(traced(3, 3), m(3, 3))
-
-    def test_traceable_function_with_nonstandard_name(self):
-        def foo(x):
-            return torch.relu(x)
-        traced = symbolic_trace_with_rewrite(foo)
-
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/fx/experimental/rewriter.py b/torch/fx/experimental/rewriter.py
deleted file mode 100644
index 9981c884780f..000000000000
--- a/torch/fx/experimental/rewriter.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import ast
-import inspect
-import textwrap
-import copy
-from types import FunctionType
-from typing import Union, Callable
-from torch.fx.symbolic_trace import Tracer
-from torch.fx.graph import Graph
-from torch.jit.frontend import normalize_source_lines
-import torch
-
-class AST_Rewriter(ast.NodeTransformer):
-    """
-    Take a FunctionType object representing a `forward` method, then
-    perform an AST rewrite to swap out nodes that are not symbolically
-    traceable with a callsite to the FX alternative.
-
-    To support swapping out an AST node, define a new `visit` method on
-    that node. For more details, see:
-    https://docs.python.org/3/library/ast.html#ast.NodeTransformer
-    """
-
-    def rewrite(self, fn: FunctionType):
-
-        # Normalize the source lines
-        sourcelines, _ = inspect.getsourcelines(fn)
-        sourcelines = normalize_source_lines(sourcelines)
-        source = ''.join(sourcelines)
-        normalized_str = textwrap.dedent(source)
-
-        # Rewrite the original AST
-        source_ast = ast.parse(normalized_str)
-        dest_ast = ast.fix_missing_locations(self.visit(source_ast))
-
-        # Pull out the compiled fucntion from the newly-created Module
-        code = compile(dest_ast, "", "exec")
-        globals_dict = copy.copy(fn.__globals__)
-        keys_before = set(globals_dict.keys())
-        exec(code, globals_dict)
-        new_keys = list(set(globals_dict.keys()) - keys_before)
-        assert len(new_keys) == 1
-        fn_compiled = globals_dict[new_keys[0]]
-
-        # Return the correct FunctionType object
-        return fn_compiled
-
-    def visit_Assert(self, node):
-        """
-        Swap out the Assert node (Python's `assert`) with a callsite to the
-        symbolically-traceable torch.Assert function
-        """
-        # Create the Call node
-        call_node = ast.parse('torch.Assert()', mode='eval').body
-        msg = node.msg if node.msg else ast.Constant(value="", kind=None)
-        call_node.args = [node.test, msg]
-
-        # Ensure that the new node conforms to the Python AST grammar
-        expr_wrapper = ast.Expr(value=call_node)
-
-        # Return the new Call node to signify that we want to use it as
-        # a replacement for the original Assert node
-        return ast.copy_location(expr_wrapper, node)
-
-
-class RewritingTracer(Tracer):
-    def trace(self, root: Union[torch.nn.Module, Callable]) -> Graph:
-        return super().trace(_rewrite(root))
-
-
-def _rewrite(fn : Union[torch.nn.Module, Callable]) -> Union[torch.nn.Module, Callable]:
-    if isinstance(fn, torch.nn.Module):
-        # Rewrite this module's forward() and all of its recursive children's
-        # forward. Return the new rewritten module hierarchy.
-        def rewrite_module(m : torch.nn.Module):
-            class RewrittenModule(torch.nn.Module):
-                def __init__(self, orig):
-                    super().__init__()
-                    self.__dict__ = copy.copy(orig.__dict__)
-            RewrittenModule.forward = AST_Rewriter().rewrite(m.forward)
-            new_m = RewrittenModule(m)
-            for name, child in new_m.named_children():
-                new_m[name] = rewrite_module(child)
-            return new_m
-        return rewrite_module(fn)
-    else:
-        # Rewrite this single free function
-        return AST_Rewriter().rewrite(fn)
diff --git a/torch/fx/node.py b/torch/fx/node.py
index e43487961b70..118b32fe15b0 100644
--- a/torch/fx/node.py
+++ b/torch/fx/node.py
@@ -20,7 +20,6 @@
     BaseArgumentTypes
 ]]
 
-
 class Node:
     def __init__(self, graph: 'Graph', name: str, op: str, target: Target,
                  args: Tuple[Argument, ...], kwargs: Dict[str, Argument],
diff --git a/torch/fx/symbolic_trace.py b/torch/fx/symbolic_trace.py
index f723c625d022..2865c97da22b 100644
--- a/torch/fx/symbolic_trace.py
+++ b/torch/fx/symbolic_trace.py
@@ -6,7 +6,7 @@
 from .node import Argument
 from .graph import Graph
 from .graph_module import GraphModule
-from .proxy import Proxy, TracerBase
+from .proxy import TracerBase
 
 HAS_VARSTUFF = inspect.CO_VARARGS | inspect.CO_VARKEYWORDS
 
@@ -206,10 +206,6 @@ def forward(*args, **kwargs):
             torch.nn.Module.__getattr__ = orig_getattr  # type: ignore
         return self.graph
 
-    def _proxy_placeholder(self, name: str, type_expr: Optional[Any] = None) -> Proxy:
-        return Proxy(self.create_node('placeholder', name, (), {}, type_expr=type_expr), self)
-
-
 # Symbolic tracing API
 #
 # Given an `nn.Module` or function instance `root`, this function will return a `GraphModule`

From 19ede75eb95b2436d5e493497a38c1e6be5ec50b Mon Sep 17 00:00:00 2001
From: Meghan Lele <meghanl@fb.com>
Date: Sat, 31 Oct 2020 21:34:21 -0700
Subject: [PATCH 168/169] [JIT] Enable ModuleDict non-literal indexing (#45716)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45716

**Summary**
This commit enables indexing into `ModuleDict` using a non-literal
index if the `ModuleDict` is annotated with `Dict[str, X]`, where `X` is
a module interface type. These annotations must be expressed using a
class attribute named `__annotations__`, which is a `Dict[str, Type]`
where the keys are the names of module attributes and the values are
their types.

The approach taken by this commit is that these annotations are stored
as "hints" along with the corresponding module attributes in the
`ConcreteSubmoduleTypeBuilder` instance for each module (which might be
a `ModuleDict`). These hints are passed into the `ModuleValue` that is
created for desugaring operations on submodules so that indexing into a
`ModuleDict` can be emitted as a getitem op into a dict emitted into the
graph that represents the `ModuleDict`.

**Test Plan**
This commit adds unit tests to `TestModuleContainers` to test this
feature (`test_typed_module_dict`).

Differential Revision: D24070606

Test Plan: Imported from OSS

Reviewed By: ansley

Pulled By: SplitInfinity

fbshipit-source-id: 6019a7242d53d68fbfc1aa5a49df6cfc0507b992
---
 aten/src/ATen/core/interned_strings.h         |  1 +
 test/jit/test_freezing.py                     | 33 ++++++++++
 test/jit/test_module_containers.py            | 63 ++++++++++++++++++-
 torch/csrc/jit/frontend/ir_emitter.cpp        |  8 ++-
 torch/csrc/jit/frontend/sugared_value.cpp     | 20 +++++-
 torch/csrc/jit/frontend/sugared_value.h       | 31 ++++++---
 torch/csrc/jit/passes/freeze_module.cpp       |  7 +++
 .../csrc/jit/python/python_sugared_value.cpp  | 30 ++++++++-
 torch/csrc/jit/python/python_sugared_value.h  |  3 +-
 torch/csrc/jit/runtime/register_ops_utils.h   |  1 +
 .../jit/runtime/register_prim_ops_fulljit.cpp | 12 ++++
 torch/csrc/jit/serialization/python_print.cpp |  9 +++
 12 files changed, 199 insertions(+), 19 deletions(-)

diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h
index 7e841b8b5311..c29ff15c2b59 100644
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@@ -70,6 +70,7 @@ namespace c10 {
   _(prim, ListConstruct)             \
   _(prim, ListUnpack)                \
   _(prim, DictConstruct)             \
+  _(prim, ModuleDictIndex)           \
   _(prim, EnumName)                  \
   _(prim, EnumValue)                 \
   _(prim, StringIndex)               \
diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py
index 5f16b0229c2d..9c59b00a04cd 100644
--- a/test/jit/test_freezing.py
+++ b/test/jit/test_freezing.py
@@ -7,6 +7,7 @@
 from torch.testing._internal.common_quantization import skipIfNoFBGEMM
 
 from torch.jit._recursive import wrap_cpp_module
+from typing import Any
 
 import io
 
@@ -1222,3 +1223,35 @@ def forward(self, cond: bool):
         mod_eager = Mod()
         self.assertEqual(mod_eager(True), frozen_mod(True))
         self.assertEqual(mod_eager(False), frozen_mod(False))
+
+    def test_freeze_module_with_non_static_module_dict_index(self):
+        """
+        Test that a Module contained a non-static ModuleDict index
+        cannot be frozen.
+        """
+        @torch.jit.interface
+        class ModuleInterface(torch.nn.Module):
+            def forward(self, inp: Any) -> Any:
+                pass
+
+        class ImplementsInterface(torch.nn.Module):
+            def forward(self, inp: Any) -> Any:
+                if isinstance(inp, torch.Tensor):
+                    return torch.max(inp, dim=0)
+
+                return inp
+
+        # Test annotation of submodule.
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.d = torch.nn.ModuleDict({"module": ImplementsInterface()})
+
+            def forward(self, x: torch.Tensor, key: str) -> Any:
+                value: ModuleInterface = self.d[key]
+                return value.forward(x)
+
+        m = torch.jit.script(Mod())
+        m.eval()
+        with self.assertRaisesRegex(RuntimeError, "Freezing modules containing prim::ModuleDictIndex is not supported"):
+            mf = torch._C._freeze_module(m._c)
diff --git a/test/jit/test_module_containers.py b/test/jit/test_module_containers.py
index b53bf10a70c2..e261124bedb5 100644
--- a/test/jit/test_module_containers.py
+++ b/test/jit/test_module_containers.py
@@ -1,7 +1,7 @@
 import os
 import sys
 
-from typing import List
+from typing import Any, List, Tuple
 from collections import OrderedDict
 import torch
 import torch.nn as nn
@@ -428,3 +428,64 @@ def forward(self, inputs):
 
         m = MyModule()
         self.checkModule(m, [torch.randn(2, 2)])
+
+    def test_typed_module_dict(self):
+        """
+        Test that a type annotation can be provided for a ModuleDict that allows
+        non-static indexing.
+        """
+        @torch.jit.interface
+        class ModuleInterface(torch.nn.Module):
+            def forward(self, inp: Any) -> Any:
+                pass
+
+        class ImplementsInterface(torch.nn.Module):
+            def forward(self, inp: Any) -> Any:
+                if isinstance(inp, torch.Tensor):
+                    return torch.max(inp, dim=0)
+
+                return inp
+
+        class DoesNotImplementInterface(torch.nn.Module):
+            def forward(self, inp: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+                return torch.max(inp, dim=0)
+
+        # Test annotation of submodule.
+
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.d = torch.nn.ModuleDict({"module": ImplementsInterface()})
+
+            def forward(self, x: torch.Tensor, key: str) -> Any:
+                value: ModuleInterface = self.d[key]
+                return value.forward(x)
+
+        m = Mod()
+        self.checkModule(m, (torch.randn(2, 2), "module"))
+
+        # Test annotation of self.
+        class ModDict(torch.nn.ModuleDict):
+            def __init__(self):
+                super().__init__({"module": ImplementsInterface()})
+
+            def forward(self, x: torch.Tensor, key: str) -> Any:
+                submodule: ModuleInterface = self[key]
+                return submodule.forward(x)
+
+        m = ModDict()
+        self.checkModule(m, (torch.randn(2, 2), "module"))
+
+        # Test error message thrown when annotated attribute does not comply with the
+        # annotation.
+        class ModWithWrongAnnotation(torch.nn.ModuleDict):
+            def __init__(self):
+                super().__init__()
+                self.d = torch.nn.ModuleDict({"module": DoesNotImplementInterface()})
+
+            def forward(self, x: torch.Tensor, key: str) -> Any:
+                submodule: ModuleInterface = self.d[key]
+                return submodule.forward(x)
+
+        with self.assertRaisesRegex(RuntimeError, r"Attribute module is not of annotated type"):
+            torch.jit.script(ModWithWrongAnnotation())
diff --git a/torch/csrc/jit/frontend/ir_emitter.cpp b/torch/csrc/jit/frontend/ir_emitter.cpp
index 201afec1ce4e..a4b239418cfb 100644
--- a/torch/csrc/jit/frontend/ir_emitter.cpp
+++ b/torch/csrc/jit/frontend/ir_emitter.cpp
@@ -2946,7 +2946,7 @@ struct to_ir {
         return emitApplyExpr(apply, n_binders, type_hint);
       } break;
       case TK_SUBSCRIPT: {
-        return emitSubscript(Subscript(tree));
+        return emitSubscript(Subscript(tree), type_hint);
       } break;
       default:
         return std::make_shared<SimpleValue>(emitSimpleExpr(tree, type_hint));
@@ -3805,7 +3805,9 @@ struct to_ir {
         ->output();
   }
 
-  std::shared_ptr<SugaredValue> emitSubscript(const Subscript& subscript) {
+  std::shared_ptr<SugaredValue> emitSubscript(
+      const Subscript& subscript,
+      TypePtr type_hint = nullptr) {
     const SugaredValuePtr sv = emitSugaredExpr(subscript.value(), 1);
     const List<Expr>& subscript_exprs = subscript.subscript_exprs();
     const SourceRange& range = subscript.range();
@@ -3875,7 +3877,7 @@ struct to_ir {
         return std::make_shared<SimpleValue>(
             emitMultidimSlicing(range, sliceable, subscript_exprs));
       } else {
-        return sv->getitem(range, method, idx);
+        return sv->getitem(range, method, idx, std::move(type_hint));
       }
     }
   }
diff --git a/torch/csrc/jit/frontend/sugared_value.cpp b/torch/csrc/jit/frontend/sugared_value.cpp
index 57faeeb42265..8810a5a62019 100644
--- a/torch/csrc/jit/frontend/sugared_value.cpp
+++ b/torch/csrc/jit/frontend/sugared_value.cpp
@@ -377,7 +377,8 @@ Value* SimpleValue::len(const SourceRange& loc, Function& m) {
 SugaredValuePtr SimpleValue::getitem(
     const SourceRange& loc,
     Function& m,
-    Value* idx) {
+    Value* idx,
+    TypePtr type_hint) {
   Value* val = getValue();
   TypePtr val_type = val->type();
   Graph& g = *m.graph();
@@ -393,6 +394,17 @@ SugaredValuePtr SimpleValue::getitem(
     return std::make_shared<SimpleValue>(
         g.insert(aten::select, {val, 0, idx}, {}, loc));
   } else if (auto class_type = val_type->cast<ClassType>()) {
+    // Check if this is an indexing operation enabled by a type hint.
+    // The ModuleDict has already been checked during IR generation to make
+    // sure its contents implement the module interface referred to by
+    // type_hint.
+    if (class_type->is_module() && type_hint) {
+      auto res = g.insert(prim::ModuleDictIndex, {val, idx}, {}, loc);
+      res->setType(type_hint);
+      return std::make_shared<SimpleValue>(res);
+    }
+
+    // Defer to the __getitem__ attr on the class.
     return attr(loc, m, "__getitem__")->call(loc, m, {idx}, {}, 1);
   } else {
     throw ErrorReport(loc) << "'" << val_type->repr_str() << "'"
@@ -485,7 +497,8 @@ Value* RangeValue::len(const SourceRange& loc, Function& m) {
 SugaredValuePtr RangeValue::getitem(
     const SourceRange& loc,
     Function& m,
-    Value* idx) {
+    Value* idx,
+    TypePtr type_hint) {
   if (has_only_end_) {
     return std::make_shared<SimpleValue>(idx);
   } else {
@@ -535,7 +548,8 @@ Value* IterableTree::len(const SourceRange& loc, Function& m) {
 SugaredValuePtr IterableTree::getitem(
     const SourceRange& loc,
     Function& m,
-    Value* idx) {
+    Value* idx,
+    TypePtr type_hint) {
   std::vector<SugaredValuePtr> child_items;
   for (const SugaredValuePtr& child : children_) {
     child_items.emplace_back(child->getitem(loc, m, idx));
diff --git a/torch/csrc/jit/frontend/sugared_value.h b/torch/csrc/jit/frontend/sugared_value.h
index f85e75509898..28a18aceda49 100644
--- a/torch/csrc/jit/frontend/sugared_value.h
+++ b/torch/csrc/jit/frontend/sugared_value.h
@@ -139,7 +139,8 @@ struct TORCH_API SugaredValue
   virtual std::shared_ptr<SugaredValue> getitem(
       const SourceRange& loc,
       Function& m,
-      Value* idx) {
+      Value* idx,
+      TypePtr type_hint = nullptr) {
     throw ErrorReport(loc) << "'" << kind() << "'"
                            << " object is not subscriptable";
   }
@@ -193,8 +194,11 @@ struct TORCH_API SimpleValue : public SugaredValue {
   }
 
   Value* len(const SourceRange& loc, Function& m) override;
-  SugaredValuePtr getitem(const SourceRange& loc, Function& m, Value* idx)
-      override;
+  SugaredValuePtr getitem(
+      const SourceRange& loc,
+      Function& m,
+      Value* idx,
+      TypePtr type_hint = nullptr) override;
 
  private:
   Value* value_;
@@ -251,8 +255,11 @@ struct TORCH_API SugaredTupleValue : public SugaredValue {
     return "Tuple";
   }
 
-  SugaredValuePtr getitem(const SourceRange& loc, Function& m, Value* idx)
-      override {
+  SugaredValuePtr getitem(
+      const SourceRange& loc,
+      Function& m,
+      Value* idx,
+      TypePtr type_hint = nullptr) override {
     if (!(idx->type()->cast<IntType>() && toIValue(idx))) {
       throw ErrorReport(loc)
           << "Expected integer literal for index. "
@@ -603,8 +610,11 @@ struct TORCH_API RangeValue : SugaredValue {
     return "range";
   }
   Value* len(const SourceRange& loc, Function& m) override;
-  SugaredValuePtr getitem(const SourceRange& loc, Function& m, Value* idx)
-      override;
+  SugaredValuePtr getitem(
+      const SourceRange& loc,
+      Function& m,
+      Value* idx,
+      TypePtr type_hint = nullptr) override;
   std::shared_ptr<SugaredValue> iter(const SourceRange& loc, Function& m)
       override;
 
@@ -679,8 +689,11 @@ struct TORCH_API IterableTree : SugaredValue {
   std::vector<SugaredValuePtr> get_base_iterables();
 
   Value* len(const SourceRange& loc, Function& m) override;
-  SugaredValuePtr getitem(const SourceRange& loc, Function& m, Value* idx)
-      override;
+  SugaredValuePtr getitem(
+      const SourceRange& loc,
+      Function& m,
+      Value* idx,
+      TypePtr type_hint = nullptr) override;
 
  private:
   c10::optional<int64_t> unroll_length_ = c10::nullopt;
diff --git a/torch/csrc/jit/passes/freeze_module.cpp b/torch/csrc/jit/passes/freeze_module.cpp
index 6b5beb4372a8..76b6f1d234ba 100644
--- a/torch/csrc/jit/passes/freeze_module.cpp
+++ b/torch/csrc/jit/passes/freeze_module.cpp
@@ -212,6 +212,13 @@ class AttributePropagator {
         for (Block* sub_block : n->blocks()) {
           blocks.push(sub_block);
         }
+
+        // Modules with prim::ModuleDictIndex cannot be frozen because they
+        // return InterfaceTypes.
+        TORCH_CHECK(
+            n->kind() != prim::ModuleDictIndex,
+            "Freezing modules containing prim::ModuleDictIndex is not supported");
+
         if (n->kind() == prim::SetAttr || n->kind() == prim::GetAttr) {
           // By default if interface attributes are present then fail freezing.
           // If freezingInterfaces is on then Interfaces are folded similarly
diff --git a/torch/csrc/jit/python/python_sugared_value.cpp b/torch/csrc/jit/python/python_sugared_value.cpp
index 071347561665..7da82644150e 100644
--- a/torch/csrc/jit/python/python_sugared_value.cpp
+++ b/torch/csrc/jit/python/python_sugared_value.cpp
@@ -234,9 +234,11 @@ SugaredValuePtr ModuleValue::asTupleValue(const SourceRange& loc, Function& m) {
 SugaredValuePtr ModuleValue::getitem(
     const SourceRange& loc,
     Function& m,
-    Value* idx) {
+    Value* idx,
+    TypePtr type_hint) {
   if (concreteType_->getIterableModuleKind() == IterableModuleKind::LIST) {
-    return getSugaredDict(loc, m)->getModules()->getitem(loc, m, idx);
+    return getSugaredDict(loc, m)->getModules()->getitem(
+        loc, m, idx, type_hint);
   } else if (
       concreteType_->getIterableModuleKind() == IterableModuleKind::DICT) {
     if (auto ivalue = toIValue(idx)) {
@@ -252,6 +254,30 @@ SugaredValuePtr ModuleValue::getitem(
         }
       }
       throw ErrorReport(loc) << "Key Error, " << idx_str;
+    } else if (type_hint) {
+      // Check that all submodules comply with the type hint.
+      const auto& self_type = concreteType_->getJitType()->expect<ClassType>();
+      for (size_t i = 0; i < self_type->numAttributes(); ++i) {
+        const auto& attr_type = self_type->getAttribute(i);
+        if (attr_type->is_module()) {
+          if (!attr_type->isSubtypeOf(type_hint)) {
+            auto loc = self_->node()->sourceRange();
+            throw ErrorReport(loc)
+                << "Attribute " << self_type->getAttributeName(i)
+                << " is not of annotated type " << type_hint->annotation_str();
+          }
+        }
+      }
+
+      // Emit a prim::ModuleDictIndex operator. This is needed because it's
+      // difficult to construct a dict in the graph representing the ModuleDict
+      // and use aten::__getitem__ ops to index into it because any call to
+      // ModuleDict.setAttr would invalidate that emitted dict.
+      auto graph = m.graph();
+      auto* getitem_node =
+          graph->insertNode(graph->create(prim::ModuleDictIndex, {self_, idx}));
+      getitem_node->output(0)->setType(type_hint);
+      return std::make_shared<SimpleValue>(getitem_node->output(0));
     }
     throw ErrorReport(loc)
         << "Unable to extract string literal index. "
diff --git a/torch/csrc/jit/python/python_sugared_value.h b/torch/csrc/jit/python/python_sugared_value.h
index a399fa543dfb..12a5d87b063e 100644
--- a/torch/csrc/jit/python/python_sugared_value.h
+++ b/torch/csrc/jit/python/python_sugared_value.h
@@ -201,7 +201,8 @@ struct VISIBILITY_HIDDEN ModuleValue : public SugaredValue {
   std::shared_ptr<SugaredValue> getitem(
       const SourceRange& loc,
       Function& m,
-      Value* idx) override;
+      Value* idx,
+      TypePtr type_hint) override;
 
  private:
   Value* self_;
diff --git a/torch/csrc/jit/runtime/register_ops_utils.h b/torch/csrc/jit/runtime/register_ops_utils.h
index ae974c063ef3..5bd85d20556a 100644
--- a/torch/csrc/jit/runtime/register_ops_utils.h
+++ b/torch/csrc/jit/runtime/register_ops_utils.h
@@ -9,6 +9,7 @@
 #include <torch/csrc/autograd/profiler.h>
 #include <torch/csrc/autograd/variable.h>
 #include <torch/csrc/jit/api/compilation_unit.h>
+#include <torch/csrc/jit/api/module.h>
 #include <torch/csrc/jit/codegen/fuser/interface.h>
 #include <torch/csrc/jit/frontend/error_report.h>
 #include <torch/csrc/jit/ir/ir.h>
diff --git a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
index f7dd9594347e..4706635a6a0c 100644
--- a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
@@ -568,6 +568,18 @@ RegisterOperators reg(
            };
          },
          aliasAnalysisSpecialCase()),
+     // This operator is generated inside the compiler for indexing into
+     // ModuleDict without a statically determinable key. Accordingly,
+     // self must be a ModuleType and the output must be an InterfaceType.
+     OperatorGenerator(
+         TORCH_SELECTIVE_SCHEMA(
+             "prim::ModuleDictIndex(Any self, str ind) -> Any"),
+         [](Stack* stack) {
+           IValue ind = pop(stack);
+           IValue module_dict = pop(stack);
+           push(stack, module_dict.toModule().attr(ind.toStringRef()));
+         },
+         aliasAnalysisFromSchema()),
      Operator(
          "aten::dict() -> Dict(str, Tensor)",
          [](Stack* stack) {
diff --git a/torch/csrc/jit/serialization/python_print.cpp b/torch/csrc/jit/serialization/python_print.cpp
index 0f63770aa792..9803829eb683 100644
--- a/torch/csrc/jit/serialization/python_print.cpp
+++ b/torch/csrc/jit/serialization/python_print.cpp
@@ -822,6 +822,15 @@ struct PythonPrintImpl {
         body_ << "):\n";
         printBody(graph->block());
       } break;
+      case prim::ModuleDictIndex: {
+        const auto dict = node->inputs().at(0);
+        const auto key = node->inputs().at(1);
+        const auto out = node->outputs().at(0);
+        assignValuesToTheirUniqueNames(out);
+        indent();
+        body_ << useOf(out) << " : " << out->type()->annotation_str() << " = "
+              << useOf(dict) << "[" << useOf(key) << "]\n";
+      } break;
       default:
         auto ss = std::make_shared<TaggedStringStream>(&source_range_stack_);
         printRHS(*ss, node);

From 1cc1da541158b53c24a579fd316d190d5c27bfd1 Mon Sep 17 00:00:00 2001
From: Venkata Chintapalli <venkatach@fb.com>
Date: Sun, 1 Nov 2020 14:29:33 -0800
Subject: [PATCH 169/169] LayerNormInt8QuantizeFakeNNPI fix to match ICEREF.
 (#47140)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/47140

LayerNorm + Int8Quantize fix to match ICEREF.

(Note: this ignores all push blocking failures!)

Test Plan:
buck test --debug //caffe2/caffe2/contrib/fakelowp/test:test_layernorm_nnpi_fp16nnpi -- test_fused_ln_quantize --print-passing-details

https://internalfb.com/intern/testinfra/testrun/7881299371969005

Reviewed By: hyuen

Differential Revision: D24659904

fbshipit-source-id: 026d1a1f69a68eca662a39752af5ab0756bace2d
---
 caffe2/contrib/fakelowp/layernorm_fp16_fake_op.h         | 3 +--
 caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py | 5 -----
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/caffe2/contrib/fakelowp/layernorm_fp16_fake_op.h b/caffe2/contrib/fakelowp/layernorm_fp16_fake_op.h
index 4aef84663adc..ddeea5d5f56c 100644
--- a/caffe2/contrib/fakelowp/layernorm_fp16_fake_op.h
+++ b/caffe2/contrib/fakelowp/layernorm_fp16_fake_op.h
@@ -189,7 +189,7 @@ class LayerNormFakeFp16Op final : public Operator<CPUContext> {
       int Nout = X.numel();
 
       std::vector<float> inv_scalev(Nout, inv_scale);
-      std::vector<float> offsetv(Nout, Y_offset - 128.0);
+      std::vector<float> offsetv(Nout, Y_offset);
       uint8_t* Y_uint8_data = Y_int8->t.template mutable_data<uint8_t>();
 
       fake_fp16::fma_fp16(Nout, Y_fp16.data(), inv_scalev.data(), offsetv.data());
@@ -200,7 +200,6 @@ class LayerNormFakeFp16Op final : public Operator<CPUContext> {
       for (int i = 0; i < Nout; i++) {
         float halfRes = offsetv[i];
         halfRes = round(halfRes);
-        halfRes = halfRes + 128.0;
         if (std::isinf(halfRes)) {
           if (halfRes > 0) {
             halfRes = qmax;
diff --git a/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py
index 9ff0986116b6..5129a38c5241 100644
--- a/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py
@@ -1,8 +1,3 @@
-
-
-
-
-
 import numpy as np
 import caffe2.python.fakelowp.init_shared_libs  # noqa
 from caffe2.proto import caffe2_pb2