From 6c327ef9d465c1b31eea71f957ca3799869e613b Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 15 Apr 2021 12:29:56 -0700
Subject: [PATCH 01/45] matches_jit_signatures is dead (#53637)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/53637

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: albanD

Differential Revision: D26920687

Pulled By: ezyang

fbshipit-source-id: 288bd9dca63da04ccc633d939833066a3305a68a
---
 aten/src/ATen/native/README.md | 14 --------------
 tools/codegen/gen.py           |  1 -
 tools/shared/cwrap_common.py   |  2 --
 3 files changed, 17 deletions(-)

diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md
index 14700ed6a6cb..38843ebda856 100644
--- a/aten/src/ATen/native/README.md
+++ b/aten/src/ATen/native/README.md
@@ -365,20 +365,6 @@ you're a function that simply does not interact with any devices. In
 that case, code generation of the device guard can be disabled by adding
 `device_guard: False` to your function definition.
 
-### `matches_jit_signature`
-
-```
-matches_jit_signature: False
-```
-
-This will indicate that the func syntax does not follow the JIT signature schema.
-If you are a triggering an assert related to JIT signature compliance
-try adding this field and setting it to False. In general, this serves as a means
-of tracking an ongoing schema unification with the goal of aligning func syntax
-with other components of PyTorch in order to reduce overall complexity.
-If you find yourself having to set this field to False add @gchanan to your PR's
-set of reviewers.
-
 ### `manual_kernel_registration`
 
 ```
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index 1cb11e0df78f..99cbc56b64c9 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -628,7 +628,6 @@ def compute_declaration_yaml(f: NativeFunction) -> object:
         ('overload_name', str(f.func.name.overload_name)),
         ('manual_kernel_registration', f.manual_kernel_registration),
         ('category_override', f.category_override if f.category_override is not None else ''),
-        ('matches_jit_signature', True),
         ('schema_string', f'aten::{f.func}'),
         ('arguments', arguments),
         ('schema_order_cpp_signature', schema_order_cpp_signature),
diff --git a/tools/shared/cwrap_common.py b/tools/shared/cwrap_common.py
index bed58f03e423..d35d7d1bde3d 100644
--- a/tools/shared/cwrap_common.py
+++ b/tools/shared/cwrap_common.py
@@ -25,8 +25,6 @@ def set_declaration_defaults(declaration):
         # This happens for legacy TH bindings like
         # _thnn_conv_depthwise2d_backward
         declaration['schema_string'] = ''
-    if 'matches_jit_signature' not in declaration:
-        declaration['matches_jit_signature'] = False
     declaration.setdefault('arguments', [])
     declaration.setdefault('return', 'void')
     if 'cname' not in declaration:

From 5ed3be799d8ca063d1b990065521868a83cd4ff1 Mon Sep 17 00:00:00 2001
From: "Rong Rong (AI Infra)" <rongr@fb.com>
Date: Thu, 15 Apr 2021 13:18:11 -0700
Subject: [PATCH 02/45] skip test_filtering_env_var for rocm (#56178)

Summary:
ROCM doesn't report the correct number of expected test device type. Skipping for now.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/56178

Reviewed By: seemethere

Differential Revision: D27802139

Pulled By: walterddr

fbshipit-source-id: 2e58df1a3ba2411e690be52babf946e284c4efcc
---
 test/test_testing.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_testing.py b/test/test_testing.py
index 8a26262e9810..1dc16fb8ed5d 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -6,7 +6,7 @@
 import unittest
 
 from torch.testing._internal.common_utils import \
-    (IS_SANDCASTLE, IS_WINDOWS, TestCase, make_tensor, run_tests, slowTest)
+    (IS_SANDCASTLE, IS_WINDOWS, TestCase, make_tensor, run_tests, skipIfRocm, slowTest)
 from torch.testing._internal.framework_utils import calculate_shards
 from torch.testing._internal.common_device_type import \
     (PYTORCH_TESTING_DEVICE_EXCEPT_FOR_KEY, PYTORCH_TESTING_DEVICE_ONLY_FOR_KEY, dtypes,
@@ -685,6 +685,7 @@ def test_calculate_2_shards_against_optimal_shards(self):
                 # All the tests should be represented by some shard
                 self.assertEqual(sorted_tests, sorted_shard_tests)
 
+    @skipIfRocm
     @unittest.skipIf(IS_WINDOWS, "Skipping because doesn't work for windows")
     @unittest.skipIf(IS_SANDCASTLE, "Skipping because doesn't work on sandcastle")
     def test_filtering_env_var(self):

From 63f83edcfbaaeea2c4dedb68a13779ef95151140 Mon Sep 17 00:00:00 2001
From: Xue Haotian <njxht@foxmail.com>
Date: Thu, 15 Apr 2021 13:24:54 -0700
Subject: [PATCH 03/45] OpInfo porting for torch.real & torch.imag (#55134)

Summary:
Related https://github.com/pytorch/pytorch/issues/54298

This PR ports the method_tests() entries of torch.real & torch.imag to OpInfo.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/55134

Reviewed By: agolynski

Differential Revision: D27793242

Pulled By: anjali411

fbshipit-source-id: 0e9a987bfef16e78a1cda81ce14970993a59e467
---
 test/test_autograd.py                         |  2 +-
 torch/testing/_core.py                        |  2 +-
 .../_internal/common_methods_invocations.py   | 33 ++++++++++++++++---
 3 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index d24824b5b207..d76d3ee5731e 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -5340,7 +5340,7 @@ def run_functional_checks(test_case, test_name, name, apply_fn, run_grad_checks,
 # the tests for these ops which do not have 'complex' in variant should not run for complex
 # and only run for floating point
 
-separate_complex_tests = ['view_as_real', 'real', 'imag', 'div', 'pow', '__rdiv__', 'add', 'sub']
+separate_complex_tests = ['view_as_real', 'div', 'pow', '__rdiv__', 'add', 'sub']
 
 # NOTE: Some non-holomorphic are separately tested in TestAutogradComplex until gradcheck works properly
 # for non-holomorphic functions
diff --git a/torch/testing/_core.py b/torch/testing/_core.py
index cc18d2baf77e..02f165ce839e 100644
--- a/torch/testing/_core.py
+++ b/torch/testing/_core.py
@@ -345,7 +345,7 @@ def all_types():
 def all_types_and(*dtypes):
     return _all_types + _validate_dtypes(*dtypes)
 
-_complex_types = (torch.cfloat, torch.cdouble)
+_complex_types = _dispatch_dtypes((torch.cfloat, torch.cdouble))
 def complex_types():
     return _complex_types
 
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index bd16944ef910..54faa9f98ec4 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -14,7 +14,7 @@
 from typing import List, Sequence, Tuple, Dict, Any, Union
 
 from torch.testing import \
-    (make_non_contiguous, floating_types, floating_types_and,
+    (make_non_contiguous, floating_types, floating_types_and, complex_types,
      floating_and_complex_types, floating_and_complex_types_and,
      all_types_and_complex_and, all_types_and, all_types_and_complex,
      integral_types_and, all_types)
@@ -203,7 +203,9 @@ def __init__(self,
 
         # NOTE: if the op is unspecified it is assumed to be under the torch namespace
         self.op = op if op else _getattr_qual(torch, self.name)
-        self.method_variant = getattr(torch.Tensor, name, None)
+        method_variant = getattr(torch.Tensor, name, None)
+        # attributes like real, imag are not callable
+        self.method_variant = method_variant if callable(method_variant) else None
         inplace_name = name + "_"
         self.inplace_variant = getattr(torch.Tensor, inplace_name, None)
         self.operator_variant = getattr(operator, name, None)
@@ -2325,7 +2327,6 @@ def sample_inputs_entr(op_info, device, dtype, requires_grad, **kwargs):
                                     low=low,
                                     requires_grad=requires_grad)))
 
-
 def sample_inputs_rsub(op_info, device, dtype, requires_grad, variant='tensor', **kwargs):
     def _make_tensor_helper(shape, low=None, high=None):
         return make_tensor(shape, device, dtype, low=low, high=high, requires_grad=requires_grad)
@@ -3356,6 +3357,18 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
                        SkipInfo('TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                 active_if=IS_WINDOWS),
                    )),
+    UnaryUfuncInfo('imag',
+                   ref=np.imag,
+                   dtypes=complex_types(),
+                   dtypesIfCPU=complex_types(),
+                   dtypesIfCUDA=complex_types(),
+                   dtypesIfROCM=complex_types(),
+                   supports_out=False,
+                   supports_autograd=False,
+                   skips=(
+                       # Skip since real and imag don't have out variants.
+                       SkipInfo('TestUnaryUfuncs', 'test_out_arg_all_dtypes'),
+                   )),
     OpInfo('linalg.householder_product',
            aten_name='linalg_householder_product',
            op=torch.linalg.householder_product,
@@ -3787,6 +3800,18 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
                                 dtypes=[torch.bfloat16]),
                    ),
                    safe_casts_outputs=True),
+    UnaryUfuncInfo('real',
+                   ref=np.real,
+                   dtypes=complex_types(),
+                   dtypesIfCPU=complex_types(),
+                   dtypesIfCUDA=complex_types(),
+                   dtypesIfROCM=complex_types(),
+                   supports_out=False,
+                   supports_autograd=False,
+                   skips=(
+                       # Skip since real and imag don't have out variants.
+                       SkipInfo('TestUnaryUfuncs', 'test_out_arg_all_dtypes'),
+                   )),
     UnaryUfuncInfo('round',
                    ref=np.round,
                    dtypes=floating_types_and(torch.half),
@@ -4747,8 +4772,6 @@ def method_tests():
         ('expand', (), (dont_convert(()),), 'scalar_to_scalar'),
         ('expand', (), (1, 3, 2), 'scalar_to_dims', (False,)),
         ('expand_as', (S, 1, 1), (torch.rand(S, S, S),), '', (False,)),
-        ('real', (S, S, S), NO_ARGS, 'complex'),
-        ('imag', (S, S, S), NO_ARGS, 'complex'),
         ('view_as_real', (S, S, S), NO_ARGS, 'complex'),
         ('view_as_complex', (S, S, 2), NO_ARGS),
         ('complex', (S, S, S), ((S, S, S),), ''),

From 06ea73942a01304326f02eac797322c33b9f46e7 Mon Sep 17 00:00:00 2001
From: Dhruv Matani <dhruvbird@fb.com>
Date: Thu, 15 Apr 2021 13:43:00 -0700
Subject: [PATCH 04/45] [easy] Rename fb::jpeg_decode_to_NCHW to
 fb::image_decode_to_NCHW (#55857)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/55857

Since OpenCV supports more than just the JPEG file format.

ghstack-source-id: 126528422

Test Plan: Build

Reviewed By: JacobSzwejbka

Differential Revision: D27722865

fbshipit-source-id: 6cf83bf187bb1fb3a28e3aa2a011959ef8925449
---
 test/test_bundled_images.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_bundled_images.py b/test/test_bundled_images.py
index 18883b9e791f..0c95ae39c582 100644
--- a/test/test_bundled_images.py
+++ b/test/test_bundled_images.py
@@ -69,11 +69,11 @@ def forward(self, arg):
         self.assertEqual(raw_data.shape, decoded_data.shape)
         self.assertTrue(torch.allclose(raw_data, decoded_data, atol=0.1, rtol=1e-01))
 
-        # Check if fb::jpeg_decode_to_NCHW works as expected
+        # Check if fb::image_decode_to_NCHW works as expected
         with open("caffe2/test/test_img/p1.jpg", "rb") as fp:
             weight = torch.full((3,), 1.0 / 255.0).diag()
             bias = torch.zeros(3)
             byte_tensor = torch.tensor(list(fp.read())).byte()
-            im2_tensor = torch.ops.fb.jpeg_decode_to_NCHW(byte_tensor, weight, bias)
+            im2_tensor = torch.ops.fb.image_decode_to_NCHW(byte_tensor, weight, bias)
             self.assertEqual(raw_data.shape, im2_tensor.shape)
             self.assertTrue(torch.allclose(raw_data, im2_tensor, atol=0.1, rtol=1e-01))

From d56f4518209d233d7da603cf306d862c73970d8d Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Thu, 15 Apr 2021 13:57:19 -0700
Subject: [PATCH 05/45] [nnc] Separate printing of optimized llvm bitcode from
 assembly (#56117)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/56117

I was debugging an issue during instruction selection and wanted to
see the input bitcode.  This way we always print it before going into the asm
generation pass.
ghstack-source-id: 126592596

Test Plan: Run with `PYTORCH_JIT_LOG_LEVEL=">>llvm_codegen"`

Reviewed By: huiguoo

Differential Revision: D27781683

fbshipit-source-id: 84635d0ca2a1318ae7a9a73cc1d2df450d8b6a08
---
 torch/csrc/jit/tensorexpr/llvm_codegen.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index b25e4d6c738e..e1903d81af96 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -550,10 +550,13 @@ void LLVMCodeGenImpl::emitKernel(
 
   optimize(*module_);
 
-  // print graph debug info after optimization
   asmBuffer.set_size(0);
   module_->print(asmStream, nullptr);
   llvmCode = asmStream.str().str();
+  GRAPH_DEBUG(
+      "\nLLVM module after optimizations\n\n", asmStream.str().str(), "\n");
+
+  // print graph debug info after optimization
   asmBuffer.set_size(0);
   llvm::legacy::PassManager PM;
   jit_->getTargetMachine().addPassesToEmitFile(
@@ -568,8 +571,7 @@ void LLVMCodeGenImpl::emitKernel(
   PM.run(*module_);
   asmCode = asmStream.str().str();
 
-  GRAPH_DEBUG(
-      "\nLLVM module after optimizations\n\n", llvmCode, "\n", asmCode, "\n");
+  GRAPH_DEBUG("\nLLVM generated assembly code\n\n", asmCode, "\n");
 }
 
 // TODO: The binary ops are copypasta.

From 16820bba5ab5ab6a0f5bc51cc29e1c0e0cecc04c Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Thu, 15 Apr 2021 13:57:19 -0700
Subject: [PATCH 06/45] [nnc][trivial] Trailing underscore style for llvmCode,
 asmCode members (#56118)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/56118

that's it
ghstack-source-id: 126592595

Test Plan: compile

Reviewed By: huiguoo

Differential Revision: D27781682

fbshipit-source-id: 12728c279d0e02eb007093e18d9fc989456bea77
---
 torch/csrc/jit/tensorexpr/llvm_codegen.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index e1903d81af96..9d6ffd5f8642 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -176,8 +176,8 @@ class LLVMCodeGenImpl : public IRVisitor {
   std::unordered_map<const Block*, std::vector<const Var*>> scopeToVar_;
   const Block* scope_;
 
-  std::string llvmCode;
-  std::string asmCode;
+  std::string llvmCode_;
+  std::string asmCode_;
 
  private:
   llvm::LLVMContext& getContext();
@@ -268,10 +268,10 @@ class LLVMCodeGenImpl : public IRVisitor {
 
   void optimize(llvm::Module& M);
   std::string getLLVMCodeText() {
-    return llvmCode;
+    return llvmCode_;
   }
   std::string getASMCodeText() {
-    return asmCode;
+    return asmCode_;
   }
 };
 
@@ -552,7 +552,7 @@ void LLVMCodeGenImpl::emitKernel(
 
   asmBuffer.set_size(0);
   module_->print(asmStream, nullptr);
-  llvmCode = asmStream.str().str();
+  llvmCode_ = asmStream.str().str();
   GRAPH_DEBUG(
       "\nLLVM module after optimizations\n\n", asmStream.str().str(), "\n");
 
@@ -569,9 +569,9 @@ void LLVMCodeGenImpl::emitKernel(
       llvm::TargetMachine::CodeGenFileType::CGFT_AssemblyFile);
 #endif
   PM.run(*module_);
-  asmCode = asmStream.str().str();
+  asmCode_ = asmStream.str().str();
 
-  GRAPH_DEBUG("\nLLVM generated assembly code\n\n", asmCode, "\n");
+  GRAPH_DEBUG("\nLLVM generated assembly code\n\n", asmCode_, "\n");
 }
 
 // TODO: The binary ops are copypasta.

From b940516061cca19bf073b6828ae069a04d8c709d Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Thu, 15 Apr 2021 13:57:19 -0700
Subject: [PATCH 07/45] [nnc] Don't fuse fp16 on CPU (#56119)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/56119

There are apparently still more issues with fp16 on LLVM so let's just
nuke it from orbit while we develop a robust workaround.
ghstack-source-id: 126619411

Test Plan: compile

Reviewed By: ZolotukhinM

Differential Revision: D27787080

fbshipit-source-id: 9e771211fe48266f50fca1de8d40295922da5bca
---
 test/test_jit_fuser_te.py                  | 6 ++++--
 torch/csrc/jit/passes/tensorexpr_fuser.cpp | 9 ++++++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index bcf8b97bf0a0..35b2f2ff8bee 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -83,7 +83,8 @@ def setUp(self):
             torch.bool,
         ]
         self.fp_dtypes = [
-            torch.float16,
+            # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed
+            # torch.float16,
             torch.float32,
             torch.float64,
         ]
@@ -1349,7 +1350,8 @@ def test_masked_fill(self):
             torch.int16,
             torch.int32,
             torch.int64,
-            torch.float16,
+            # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed
+            # torch.float16,
             torch.float32,
             torch.float64,
             torch.bool,
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index 0b63ce36e290..d989b07efe6e 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -905,9 +905,10 @@ class TensorExprFuser {
     for (const Value* v : node->inputs()) {
       if (auto const& tt = v->type()->cast<TensorType>()) {
         auto const& st = tt->scalarType();
+        auto const& device = tt->device();
 
         // All tensors must be typed.
-        if (!st) {
+        if (!st || !device) {
           return false;
         }
 
@@ -917,6 +918,12 @@ class TensorExprFuser {
           return false;
         }
 
+        // Float16 has a few kinks on LLVM.  Disable it until we either move to
+        // a more stable version or find workarounds.
+        if (*st == c10::ScalarType::Half && *device == c10::kCPU) {
+          return false;
+        }
+
         // These operators only support floats, because integer divisors need to
         // raise ZeroDivisionError.
         if (node->isMemberOf(float_only_operator_set) && !isFloatingType(*st)) {

From 33159b68a3f6662653922967a20aeeadd5114dbc Mon Sep 17 00:00:00 2001
From: Heitor Schueroff <heitorschueroff@fb.com>
Date: Thu, 15 Apr 2021 14:04:50 -0700
Subject: [PATCH 08/45] Revert "Deprecate legacy constructor `torch.Tensor()`
 (#54414)" (#55831)

Summary:
This PR reverts https://github.com/pytorch/pytorch/pull/54414 because of https://github.com/pytorch/pytorch/issues/55780

cc ysiraichi

Pull Request resolved: https://github.com/pytorch/pytorch/pull/55831

Reviewed By: agolynski

Differential Revision: D27762264

Pulled By: heitorschueroff

fbshipit-source-id: 8079a660cc440cafb9d22aa031d36dde121e13b3
---
 docs/source/tensors.rst         | 5 -----
 torch/csrc/utils/tensor_new.cpp | 5 -----
 2 files changed, 10 deletions(-)

diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst
index 1cd07cacb1c9..580a0d18d528 100644
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@@ -176,11 +176,6 @@ Tensor class reference
    - To create a tensor with similar type but different size as another tensor,
      use ``tensor.new_*`` creation ops.
 
-   .. warning::
-      The :class:`torch.Tensor` constructor is deprecated. Instead, consider using:
-      :func:`torch.tensor` for creating tensors from tensor-like objects (e.g. lists and tuples);
-      or :func:`torch.empty` for creating uninitialized tensors with specific sizes (e.g. int).
-
 .. autoattribute:: Tensor.T
 
 .. autosummary::
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index 8d47ff66b34d..6019095910f9 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -473,11 +473,6 @@ Tensor legacy_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_t
     return legacy_sparse_tensor_ctor(dispatch_key, scalar_type, args, kwargs);
   }
 
-  TORCH_WARN_ONCE(
-      "Legacy tensor constructor is deprecated. "
-      "Use: torch.tensor(...) for creating tensors from tensor-like objects; "
-      "or torch.empty(...) for creating an uninitialized tensor with specific sizes.");
-
   ParsedArgs<2> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
   if (r.idx == 0) {

From 14d529a368e8a88c487cc88f7db1dffa654814b7 Mon Sep 17 00:00:00 2001
From: nikithamalgi <nikithamalgi@devvm146.prn0.facebook.com>
Date: Thu, 15 Apr 2021 14:05:47 -0700
Subject: [PATCH 09/45] Add support for refinement for torch.jit.Future
 (#56148)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/56148

Fixes issue: #55787

Test Plan: Imported from OSS

Reviewed By: navahgar

Differential Revision: D27796830

Pulled By: nikithamalgifb

fbshipit-source-id: b7a60218010793a54eb52d6b7602d333dc5a1c9e
---
 test/test_jit.py                               | 10 ++++++++++
 torch/csrc/jit/frontend/script_type_parser.cpp |  2 +-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index 8e69fc0215bc..d8b57915e627 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -14590,6 +14590,16 @@ def fn(x):
         self.checkScript(fn, ("h",))
         self.checkScript(fn, ("hello",))
 
+    def test_multiline_optional_future_refinement(self):
+        @torch.jit.script
+        def fun() -> int:
+            future: Optional[
+                torch.jit.Future[Tuple[torch.Tensor]]
+            ] = None
+
+            return 1
+        self.assertEqual(fun(), 1)
+
     @unittest.skipIf(IS_WINDOWS or IS_SANDCASTLE, "NYI: TemporaryFileName support for Windows or Sandcastle")
     def test_attribute_unpickling(self):
         tensor = torch.randn(2, 2)
diff --git a/torch/csrc/jit/frontend/script_type_parser.cpp b/torch/csrc/jit/frontend/script_type_parser.cpp
index 19c9747218e9..1ea596f4c881 100644
--- a/torch/csrc/jit/frontend/script_type_parser.cpp
+++ b/torch/csrc/jit/frontend/script_type_parser.cpp
@@ -65,7 +65,7 @@ TypePtr ScriptTypeParser::subscriptToType(
         parseTypeFromExprImpl(*subscript.subscript_exprs().begin());
     return OptionalType::create(elem_type);
 
-  } else if (typeName == "Future") {
+  } else if (typeName == "Future" || typeName == "torch.jit.Future") {
     if (subscript.subscript_exprs().size() != 1) {
       throw ErrorReport(subscript)
           << " expected exactly one element type but found "

From aae1023bed2106952805435effd0f79c2631cfd0 Mon Sep 17 00:00:00 2001
From: Adam Simpkins <simpkins@fb.com>
Date: Thu, 15 Apr 2021 14:42:20 -0700
Subject: [PATCH 10/45] [caffe2] allow passing options to the DB in Save
 operations (#55935)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/55935

Add a new `DB::SetOptions()` method to allow passing options to the DB as part
of Save operations.  This can be used for passing in options to control the
serialization behavior, such as rate limits or other parameters.  The
serialization options are passed is an opaque string, so that different DB
implementations may choose their own options and options format.

This also adds a new `db_options` parameter to the `Save` operator.
This allows users to pass in the DB options when saving data.
ghstack-source-id: 126589771

Test Plan:
I don't have any tests in this diff since no DB implements options yet.  The
next diff in the stack includes an options implementation, along with unit
tests that verify the options are passed in correctly.

Differential Revision: D27729461

fbshipit-source-id: 4d03250c389c66a049cdee1d05e082f5649ac0f0
---
 caffe2/core/db.h                 | 16 +++++++++++++++-
 caffe2/operators/load_save_op.cc |  4 ++++
 caffe2/operators/load_save_op.h  |  1 +
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/caffe2/core/db.h b/caffe2/core/db.h
index 97657793a70a..812404358a5a 100644
--- a/caffe2/core/db.h
+++ b/caffe2/core/db.h
@@ -3,7 +3,8 @@
 
 #include <mutex>
 
-#include "c10/util/Registry.h"
+#include <c10/util/Registry.h>
+#include <c10/util/string_view.h>
 #include "caffe2/core/blob_serialization.h"
 #include "caffe2/proto/caffe2_pb.h"
 
@@ -98,6 +99,19 @@ class TORCH_API DB {
    */
   virtual std::unique_ptr<Transaction> NewTransaction() = 0;
 
+  /**
+   * Set DB options.
+   *
+   * These options should apply for the lifetime of the DB, or until a
+   * subsequent SetOptions() call overrides them.
+   *
+   * This is used by the Save operator to allow the client to pass in
+   * DB-specific options to control the behavior.  This is an opaque string,
+   * where the format is specific to the DB type.  DB types may pass in a
+   * serialized protobuf message here if desired.
+   */
+  virtual void SetOptions(c10::string_view /* options */) {}
+
  protected:
   Mode mode_;
 
diff --git a/caffe2/operators/load_save_op.cc b/caffe2/operators/load_save_op.cc
index b409c60f2640..cb4c1839135e 100644
--- a/caffe2/operators/load_save_op.cc
+++ b/caffe2/operators/load_save_op.cc
@@ -56,6 +56,7 @@ SaveOpImpl::SaveOpImpl(
     : operator_(op),
       strip_prefix_(op->template GetSingleArgument<string>("strip_prefix", "")),
       db_type_(op->template GetSingleArgument<string>("db_type", "")),
+      db_options_(op->template GetSingleArgument<string>("db_options", "")),
       blob_names_(
           op->template GetRepeatedArgument<string>("blob_name_overrides")) {
   CAFFE_ENFORCE_GT(db_type_.size(), 0, "Must specify a db type.");
@@ -165,6 +166,9 @@ bool SaveOpImpl::RunOnDevice() {
       " (while trying to open ",
       full_db_name_,
       ")");
+  if (!db_options_.empty()) {
+    out_db->SetOptions(db_options_);
+  }
 
   BlobSerializerBase::SerializationAcceptor acceptor =
       [&](const std::string& blobName, const std::string& data) {
diff --git a/caffe2/operators/load_save_op.h b/caffe2/operators/load_save_op.h
index 9859e10351d7..07170cb85441 100644
--- a/caffe2/operators/load_save_op.h
+++ b/caffe2/operators/load_save_op.h
@@ -309,6 +309,7 @@ class TORCH_API SaveOpImpl {
   std::string strip_prefix_;
   std::string full_db_name_;
   std::string db_type_;
+  std::string db_options_;
   std::vector<std::string> blob_names_;
   SerializationOptions options_;
 };

From 9bfe16a308c8d3ccc5dcc318eaa12a4bb42f4614 Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Thu, 15 Apr 2021 14:44:28 -0700
Subject: [PATCH 11/45] should_check_autodiff is now should_autodiff_node
 (#56013)

Summary:
The name `should_check_autodiff` became `should_autodiff_node` but documentation did not change. The identifier is used in `test/test_jit.py`. It seems the file is too big for github to link to the line, but it is the return value from `normalize_check_ad`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/56013

Reviewed By: agolynski

Differential Revision: D27800008

Pulled By: Lilyjjo

fbshipit-source-id: 88a43c14c0f48fb3f94792e3fd6de2bd6a59a1a2
---
 torch/csrc/jit/OVERVIEW.md                            | 6 +++---
 torch/testing/_internal/common_methods_invocations.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/torch/csrc/jit/OVERVIEW.md b/torch/csrc/jit/OVERVIEW.md
index e96ce664a6f5..49abf278611d 100644
--- a/torch/csrc/jit/OVERVIEW.md
+++ b/torch/csrc/jit/OVERVIEW.md
@@ -1361,13 +1361,13 @@ If your PR adds/updates a gradient formula for `torch`/`nn` functions, you **MUS
 - `torch` functions: `method_tests` in [common_method_tests.py](../../../test/common_method_tests.py)
 - `nn` functions: `nn_functional_tests` in [test_jit.py](../../../test/test_jit.py)
 
-To turn on autodiff check, you can add an optional `check_ad(should_check_autodiff[bool], nonfusible_nodes[str|list[str]], fusible_nodes[str|list[str]])` tuple after the optional test variant name field.
-If `should_check_autodiff=True`, the differentiated traced/script forward graph must have a `prim::DifferentiableGraph`.
+To turn on autodiff check, you can add an optional `check_ad(should_autodiff_node[bool], nonfusible_nodes[str|list[str]], fusible_nodes[str|list[str]])` tuple after the optional test variant name field.
+If `should_autodiff_node=True`, the differentiated traced/script forward graph must have a `prim::DifferentiableGraph`.
 
 All nodes in `nonfusible_nodes` should show up in at least once in `prim::DifferentiableGraph` subgraphs.
 When fusion is enabled, all nodes in `fusible_nodes` should show up in one of `prim::FusionGroup` graphs attached to `prim::DifferentiableGraph`,
 otherwise they're checked as `nonfusible_nodes` as well.
-On the other hand, if `should_check_autodiff=False`, the graph can still have `prim::DifferentiableGraph` with other nodes, but not `nonfusible_nodes` and `fusible_nodes`.
+On the other hand, if `should_autodiff_node=False`, the graph can still have `prim::DifferentiableGraph` with other nodes, but not `nonfusible_nodes` and `fusible_nodes`.
 
 To make writing test easier, you only need to write out node names if it's different from the function name. Below are a few examples:
 ```python
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 54faa9f98ec4..93d2539e6d22 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -4624,7 +4624,7 @@ def ident(x):
 #   input size/constructing fn,
 #   args (tuple represents shape of a tensor arg),
 #   test variant name (will be used at test name suffix),    // optional
-#   (should_check_autodiff[bool], nonfusible_nodes, fusible_nodes) for autodiff, // optional
+#   (should_autodiff_node[bool], nonfusible_nodes, fusible_nodes) for autodiff, // optional
 #   indices for possible dim arg,                            // optional
 #   fn mapping output to part that should be gradcheck'ed,   // optional
 #   kwargs                                                   // optional

From e2036ea3427b7e1135598ad1899354a5d6672c72 Mon Sep 17 00:00:00 2001
From: Vitaly Fedyunin <vitalyf@fb.com>
Date: Thu, 15 Apr 2021 14:44:57 -0700
Subject: [PATCH 12/45] Revert D27758303: [20/n][torch/elastic][upstream] Move
 torchelastic.distributed.tests to pytorch.distributed

Test Plan: revert-hammer

Differential Revision:
D27758303 (https://github.com/pytorch/pytorch/commit/9f6fed8a15a6f64cc2f915f221f2eb4ab0289713)

Original commit changeset: c987d4764f47

fbshipit-source-id: 90846dcd5c8512dd615c7f44dc3663f124cf4a25
---
 .../launcher/bin/test_script_local_rank.py    |  40 --
 .../launcher/elastic_launch_test.py           | 425 ------------------
 test/distributed/launcher/launch_test.py      |  86 ----
 3 files changed, 551 deletions(-)
 delete mode 100755 test/distributed/launcher/bin/test_script_local_rank.py
 delete mode 100644 test/distributed/launcher/elastic_launch_test.py
 delete mode 100644 test/distributed/launcher/launch_test.py

diff --git a/test/distributed/launcher/bin/test_script_local_rank.py b/test/distributed/launcher/bin/test_script_local_rank.py
deleted file mode 100755
index 7c01dcaf6029..000000000000
--- a/test/distributed/launcher/bin/test_script_local_rank.py
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import os
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="test script")
-
-    parser.add_argument(
-        "--local_rank",
-        type=int,
-        required=True,
-        help="The rank of the node for multi-node distributed " "training",
-    )
-
-    return parser.parse_args()
-
-
-def main():
-    print("Start execution")
-    args = parse_args()
-    expected_rank = int(os.environ["LOCAL_RANK"])
-    actual_rank = args.local_rank
-    if expected_rank != actual_rank:
-        raise RuntimeError(
-            "Parameters passed: --local_rank that has different value "
-            f"from env var: expected: {expected_rank}, got: {actual_rank}"
-        )
-    print("End execution")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/test/distributed/launcher/elastic_launch_test.py b/test/distributed/launcher/elastic_launch_test.py
deleted file mode 100644
index 346cf5b3ec9d..000000000000
--- a/test/distributed/launcher/elastic_launch_test.py
+++ /dev/null
@@ -1,425 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-import multiprocessing as mp
-import os
-import shutil
-import subprocess
-import tempfile
-import unittest
-import uuid
-from contextlib import closing
-from unittest import mock
-from unittest.mock import Mock, patch
-
-import torch.distributed.elastic_launch as launch
-from torch.distributed.elastic.agent.server.api import RunResult, WorkerState
-from torch.distributed.elastic.multiprocessing.errors import ChildFailedError
-from torch.distributed.elastic.rendezvous.etcd_server import EtcdServer
-from torch.distributed.elastic.utils import get_socket_with_port
-from torch.testing._internal.common_utils import (
-    TEST_WITH_ASAN,
-    TEST_WITH_TSAN,
-)
-
-
-def launch_in_proc(args):
-    launch.main(args)
-
-
-def path(script):
-    return os.path.join(os.path.dirname(__file__), script)
-
-
-def get_child_pids(pid):
-    pgrep = subprocess.Popen(args=f"pgrep -P {pid}", shell=True, stdout=subprocess.PIPE)
-    pgrep.wait()
-    out = pgrep.stdout.read().decode("utf-8").rstrip().split("\n")
-    pids = []
-    for pid in out:
-        if pid:
-            pids.append(int(pid))
-    return pids
-
-
-def pid_exists(pid):
-    try:
-        os.kill(pid, 0)
-        return True
-    except OSError:
-        return False
-
-
-class MockException(Exception):
-    pass
-
-
-class ElasticLaunchTest(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        # start a standalone, single process etcd server to use for all tests
-        cls._etcd_server = EtcdServer()
-        cls._etcd_server.start()
-        cls._etcd_endpoint = cls._etcd_server.get_endpoint()
-
-    @classmethod
-    def tearDownClass(cls):
-        # stop the standalone etcd server
-        cls._etcd_server.stop()
-
-    def setUp(self):
-        self.test_dir = tempfile.mkdtemp()
-
-        # remove any lingering environment variables
-        for env in os.environ.keys():
-            if env.startswith("PET_"):
-                del os.environ[env]
-
-        # set a sentinel env var on the parent proc
-        # this should be present on the child and gets
-        # asserted in ``bin/test_script.py``
-        os.environ["TEST_SENTINEL_PARENT"] = "FOOBAR"
-
-    def tearDown(self):
-        shutil.rmtree(self.test_dir)
-
-    def test_launch_user_script_python(self):
-        run_id = str(uuid.uuid4().int)
-        nnodes = 1
-        nproc_per_node = 4
-        world_size = nnodes * nproc_per_node
-        args = [
-            f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_per_node}",
-            "--rdzv_backend=etcd",
-            f"--rdzv_endpoint={self._etcd_endpoint}",
-            f"--rdzv_id={run_id}",
-            "--monitor_interval=1",
-            "--start_method=fork",
-            path("bin/test_script.py"),
-            f"--touch_file_dir={self.test_dir}",
-        ]
-        launch.main(args)
-
-        # make sure all the workers ran
-        # each worker touches a file with its global rank as the name
-        self.assertSetEqual(
-            {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
-        )
-
-    def test_launch_user_script_python_caffe2_bc(self):
-        nnodes = 1
-        nproc_per_node = 4
-        world_size = nnodes * nproc_per_node
-        sock = get_socket_with_port()
-        with closing(sock):
-            master_port = sock.getsockname()[1]
-        args = [
-            f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_per_node}",
-            "--monitor_interval=1",
-            "--start_method=fork",
-            "--master_addr=localhost",
-            f"--master_port={master_port}",
-            "--node_rank=0",
-            "--use_env",
-            path("bin/test_script.py"),
-            f"--touch_file_dir={self.test_dir}",
-        ]
-        launch.main(args)
-
-        # make sure all the workers ran
-        # each worker touches a file with its global rank as the name
-        self.assertSetEqual(
-            {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
-        )
-
-    def test_launch_user_script_bash(self):
-        run_id = str(uuid.uuid4().int)
-        nnodes = 1
-        nproc_per_node = 4
-        world_size = nnodes * nproc_per_node
-        args = [
-            f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_per_node}",
-            "--rdzv_backend=etcd",
-            f"--rdzv_endpoint={self._etcd_endpoint}",
-            f"--rdzv_id={run_id}",
-            "--monitor_interval=1",
-            "--start_method=fork",
-            "--no_python",
-        ]
-
-        script_args = [path("bin/test_script.sh"), f"{self.test_dir}"]
-
-        with self.assertRaises(ValueError):
-            # --no_python cannot be used with --module
-            launch.main(args + ["--module"] + script_args)
-
-        launch.main(args + script_args)
-
-        # make sure all the workers ran
-        # each worker touches a file with its global rank as the name
-        self.assertSetEqual(
-            {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
-        )
-
-    def test_launch_with_env_vars(self):
-        run_id = str(uuid.uuid4().int)
-        nnodes = 1
-        nproc_per_node = 4
-        world_size = nnodes * nproc_per_node
-
-        os.environ["PET_NNODES"] = str(nnodes)
-        os.environ["PET_NPROC_PER_NODE"] = str(nproc_per_node)
-        os.environ["PET_RDZV_BACKEND"] = "etcd"
-        os.environ["PET_RDZV_ENDPOINT"] = self._etcd_endpoint
-        os.environ["PET_RDZV_ID"] = run_id
-        os.environ["PET_MONITOR_INTERVAL"] = "1"
-        os.environ["PET_START_METHOD"] = "fork"
-        os.environ["PET_NO_PYTHON"] = "1"
-
-        script_args = [path("bin/test_script.sh"), f"{self.test_dir}"]
-
-        with self.assertRaises(ValueError):
-            # --no_python cannot be used with --module
-            os.environ["PET_MODULE"] = "1"
-            launch.main(script_args)
-
-        os.environ["PET_MODULE"] = "0"
-        launch.main(script_args)
-
-        # make sure all the workers ran
-        # each worker touches a file with its global rank as the name
-        self.assertSetEqual(
-            {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
-        )
-
-    def _test_nproc_launch_configuration(self, nproc_type, expected_number):
-        run_id = str(uuid.uuid4().int)
-        nnodes = 1
-
-        args = [
-            f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_type}",
-            "--rdzv_backend=etcd",
-            f"--rdzv_endpoint={self._etcd_endpoint}",
-            f"--rdzv_id={run_id}",
-            "--monitor_interval=1",
-            "--start_method=fork",
-            "--no_python",
-        ]
-
-        script_args = [path("bin/test_script.sh"), f"{self.test_dir}"]
-
-        launch.main(args + script_args)
-
-        world_size = nnodes * expected_number
-        # make sure all the workers ran
-        # each worker touches a file with its global rank as the name
-        self.assertSetEqual(
-            {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
-        )
-
-    @unittest.skipIf(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan")
-    def test_nproc_launch_auto_configurations(self):
-        self._test_nproc_launch_configuration("auto", os.cpu_count())
-
-    @unittest.skipIf(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan")
-    def test_nproc_launch_number_configurations(self):
-        self._test_nproc_launch_configuration("4", 4)
-
-    @unittest.skipIf(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan")
-    def test_nproc_launch_unknown_configurations(self):
-        with self.assertRaises(ValueError):
-            self._test_nproc_launch_configuration("unknown", 4)
-
-    @unittest.skipIf(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan")
-    @patch("torch.cuda.is_available", return_value=True)
-    @patch("torch.cuda.device_count", return_value=3)
-    def test_nproc_gpu_launch_configurations(self, _mock1, _mock2):
-        self._test_nproc_launch_configuration("auto", 3)
-        self._test_nproc_launch_configuration("gpu", 3)
-
-    @unittest.skipIf(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan")
-    def test_launch_elastic(self):
-        run_id = str(uuid.uuid4().int)
-        min_nodes = 1
-        max_nodes = 2
-        nproc_per_node = 4
-        # we are only launching 1 node (even though max = 2)
-        world_size = nproc_per_node
-        args = [
-            f"--nnodes={min_nodes}:{max_nodes}",
-            f"--nproc_per_node={nproc_per_node}",
-            "--rdzv_backend=etcd",
-            f"--rdzv_endpoint={self._etcd_endpoint}",
-            f"--rdzv_id={run_id}",
-            "--monitor_interval=1",
-            "--start_method=fork",
-            path("bin/test_script.py"),
-            f"--touch_file_dir={self.test_dir}",
-        ]
-        launch.main(args)
-
-        # make sure all the workers ran
-        # each worker touches a file with its global rank as the name
-        self.assertSetEqual(
-            {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
-        )
-
-    @mock.patch("torch.distributed.elastic.events.record")
-    @unittest.skipIf(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan")
-    def test_launch_elastic_worker_raise_exception(self, record_mock):
-        """
-        Asserts that when the worker program fails and lancher raieses exception
-        to indicate that worker process failed
-
-        """
-        run_id = str(uuid.uuid4().int)
-        min_nodes = 1
-        max_nodes = 2
-        nproc_per_node = 4
-        args = [
-            f"--nnodes={min_nodes}:{max_nodes}",
-            f"--nproc_per_node={nproc_per_node}",
-            "--rdzv_backend=etcd",
-            f"--rdzv_endpoint={self._etcd_endpoint}",
-            f"--rdzv_id={run_id}",
-            "--monitor_interval=1",
-            "--max_restarts=0",
-            "--start_method=fork",
-            path("bin/test_script.py"),
-            "--fail",
-        ]
-        with self.assertRaises(ChildFailedError):
-            launch.main(args)
-
-        record_mock.assert_called_once()
-
-    @unittest.skipIf(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan")
-    @mock.patch(
-        "torch.distributed.elastic.agent.server.local_elastic_agent.LocalElasticAgent.run"
-    )
-    @mock.patch("torch.distributed.elastic.events.record")
-    def test_launch_elastic_agent_raise_exception(self, record_mock, mock_agent_run):
-        """
-        Asserts that when the agent raises an exception
-        the launcher re-raises the original exception
-        """
-        run_id = str(uuid.uuid4().int)
-        min_nodes = 1
-        max_nodes = 2
-        nproc_per_node = 4
-        args = [
-            f"--nnodes={min_nodes}:{max_nodes}",
-            f"--nproc_per_node={nproc_per_node}",
-            "--rdzv_backend=etcd",
-            f"--rdzv_endpoint={self._etcd_endpoint}",
-            f"--rdzv_id={run_id}",
-            "--monitor_interval=1",
-            "--max_restarts=0",
-            "--start_method=fork",
-            path("bin/test_script.py"),
-            f"--touch_file_dir={self.test_dir}",
-        ]
-
-        mock_agent_run.side_effect = MockException
-        with self.assertRaises(MockException):
-            launch.main(args)
-        record_mock.assert_called_once()
-
-    @unittest.skipIf(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan")
-    def test_launch_standalone(self):
-        nnodes = 1
-        nproc_per_node = 4
-        world_size = nnodes * nproc_per_node
-        args = [
-            f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_per_node}",
-            "--standalone",
-            "--monitor_interval=1",
-            "--start_method=fork",
-            path("bin/test_script.py"),
-            f"--touch_file_dir={self.test_dir}",
-        ]
-        launch.main(args)
-
-        # make sure all the workers ran
-        # each worker touches a file with its global rank as the name
-        self.assertSetEqual(
-            {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
-        )
-
-    @unittest.skipIf(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan")
-    def test_launch_elastic_multiple_agents(self):
-        run_id = str(uuid.uuid4().int)
-        min_nodes = 1
-        max_nodes = 2
-        nproc_per_node = 4
-        nnodes = 2
-        world_size = nnodes * nproc_per_node
-        args = [
-            f"--nnodes={min_nodes}:{max_nodes}",
-            f"--nproc_per_node={nproc_per_node}",
-            "--rdzv_backend=etcd",
-            f"--rdzv_endpoint={self._etcd_endpoint}",
-            f"--rdzv_id={run_id}",
-            "--monitor_interval=1",
-            "--start_method=fork",
-            path("bin/test_script.py"),
-            f"--touch_file_dir={self.test_dir}",
-        ]
-        procs = []
-        for _ in range(nnodes - 1):
-            p = mp.Process(target=launch.main, args=[args])
-            procs.append(p)
-            p.start()
-        launch.main(args)
-        for i in range(nnodes - 1):
-            p = procs[i]
-            p.join()
-            self.assertEqual(0, p.exitcode)
-
-        # make sure all the workers ran
-        # each worker touches a file with its global rank as the name
-        self.assertSetEqual(
-            {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
-        )
-
-    def test_min_max_nodes_parse(self):
-        min_nodes, max_nodes = launch.parse_min_max_nnodes("1")
-        self.assertTrue(min_nodes, max_nodes)
-        self.assertTrue(1, min_nodes)
-        min_nodes, max_nodes = launch.parse_min_max_nnodes("2:20")
-        self.assertTrue(2, min_nodes)
-        self.assertTrue(20, max_nodes)
-        with self.assertRaises(RuntimeError):
-            launch.parse_min_max_nnodes("2:20:30")
-
-    @patch("torch.distributed.launcher.api.LocalElasticAgent")
-    def test_launch_shutdown(self, agent_mock_cls):
-        nnodes = 1
-        nproc_per_node = 4
-        args = [
-            f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_per_node}",
-            "--monitor_interval=1",
-            "--start_method=fork",
-            path("bin/test_script.py"),
-            f"--touch_file_dir={self.test_dir}",
-        ]
-        agent_mock = Mock()
-        agent_mock.run.return_value = RunResult(WorkerState.SUCCEEDED)
-        agent_mock_cls.return_value = agent_mock
-        rdzv_handler_mock = Mock()
-        with patch(
-            "torch.distributed.elastic.rendezvous.registry.get_rendezvous_handler"
-        ) as param_mock:
-            param_mock.return_value = rdzv_handler_mock
-            launch.main(args)
-            rdzv_handler_mock.shutdown.assert_called_once()
diff --git a/test/distributed/launcher/launch_test.py b/test/distributed/launcher/launch_test.py
deleted file mode 100644
index 9a5547c9cf2d..000000000000
--- a/test/distributed/launcher/launch_test.py
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-import os
-import shutil
-import tempfile
-import unittest
-from contextlib import closing
-
-import torch.distributed.launch as launch
-from torch.distributed.elastic.utils import get_socket_with_port
-from torch.testing._internal.common_utils import (
-    TEST_WITH_ASAN,
-    TEST_WITH_TSAN,
-)
-
-
-def path(script):
-    return os.path.join(os.path.dirname(__file__), script)
-
-
-class LaunchTest(unittest.TestCase):
-    def setUp(self):
-        self.test_dir = tempfile.mkdtemp()
-        # set a sentinel env var on the parent proc
-        # this should be present on the child and gets
-        # asserted in ``bin/test_script.py``
-        os.environ["TEST_SENTINEL_PARENT"] = "FOOBAR"
-
-    def tearDown(self):
-        shutil.rmtree(self.test_dir)
-
-    @unittest.skipIf(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
-    )
-    def test_launch_without_env(self):
-        nnodes = 1
-        nproc_per_node = 4
-        world_size = nnodes * nproc_per_node
-        sock = get_socket_with_port()
-        with closing(sock):
-            master_port = sock.getsockname()[1]
-        args = [
-            f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_per_node}",
-            "--monitor_interval=1",
-            "--start_method=fork",
-            "--master_addr=localhost",
-            f"--master_port={master_port}",
-            "--node_rank=0",
-            path("bin/test_script_local_rank.py"),
-        ]
-        launch.main(args)
-
-    @unittest.skipIf(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
-    )
-    def test_launch_with_env(self):
-        nnodes = 1
-        nproc_per_node = 4
-        world_size = nnodes * nproc_per_node
-        sock = get_socket_with_port()
-        with closing(sock):
-            master_port = sock.getsockname()[1]
-        args = [
-            f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_per_node}",
-            "--monitor_interval=1",
-            "--start_method=fork",
-            "--master_addr=localhost",
-            f"--master_port={master_port}",
-            "--node_rank=0",
-            "--use_env",
-            path("bin/test_script.py"),
-            f"--touch_file_dir={self.test_dir}",
-        ]
-        launch.main(args)
-        # make sure all the workers ran
-        # each worker touches a file with its global rank as the name
-        self.assertSetEqual(
-            {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
-        )

From 512c744f2e641ebc1861110e53e673d6d48267b5 Mon Sep 17 00:00:00 2001
From: Can Balioglu <cbalioglu@users.noreply.github.com>
Date: Thu, 15 Apr 2021 14:45:55 -0700
Subject: [PATCH 13/45] [torch/elastic] Introduce `PeriodicTimer` (#55919)

Summary:
This PR introduces a basic timer type that periodically calls a specified function. Its main use in the upcoming `DynamicRendezvousHandler` implementation will be to send periodic keep-alive updates in a background thread.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/55919

Reviewed By: tierex

Differential Revision: D27740823

Pulled By: cbalioglu

fbshipit-source-id: e46fc848ab033995946a38a29c01d67d387a4cf5
---
 .../elastic/rendezvous/utils_test.py          | 105 ++++++++++++++++++
 torch/distributed/elastic/rendezvous/utils.py |  73 +++++++++++-
 2 files changed, 177 insertions(+), 1 deletion(-)

diff --git a/test/distributed/elastic/rendezvous/utils_test.py b/test/distributed/elastic/rendezvous/utils_test.py
index 8635710506fd..19260217d28c 100644
--- a/test/distributed/elastic/rendezvous/utils_test.py
+++ b/test/distributed/elastic/rendezvous/utils_test.py
@@ -4,10 +4,15 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import threading
+import time
 import socket
+from datetime import timedelta
+from typing import List
 from unittest import TestCase
 
 from torch.distributed.elastic.rendezvous.utils import (
+    _PeriodicTimer,
     _matches_machine_hostname,
     _parse_rendezvous_config,
     parse_rendezvous_endpoint,
@@ -236,3 +241,103 @@ def test_matches_machine_hostname_returns_false_if_hostname_does_not_match(
         for host in hosts:
             with self.subTest(host=host):
                 self.assertFalse(_matches_machine_hostname(host))
+
+
+class PeriodicTimerTest(TestCase):
+    def test_start_can_be_called_only_once(self):
+        timer = _PeriodicTimer(timedelta(seconds=1), lambda: None)
+
+        timer.start()
+
+        with self.assertRaisesRegex(RuntimeError, r"^The timer has already started.$"):
+            timer.start()
+
+        timer.cancel()
+
+    def test_cancel_can_be_called_multiple_times(self):
+        timer = _PeriodicTimer(timedelta(seconds=1), lambda: None)
+
+        timer.start()
+
+        timer.cancel()
+        timer.cancel()
+
+    def test_cancel_stops_background_thread(self):
+        timer = _PeriodicTimer(timedelta(seconds=1), lambda: None)
+
+        timer.start()
+
+        self.assertTrue(any(t.name == "PeriodicTimer" for t in threading.enumerate()))
+
+        timer.cancel()
+
+        self.assertTrue(all(t.name != "PeriodicTimer" for t in threading.enumerate()))
+
+    def test_delete_stops_background_thread(self):
+        timer = _PeriodicTimer(timedelta(seconds=1), lambda: None)
+
+        timer.start()
+
+        self.assertTrue(any(t.name == "PeriodicTimer" for t in threading.enumerate()))
+
+        del timer
+
+        self.assertTrue(all(t.name != "PeriodicTimer" for t in threading.enumerate()))
+
+    def test_timer_calls_background_thread_at_regular_intervals(self):
+        begin_time = time.monotonic()
+
+        # Call our function every 200ms.
+        call_interval = 0.2
+
+        # Keep the log of intervals between each consecutive call.
+        actual_call_intervals: List[float] = []
+
+        # Keep the number of times the function was called.
+        call_count = 0
+
+        # In order to prevent a flaky test instead of asserting that the
+        # function was called an exact number of times we use a lower bound
+        # that is guaranteed to be true for a correct implementation.
+        min_required_call_count = 4
+
+        timer_stop_event = threading.Event()
+
+        def log_call(self):
+            nonlocal begin_time, call_count
+
+            actual_call_intervals.append(time.monotonic() - begin_time)
+
+            call_count += 1
+            if call_count == min_required_call_count:
+                timer_stop_event.set()
+
+            begin_time = time.monotonic()
+
+        timer = _PeriodicTimer(timedelta(seconds=call_interval), log_call, self)
+
+        timer.start()
+
+        # Although this is theoretically non-deterministic, if our timer, which
+        # has a 200ms call interval, does not get called 4 times in 60 seconds,
+        # there is very likely something else going on.
+        timer_stop_event.wait(60)
+
+        timer.cancel()
+
+        self.longMessage = False
+
+        self.assertGreaterEqual(
+            call_count,
+            min_required_call_count,
+            f"The function has been called {call_count} time(s) but expected to be called at least "
+            f"{min_required_call_count} time(s).",
+        )
+
+        for actual_call_interval in actual_call_intervals:
+            self.assertGreaterEqual(
+                actual_call_interval,
+                call_interval,
+                f"The interval between two function calls was {actual_call_interval} second(s) but "
+                f"expected to be at least {call_interval} second(s).",
+            )
diff --git a/torch/distributed/elastic/rendezvous/utils.py b/torch/distributed/elastic/rendezvous/utils.py
index 2e6b697792d3..289cab9f9a7c 100644
--- a/torch/distributed/elastic/rendezvous/utils.py
+++ b/torch/distributed/elastic/rendezvous/utils.py
@@ -7,7 +7,9 @@
 import ipaddress
 import re
 import socket
-from typing import Dict, Optional, Tuple
+from datetime import timedelta
+from threading import Event, Thread
+from typing import Any, Callable, Dict, Optional, Tuple
 
 
 def _parse_rendezvous_config(config_str: str) -> Dict[str, str]:
@@ -138,3 +140,72 @@ def _matches_machine_hostname(host: str) -> bool:
             return True
 
     return False
+
+
+class _PeriodicTimer:
+    """Represents a timer that periodically runs a specified function.
+
+    Args:
+        interval:
+            The interval, in seconds, between each run.
+        function:
+            The function to run.
+    """
+
+    # The state of the timer is hold in a separate context object to avoid a
+    # reference cycle between the timer and the background thread.
+    class _Context:
+        interval: float
+        function: Callable[..., None]
+        args: Tuple[Any, ...]
+        kwargs: Dict[str, Any]
+        stop_event: Event
+
+    _thread: Optional[Thread]
+
+    # The context that is shared between the timer and the background thread.
+    _ctx: _Context
+
+    def __init__(
+        self,
+        interval: timedelta,
+        function: Callable[..., None],
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        self._ctx = self._Context()
+        self._ctx.interval = interval.total_seconds()
+        self._ctx.function = function  # type: ignore
+        self._ctx.args = args or ()
+        self._ctx.kwargs = kwargs or {}
+        self._ctx.stop_event = Event()
+
+        self._thread = None
+
+    def __del__(self) -> None:
+        self.cancel()
+
+    def start(self) -> None:
+        """Start the timer."""
+        if self._thread:
+            raise RuntimeError("The timer has already started.")
+
+        self._thread = Thread(
+            target=self._run, name="PeriodicTimer", args=(self._ctx,), daemon=True
+        )
+
+        self._thread.start()
+
+    def cancel(self) -> None:
+        """Stop the timer at the next opportunity."""
+        if not self._thread:
+            return
+
+        self._ctx.stop_event.set()
+
+        self._thread.join()
+
+    @staticmethod
+    def _run(ctx) -> None:
+        while not ctx.stop_event.wait(ctx.interval):
+            ctx.function(*ctx.args, **ctx.kwargs)

From 90e103ddfeee3746848ca72e92f6e7b409a4e98c Mon Sep 17 00:00:00 2001
From: Vitaly Fedyunin <vitalyf@fb.com>
Date: Thu, 15 Apr 2021 14:47:30 -0700
Subject: [PATCH 14/45] Revert D27753803: [19/n][torch/elastic][upstream]
 Replace pytorch.distributed.launch with torchelastic launcher

Test Plan: revert-hammer

Differential Revision:
D27753803 (https://github.com/pytorch/pytorch/commit/7c708ef4ea740972a6cb9a06285772a708f0cc55)

Original commit changeset: 5f24bcfdcb70

fbshipit-source-id: 650e229b788d046450615364e5cba65065a95e3b
---
 test/distributed/argparse_util_test.py | 135 ------
 torch/distributed/argparse_util.py     | 103 -----
 torch/distributed/elastic_launch.py    | 606 -------------------------
 torch/distributed/launch.py            | 223 +++++++--
 4 files changed, 195 insertions(+), 872 deletions(-)
 delete mode 100644 test/distributed/argparse_util_test.py
 delete mode 100644 torch/distributed/argparse_util.py
 delete mode 100644 torch/distributed/elastic_launch.py

diff --git a/test/distributed/argparse_util_test.py b/test/distributed/argparse_util_test.py
deleted file mode 100644
index fb63feda1d82..000000000000
--- a/test/distributed/argparse_util_test.py
+++ /dev/null
@@ -1,135 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-import os
-import unittest
-from argparse import ArgumentParser
-
-from torch.distributed.argparse_util import check_env, env
-
-
-class ArgParseUtilTest(unittest.TestCase):
-    def setUp(self):
-        # remove any lingering environment variables
-        for e in os.environ.keys():
-            if e.startswith("PET_"):
-                del os.environ[e]
-
-    def test_env_string_arg_no_env(self):
-        parser = ArgumentParser()
-        parser.add_argument("-f", "--foo", action=env, default="bar")
-
-        self.assertEqual("bar", parser.parse_args([]).foo)
-        self.assertEqual("baz", parser.parse_args(["-f", "baz"]).foo)
-        self.assertEqual("baz", parser.parse_args(["--foo", "baz"]).foo)
-
-    def test_env_string_arg_env(self):
-        os.environ["PET_FOO"] = "env_baz"
-        parser = ArgumentParser()
-        parser.add_argument("-f", "--foo", action=env, default="bar")
-
-        self.assertEqual("env_baz", parser.parse_args([]).foo)
-        self.assertEqual("baz", parser.parse_args(["-f", "baz"]).foo)
-        self.assertEqual("baz", parser.parse_args(["--foo", "baz"]).foo)
-
-    def test_env_int_arg_no_env(self):
-        parser = ArgumentParser()
-        parser.add_argument("-f", "--foo", action=env, default=1, type=int)
-
-        self.assertEqual(1, parser.parse_args([]).foo)
-        self.assertEqual(2, parser.parse_args(["-f", "2"]).foo)
-        self.assertEqual(2, parser.parse_args(["--foo", "2"]).foo)
-
-    def test_env_int_arg_env(self):
-        os.environ["PET_FOO"] = "3"
-        parser = ArgumentParser()
-        parser.add_argument("-f", "--foo", action=env, default=1, type=int)
-
-        self.assertEqual(3, parser.parse_args([]).foo)
-        self.assertEqual(2, parser.parse_args(["-f", "2"]).foo)
-        self.assertEqual(2, parser.parse_args(["--foo", "2"]).foo)
-
-    def test_env_no_default_no_env(self):
-        parser = ArgumentParser()
-        parser.add_argument("-f", "--foo", action=env)
-
-        self.assertIsNone(parser.parse_args([]).foo)
-        self.assertEqual("baz", parser.parse_args(["-f", "baz"]).foo)
-        self.assertEqual("baz", parser.parse_args(["--foo", "baz"]).foo)
-
-    def test_env_no_default_env(self):
-        os.environ["PET_FOO"] = "env_baz"
-        parser = ArgumentParser()
-        parser.add_argument("-f", "--foo", action=env)
-
-        self.assertEqual("env_baz", parser.parse_args([]).foo)
-        self.assertEqual("baz", parser.parse_args(["-f", "baz"]).foo)
-        self.assertEqual("baz", parser.parse_args(["--foo", "baz"]).foo)
-
-    def test_env_required_no_env(self):
-        parser = ArgumentParser()
-        parser.add_argument("-f", "--foo", action=env, required=True)
-
-        self.assertEqual("baz", parser.parse_args(["-f", "baz"]).foo)
-        self.assertEqual("baz", parser.parse_args(["--foo", "baz"]).foo)
-
-    def test_env_required_env(self):
-        os.environ["PET_FOO"] = "env_baz"
-        parser = ArgumentParser()
-        parser.add_argument("-f", "--foo", action=env, default="bar", required=True)
-
-        self.assertEqual("env_baz", parser.parse_args([]).foo)
-        self.assertEqual("baz", parser.parse_args(["-f", "baz"]).foo)
-        self.assertEqual("baz", parser.parse_args(["--foo", "baz"]).foo)
-
-    def test_check_env_no_env(self):
-        parser = ArgumentParser()
-        parser.add_argument("-v", "--verbose", action=check_env)
-
-        self.assertFalse(parser.parse_args([]).verbose)
-        self.assertTrue(parser.parse_args(["-v"]).verbose)
-        self.assertTrue(parser.parse_args(["--verbose"]).verbose)
-
-    def test_check_env_default_no_env(self):
-        parser = ArgumentParser()
-        parser.add_argument("-v", "--verbose", action=check_env, default=True)
-
-        self.assertTrue(parser.parse_args([]).verbose)
-        self.assertTrue(parser.parse_args(["-v"]).verbose)
-        self.assertTrue(parser.parse_args(["--verbose"]).verbose)
-
-    def test_check_env_env_zero(self):
-        os.environ["PET_VERBOSE"] = "0"
-        parser = ArgumentParser()
-        parser.add_argument("-v", "--verbose", action=check_env)
-
-        self.assertFalse(parser.parse_args([]).verbose)
-        self.assertTrue(parser.parse_args(["--verbose"]).verbose)
-
-    def test_check_env_env_one(self):
-        os.environ["PET_VERBOSE"] = "1"
-        parser = ArgumentParser()
-        parser.add_argument("-v", "--verbose", action=check_env)
-
-        self.assertTrue(parser.parse_args([]).verbose)
-        self.assertTrue(parser.parse_args(["--verbose"]).verbose)
-
-    def test_check_env_default_env_zero(self):
-        os.environ["PET_VERBOSE"] = "0"
-        parser = ArgumentParser()
-        parser.add_argument("-v", "--verbose", action=check_env, default=True)
-
-        self.assertFalse(parser.parse_args([]).verbose)
-        self.assertTrue(parser.parse_args(["--verbose"]).verbose)
-
-    def test_check_env_default_env_one(self):
-        os.environ["PET_VERBOSE"] = "1"
-        parser = ArgumentParser()
-        parser.add_argument("-v", "--verbose", action=check_env, default=True)
-
-        self.assertTrue(parser.parse_args([]).verbose)
-        self.assertTrue(parser.parse_args(["--verbose"]).verbose)
diff --git a/torch/distributed/argparse_util.py b/torch/distributed/argparse_util.py
deleted file mode 100644
index b525f20e4407..000000000000
--- a/torch/distributed/argparse_util.py
+++ /dev/null
@@ -1,103 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-import os
-from argparse import Action
-
-
-class env(Action):
-    """
-    Gets argument values from ``PET_{dest}`` before defaulting
-    to the given ``default`` value. For flags (e.g. ``--standalone``)
-    use ``check_env`` instead.
-
-    .. note:: when multiple option strings are specified, ``dest`` is
-              the longest option string (e.g. for ``"-f", "--foo"``
-              the env var to set is ``PET_FOO`` not ``PET_F``)
-
-    Example:
-
-    ::
-
-     parser.add_argument("-f", "--foo", action=env, default="bar")
-
-     ./program                                      -> args.foo="bar"
-     ./program -f baz                               -> args.foo="baz"
-     ./program --foo baz                            -> args.foo="baz"
-     PET_FOO="env_bar" ./program -f baz    -> args.foo="baz"
-     PET_FOO="env_bar" ./program --foo baz -> args.foo="baz"
-     PET_FOO="env_bar" ./program           -> args.foo="env_bar"
-
-     parser.add_argument("-f", "--foo", action=env, required=True)
-
-     ./program                                      -> fails
-     ./program -f baz                               -> args.foo="baz"
-     PET_FOO="env_bar" ./program           -> args.foo="env_bar"
-     PET_FOO="env_bar" ./program -f baz    -> args.foo="baz"
-    """
-
-    def __init__(self, dest, default=None, required=False, **kwargs) -> None:
-        env_name = f"PET_{dest.upper()}"
-        default = os.environ.get(env_name, default)
-
-        # ``required`` means that it NEEDS to be present  in the command-line args
-        # rather than "this option requires a value (either set explicitly or default"
-        # so if we found default then we don't "require" it to be in the command-line
-        # so set it to False
-        if default:
-            required = False
-
-        super().__init__(dest=dest, default=default, required=required, **kwargs)
-
-    def __call__(self, parser, namespace, values, option_string=None):
-        setattr(namespace, self.dest, values)
-
-
-class check_env(Action):
-    """
-    For flags, checks whether the env var ``PET_{dest}`` exists
-    before defaulting to the given ``default`` value. Equivalent to
-    ``store_true`` argparse built-in action except that the argument can
-    be omitted from the commandline if the env var is present and has a
-    non-zero value.
-
-    .. note:: it is redundant to pass ``default=True`` for arguments
-              that use this action because a flag should be ``True``
-              when present and ``False`` otherwise.
-
-    Example:
-
-    ::
-
-     parser.add_argument("--verbose", action=check_env)
-
-     ./program                                  -> args.verbose=False
-     ./program --verbose                        -> args.verbose=True
-     PET_VERBOSE=1 ./program           -> args.verbose=True
-     PET_VERBOSE=0 ./program           -> args.verbose=False
-     PET_VERBOSE=0 ./program --verbose -> args.verbose=True
-
-    Anti-pattern (don't do this):
-
-    ::
-
-     parser.add_argument("--verbose", action=check_env, default=True)
-
-     ./program                                  -> args.verbose=True
-     ./program --verbose                        -> args.verbose=True
-     PET_VERBOSE=1 ./program           -> args.verbose=True
-     PET_VERBOSE=0 ./program           -> args.verbose=False
-
-    """
-
-    def __init__(self, dest, default=False, **kwargs) -> None:
-        env_name = f"PET_{dest.upper()}"
-        default = bool(int(os.environ.get(env_name, "1" if default else "0")))
-        super().__init__(dest=dest, const=True, default=default, nargs=0, **kwargs)
-
-    def __call__(self, parser, namespace, values, option_string=None):
-        setattr(namespace, self.dest, self.const)
diff --git a/torch/distributed/elastic_launch.py b/torch/distributed/elastic_launch.py
deleted file mode 100644
index 0c5f5b01778a..000000000000
--- a/torch/distributed/elastic_launch.py
+++ /dev/null
@@ -1,606 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-r"""
-This module provides similar functionality as ``torch.distributed.launch``,
-with the following additional functionalities:
-
-1. Worker failures are handled gracefully by restarting all workers.
-
-2. Worker ``RANK`` and ``WORLD_SIZE`` are assigned automatically.
-
-3. Number of nodes is allowed to change between min and max sizes (elasticity).
-
-**Usage:**
-
-1. Single-node multi-worker (with sidecar etcd server)
-
-::
-
-    >>> python -m torch.distributed.elastic_launch
-        --standalone
-        --nnodes=1
-        --nproc_per_node=$NUM_TRAINERS
-        YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
-
-2. Fault tolerant (fixed sized number of workers, no elasticity).:
-
-::
-
-    >>> python -m torch.distributed.elastic_launch
-        --nnodes=$NUM_NODES
-        --nproc_per_node=$NUM_TRAINERS
-        --rdzv_id=$JOB_ID
-        --rdzv_backend=etcd
-        --rdzv_endpoint=$ETCD_HOST:$ETCD_PORT
-        YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
-
-3. Elastic (``min=1``, ``max=4``):
-
-::
-
-    >>> python -m torch.distributed.elastic_launch
-        --nnodes=1:4
-        --nproc_per_node=$NUM_TRAINERS
-        --rdzv_id=$JOB_ID
-        --rdzv_backend=etcd
-        --rdzv_endpoint=$ETCD_HOST:$ETCD_PORT
-        YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
-
-**Note on rendezvous backend**:
-
-For multi-node training you need to specify:
-
-1. ``--rdzv_id``: a unique job id (shared by all nodes participating in the job)
-2. ``--rdzv_backend``: an implementation of ``torch.distributed.elastic.rendevous.RendezvousHandler``
-3. ``--rdzv_endpoint``: ``host:port``-style endpoint where the rdzv backend is running.
-
-Currently only ``etcd`` rdzv backend is supported out of the box.
-To use ``etcd``, setup an etcd server with the ``v2`` api enabled
-(e.g. ``--enable-v2``).
-
-.. warning:: ``EtcdRendezvous`` uses etcd api v2. You MUST enable the v2
-             api on the etcd server. Our tests use etcd v3.4.3.
-
-**Definitions:**
-
-1. ``Node`` - Physical instance or container.
-    Maps to the unit that the job manager works with.
-
-2. ``Worker`` - A worker in the context of distributed training.
-
-3. ``Worker Group`` - Workers that execute the same function (e.g. trainers)
-
-4. ``Local Worker Group`` - Subset of the workers in the
-    worker group running on the same Node
-
-5. ``RANK`` - rank of the worker within a worker group.
-
-6. ``WORLD_SIZE`` - total number of workers in a worker group.
-
-7. ``LOCAL_RANK`` - rank of the worker within a local worker group
-
-8. ``LOCAL_WORLD_SIZE`` - size of the local worker group
-
-9. ``rdzv_id`` - user defined id that uniquely identifies the worker group
-      for a job. This id is used by each node to join as a member of a particular
-      worker group.
-
-9. ``rdzv_backend`` - the backend store of rendezvous (e.g. etcd). This is
-    typically a strongly consistent key-value store.
-
-10. ``rdzv_endpoint`` - rdzv backend server endpoint in ``host:port`` format.
-
-A ``Node`` runs ``LOCAL_WORLD_SIZE`` workers which comprise a ``LocalWorkerGroup``.
-The union of all ``LocalWorkerGroups`` in the nodes in the job comprise the
-``WorkerGroup``.
-
-**Environment Variables:**
-
-The following environment variables are made available to you in your
-script:
-
-1. ``LOCAL_RANK`` -  local rank
-
-2. ``RANK`` -  global rank
-
-3. ``GROUP_RANK`` - rank of the worker group. A number between 0 - ``max_nnodes``.
-        When running a single worker group per node, this is the rank of the node.
-
-4. ``ROLE_RANK`` -  the rank of the worker across all the workers tha have the same
-        role. The role of the worker is specified in the ``WorkerSpec``.
-
-5. ``LOCAL_WORLD_SIZE`` - local world size (e.g. number of workers running locally).
-       Equal to ``--nproc_per_node`` specified on ``torch.distributed.elastic_launch``.
-
-6. ``WORLD_SIZE`` - world size (total number of workers in the job).
-
-7. ``ROLE_WORLD_SIZE`` - the total number of workers that was launched with the same
-        role specified in ``WorkerSpec``.
-
-8. ``MASTER_ADDR`` - fqdn of the host that is running worker with rank 0.
-   Used to initialize torch distributed backend.
-
-9. ``MASTER_PORT`` - port on the ``MASTER_ADDR`` that can be used to
-   host the tcp ``c10d`` store.
-
-10. ``TORCHELASTIC_RESTART_COUNT`` - number of worker group restarts so far.
-
-11. ``TORCHELASTIC_MAX_RESTARTS`` - configured max number of restarts.
-
-12. ``TORCHELASTIC_RUN_ID`` - equal to rdzv run_id (e.g. unique job id).
-
-**Deployment:**
-
-1. Start the rdzv backend server and get the endpoint
-   (to be passed as ``--rdzv_endpoint`` to the launcher script)
-
-2. Single-node multi-worker - start the launcher on the host to start
-   the agent process which creates and monitors a local worker group.
-
-3. Multi-node multi-worker - Start the launcher with the same arguments
-   on all the nodes participating in training.
-
-When using a job/cluster manager the entry point command to the multi-node
-job is invoking this launcher.
-
-**Failure Modes:**
-
-1. Worker failure - For a training job with ``n`` workers, if ``k <= n`` workers fail
-   all workers are stopped and restarted up to ``max_restarts``.
-
-2. Agent failure - An agent failure results in local worker group failure,
-   it is up to the job manager to fail the entire job (gang semantics) or attempt
-   to replace the node. Both behaviors are supported by the agent.
-
-3. Node failure - Same as agent failure.
-
-**Membership Changes:**
-
-1. Node departure (scale-down) - agent is notified of the departure,
-   all existing workers are stopped, a new ``Worker Group`` is formed and all
-   workers are started with a new ``RANK`` and ``WORLD_SIZE``.
-
-2. Node arrival (scale-up) - the new node is admitted to the job,
-   all existing workers are stopped, a new ``Worker Group`` is formed and all
-   workers are started with a new ``RANK`` and ``WORLD_SIZE``.
-
-
-**Important Notices:**
-
-1. All the items in the important notices section of ``torch.distributed.launch``
-   apply to this module as well
-
-2. The environment variables necessary to initialize a torch process group
-   are provided to you by this module, no need for you to pass ``RANK`` manually.
-   To initialize a process group in your training script, simply run
-
-::
-
- >>> import torch.distributed as dist
- >>> dist.init_process_group(backend="gloo|nccl")
-
-3. On failures or membership changes ALL surviving workers are killed
-   immediately. Make sure to checkpoint your progress. The frequency of
-   checkpoints should depend on your job's tolerance for lost work.
-
-4. This module only supports homogeneous ``LOCAL_WORLD_SIZE``. That is,
-   it is assumed that all nodes run the same number of local workers (per role).
-
-5. ``RANK`` is NOT stable. Between restarts, the local workers on a node
-   can be assgined a different range of ranks than before. NEVER hard code
-   any assumptions about the stable-ness of ranks or some correlation between
-   ``RANK`` and ``LOCAL_RANK``.
-
-6. When using elasticity (``min_size != max_size``) DO NOT hard code
-   assumptions about ``WORLD_SIZE`` as the world size can change as
-   nodes are allowed to leave and join.
-
-7. It is recommended your script have the following structure
-
-::
-
-  def main():
-    load_checkpoint(checkpoint_path)
-    initialize()
-    train()
-
-  def train():
-    for batch in iter(dataset):
-      train_step(batch)
-
-      if should_checkpoint:
-        save_checkpoint(checkpoint_path)
-"""
-import logging
-import os
-import sys
-import uuid
-from argparse import REMAINDER, ArgumentParser
-from typing import List, Tuple
-
-import torch
-from torch.distributed.argparse_util import check_env, env
-from torch.distributed.elastic.multiprocessing import Std
-from torch.distributed.elastic.multiprocessing.errors import record
-from torch.distributed.elastic.rendezvous.etcd_server import EtcdServer
-from torch.distributed.elastic.rendezvous.utils import _parse_rendezvous_config
-from torch.distributed.elastic.utils import macros
-from torch.distributed.elastic.utils.logging import get_logger
-from torch.distributed.launcher.api import LaunchConfig, elastic_launch
-
-
-log = get_logger()
-
-
-def get_args_parser() -> ArgumentParser:
-    """
-    Helper function parsing the command line options.
-    """
-
-    parser = ArgumentParser(description="torchelastic elastic training launcher")
-
-    # Arguments for the launch helper
-    # worker/node size related arguments
-    parser.add_argument(
-        "--nnodes",
-        action=env,
-        type=str,
-        default="1:1",
-        help="number of nodes or MIN_NODES:MAX_NODES",
-    )
-    parser.add_argument(
-        "--nproc_per_node",
-        action=env,
-        type=str,
-        default="auto",
-        help="number of workers per node, supported values: [auto, cpu, gpu, int]",
-    )
-
-    # rendezvous related arguments
-    parser.add_argument(
-        "--rdzv_backend",
-        action=env,
-        type=str,
-        default="static",
-        help="rendezvous backend",
-    )
-    parser.add_argument(
-        "--rdzv_endpoint",
-        action=env,
-        type=str,
-        default="",
-        help="rendezvous backend server host:port",
-    )
-    parser.add_argument(
-        "--rdzv_id",
-        action=env,
-        default="<NONE>",
-        type=str,
-        help="user defined group id",
-    )
-    parser.add_argument(
-        "--rdzv_conf",
-        action=env,
-        type=str,
-        default="",
-        help="additional rdzv configuration (conf1=v1,conf2=v2,...)",
-    )
-
-    # sidecar embed rdzv backend that defaults to etcd
-    parser.add_argument(
-        "--standalone",
-        action=check_env,
-        help="starts a local, standalone rdzv backend that is represented by"
-        " etcd server on a random free port"
-        "using the etcd binary specified in TORCHELASTIC_ETCD_BINARY_PATH"
-        " env var or the one found in PATH."
-        " Useful when launching single-node, multi-worker job."
-        " If specified --rdzv_backend, --rdzv_endpoint, --rdzv_id"
-        " are autoassigned, any explicitly set values are ignored",
-    )
-
-    # user-code launch related arguments
-    parser.add_argument(
-        "--max_restarts",
-        action=env,
-        type=int,
-        default=3,
-        help="max number of worker group restarts before failing",
-    )
-    parser.add_argument(
-        "--monitor_interval",
-        action=env,
-        type=float,
-        default=5,
-        help="interval (in seconds) to monitor the state of workers",
-    )
-    parser.add_argument(
-        "--start_method",
-        action=env,
-        type=str,
-        default="spawn",
-        choices=["spawn", "fork", "forkserver"],
-        help="multiprocessing start_method to use when creating workers",
-    )
-    parser.add_argument(
-        "--role",
-        action=env,
-        type=str,
-        default="default",
-        help="user-defined role for the workers",
-    )
-    parser.add_argument(
-        "-m",
-        "--module",
-        action=check_env,
-        help="Changes each process to interpret the launch script "
-        "as a python module, executing with the same behavior as"
-        "'python -m'.",
-    )
-    parser.add_argument(
-        "--no_python",
-        action=check_env,
-        help='Do not prepend the training script with "python" - just exec '
-        "it directly. Useful when the script is not a Python script.",
-    )
-
-    parser.add_argument(
-        "--log_dir",
-        action=env,
-        type=str,
-        default=None,
-        help="base dir to use for log files (e.g. /var/log/torchelastic)"
-        " can reuse the same dir for multiple runs "
-        "(a unique job-level subdir is created with rdzv_id as the prefix)",
-    )
-
-    parser.add_argument(
-        "-r",
-        "--redirects",
-        action=env,
-        type=str,
-        default="0",
-        help="std streams to redirect into a log file in the log_dir"
-        " (e.g. [-r 3] redirects both stdout+stderr for all workers,"
-        " [-r 0:1,1:2] redirects stdout for local rank 0 and stderr for local rank 1)",
-    )
-
-    parser.add_argument(
-        "-t",
-        "--tee",
-        action=env,
-        type=str,
-        default="0",
-        help="tee std streams into a log file and also to console (see --redirects for format)",
-    )
-
-    # backwards compatible params with caffe2.distributed.launch
-
-    parser.add_argument(
-        "--node_rank",
-        type=int,
-        action=env,
-        default=0,
-        help="The rank of the node for multi-node distributed " "training",
-    )
-
-    parser.add_argument(
-        "--master_addr",
-        default="127.0.0.1",
-        type=str,
-        action=env,
-        help="Master node (rank 0)'s address, should be either "
-        "the IP address or the hostname of node 0, for "
-        "single node multi-proc training, the "
-        "--master_addr can simply be 127.0.0.1"
-        "IPV6 should have the following pattern: `[0:0:0:0:0:0:0:1]`",
-    )
-    parser.add_argument(
-        "--master_port",
-        default=29500,
-        type=int,
-        action=env,
-        help="Master node (rank 0)'s free port that needs to "
-        "be used for communication during distributed "
-        "training",
-    )
-
-    # positional
-    parser.add_argument(
-        "training_script",
-        type=str,
-        help="The full path to the single GPU training "
-        "program/script to be launched in parallel, "
-        "followed by all the arguments for the "
-        "training script",
-    )
-
-    # rest from the training program
-    parser.add_argument("training_script_args", nargs=REMAINDER)
-    return parser
-    # return parser.parse_args(args)
-
-
-def parse_args(args):
-    parser = get_args_parser()
-    parser.add_argument(
-        "--use_env",
-        default=True,
-        action="store_true",
-        help="Use environment variable to pass "
-        "'local rank'. For legacy reasons, the default value is False. "
-        "If set to True, the script will not pass "
-        "--local_rank as argument, and will instead set LOCAL_RANK.",
-    )
-    return parser.parse_args(args)
-
-
-def parse_min_max_nnodes(nnodes: str):
-    arr = nnodes.split(":")
-
-    if len(arr) == 1:
-        min_nodes = max_nodes = int(arr[0])
-    elif len(arr) == 2:
-        min_nodes = int(arr[0])
-        max_nodes = int(arr[1])
-    else:
-        raise RuntimeError(f'nnodes={nnodes} is not in "MIN:MAX" format')
-
-    return min_nodes, max_nodes
-
-
-def determine_local_world_size(nproc_per_node: str):
-    try:
-        logging.info(f"Using nproc_per_node={nproc_per_node}.")
-        return int(nproc_per_node)
-    except ValueError:
-        if nproc_per_node == "cpu":
-            num_proc = os.cpu_count()
-            device_type = "cpu"
-        elif nproc_per_node == "gpu":
-            if not torch.cuda.is_available():
-                raise ValueError("Cuda is not available.")
-            device_type = "gpu"
-            num_proc = torch.cuda.device_count()
-        elif nproc_per_node == "auto":
-            if torch.cuda.is_available():
-                num_proc = torch.cuda.device_count()
-                device_type = "gpu"
-            else:
-                num_proc = os.cpu_count()
-                device_type = "cpu"
-        else:
-            raise ValueError(f"Unsupported nproc_per_node value: {nproc_per_node}")
-
-        log.info(
-            f"Using nproc_per_node={nproc_per_node},"
-            f" seting to {num_proc} since the instance "
-            f"has {os.cpu_count()} {device_type}"
-        )
-        return num_proc
-
-
-def get_rdzv_endpoint(args):
-    if args.rdzv_backend == "static":
-        return f"{args.master_addr}:{args.master_port}"
-    else:
-        return args.rdzv_endpoint
-
-
-def config_from_args(args) -> Tuple[LaunchConfig, List[str]]:
-    # If ``args`` not passed, defaults to ``sys.argv[:1]``
-    min_nodes, max_nodes = parse_min_max_nnodes(args.nnodes)
-    assert 0 < min_nodes <= max_nodes
-    assert args.max_restarts >= 0
-
-    nproc_per_node = determine_local_world_size(args.nproc_per_node)
-    if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1:
-        omp_num_threads = 1
-        print(
-            f"*****************************************\n"
-            f"Setting OMP_NUM_THREADS environment variable for each process to be "
-            f"{omp_num_threads} in default, to avoid your system being overloaded, "
-            f"please further tune the variable for optimal performance in "
-            f"your application as needed. \n"
-            f"*****************************************"
-        )
-        # This env variable will be passed down to the subprocesses
-        os.environ["OMP_NUM_THREADS"] = str(omp_num_threads)
-
-    rdzv_configs = _parse_rendezvous_config(args.rdzv_conf)
-
-    if args.rdzv_backend == "static":
-        rdzv_configs["rank"] = args.node_rank
-
-    rdzv_endpoint = get_rdzv_endpoint(args)
-
-    config = LaunchConfig(
-        min_nodes=min_nodes,
-        max_nodes=max_nodes,
-        nproc_per_node=nproc_per_node,
-        run_id=args.rdzv_id,
-        role=args.role,
-        rdzv_endpoint=rdzv_endpoint,
-        rdzv_backend=args.rdzv_backend,
-        rdzv_configs=rdzv_configs,
-        max_restarts=args.max_restarts,
-        monitor_interval=args.monitor_interval,
-        start_method=args.start_method,
-        redirects=Std.from_str(args.redirects),
-        tee=Std.from_str(args.tee),
-    )
-
-    with_python = not args.no_python
-    cmd = []
-    if with_python:
-        cmd = [sys.executable, "-u"]
-        if args.module:
-            cmd.append("-m")
-    else:
-        if not args.use_env:
-            raise ValueError(
-                "When using the '--no_python' flag,"
-                " you must also set the '--use_env' flag."
-            )
-        if args.module:
-            raise ValueError(
-                "Don't use both the '--no_python' flag"
-                " and the '--module' flag at the same time."
-            )
-    cmd.append(args.training_script)
-    if not args.use_env:
-        log.warning(
-            "`torch.distributed.launch` is Deprecated. Use torch.distributed.elastic_launch"
-        )
-        cmd.append(f"--local_rank={macros.local_rank}")
-    cmd.extend(args.training_script_args)
-
-    return config, cmd
-
-
-@record
-def run(args):
-    if args.standalone:
-        etcd_server = EtcdServer()
-        etcd_server.start()
-        args.rdzv_backend = "etcd"
-        args.rdzv_endpoint = etcd_server.get_endpoint()
-        args.rdzv_id = str(uuid.uuid4())
-        log.info(
-            f"\n**************************************\n"
-            f"Rendezvous info:\n"
-            f"--rdzv_backend={args.rdzv_backend} "
-            f"--rdzv_endpoint={args.rdzv_endpoint} "
-            f"--rdzv_id={args.rdzv_id}\n"
-            f"**************************************\n"
-        )
-
-    config, cmd = config_from_args(args)
-
-    try:
-        elastic_launch(
-            config=config,
-            entrypoint=cmd[0],
-        )(*cmd[1:])
-    finally:
-        if args.standalone:
-            etcd_server.stop()
-
-
-def main(args=None):
-    args = parse_args(args)
-    run(args)
-
-
-if __name__ == "__main__":
-    logging.basicConfig(
-        level=logging.INFO, format="[%(levelname)s] %(asctime)s %(module)s: %(message)s"
-    )
-    log.info(f"Running torch.distributed.elastic_launch with args: {sys.argv}")
-    main()
diff --git a/torch/distributed/launch.py b/torch/distributed/launch.py
index 5cbb25ad3e2d..a6a5b26e6d40 100644
--- a/torch/distributed/launch.py
+++ b/torch/distributed/launch.py
@@ -2,8 +2,6 @@
 `torch.distributed.launch` is a module that spawns up multiple distributed
 training processes on each of the training nodes.
 
-NOTE: This module is deprecated, use torch.distributed.elastic_launch.
-
 The utility can be used for single-node distributed training, in which one or
 more processes per node will be spawned. The utility can be used for either
 CPU training or GPU training. If the utility is used for GPU training,
@@ -138,36 +136,205 @@
 
 """
 
-import logging
-
-from torch.distributed.elastic_launch import get_args_parser, run
-
-logger = logging.getLogger(__name__)
-
 
-def parse_args(args):
-    parser = get_args_parser()
+import time
+import signal
+import sys
+import subprocess
+import os
+from argparse import ArgumentParser, REMAINDER
+from typing import Optional, IO, List, Any
+
+node_local_rank_stdout_filename = "node_{}_local_rank_{}_stdout"
+node_local_rank_stderr_filename = "node_{}_local_rank_{}_stderr"
+
+def parse_args():
+    """
+    Helper function parsing the command line options
+    @retval ArgumentParser
+    """
+    parser = ArgumentParser(description="PyTorch distributed training launch "
+                                        "helper utility that will spawn up "
+                                        "multiple distributed processes")
+
+    # Optional arguments for the launch helper
+    parser.add_argument("--nnodes", type=int, default=1,
+                        help="The number of nodes to use for distributed "
+                             "training")
+    parser.add_argument("--node_rank", type=int, default=0,
+                        help="The rank of the node for multi-node distributed "
+                             "training")
+    parser.add_argument("--nproc_per_node", type=int, default=1,
+                        help="The number of processes to launch on each node, "
+                             "for GPU training, this is recommended to be set "
+                             "to the number of GPUs in your system so that "
+                             "each process can be bound to a single GPU.")
+    parser.add_argument("--master_addr", default="127.0.0.1", type=str,
+                        help="Master node (rank 0)'s address, should be either "
+                             "the IP address or the hostname of node 0, for "
+                             "single node multi-proc training, the "
+                             "--master_addr can simply be 127.0.0.1")
+    parser.add_argument("--master_port", default=29500, type=int,
+                        help="Master node (rank 0)'s free port that needs to "
+                             "be used for communication during distributed "
+                             "training")
+    parser.add_argument("--use_env", default=False, action="store_true",
+                        help="Use environment variable to pass "
+                             "'local rank'. For legacy reasons, the default value is False. "
+                             "If set to True, the script will not pass "
+                             "--local_rank as argument, and will instead set LOCAL_RANK.")
+    parser.add_argument("-m", "--module", default=False, action="store_true",
+                        help="Changes each process to interpret the launch script "
+                             "as a python module, executing with the same behavior as"
+                             "'python -m'.")
+    parser.add_argument("--no_python", default=False, action="store_true",
+                        help="Do not prepend the training script with \"python\" - just exec "
+                             "it directly. Useful when the script is not a Python script.")
     parser.add_argument(
-        "--use_env",
-        default=False,
-        action="store_true",
-        help="Use environment variable to pass "
-        "'local rank'. For legacy reasons, the default value is False. "
-        "If set to True, the script will not pass "
-        "--local_rank as argument, and will instead set LOCAL_RANK.",
-    )
-    return parser.parse_args(args)
-
-
-def main(args=None):
-    logger.warn(
-        "The module torch.distributed.launch is deprecated "
-        "and going to be removed in future."
-        "Migrate to torch.distributed.elastic_launch"
+        "--logdir",
+        default=None,
+        type=str,
+        help=f"""Relative path to write subprocess logs to. Passing in a relative
+        path will create a directory if needed, and write the stdout and stderr to files
+        {node_local_rank_stdout_filename} and {node_local_rank_stderr_filename}. Note that
+        successive runs with the  same path to write logs to will overwrite existing logs,
+        so be sure to save logs as needed.""",
     )
-    args = parse_args(args)
-    run(args)
 
+    # positional
+    parser.add_argument("training_script", type=str,
+                        help="The full path to the single GPU training "
+                             "program/script to be launched in parallel, "
+                             "followed by all the arguments for the "
+                             "training script")
+
+    # rest from the training program
+    parser.add_argument('training_script_args', nargs=REMAINDER)
+    return parser.parse_args()
+
+def main():
+    args = parse_args()
+
+    # world size in terms of number of processes
+    dist_world_size = args.nproc_per_node * args.nnodes
+
+    # set PyTorch distributed related environmental variables
+    current_env = os.environ.copy()
+    current_env["MASTER_ADDR"] = args.master_addr
+    current_env["MASTER_PORT"] = str(args.master_port)
+    current_env["WORLD_SIZE"] = str(dist_world_size)
+
+    processes: List[Any] = []
+
+    if 'OMP_NUM_THREADS' not in os.environ and args.nproc_per_node > 1:
+        current_env["OMP_NUM_THREADS"] = str(1)
+        print("*****************************************\n"
+              "Setting OMP_NUM_THREADS environment variable for each process "
+              "to be {} in default, to avoid your system being overloaded, "
+              "please further tune the variable for optimal performance in "
+              "your application as needed. \n"
+              "*****************************************".format(current_env["OMP_NUM_THREADS"]))
+
+    if args.logdir:
+        # Possibly create the directory to write subprocess log output to.
+        if os.path.exists(args.logdir):
+            if not os.path.isdir(args.logdir):
+                raise ValueError("argument --logdir must be a path to a directory.")
+        else:
+            # create the relative directory
+            os.mkdir(os.path.join(os.getcwd(), args.logdir))
+
+    subprocess_file_handles = []
+
+    for local_rank in range(0, args.nproc_per_node):
+        # each process's rank
+        dist_rank = args.nproc_per_node * args.node_rank + local_rank
+        current_env["RANK"] = str(dist_rank)
+        current_env["LOCAL_RANK"] = str(local_rank)
+
+        # spawn the processes
+        with_python = not args.no_python
+        cmd = []
+        if with_python:
+            cmd = [sys.executable, "-u"]
+            if args.module:
+                cmd.append("-m")
+        else:
+            if not args.use_env:
+                raise ValueError("When using the '--no_python' flag, you must also set the '--use_env' flag.")
+            if args.module:
+                raise ValueError("Don't use both the '--no_python' flag and the '--module' flag at the same time.")
+
+        cmd.append(args.training_script)
+
+        if not args.use_env:
+            cmd.append("--local_rank={}".format(local_rank))
+
+        cmd.extend(args.training_script_args)
+
+        stdout_handle: Optional[IO]
+        stderr_handle: Optional[IO]
+        if args.logdir:
+            directory_path = os.path.join(os.getcwd(), args.logdir)
+            node_rank = args.node_rank
+            stdout_file_name = node_local_rank_stdout_filename.format(node_rank, local_rank)
+            stderr_file_name = node_local_rank_stderr_filename.format(node_rank, local_rank)
+            stdout_handle = open(os.path.join(directory_path, stdout_file_name), "w")
+            stderr_handle = open(os.path.join(directory_path, stderr_file_name), "w")
+            subprocess_file_handles.append((stdout_handle, stderr_handle))
+            stdout_name = stdout_handle.name
+            stderr_name = stderr_handle.name
+            print(f"""Note: Stdout and stderr for node {node_rank} rank {local_rank} will
+            be written to {stdout_name}, {stderr_name} respectively.""")
+
+        sig_names = {2: "SIGINT", 15: "SIGTERM"}
+        last_return_code = None
+
+        def sigkill_handler(signum, frame):
+            for process in processes:
+                print(f"Killing subprocess {process.pid}")
+                try:
+                    process.kill()
+                except Exception:
+                    pass
+            if last_return_code is not None:
+                raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd)
+            if signum in sig_names:
+                print(f"Main process received {sig_names[signum]}, exiting")
+            sys.exit(1)
+
+        # pass SIGINT/SIGTERM to children if the parent is being terminated
+        signal.signal(signal.SIGINT, sigkill_handler)
+        signal.signal(signal.SIGTERM, sigkill_handler)
+
+        stdout_handle = None if not subprocess_file_handles else subprocess_file_handles[local_rank][0]
+        stderr_handle = None if not subprocess_file_handles else subprocess_file_handles[local_rank][1]
+        process = subprocess.Popen(cmd, env=current_env, stdout=stdout_handle, stderr=stderr_handle)
+        processes.append(process)
+
+    try:
+        alive_processes = set(processes)
+        while len(alive_processes):
+            finished_processes = []
+            for process in alive_processes:
+                if process.poll() is None:
+                    # the process is still running
+                    continue
+                else:
+                    if process.returncode != 0:
+                        last_return_code = process.returncode  # for sigkill_handler
+                        sigkill_handler(signal.SIGTERM, None)  # not coming back
+                    else:
+                        # exited cleanly
+                        finished_processes.append(process)
+            alive_processes = set(alive_processes) - set(finished_processes)
+
+            time.sleep(1)
+    finally:
+        # close open file descriptors
+        for (stdout_handle, stderr_handle) in subprocess_file_handles:
+            stdout_handle.close()
+            stderr_handle.close()
 
 if __name__ == "__main__":
     main()

From a3a75bd35ec6e1ad7371511cb06c085d74fde6ba Mon Sep 17 00:00:00 2001
From: Kurt Mohler <kmohler@quansight.com>
Date: Thu, 15 Apr 2021 15:05:55 -0700
Subject: [PATCH 15/45] Add complex autograd support for `torch.cross` (#55854)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/53512

Pull Request resolved: https://github.com/pytorch/pytorch/pull/55854

Reviewed By: nikithamalgifb

Differential Revision: D27737571

Pulled By: anjali411

fbshipit-source-id: 38165b952cc4c9213d61c7d98b549b984c154927
---
 test/test_linalg.py                 | 16 ++++++++++++++++
 tools/autograd/derivatives.yaml     |  4 ++--
 tools/autograd/gen_variable_type.py |  2 +-
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/test/test_linalg.py b/test/test_linalg.py
index d28c515e1145..1ad4467b2957 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -4727,6 +4727,22 @@ def test_cross(self, device, dtype):
         torch.cross(x, y, out=res2)
         self.assertEqual(res1, res2)
 
+    # TODO: This test should be removed and OpInfo should enable complex
+    #       types after this PR is merged:
+    #       https://github.com/pytorch/pytorch/pull/55483
+    @dtypes(torch.cdouble)
+    def test_cross_autograd(self, device, dtype):
+        x = torch.rand(100, 3, dtype=dtype, device=device, requires_grad=True)
+        y = torch.rand(100, 3, dtype=dtype, device=device, requires_grad=True)
+
+        if torch.device(device).type == 'cuda' and dtype.is_complex:
+            # TODO: Remove this error when cross CUDA supports complex
+            with self.assertRaisesRegex(RuntimeError, r'_th_cross_kernel_out not supported on CUDAType for Complex'):
+                gradcheck(torch.cross, [x, y])
+        else:
+            gradcheck(torch.cross, [x, y])
+            gradgradcheck(torch.cross, [x, y], atol=1e-3, check_batched_grad=False)
+
     @onlyCPU
     @dtypes(torch.float)
     def test_cross_with_and_without_dim(self, device, dtype):
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 55ba140f39e7..5f35e1faee71 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -376,8 +376,8 @@
   output_differentiability: [False]
 
 - name: cross(Tensor self, Tensor other, int? dim=None) -> Tensor
-  self: other.cross(grad, dim)
-  other: grad.cross(self, dim)
+  self: other.conj().cross(grad, dim)
+  other: grad.cross(self.conj(), dim)
 
 - name: logcumsumexp(Tensor self, int dim) -> Tensor
   self: logcumsumexp_backward(grad, self, result, dim)
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 81b8174d28ba..350916be1f42 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -97,7 +97,7 @@
     'replication_pad1d_backward', 'replication_pad2d_backward', 'replication_pad3d_backward',
     'diag', 'masked_scatter', 'masked_select', 'index_fill', 'trace', 'polar', 'cumsum', 'rsub',
     'eig', 'lerp', 'linalg_vector_norm', 'cumprod', 'prod', 'index_copy', 'lu', 'unfold', 'unfold_backward',
-    'index', 'masked_fill'
+    'index', 'masked_fill', 'cross'
 }
 
 # Some operators invalidate the grad_accumulator. Let's reset it.

From f26a6cb372569b957318c7f2a44c92428cc9686b Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Thu, 15 Apr 2021 15:10:26 -0700
Subject: [PATCH 16/45] [quantization] Fix deepcopy on quantized ConvNd
 (#56154)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/56154

Test Plan: Imported from OSS

Reviewed By: vkuzo

Differential Revision: D27796268

Pulled By: jamesr66a

fbshipit-source-id: cb693dc16582a9334c93f46201c42eb0f4b794b3
---
 test/quantization/test_quantized_module.py | 23 ++++++++++++++++++++++
 torch/nn/quantized/modules/conv.py         | 10 ++++++++++
 2 files changed, 33 insertions(+)

diff --git a/test/quantization/test_quantized_module.py b/test/quantization/test_quantized_module.py
index 22009bdb3d3a..3c301718997f 100644
--- a/test/quantization/test_quantized_module.py
+++ b/test/quantization/test_quantized_module.py
@@ -31,6 +31,7 @@
 import torch.testing._internal.hypothesis_utils as hu
 hu.assert_deadline_disabled()
 
+import copy
 import io
 import numpy as np
 import itertools
@@ -328,6 +329,7 @@ def _test_conv_api_impl(
             np.testing.assert_array_almost_equal(
                 Y_exp.int_repr().numpy(), Y_loaded.int_repr().numpy(), decimal=0)
 
+        # Test serialization
         b = io.BytesIO()
         torch.save(qconv_module, b)
         b.seek(0)
@@ -338,6 +340,27 @@ def _test_conv_api_impl(
         self.assertEqual(loaded_conv.zero_point,
                          qconv_module.zero_point)
 
+        # Test copy and deepcopy
+        copied_conv = copy.copy(qconv_module)
+        self.assertEqual(copied_conv.bias(), qconv_module.bias())
+        self.assertEqual(copied_conv.scale, qconv_module.scale)
+        self.assertEqual(copied_conv.zero_point,
+                         qconv_module.zero_point)
+        Y_copied = copied_conv(X_q)
+        if not is_reference:
+            np.testing.assert_array_almost_equal(
+                Y_exp.int_repr().numpy(), Y_copied.int_repr().numpy(), decimal=0)
+
+        deepcopied_conv = copy.deepcopy(qconv_module)
+        self.assertEqual(deepcopied_conv.bias(), qconv_module.bias())
+        self.assertEqual(deepcopied_conv.scale, qconv_module.scale)
+        self.assertEqual(deepcopied_conv.zero_point,
+                         qconv_module.zero_point)
+        Y_deepcopied = copied_conv(X_q)
+        if not is_reference:
+            np.testing.assert_array_almost_equal(
+                Y_exp.int_repr().numpy(), Y_deepcopied.int_repr().numpy(), decimal=0)
+
         # JIT testing
         self.checkScriptable(
             qconv_module, [[X_q]],
diff --git a/torch/nn/quantized/modules/conv.py b/torch/nn/quantized/modules/conv.py
index ed892fb39a44..9f0e942a1b59 100644
--- a/torch/nn/quantized/modules/conv.py
+++ b/torch/nn/quantized/modules/conv.py
@@ -174,6 +174,16 @@ def __setstate__(self, state):
         self.zero_point = state[13]
         self.training = state[14]
 
+    def __deepcopy__(self, memo):
+        new_instance = type(self).__new__(type(self))
+        torch.nn.Module.__init__(new_instance)
+        state = self.__getstate__()
+        new_instance.__setstate__(state)
+        return new_instance
+
+    def __copy__(self):
+        return self.__deepcopy__({})
+
     @classmethod
     def get_qconv(cls, mod, activation_post_process, weight_post_process=None):
         r"""Creates a qconv object and returns it.

From bb245b6444d3c5b9f586d93121730390985a0bae Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@fb.com>
Date: Thu, 15 Apr 2021 15:15:24 -0700
Subject: [PATCH 17/45] [optim] refactor adamax to use functional API (#55830)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/55830

ghstack-source-id: 126325537

Reviewed By: driazati

Differential Revision: D26561017

fbshipit-source-id: 41273d200e546d4ac08d39b57865d63c624f143a
---
 torch/optim/_functional.py | 39 ++++++++++++++++++++++++++++
 torch/optim/adamax.py      | 52 ++++++++++++++++++++++----------------
 2 files changed, 69 insertions(+), 22 deletions(-)

diff --git a/torch/optim/_functional.py b/torch/optim/_functional.py
index 3f702a2d17ed..e93c6798d30f 100644
--- a/torch/optim/_functional.py
+++ b/torch/optim/_functional.py
@@ -238,3 +238,42 @@ def rmsprop(params: List[Tensor],
             param.add_(buf, alpha=-lr)
         else:
             param.addcdiv_(grad, avg, value=-lr)
+
+
+def adamax(params: List[Tensor],
+           grads: List[Tensor],
+           exp_avgs: List[Tensor],
+           exp_infs: List[Tensor],
+           state_steps: List[int],
+           eps: float,
+           beta1: float,
+           beta2: float,
+           lr: float,
+           weight_decay: float):
+    r"""Functional API that performs adamax algorithm computation.
+
+    See :class:`~torch.optim.Adamax` for details.
+    """
+
+    for i, param in enumerate(params):
+        grad = grads[i]
+        exp_avg = exp_avgs[i]
+        exp_inf = exp_infs[i]
+        step = state_steps[i]
+
+        if weight_decay != 0:
+            grad = grad.add(param, alpha=weight_decay)
+
+        # Update biased first moment estimate.
+        exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
+        # Update the exponentially weighted infinity norm.
+        norm_buf = torch.cat([
+            exp_inf.mul_(beta2).unsqueeze(0),
+            grad.abs().add_(eps).unsqueeze_(0)
+        ], 0)
+        torch.amax(norm_buf, 0, keepdim=False, out=exp_inf)
+
+        bias_correction = 1 - beta1 ** step
+        clr = lr / bias_correction
+
+        param.addcdiv_(exp_avg, exp_inf, value=-clr)
diff --git a/torch/optim/adamax.py b/torch/optim/adamax.py
index fa028f469b43..e5591f5c158a 100644
--- a/torch/optim/adamax.py
+++ b/torch/optim/adamax.py
@@ -1,4 +1,5 @@
 import torch
+from . import _functional as F
 from .optimizer import Optimizer
 
 
@@ -50,12 +51,25 @@ def step(self, closure=None):
                 loss = closure()
 
         for group in self.param_groups:
+            params_with_grad = []
+            grads = []
+            exp_avgs = []
+            exp_infs = []
+            state_steps = []
+
+            beta1, beta2 = group['betas']
+            eps = group['eps']
+            lr = group['lr']
+            weight_decay = group['weight_decay']
+
             for p in group['params']:
                 if p.grad is None:
                     continue
-                grad = p.grad
-                if grad.is_sparse:
+                params_with_grad.append(p)
+                if p.grad.is_sparse:
                     raise RuntimeError('Adamax does not support sparse gradients')
+                grads.append(p.grad)
+
                 state = self.state[p]
 
                 # State initialization
@@ -64,27 +78,21 @@ def step(self, closure=None):
                     state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
                     state['exp_inf'] = torch.zeros_like(p, memory_format=torch.preserve_format)
 
-                exp_avg, exp_inf = state['exp_avg'], state['exp_inf']
-                beta1, beta2 = group['betas']
-                eps = group['eps']
+                exp_avgs.append(state['exp_avg'])
+                exp_infs.append(state['exp_inf'])
 
                 state['step'] += 1
-
-                if group['weight_decay'] != 0:
-                    grad = grad.add(p, alpha=group['weight_decay'])
-
-                # Update biased first moment estimate.
-                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
-                # Update the exponentially weighted infinity norm.
-                norm_buf = torch.cat([
-                    exp_inf.mul_(beta2).unsqueeze(0),
-                    grad.abs().add_(eps).unsqueeze_(0)
-                ], 0)
-                torch.amax(norm_buf, 0, keepdim=False, out=exp_inf)
-
-                bias_correction = 1 - beta1 ** state['step']
-                clr = group['lr'] / bias_correction
-
-                p.addcdiv_(exp_avg, exp_inf, value=-clr)
+                state_steps.append(state['step'])
+
+            F.adamax(params_with_grad,
+                     grads,
+                     exp_avgs,
+                     exp_infs,
+                     state_steps,
+                     eps,
+                     beta1,
+                     beta2,
+                     lr,
+                     weight_decay)
 
         return loss

From 8ef13cf97637a6132c6d7f6e930f3fdcea0e8f94 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@fb.com>
Date: Thu, 15 Apr 2021 15:15:24 -0700
Subject: [PATCH 18/45] [optim] refactor rprop to use functional API (#55832)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/55832

ghstack-source-id: 126325541

Reviewed By: driazati

Differential Revision: D27703877

fbshipit-source-id: 34d4ce7b7d124c0cd75e2f6d0bc8f836713b7301
---
 torch/optim/_functional.py | 37 +++++++++++++++++++++++++++++++++++++
 torch/optim/rprop.py       | 38 ++++++++++++++++++++------------------
 2 files changed, 57 insertions(+), 18 deletions(-)

diff --git a/torch/optim/_functional.py b/torch/optim/_functional.py
index e93c6798d30f..0e03f3594bd8 100644
--- a/torch/optim/_functional.py
+++ b/torch/optim/_functional.py
@@ -240,6 +240,43 @@ def rmsprop(params: List[Tensor],
             param.addcdiv_(grad, avg, value=-lr)
 
 
+def rprop(params: List[Tensor],
+          grads: List[Tensor],
+          prevs: List[Tensor],
+          step_sizes: List[Tensor],
+          step_size_min: float,
+          step_size_max: float,
+          etaminus: float,
+          etaplus: float):
+    r"""Functional API that performs rprop algorithm computation.
+
+    See :class:`~torch.optim.Rprop` for details.
+    """
+
+    for i, param in enumerate(params):
+        grad = grads[i]
+        prev = prevs[i]
+        step_size = step_sizes[i]
+
+        sign = grad.mul(prev).sign()
+        sign[sign.gt(0)] = etaplus
+        sign[sign.lt(0)] = etaminus
+        sign[sign.eq(0)] = 1
+
+        # update stepsizes with step size updates
+        step_size.mul_(sign).clamp_(step_size_min, step_size_max)
+
+        # for dir<0, dfdx=0
+        # for dir>=0 dfdx=dfdx
+        grad = grad.clone(memory_format=torch.preserve_format)
+        grad[sign.eq(etaminus)] = 0
+
+        # update parameters
+        param.addcmul_(grad.sign(), step_size, value=-1)
+
+        prev.copy_(grad)
+
+
 def adamax(params: List[Tensor],
            grads: List[Tensor],
            exp_avgs: List[Tensor],
diff --git a/torch/optim/rprop.py b/torch/optim/rprop.py
index ec2a5f1f222a..0b71ec29174c 100644
--- a/torch/optim/rprop.py
+++ b/torch/optim/rprop.py
@@ -1,4 +1,5 @@
 import torch
+from . import _functional as F
 from .optimizer import Optimizer
 
 
@@ -39,12 +40,20 @@ def step(self, closure=None):
                 loss = closure()
 
         for group in self.param_groups:
+            params = []
+            grads = []
+            prevs = []
+            step_sizes = []
+
             for p in group['params']:
                 if p.grad is None:
                     continue
+                params.append(p)
                 grad = p.grad
                 if grad.is_sparse:
                     raise RuntimeError('Rprop does not support sparse gradients')
+
+                grads.append(grad)
                 state = self.state[p]
 
                 # State initialization
@@ -53,28 +62,21 @@ def step(self, closure=None):
                     state['prev'] = torch.zeros_like(p, memory_format=torch.preserve_format)
                     state['step_size'] = grad.new().resize_as_(grad).fill_(group['lr'])
 
+                prevs.append(state['prev'])
+                step_sizes.append(state['step_size'])
+
                 etaminus, etaplus = group['etas']
                 step_size_min, step_size_max = group['step_sizes']
-                step_size = state['step_size']
 
                 state['step'] += 1
 
-                sign = grad.mul(state['prev']).sign()
-                sign[sign.gt(0)] = etaplus
-                sign[sign.lt(0)] = etaminus
-                sign[sign.eq(0)] = 1
-
-                # update stepsizes with step size updates
-                step_size.mul_(sign).clamp_(step_size_min, step_size_max)
-
-                # for dir<0, dfdx=0
-                # for dir>=0 dfdx=dfdx
-                grad = grad.clone(memory_format=torch.preserve_format)
-                grad[sign.eq(etaminus)] = 0
-
-                # update parameters
-                p.addcmul_(grad.sign(), step_size, value=-1)
-
-                state['prev'].copy_(grad)
+            F.rprop(params,
+                    grads,
+                    prevs,
+                    step_sizes,
+                    step_size_min,
+                    step_size_max,
+                    etaminus,
+                    etaplus)
 
         return loss

From 4e9e7200f2062a9223464f257ad220747fd7042b Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@fb.com>
Date: Thu, 15 Apr 2021 15:15:24 -0700
Subject: [PATCH 19/45] [dist_optim] Add distributed functional Adamax
 optimizer (#55833)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/55833

Add distributed functional Adamax optimizer, to support in TorchScript
ghstack-source-id: 126325538

Reviewed By: rohan-varma

Differential Revision: D26696540

fbshipit-source-id: 6242faebd2476847831a05df7f8b0d616f2b5355
---
 torch/distributed/optim/functional_adamax.py  | 102 ++++++++++++++++++
 torch/distributed/optim/optimizer.py          |   4 +-
 .../distributed/rpc/dist_optimizer_test.py    |   1 +
 3 files changed, 106 insertions(+), 1 deletion(-)
 create mode 100644 torch/distributed/optim/functional_adamax.py

diff --git a/torch/distributed/optim/functional_adamax.py b/torch/distributed/optim/functional_adamax.py
new file mode 100644
index 000000000000..7daa315636ef
--- /dev/null
+++ b/torch/distributed/optim/functional_adamax.py
@@ -0,0 +1,102 @@
+from typing import List, Dict, Optional, Tuple
+import torch
+import torch.optim._functional as F
+
+from torch import Tensor
+
+# Define a TorchScript compatible Functional Adamax Optimizer
+# where we use these optimizer in a functional way.
+# Instead of using the `param.grad` when updating parameters,
+# we explicitly allow the distributed optimizer pass gradients to
+# the `step` function. In this way, we could separate the gradients
+# and parameters and allow multithreaded trainer to update the
+# parameters without data traces on accumulating to the same .grad.
+# NOTE: This should be only used by distributed optimizer internals
+# and not meant to expose to the user.
+@torch.jit.script
+class _FunctionalAdamax(object):
+    def __init__(
+        self,
+        params: List[Tensor],
+        lr: float = 1e-3,
+        betas: Tuple[float, float] = (0.9, 0.999),
+        eps: float = 1e-8,
+        weight_decay: float = 0.0,
+    ):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+
+        self.defaults = {
+            "lr": lr,
+            "eps": eps,
+            "beta1": betas[0],
+            "beta2": betas[1],
+            "weight_decay": weight_decay,
+        }
+        self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {})
+
+        if len(params) == 0:
+            raise ValueError("optimizer got an empty parameter list")
+
+        # NOTE: we only have one param_group and don't allow user to add additional
+        # param group as it's not a common use case.
+        self.param_group = {"params": params}
+
+    def step(self, gradients: List[Optional[Tensor]]):
+        params = self.param_group['params']
+        params_with_grad = []
+        grads = []
+        exp_avgs = []
+        exp_infs = []
+        state_steps: List[int] = []
+
+        if len(params) != len(gradients):
+            raise ValueError(
+                "the gradients passed in does not equal to the size of the parameters!"
+                + f"Params length: {len(params)}. "
+                + f"Gradients length: {len(gradients)}"
+            )
+
+        for param, gradient in zip(self.param_group['params'], gradients):
+            if gradient is not None:
+                params_with_grad.append(param)
+                grads.append(gradient)
+                # Lazy state initialization
+                if param not in self.state:
+                    self.state[param] = {}
+                    state = self.state[param]
+                    state['step'] = torch.tensor(0.0)
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(param, memory_format=torch.preserve_format)
+                    # Exponential moving average of squared gradient values
+                    state['exp_inf'] = torch.zeros_like(param, memory_format=torch.preserve_format)
+
+                state = self.state[param]
+
+                exp_avgs.append(state['exp_avg'])
+                exp_infs.append(state['exp_inf'])
+
+                # update the steps for each param group update
+                state['step'] += 1
+                # record the step after step update
+                state_steps.append(state['step'].item())
+
+        with torch.no_grad():
+            F.adamax(params_with_grad,
+                     grads,
+                     exp_avgs,
+                     exp_infs,
+                     state_steps,
+                     self.defaults['eps'],
+                     self.defaults['beta1'],
+                     self.defaults['beta2'],
+                     self.defaults['lr'],
+                     self.defaults['weight_decay'])
diff --git a/torch/distributed/optim/optimizer.py b/torch/distributed/optim/optimizer.py
index 6200f6df8e50..dc32238a26d6 100644
--- a/torch/distributed/optim/optimizer.py
+++ b/torch/distributed/optim/optimizer.py
@@ -13,6 +13,7 @@
 from .functional_sgd import _FunctionalSGD
 from .functional_adadelta import _FunctionalAdadelta
 from .functional_rmsprop import _FunctionalRMSprop
+from .functional_adamax import _FunctionalAdamax
 import torch.distributed.autograd as dist_autograd
 
 
@@ -24,7 +25,7 @@
 # XXX: we define a _ScriptModuleOptimizer here to explicitly
 # compile the FunctionalOptimizer class into TorchScript
 # This is because ScriptClass instance still lives in
-# python unless you explictly compile it as an attribute
+# python unless you explicitly compile it as an attribute
 # in ScriptModule or pass it to a ScriptFunction
 # _ScriptLocalOptimizerInterface serves as a common
 # interface type for Optimizer ScriptModules.
@@ -200,6 +201,7 @@ class DistributedOptimizer:
         optim.SGD: _FunctionalSGD,
         optim.Adadelta: _FunctionalAdadelta,
         optim.RMSprop: _FunctionalRMSprop,
+        optim.Adamax: _FunctionalAdamax,
     }
 
     def __init__(self, optimizer_class, params_rref, *args, **kwargs):
diff --git a/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py b/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py
index a8f953539c54..be4b49d0b410 100644
--- a/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py
+++ b/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py
@@ -207,3 +207,4 @@ def test_dist_optim(self):
         self._test_dist_optim_base(optim.SGD, lr=1e-3, momentum=1, weight_decay=1, nesterov=True)
         self._test_dist_optim_base(optim.Adadelta, rho=0.95)
         self._test_dist_optim_base(optim.RMSprop, lr=0.05)
+        self._test_dist_optim_base(optim.Adamax, lr=0.05)

From dd090e72b23f0035de0d97b9ee264fd744cb00ae Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@fb.com>
Date: Thu, 15 Apr 2021 15:15:24 -0700
Subject: [PATCH 20/45] [dist_optim] add distributed functional rprop optimizer
 (#55834)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/55834

ghstack-source-id: 126325536

Reviewed By: rohan-varma

Differential Revision: D27703878

fbshipit-source-id: 5c8ec9a4ccb4442b2b51d48d75ea5cd506179f14
---
 torch/distributed/optim/functional_rprop.py   | 81 +++++++++++++++++++
 torch/distributed/optim/optimizer.py          |  2 +
 .../distributed/rpc/dist_optimizer_test.py    |  1 +
 3 files changed, 84 insertions(+)
 create mode 100644 torch/distributed/optim/functional_rprop.py

diff --git a/torch/distributed/optim/functional_rprop.py b/torch/distributed/optim/functional_rprop.py
new file mode 100644
index 000000000000..93f6e08d2317
--- /dev/null
+++ b/torch/distributed/optim/functional_rprop.py
@@ -0,0 +1,81 @@
+from typing import List, Dict, Optional, Tuple
+import torch
+import torch.optim._functional as F
+
+from torch import Tensor
+
+# Define a TorchScript compatible Functional Rprop Optimizer
+# where we use these optimizer in a functional way.
+# Instead of using the `param.grad` when updating parameters,
+# we explicitly allow the distributed optimizer pass gradients to
+# the `step` function. In this way, we could separate the gradients
+# and parameters and allow multithreaded trainer to update the
+# parameters without data traces on accumulating to the same .grad.
+# NOTE: This should be only used by distributed optimizer internals
+# and not meant to expose to the user.
+@torch.jit.script
+class _FunctionalRprop(object):
+    def __init__(
+        self,
+        params: List[Tensor],
+        lr: float = 1e-2,
+        etas: Tuple[float, float] = (0.5, 1.2),
+        step_sizes: Tuple[float, float] = (1e-6, 50)
+    ):
+        self.defaults = {
+            "lr": lr,
+        }
+        self.etas = etas
+        self.step_sizes = step_sizes
+
+        if len(params) == 0:
+            raise ValueError("optimizer got an empty parameter list")
+
+        # NOTE: we only have one param_group and don't allow user to add additional
+        # param group as it's not a common use case.
+        self.param_group = {"params": params}
+
+        self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {})
+
+    def step(self, gradients: List[Optional[Tensor]]):
+        params = self.param_group['params']
+        grads = []
+        prevs = []
+        step_sizes = []
+        lr = self.defaults['lr']
+        etaminus, etaplus = self.etas
+        step_size_min, step_size_max = self.step_sizes
+
+        if len(params) != len(gradients):
+            raise ValueError(
+                "the gradients passed in does not equal to the size of the parameters!"
+                + f"Params length: {len(params)}. "
+                + f"Gradients length: {len(gradients)}"
+            )
+
+        for param, gradient in zip(params, gradients):
+            if gradient is not None:
+                grads.append(gradient)
+                # Lazy state initialization
+                if param not in self.state:
+                    self.state[param] = {}
+                    state = self.state[param]
+                    state['step'] = torch.tensor(0.0)
+                    state['prev'] = torch.zeros_like(param, memory_format=torch.preserve_format)
+                    state['step_size'] = torch.full_like(gradient, lr)
+
+                state = self.state[param]
+                prevs.append(state['prev'])
+                step_sizes.append(state['step_size'])
+
+                state['step'] += 1
+
+        with torch.no_grad():
+            F.rprop(params,
+                    grads,
+                    prevs,
+                    step_sizes,
+                    step_size_min,
+                    step_size_max,
+                    etaminus,
+                    etaplus)
diff --git a/torch/distributed/optim/optimizer.py b/torch/distributed/optim/optimizer.py
index dc32238a26d6..8785fb0903c0 100644
--- a/torch/distributed/optim/optimizer.py
+++ b/torch/distributed/optim/optimizer.py
@@ -13,6 +13,7 @@
 from .functional_sgd import _FunctionalSGD
 from .functional_adadelta import _FunctionalAdadelta
 from .functional_rmsprop import _FunctionalRMSprop
+from .functional_rprop import _FunctionalRprop
 from .functional_adamax import _FunctionalAdamax
 import torch.distributed.autograd as dist_autograd
 
@@ -201,6 +202,7 @@ class DistributedOptimizer:
         optim.SGD: _FunctionalSGD,
         optim.Adadelta: _FunctionalAdadelta,
         optim.RMSprop: _FunctionalRMSprop,
+        optim.Rprop: _FunctionalRprop,
         optim.Adamax: _FunctionalAdamax,
     }
 
diff --git a/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py b/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py
index be4b49d0b410..250f0e382a75 100644
--- a/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py
+++ b/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py
@@ -208,3 +208,4 @@ def test_dist_optim(self):
         self._test_dist_optim_base(optim.Adadelta, rho=0.95)
         self._test_dist_optim_base(optim.RMSprop, lr=0.05)
         self._test_dist_optim_base(optim.Adamax, lr=0.05)
+        self._test_dist_optim_base(optim.Rprop, lr=0.05)

From f02454f9571cec7c730335c422b35fac88aa8d1c Mon Sep 17 00:00:00 2001
From: h6197627 <44726212+h6197627@users.noreply.github.com>
Date: Thu, 15 Apr 2021 15:35:00 -0700
Subject: [PATCH 21/45] Fix ChanelShuffle named tensor warnings (#55911)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/54846

Pull Request resolved: https://github.com/pytorch/pytorch/pull/55911

Reviewed By: agolynski

Differential Revision: D27798078

Pulled By: jbschlosser

fbshipit-source-id: 1ebd325ac8a21f82c395d2eafac7ef2ecd1f32b1
---
 aten/src/ATen/native/ChanelShuffle.cpp |  2 +-
 test/test_nn.py                        | 20 +++++++++++++++-----
 torch/nn/modules/channelshuffle.py     |  9 ++++++---
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/aten/src/ATen/native/ChanelShuffle.cpp b/aten/src/ATen/native/ChanelShuffle.cpp
index 81604eabeb7d..80a09c4b7508 100644
--- a/aten/src/ATen/native/ChanelShuffle.cpp
+++ b/aten/src/ATen/native/ChanelShuffle.cpp
@@ -54,7 +54,7 @@ Tensor channel_shuffle(const Tensor& self, int64_t groups) {
       .reshape(self.sizes());
   return namedinference::propagate_names_if_nonempty(
       output_tensor,
-      self.names());
+      self.has_names() ? self.names() : at::ArrayRef<Dimname>{});
 }
 
 }} // namespace at::native
diff --git a/test/test_nn.py b/test/test_nn.py
index d93c5cf90276..27636dfeaddd 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -9194,7 +9194,9 @@ def test_channel_shuffle(self):
               ]]
         )
         #  ChannelsFirst
-        y = F.channel_shuffle(x, 2)
+        with warnings.catch_warnings(record=True) as w:
+            y = F.channel_shuffle(x, 2)
+            self.assertEqual(len(w), 0)
         self.assertEqual(y, y_ref)
         #  ChannelsLast not supported for 3dim
 
@@ -9222,10 +9224,14 @@ def test_channel_shuffle(self):
               ]]
         )
         #  ChannelsFirst NCHW
-        y = F.channel_shuffle(x, 2)
+        with warnings.catch_warnings(record=True) as w:
+            y = F.channel_shuffle(x, 2)
+            self.assertEqual(len(w), 0)
         self.assertEqual(y, y_ref)
         #  ChannelsLast NHWC
-        y = F.channel_shuffle(x.contiguous(memory_format=torch.channels_last), 2)
+        with warnings.catch_warnings(record=True) as w:
+            y = F.channel_shuffle(x.contiguous(memory_format=torch.channels_last), 2)
+            self.assertEqual(len(w), 0)
         y = y.contiguous(memory_format=torch.contiguous_format)
         self.assertEqual(y, y_ref)
 
@@ -9253,10 +9259,14 @@ def test_channel_shuffle(self):
               ]]
         )
         #  ChannelsFirst NCHW
-        y = F.channel_shuffle(x, 2)
+        with warnings.catch_warnings(record=True) as w:
+            y = F.channel_shuffle(x, 2)
+            self.assertEqual(len(w), 0)
         self.assertEqual(y, y_ref)
         #  ChannelsLast NHWC
-        y = F.channel_shuffle(x.contiguous(memory_format=torch.channels_last_3d), 2)
+        with warnings.catch_warnings(record=True) as w:
+            y = F.channel_shuffle(x.contiguous(memory_format=torch.channels_last_3d), 2)
+            self.assertEqual(len(w), 0)
         y = y.contiguous(memory_format=torch.contiguous_format)
         self.assertEqual(y, y_ref)
 
diff --git a/torch/nn/modules/channelshuffle.py b/torch/nn/modules/channelshuffle.py
index 66839bff3d71..740ee6022ca2 100644
--- a/torch/nn/modules/channelshuffle.py
+++ b/torch/nn/modules/channelshuffle.py
@@ -1,6 +1,8 @@
 from .module import Module
 from .. import functional as F
 
+from torch import Tensor
+
 
 class ChannelShuffle(Module):
     r"""Divide the channels in a tensor of shape :math:`(*, C , H, W)`
@@ -37,13 +39,14 @@ class ChannelShuffle(Module):
          ]]
     """
     __constants__ = ['groups']
+    groups: int
 
-    def __init__(self, groups):
+    def __init__(self, groups: int) -> None:
         super(ChannelShuffle, self).__init__()
         self.groups = groups
 
-    def forward(self, input):
+    def forward(self, input: Tensor) -> Tensor:
         return F.channel_shuffle(input, self.groups)
 
-    def extra_repr(self):
+    def extra_repr(self) -> str:
         return 'groups={}'.format(self.groups)

From e1752ffa045be43612e75bb10500b01af7116575 Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Thu, 15 Apr 2021 15:44:25 -0700
Subject: [PATCH 22/45] [reland][ROCm] use hiprtc precompiled header (#55965)

Summary:
Revert "Revert D27449031 (https://github.com/pytorch/pytorch/commit/2a7df657feef7534bfe4bce14da06dca29d38b0f): [pytorch][PR] [ROCm] use hiprtc precompiled header".  Reland PR https://github.com/pytorch/pytorch/issues/54350.

This reverts commit 204ac21bf1457022caab197001788239720b96d6.

The original PR was reverted under suspicion that it was causing CI instability, but it was instead due to a hardware failure.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/55965

Reviewed By: jbschlosser

Differential Revision: D27755907

Pulled By: malfet

fbshipit-source-id: 75bf0b9d888df3dee62f00a366b1123757e0474e
---
 cmake/Dependencies.cmake                      |  1 +
 cmake/public/LoadHIP.cmake                    |  2 ++
 torch/csrc/jit/codegen/cuda/executor.cpp      |  2 ++
 .../csrc/jit/codegen/cuda/executor_utils.cpp  |  3 +++
 torch/csrc/jit/codegen/fuser/codegen.cpp      | 18 +++++++++++++
 .../jit/codegen/fuser/cuda/fused_kernel.cpp   |  5 +++-
 .../jit/codegen/fuser/cuda/resource_strings.h |  4 +--
 torch/csrc/jit/tensorexpr/cuda_codegen.cpp    | 25 +++++++++++++------
 8 files changed, 49 insertions(+), 11 deletions(-)

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 01558ae162c6..e9ad94f5f5c8 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1215,6 +1215,7 @@ if(USE_ROCM)
     list(APPEND HIP_CXX_FLAGS -Wno-implicit-int-float-conversion)
     list(APPEND HIP_CXX_FLAGS -DCAFFE2_USE_MIOPEN)
     list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
+    list(APPEND HIP_CXX_FLAGS -DROCM_VERSION=${ROCM_VERSION_DEV_INT})
     list(APPEND HIP_CXX_FLAGS -std=c++14)
 
     if(CMAKE_BUILD_TYPE MATCHES Debug)
diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
index 4e7dccb741de..9cdde76e53a1 100644
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@@ -161,12 +161,14 @@ if(HIP_FOUND)
     set(ROCM_VERSION_DEV_MINOR ${CMAKE_MATCH_2})
     set(ROCM_VERSION_DEV_PATCH ${CMAKE_MATCH_3})
     set(ROCM_VERSION_DEV "${ROCM_VERSION_DEV_MAJOR}.${ROCM_VERSION_DEV_MINOR}.${ROCM_VERSION_DEV_PATCH}")
+    math(EXPR ROCM_VERSION_DEV_INT "(${ROCM_VERSION_DEV_MAJOR}*10000) + (${ROCM_VERSION_DEV_MINOR}*100) + ${ROCM_VERSION_DEV_PATCH}")
   endif()
   message("\n***** ROCm version from ${ROCM_PATH}/.info/version-dev ****\n")
   message("ROCM_VERSION_DEV: ${ROCM_VERSION_DEV}")
   message("ROCM_VERSION_DEV_MAJOR: ${ROCM_VERSION_DEV_MAJOR}")
   message("ROCM_VERSION_DEV_MINOR: ${ROCM_VERSION_DEV_MINOR}")
   message("ROCM_VERSION_DEV_PATCH: ${ROCM_VERSION_DEV_PATCH}")
+  message("ROCM_VERSION_DEV_INT:   ${ROCM_VERSION_DEV_INT}")
 
   message("\n***** Library versions from dpkg *****\n")
   execute_process(COMMAND dpkg -l COMMAND grep rocm-dev COMMAND awk "{print $2 \" VERSION: \" $3}")
diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp
index 76ba1faf6641..92a561c40bd3 100644
--- a/torch/csrc/jit/codegen/cuda/executor.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor.cpp
@@ -28,8 +28,10 @@ std::string FusionExecutor::getStructuredCode(const std::string& kernel) {
   // generating cuda code;
   std::string code = "";
 #ifdef __HIP_PLATFORM_HCC__
+#if ROCM_VERSION < 40200
   code += std::string("#include <hip/hip_runtime.h>\n") +
       std::string("#include <hip/hip_fp16.h>\n");
+#endif
 #endif
   code += std::string("namespace ") + FusionExecutor::kernelNamespace() +
       " {\n" + executor_utils::kernelPreamble() + kernel + "}\n";
diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
index df0a5168ea86..ed413ddebfc5 100644
--- a/torch/csrc/jit/codegen/cuda/executor_utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
@@ -281,6 +281,9 @@ NvrtcFunction nvrtcCompile(
 
 #ifdef __HIP_PLATFORM_HCC__
   std::vector<const char*> args = {"--std=c++14"};
+#if ROCM_VERSION >= 40200
+  args.push_back("-hip-pch");
+#endif
 #else
   const std::string compute = std::string("--gpu-architecture=") +
 #if CUDA_VERSION >= 11010
diff --git a/torch/csrc/jit/codegen/fuser/codegen.cpp b/torch/csrc/jit/codegen/fuser/codegen.cpp
index 9146994e29af..5c8fa69f1269 100644
--- a/torch/csrc/jit/codegen/fuser/codegen.cpp
+++ b/torch/csrc/jit/codegen/fuser/codegen.cpp
@@ -659,6 +659,24 @@ std::string generateKernel(
     env.s("RandInit", "");
   }
 
+  // HIP headers must be included until precompiled header feature is available
+  // clang-format off
+#ifdef __HIP_PLATFORM_HCC__
+#if ROCM_VERSION < 40200
+  if (use_cuda && has_half_tensor) {
+    env.s("RuntimeHeader", R"(
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+)");
+  } else if (use_cuda) {
+    env.s("RuntimeHeader", R"(
+#include <hip/hip_runtime.h>
+)");
+  }
+#endif
+#endif
+  // clang-format on
+
   // Instantiates the CUDA or CPU-specific templates
   env.s("tensorOffsets", tensorOffsets.str());
   env.s("tensorChecks", tensorChecks.str());
diff --git a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
index 1201cef8e51e..5ce1f034f14b 100644
--- a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
+++ b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
@@ -121,7 +121,10 @@ FusedKernelCUDA::FusedKernelCUDA(
       &program, code_.c_str(), nullptr, 0, nullptr, nullptr));
 
 #ifdef __HIP_PLATFORM_HCC__
-  std::vector<const char*> args = {};
+  std::vector<const char*> args = {"--std=c++14"};
+#if ROCM_VERSION >= 40200
+  args.push_back("-hip-pch");
+#endif
 #else
   const std::string compute = std::string("--gpu-architecture=") +
 #if CUDA_VERSION >= 11010
diff --git a/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h b/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h
index 58416584352f..83a14e030213 100644
--- a/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h
+++ b/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h
@@ -15,7 +15,7 @@ cases*/
 
 #ifdef __HIP_PLATFORM_HCC__
 static auto type_declarations_template = CodeTemplate(R"(
-#include <hip/hip_runtime.h>
+${RuntimeHeader}
 ${HalfHeader}
 ${RandHeader}
 
@@ -213,8 +213,6 @@ void ${kernelName}(IndexType totalElements, ${formals} ${RandParam}) {
 #ifdef __HIP_PLATFORM_HCC__
 constexpr auto half_support_literal =
     R"(
-#include <hip/hip_fp16.h>
-
 typedef __half half;
 )";
 #else
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
index 371d7bd8eaa9..155ab5357469 100644
--- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
@@ -887,7 +887,6 @@ static std::ostream& operator<<(
 
 #ifdef USE_ROCM
 static const char* device_resource_string = R"(
-#include <hip/hip_runtime.h>
 #define POS_INFINITY INFINITY
 #define NEG_INFINITY -INFINITY
 
@@ -930,17 +929,26 @@ void CudaCodeGen::Initialize() {
   metavar_rewriter_ =
       std::make_unique<GPUMetaVarRewriter>(cuda_analysis_.get());
 
+  // Check whether the statement uses the Half type, if so add the
+  // half_support_literal.
+  Stmt* stmt_v = stmt();
+  HalfChecker halfChecker(buffer_args());
+  stmt_v->accept(&halfChecker);
+
+#if __HIP_PLATFORM_HCC__
+#if ROCM_VERSION < 40200
+  os() << "#include <hip/hip_runtime.h>" << std::endl;
+  if (halfChecker.hasHalf()) {
+    os() << "#include <hip/hip_fp16.h>" << std::endl;
+  }
+#endif
+#endif
   os() << device_resource_string << shared_resource_string;
 
   if (has_random_) {
     os() << philox_random_string << std::endl;
   }
 
-  // Check whether the statement uses the Half type, if so add the
-  // half_support_literal.
-  Stmt* stmt_v = stmt();
-  HalfChecker halfChecker(buffer_args());
-  stmt_v->accept(&halfChecker);
   if (halfChecker.hasHalf()) {
     os() << fuser::cuda::half_support_literal << std::endl;
   }
@@ -1203,7 +1211,10 @@ void CudaCodeGen::CompileToNVRTC(
       &program, code.c_str(), nullptr, 0, nullptr, nullptr));
 
 #ifdef __HIP_PLATFORM_HCC__
-  std::vector<const char*> args = {};
+  std::vector<const char*> args = {"--std=c++14"};
+#if ROCM_VERSION >= 40200
+  args.push_back("-hip-pch");
+#endif
 #else
   const std::string compute = std::string("--gpu-architecture=") +
 #if CUDA_VERSION >= 11010

From 8e82e932f334714b8ca33e3458ff3de2c8d75317 Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Thu, 15 Apr 2021 15:47:03 -0700
Subject: [PATCH 23/45] Reland: D27652485: [nnc] Enable CPU fusion only when
 num_threads == 1" (#56120)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/56120

This reverts commit ad17fadbfc786dc1ccb42e822208ff03c2a2b72c (D27786457).

The big annoyance here is that depending on the threading mode you may not be
able to toggle num_threads at will, so the fusion tests won't fail.

I hate this solution, but I'm adding a secondary override for the TE fuser.
Now you need to both turn on fusion (_jit_override_can_fuse_on_cpu), and you're
OK if you're running with 1 thread, or you can add
`_jit_set_texpr_parallel_cpu_enabled` to enable it anyways.

This is (a) mainly for tests, since a real user probably won't fiddle aimlessly
with the thread count, and (b) will go away once NNC's threading support is
fully baked.

Test Plan: Imported from OSS

Reviewed By: Krovatkin

Differential Revision: D27788199

Pulled By: bertmaher

fbshipit-source-id: 070d04474f15e9689dbdf8cc1fde43050c6506b1
---
 test/cpp/tensorexpr/test_te_fuser_pass.cpp |  6 +++++-
 test/jit/test_profiler.py                  |  3 +++
 test/test_jit_fuser_te.py                  |  5 +++++
 test/test_tensorexpr.py                    |  4 ++++
 torch/csrc/jit/passes/tensorexpr_fuser.cpp | 19 ++++++++++++++++++-
 torch/csrc/jit/passes/tensorexpr_fuser.h   |  2 ++
 torch/csrc/jit/python/init.cpp             |  2 ++
 7 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/test/cpp/tensorexpr/test_te_fuser_pass.cpp b/test/cpp/tensorexpr/test_te_fuser_pass.cpp
index adaf14593cf7..62e9200df3af 100644
--- a/test/cpp/tensorexpr/test_te_fuser_pass.cpp
+++ b/test/cpp/tensorexpr/test_te_fuser_pass.cpp
@@ -15,15 +15,19 @@ namespace jit {
 using namespace torch::jit::tensorexpr;
 
 struct WithCPUFuser {
-  WithCPUFuser(bool val = true) : cpuFuserEnabled(canFuseOnCPU()) {
+  WithCPUFuser(bool val = true)
+      : cpuFuserEnabled(canFuseOnCPU()), parallel(texprParallelCPUEnabled()) {
     overrideCanFuseOnCPU(val);
+    setTexprParallelCPUEnabled(true);
   }
 
   ~WithCPUFuser() {
     overrideCanFuseOnCPU(cpuFuserEnabled);
+    setTexprParallelCPUEnabled(parallel);
   }
 
   bool cpuFuserEnabled;
+  bool parallel;
 };
 
 TEST(TEFuserPass, FuserPass_1) {
diff --git a/test/jit/test_profiler.py b/test/jit/test_profiler.py
index b9ed9d0b78eb..aa8be0518385 100644
--- a/test/jit/test_profiler.py
+++ b/test/jit/test_profiler.py
@@ -29,6 +29,8 @@ def setUp(self):
         torch._C._debug_set_fusion_group_inlining(False)
         self.old_te_must_use_llvm_cpu = torch._C._jit_get_te_must_use_llvm_cpu()
         torch._C._jit_set_te_must_use_llvm_cpu(False)
+        self.old_fuse_parallel = torch._C._jit_texpr_parallel_cpu_enabled()
+        torch._C._jit_set_texpr_parallel_cpu_enabled(True)
 
     def tearDown(self):
         torch._C._jit_set_profiling_executor(self.prev_exec)
@@ -40,6 +42,7 @@ def tearDown(self):
         torch._C._jit_set_texpr_reductions_enabled(self.old_reduction_enabled)
         torch._C._debug_set_fusion_group_inlining(self.old_fusion_inlining)
         torch._C._jit_set_te_must_use_llvm_cpu(self.old_te_must_use_llvm_cpu)
+        torch._C._jit_set_texpr_parallel_cpu_enabled(self.old_fuse_parallel)
 
     def test_tensor_type_not_determined_by_inputs(self):
         @torch.jit.script
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index 35b2f2ff8bee..c2635651d267 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -74,6 +74,10 @@ def setUp(self):
         self.old_te_must_use_llvm_cpu = torch._C._jit_get_te_must_use_llvm_cpu()
         torch._C._jit_set_te_must_use_llvm_cpu(False)
 
+        # TODO: CPU fuser currently is disabled when multithreading.
+        self.old_fuse_parallel = torch._C._jit_texpr_parallel_cpu_enabled()
+        torch._C._jit_set_texpr_parallel_cpu_enabled(True)
+
         self.devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         self.int_dtypes = [
             torch.int8,
@@ -101,6 +105,7 @@ def tearDown(self):
 
         torch._C._jit_set_texpr_fuser_enabled(self.texpr_fuser_state)
         torch._C._jit_set_te_must_use_llvm_cpu(self.old_te_must_use_llvm_cpu)
+        torch._C._jit_set_texpr_parallel_cpu_enabled(self.old_fuse_parallel)
 
     def assertLastGraphAllFused(self):
         self.assertAllFused(torch.jit.last_executed_optimized_graph())
diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py
index 9b24c3cad781..68b8cfd510fd 100644
--- a/test/test_tensorexpr.py
+++ b/test/test_tensorexpr.py
@@ -27,6 +27,9 @@ def setUp(self):
         torch._C._debug_set_fusion_group_inlining(False)
         self.old_te_must_use_llvm_cpu = torch._C._jit_get_te_must_use_llvm_cpu()
         torch._C._jit_set_te_must_use_llvm_cpu(False)
+        # TODO: CPU fuser currently is disabled when multithreading.
+        self.old_fuse_parallel = torch._C._jit_texpr_parallel_cpu_enabled()
+        torch._C._jit_set_texpr_parallel_cpu_enabled(True)
 
         self.devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
 
@@ -39,6 +42,7 @@ def tearDown(self):
         torch._C._jit_override_can_fuse_on_cpu(self.old_cpu_fuser_state)
         torch._C._debug_set_fusion_group_inlining(self.old_fusion_inlining)
         torch._C._jit_set_te_must_use_llvm_cpu(self.old_te_must_use_llvm_cpu)
+        torch._C._jit_set_texpr_parallel_cpu_enabled(self.old_fuse_parallel)
 
     def assertLastGraphAllFused(self):
         self.assertAllFused(torch.jit.last_executed_optimized_graph())
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index d989b07efe6e..908f51ffaca4 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -1,5 +1,6 @@
 #include <torch/csrc/jit/passes/tensorexpr_fuser.h>
 
+#include <ATen/Parallel.h>
 #include <ATen/core/interned_strings.h>
 #include <ATen/record_function.h>
 #include <c10/util/FunctionRef.h>
@@ -232,6 +233,15 @@ bool isSupported(Node* node) {
 } // namespace tensorexpr
 
 static bool texpr_fuser_enabled_ = true;
+static bool texpr_parallel_cpu_enabled = false;
+
+bool texprParallelCPUEnabled() {
+  return texpr_parallel_cpu_enabled;
+}
+
+void setTexprParallelCPUEnabled(bool val) {
+  texpr_parallel_cpu_enabled = val;
+}
 
 void setTensorExprFuserEnabled(bool val) {
   texpr_fuser_enabled_ = val;
@@ -854,7 +864,14 @@ class TensorExprFuser {
       return false;
     }
     if (device->is_cpu()) {
-      return canFuseOnCPU();
+      // CPU fusion is only supported for single-thread.
+      if (!canFuseOnCPU()) {
+        return false;
+      }
+      if (at::get_num_threads() == 1 || texprParallelCPUEnabled()) {
+        return true;
+      }
+      return false;
     } else if (device->is_cuda()) {
       return canFuseOnGPU();
     } else if (device->is_xpu()) {
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.h b/torch/csrc/jit/passes/tensorexpr_fuser.h
index 992d03a6915e..cc8b427030de 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.h
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.h
@@ -24,6 +24,8 @@ TORCH_API void setTensorExprFuserEnabled(bool val);
 TORCH_API bool tensorExprFuserEnabled();
 TORCH_API bool setTexprReductionsEnabled(bool value);
 TORCH_API bool texprReductionsEnabled();
+TORCH_API bool texprParallelCPUEnabled();
+TORCH_API void setTexprParallelCPUEnabled(bool val);
 
 TORCH_API void RemoveProfileNodesAndSpecializeTypes(
     std::shared_ptr<Graph>& graph);
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 4c32e54a6bea..f5b06db24e36 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -631,6 +631,8 @@ void initJITBindings(PyObject* module) {
       .def("_jit_texpr_set_fallback_allowed", &tensorexpr::setFallbackAllowed)
       .def("_jit_set_texpr_reductions_enabled", &setTexprReductionsEnabled)
       .def("_jit_texpr_reductions_enabled", &texprReductionsEnabled)
+      .def("_jit_set_texpr_parallel_cpu_enabled", &setTexprParallelCPUEnabled)
+      .def("_jit_texpr_parallel_cpu_enabled", &texprParallelCPUEnabled)
       .def(
           "_jit_set_te_generate_block_code",
           [](bool gen_block_code) {

From 3fbca31be3ebfa83839c7d522f16b6422f90b0a3 Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Thu, 15 Apr 2021 15:52:11 -0700
Subject: [PATCH 24/45] port addmv to structured kernels (#55746)

Summary:
Per title
I've revamped size checks a bit to provide better error message if `self` is of the wrong size, also added check that inplace variant has correct `self` size

Ref: https://github.com/pytorch/pytorch/issues/55070

Pull Request resolved: https://github.com/pytorch/pytorch/pull/55746

Reviewed By: ezyang

Differential Revision: D27782980

Pulled By: ngimel

fbshipit-source-id: 6ba949b682b8fd1170d0304da0ed348dd1a7b8c7
---
 aten/src/ATen/NamedTensorUtils.cpp            |  10 +-
 aten/src/ATen/NamedTensorUtils.h              |   3 +-
 aten/src/ATen/native/Blas.cpp                 | 114 +++++++++---------
 aten/src/ATen/native/cuda/Blas.cu             |  74 ++++++++----
 aten/src/ATen/native/native_functions.yaml    |  16 +--
 .../check_backward_compatibility.py           |  14 +--
 6 files changed, 116 insertions(+), 115 deletions(-)

diff --git a/aten/src/ATen/NamedTensorUtils.cpp b/aten/src/ATen/NamedTensorUtils.cpp
index 5f8de486dc78..ce3af0fa451c 100644
--- a/aten/src/ATen/NamedTensorUtils.cpp
+++ b/aten/src/ATen/NamedTensorUtils.cpp
@@ -369,18 +369,16 @@ static std::vector<Dimname> compute_matmul_outnames(
   return result;
 }
 
-void propagate_names_for_addmv(
-    Tensor& result,
+std::vector<Dimname> propagate_names_for_addmv(
     const Tensor& mat,
     const Tensor& vec,
     const Tensor& bias) {
-  if (!result.has_names() && !mat.has_names() &&
+  if (!mat.has_names() &&
       !vec.has_names() && !bias.has_names()) {
-    return;
+    return std::vector<Dimname>{};
   }
   auto mv_outnames = compute_matmul_outnames(mat.names(), vec.names());
-  auto add_outnames = unify_from_right(mv_outnames, bias.names());
-  propagate_names(result, add_outnames);
+  return unify_from_right(mv_outnames, bias.names());
 }
 
 void propagate_names_for_addmm(
diff --git a/aten/src/ATen/NamedTensorUtils.h b/aten/src/ATen/NamedTensorUtils.h
index af5584157550..cbfcc2e00d06 100644
--- a/aten/src/ATen/NamedTensorUtils.h
+++ b/aten/src/ATen/NamedTensorUtils.h
@@ -146,8 +146,7 @@ TORCH_API void propagate_names_for_addmm(
     const Tensor& m2,
     const Tensor& bias);
 
-TORCH_API void propagate_names_for_addmv(
-    Tensor& result,
+TORCH_API std::vector<Dimname> propagate_names_for_addmv(
     const Tensor& mat,
     const Tensor& vec,
     const Tensor& bias);
diff --git a/aten/src/ATen/native/Blas.cpp b/aten/src/ATen/native/Blas.cpp
index 7f82188be828..d29b9fb31171 100644
--- a/aten/src/ATen/native/Blas.cpp
+++ b/aten/src/ATen/native/Blas.cpp
@@ -4,7 +4,24 @@
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/ScalarOps.h>
 
-namespace at { namespace native {
+namespace at {
+namespace meta {
+TORCH_META_FUNC(addmv)(const Tensor &self, const Tensor &mat, const Tensor &vec, const Scalar& beta, const Scalar& alpha) {
+  TORCH_CHECK((mat.dim() == 2 && vec.dim() == 1 && self.dim() <= 1),
+    "vector + matrix @ vector expected, got ", self.dim(), ", ", mat.dim(), ", ", vec.dim());
+
+  TORCH_CHECK(mat.size(1) == vec.size(0) && (mat.size(0) == self.numel() || self.numel() == 1),
+     "size mismatch, got ", self.size(0), ", ", mat.size(0), "x", mat.size(1), ",", vec.size(0));
+  auto names = at::namedinference::propagate_names_for_addmv(mat, vec, self);
+  set_output(0, IntArrayRef(mat.sizes().data(), 1), {}, mat.options(), names);
+  auto result = maybe_get_output(0);
+  //this check can fire for inplace op only, for all other versions result is guaranteed to be correct size
+  TORCH_CHECK(result.dim() == 1 && result.sizes()[0] == mat.sizes()[0], "output of addmv operation should be 1D with ",
+  "size equal to mat.size(0), yet got output size ", result.sizes(), " and mat.size(0) ", mat.size(0));
+}
+}
+
+namespace native {
 
 template<typename scalar_t>
 void gemv(char trans, int64_t m, int64_t n, scalar_t alpha, scalar_t *a, int64_t lda, scalar_t *x, int64_t incx, scalar_t beta, scalar_t *y, int64_t incy);
@@ -19,86 +36,69 @@ constexpr inline bool lda_cond(int64_t m, int64_t n, int64_t lda) {
   return n == 1 || lda >= std::max<int64_t>(1L, m);
 }
 
-Tensor &addmv_impl_cpu(Tensor& result, const Tensor &self, const Tensor &mat, const Tensor &vec, const Scalar& beta_, const Scalar& alpha_) {
-  auto r_stride = result.stride(0);
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, mat.scalar_type(), "addmv_impl_cpu", [&] {
-    auto beta = beta_.to<scalar_t>();
-    auto alpha = alpha_.to<scalar_t>();
-    if (mat.stride(0) == 1 && lda_cond(mat.size(0), mat.size(1), mat.stride(1))) {
-      gemv<scalar_t>('n', mat.size(0), mat.size(1), alpha, mat.data_ptr<scalar_t>(), mat.stride(1),
-          vec.data_ptr<scalar_t>(), vec.stride(0), beta, result.data_ptr<scalar_t>(), r_stride);
-    }
-    else if (mat.stride(1) == 1 && lda_cond(mat.size(1), mat.size(0), mat.stride(0))) {
-      gemv<scalar_t>('t', mat.size(1), mat.size(0), alpha, mat.data_ptr<scalar_t>(), mat.stride(0),
-          vec.data_ptr<scalar_t>(), vec.stride(0), beta, result.data_ptr<scalar_t>(), r_stride);
-    }
-    else {
-      Tensor cmat = mat.contiguous();
-      gemv<scalar_t>('t', mat.size(1), mat.size(0), alpha, cmat.data_ptr<scalar_t>(), cmat.stride(0),
-          vec.data_ptr<scalar_t>(), vec.stride(0), beta, result.data_ptr<scalar_t>(), r_stride);
-    }
-  });
-  return result;
-}
-
-Tensor &addmv_out(const Tensor &self, const Tensor &mat, const Tensor &vec, const Scalar& beta, const Scalar& alpha, Tensor& result) {
-  { // scope of NoNamesGuard
 
-  at::NoNamesGuard guard;
-  result.resize_({mat.size(0)});
-
-  Tensor self_ = self;
-  if (self.dim() == 0 || self.size(0) == 1) {
-    self_ = self.expand({mat.size(0)});
-  }
 
-  TORCH_CHECK((mat.dim() == 2 && vec.dim() == 1 && self_.dim() == 1),
-    "vector + matrix @ vector expected, got ", self_.dim(), ", ", mat.dim(), ", ", vec.dim());
-  TORCH_CHECK((mat.size(1) == vec.size(0) && mat.size(0) == self_.size(0)),
-    "size mismatch, get ", self_.size(0), ", ", mat.size(0), "x", mat.size(1), ",", vec.size(0));
 
+TORCH_IMPL_FUNC(addmv_out_cpu)(const Tensor &self, const Tensor &mat, const Tensor &vec, const Scalar& beta_, const Scalar& alpha_, const Tensor& result) {
+  c10::MaybeOwned<Tensor> self_ = expand_size(self, {mat.size(0)});
+  auto betaval = beta_.toComplexDouble();
   if (mat.numel() == 0) {
+    // shortcut for an empty matrix
     // By definition, when beta==0, values in self should be ignored. nans and infs
     // should not propagate
-    if (beta.toComplexDouble() == 0.0) {
+    if (betaval == 0.0) {
       result.zero_();
     } else {
       at::cpu::mul_out(
-          result,
+          const_cast<Tensor&>(result),
           self,
           at::native::scalar_tensor(
-              beta, self.scalar_type(), c10::nullopt /* layout */, at::kCPU, c10::nullopt /* pin_memory */));
+              beta_, self.scalar_type(), c10::nullopt /* layout */, at::kCPU, c10::nullopt /* pin_memory */));
     }
   } else {
-    if (!result.is_same(self_)) {
-      at::native::copy_(result, self_);
+    if (!result.is_same(*self_) && betaval != 0.0) { //if beta is 0, result contents is ignored
+      at::native::copy_(const_cast<Tensor&>(result), *self_);
     }
     if (result.numel() != 0) {
-      at::_addmv_impl_(result, self_, mat, vec, beta, alpha);
+      auto r_stride = result.stride(0);
+      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, mat.scalar_type(), "addmv_impl_cpu", [&] {
+        auto beta = beta_.to<scalar_t>();
+        auto alpha = alpha_.to<scalar_t>();
+        if (mat.stride(0) == 1 && lda_cond(mat.size(0), mat.size(1), mat.stride(1))) {
+          gemv<scalar_t>('n', mat.size(0), mat.size(1), alpha, mat.data_ptr<scalar_t>(), mat.stride(1),
+              vec.data_ptr<scalar_t>(), vec.stride(0), beta, result.data_ptr<scalar_t>(), r_stride);
+        }
+        else if (mat.stride(1) == 1 && lda_cond(mat.size(1), mat.size(0), mat.stride(0))) {
+          gemv<scalar_t>('t', mat.size(1), mat.size(0), alpha, mat.data_ptr<scalar_t>(), mat.stride(0),
+              vec.data_ptr<scalar_t>(), vec.stride(0), beta, result.data_ptr<scalar_t>(), r_stride);
+        }
+        else {
+          Tensor cmat = mat.contiguous();
+          gemv<scalar_t>('t', mat.size(1), mat.size(0), alpha, cmat.data_ptr<scalar_t>(), cmat.stride(0),
+              vec.data_ptr<scalar_t>(), vec.stride(0), beta, result.data_ptr<scalar_t>(), r_stride);
+        }
+      });
     }
   }
-
-  } // scope of NoNamesGuard
-  at::namedinference::propagate_names_for_addmv(result, mat, vec, self);
-  return result;
-}
-
-Tensor addmv(const Tensor &self, const Tensor &mat, const Tensor &vec, const Scalar& beta, const Scalar& alpha) {
-  Tensor result = at::empty({mat.size(0)}, mat.options());
-  return native::addmv_out(self, mat, vec, beta, alpha, result);
-}
-
-Tensor &addmv_(Tensor &self, const Tensor &mat, const Tensor &vec, const Scalar& beta, const Scalar& alpha) {
-  return native::addmv_out(self, mat, vec, beta, alpha, self);
 }
 
 Tensor &mv_out(const Tensor &self, const Tensor &vec, Tensor& result) {
-  return native::addmv_out(result, self, vec, 0, 1, result);
+  //self arg sent to addmv_out cannot be resized
+  //here we use result as self argument for addmv, and result is user supplied and can be wrong size
+  //it's not a hard error, because we allow resizing result, but it becomes a hard error
+  //in addmv, because addmv expects self to satisfy proper conditions
+  //to avoid this, supply correctly sized self, its contents doesn't matter because beta is 0
+  if (result.dim() > 1 || (result.numel() != self.size(0) || result.numel() !=1)) {
+    Tensor self_addmv = at::empty({self.size(0)}, self.options());
+    return at::addmv_out(result, self_addmv, self, vec, 0, 1);
+  }
+  return at::addmv_out(result, result, self, vec, 0, 1);
 }
 
 Tensor mv(const Tensor &self, const Tensor &vec) {
   Tensor result = at::empty({self.size(0)}, self.options());
-  return native::mv_out(self, vec, result);
+  //inplace version is more efficient if we can use it
+  return at::addmv_(result, self, vec, 0, 1);
 }
 
 inline void dot_check(const Tensor& self, const Tensor& other) {
diff --git a/aten/src/ATen/native/cuda/Blas.cu b/aten/src/ATen/native/cuda/Blas.cu
index 339eaf6c4335..292204ce441b 100644
--- a/aten/src/ATen/native/cuda/Blas.cu
+++ b/aten/src/ATen/native/cuda/Blas.cu
@@ -1,38 +1,60 @@
 #include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/cuda/CUDABlas.h>
+#include <c10/util/MaybeOwned.h>
 
 namespace at { namespace native {
 
-Tensor &addmv_impl_cuda(Tensor& result, const Tensor &self, const Tensor &mat, const Tensor &vec, const Scalar& beta_, const Scalar& alpha_) {
-  auto r_stride = result.stride(0);
-  auto vec_stride = vec.stride(0);
-
-  // Check for contiguity of `vec` and update `vec_stride` accordingly
-  const auto vec_contiguous = vec_stride == 0 ? vec.contiguous() : vec;
-  vec_stride = vec_contiguous.stride(0);
-
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, mat.scalar_type(), "addmv_impl_cuda", [&] {
-    auto beta = beta_.to<scalar_t>();
-    auto alpha = alpha_.to<scalar_t>();
-    if (mat.stride(0) == 1 && mat.stride(1) >= std::max<int64_t>(1, mat.size(0))) {
-      at::cuda::blas::gemv<scalar_t>('n',
-        mat.size(0), mat.size(1), alpha, mat.data_ptr<scalar_t>(), mat.stride(1), vec_contiguous.data_ptr<scalar_t>(),
-        vec_stride, beta, result.data_ptr<scalar_t>(), r_stride);
+TORCH_IMPL_FUNC(addmv_out_cuda)(const Tensor &self, const Tensor &mat, const Tensor &vec, const Scalar& beta_, const Scalar& alpha_, const Tensor& result) {
+  c10::MaybeOwned<Tensor> self_ = expand_size(self, {mat.size(0)});
+  auto betaval = beta_.toComplexDouble();
+  if (mat.numel() == 0) {
+    // shortcut for an empty matrix
+    // By definition, when beta==0, values in self should be ignored. nans and infs
+    // should not propagate
+    if (betaval == 0.0) {
+      result.zero_();
+    } else {
+      at::mul_out(
+          const_cast<Tensor&>(result),
+          self,
+          at::native::scalar_tensor(
+              beta_, self.scalar_type(), c10::nullopt /* layout */, at::kCPU, c10::nullopt /* pin_memory */));
     }
-    else if (mat.stride(1) == 1 && mat.stride(0) >= std::max<int64_t>(1, mat.size(1))) {
-      at::cuda::blas::gemv<scalar_t>('t',
-        mat.size(1), mat.size(0), alpha, mat.data_ptr<scalar_t>(), mat.stride(0),
-        vec_contiguous.data_ptr<scalar_t>(), vec_stride, beta, result.data_ptr<scalar_t>(), r_stride);
+  } else {
+    if (!result.is_same(*self_) && betaval != 0.0) { //if beta is 0, result contents will be zeroed later
+      at::native::copy_(const_cast<Tensor&>(result), *self_);
     }
-    else {
-      Tensor cmat = mat.contiguous();
-      at::cuda::blas::gemv<scalar_t>('t',
-          mat.size(1), mat.size(0), alpha, cmat.data_ptr<scalar_t>(), cmat.stride(0),
-          vec_contiguous.data_ptr<scalar_t>(), vec_stride, beta, result.data_ptr<scalar_t>(), r_stride);
+    if (result.numel() != 0) {
+      auto r_stride = result.stride(0);
+      auto vec_stride = vec.stride(0);
+
+      // Check for contiguity of `vec` and update `vec_stride` accordingly
+      const auto vec_contiguous = vec_stride == 0 ? vec.contiguous() : vec;
+      vec_stride = vec_contiguous.stride(0);
+
+      AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, mat.scalar_type(), "addmv_impl_cuda", [&] {
+        auto beta = beta_.to<scalar_t>();
+        auto alpha = alpha_.to<scalar_t>();
+        if (mat.stride(0) == 1 && mat.stride(1) >= std::max<int64_t>(1, mat.size(0))) {
+          at::cuda::blas::gemv<scalar_t>('n',
+            mat.size(0), mat.size(1), alpha, mat.data_ptr<scalar_t>(), mat.stride(1), vec_contiguous.data_ptr<scalar_t>(),
+            vec_stride, beta, result.data_ptr<scalar_t>(), r_stride);
+        }
+        else if (mat.stride(1) == 1 && mat.stride(0) >= std::max<int64_t>(1, mat.size(1))) {
+          at::cuda::blas::gemv<scalar_t>('t',
+            mat.size(1), mat.size(0), alpha, mat.data_ptr<scalar_t>(), mat.stride(0),
+            vec_contiguous.data_ptr<scalar_t>(), vec_stride, beta, result.data_ptr<scalar_t>(), r_stride);
+        }
+        else {
+          Tensor cmat = mat.contiguous();
+          at::cuda::blas::gemv<scalar_t>('t',
+              mat.size(1), mat.size(0), alpha, cmat.data_ptr<scalar_t>(), cmat.stride(0),
+              vec_contiguous.data_ptr<scalar_t>(), vec_stride, beta, result.data_ptr<scalar_t>(), r_stride);
+        }
+      });
     }
-  });
-  return result;
+  }
 }
 
 }} // namespace at::native
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 897be42b8ab8..c0a342415eb8 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -378,24 +378,18 @@
     CompositeExplicitAutograd: add_
 
 - func: addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  structured_delegate: addmv.out
   variants: function, method
-  dispatch:
-    CPU, CUDA: addmv
 
 - func: addmv_(Tensor(a!) self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+  structured_delegate: addmv.out
   variants: function, method
-  dispatch:
-    CPU, CUDA: addmv_
 
 - func: addmv.out(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  structured: True
   dispatch:
-    CPU, CUDA: addmv_out
-
-
-- func: _addmv_impl_(Tensor(a!) self, Tensor self2, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
-  dispatch:
-    CPU: addmv_impl_cpu
-    CUDA: addmv_impl_cuda
+    CPU: addmv_out_cpu
+    CUDA: addmv_out_cuda
 
 - func: addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
   variants: function, method
diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py
index b0877b542e0a..4387aac9ae4d 100644
--- a/test/backward_compatibility/check_backward_compatibility.py
+++ b/test/backward_compatibility/check_backward_compatibility.py
@@ -64,19 +64,6 @@
     ("aten::fake_quantize_per_tensor_affine_backward", datetime.date(2021, 2, 20)),
     ("aten::fake_quantize_per_channel_affine_backward", datetime.date(2021, 2, 20)),
     ("aten::rowwise_prune", datetime.date(9999, 1, 1)),
-    ("aten::_foreach_mul_", datetime.date(2021, 4, 2)),
-    ("aten::_foreach_addcdiv_", datetime.date(2021, 4, 2)),
-    ("aten::_foreach_div", datetime.date(2021, 4, 2)),
-    ("aten::_foreach_addcmul_", datetime.date(2021, 4, 2)),
-    ("aten::_foreach_sub", datetime.date(2021, 4, 2)),
-    ("aten::_foreach_add", datetime.date(2021, 4, 2)),
-    ("aten::_foreach_sub_", datetime.date(2021, 4, 2)),
-    ("aten::_foreach_add_", datetime.date(2021, 4, 2)),
-    ("aten::_foreach_mul", datetime.date(2021, 4, 2)),
-    ("aten::_foreach_div_", datetime.date(2021, 4, 2)),
-    ("aten::_foreach_addcdiv", datetime.date(2021, 4, 2)),
-    ("aten::_foreach_addcmul", datetime.date(2021, 4, 2)),
-    ("aten::mkldnn_linear", datetime.date(2021, 3, 2)),
     ("aten::_mode*", datetime.date(2021, 5, 2)),
     ("aten::linalg_multi_dot", datetime.date(2021, 3, 25)),
     ("aten::coalesce", datetime.date(2021, 4, 15)),
@@ -87,6 +74,7 @@
     ("aten::assert_async", datetime.date(2021, 5, 1)),
     ("aten::cumprod_backward", datetime.date(2021, 5, 1)),
     ("aten::_triangular_solve_helper", datetime.date(9999, 1, 1)),
+    ("aten::_addmv_impl_", datetime.date(2021, 5, 15)),
     ("aten::adaptive_avg_pool3d_backward", datetime.date(9999, 1, 1)),
     ("aten::_embedding_bag_dense_backward", datetime.date(9999, 1, 1)),
 ]

From ff1498e668d9f22920e23ba4b53242976cdd655c Mon Sep 17 00:00:00 2001
From: Chao Kong <chaok@fb.com>
Date: Thu, 15 Apr 2021 15:53:07 -0700
Subject: [PATCH 25/45] Add cost inference for MulGradient operator

Summary: Add cost inference for MulGradient operator; also whitelist MulGradient in COMPUTE_OP_TYPES in dense_perf_estimation

Test Plan: buck run //caffe2/caffe2/python/operator_test:elementwise_ops_test

Reviewed By: CrazySherman

Differential Revision: D27614003

fbshipit-source-id: 30901e5e2b6ce7e2183c2362d1bf9f895046cf55
---
 caffe2/operators/elementwise_ops_schema.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/caffe2/operators/elementwise_ops_schema.cc b/caffe2/operators/elementwise_ops_schema.cc
index 676e08155765..7bf88853869d 100644
--- a/caffe2/operators/elementwise_ops_schema.cc
+++ b/caffe2/operators/elementwise_ops_schema.cc
@@ -320,7 +320,8 @@ OPERATOR_SCHEMA(MulGradient)
     .NumInputs(3)
     .NumOutputs(2)
     .TensorInferenceFunction(ElementwiseGradientOpShapeInference)
-    .AllowInplace({{0, 0}, {0, 1}});
+    .AllowInplace({{0, 0}, {0, 1}})
+    .CostInferenceFunction(PointwiseCostInference<2>);
 
 OPERATOR_SCHEMA(Div)
     .NumInputs(2)

From 3c4e1cd141002a6f70c15062845ec63414c1d7e8 Mon Sep 17 00:00:00 2001
From: Horace He <horacehe2007@yahoo.com>
Date: Thu, 15 Apr 2021 15:54:35 -0700
Subject: [PATCH 26/45] remove annoying warnings from common_nn.py (#55982)

Summary:
^^

Pull Request resolved: https://github.com/pytorch/pytorch/pull/55982

Reviewed By: mruberry

Differential Revision: D27776380

Pulled By: Chillee

fbshipit-source-id: 22b3a8de73416821bed56b75b68dca1c33a21250
---
 torch/testing/_internal/common_nn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index fd474fb9905b..436d4aad0daa 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -1192,7 +1192,7 @@ def multimarginloss_weights_no_reduce_test():
 
 
 def fractional_max_pool2d_test(test_case):
-    random_samples = torch.DoubleTensor(1, 3, 2).uniform_()
+    random_samples = torch.empty((1, 3, 2), dtype=torch.double).uniform_()
     if test_case == 'ratio':
         return dict(
             constructor=lambda: nn.FractionalMaxPool2d(
@@ -1228,7 +1228,7 @@ def fractional_max_pool2d_test(test_case):
 
 
 def fractional_max_pool3d_test(test_case):
-    random_samples = torch.DoubleTensor(2, 4, 3).uniform_()
+    random_samples = torch.empty((2, 4, 3), dtype=torch.double).uniform_()
     if test_case == 'ratio':
         return dict(
             constructor=lambda: nn.FractionalMaxPool3d(

From cfc97162469d759231cf65efee17b0adaeb6c825 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 15 Apr 2021 15:55:49 -0700
Subject: [PATCH 27/45] Change all unary functions stubs to use
 TensorIteratorBase& (#56078)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/56078

This is in preparation for making all unary functions structured.
I don't actually have to make them structured yet as TensorIterator&
casts to TensorIteratorBase&

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: mruberry

Differential Revision: D27777768

Pulled By: ezyang

fbshipit-source-id: 05a3a95f200698eef72c5c74fff85fe881e1c4a3
---
 aten/src/ATen/native/Distributions.cpp        | 22 ++---
 aten/src/ATen/native/TensorFactories.cpp      |  1 +
 aten/src/ATen/native/TensorFactories.h        |  2 -
 aten/src/ATen/native/UnaryOps.h               | 46 +++++------
 .../ATen/native/cpu/DistributionTemplates.h   | 32 ++++----
 aten/src/ATen/native/cpu/UnaryOpsKernel.cpp   | 82 +++++++++----------
 aten/src/ATen/native/cuda/AbsKernel.cu        |  2 +-
 .../native/cuda/DistributionCauchyKernel.cu   |  2 +-
 .../cuda/DistributionExponentialKernel.cu     |  2 +-
 .../cuda/DistributionGeometricKernel.cu       |  2 +-
 .../cuda/DistributionLogNormalKernel.cu       |  2 +-
 .../native/cuda/DistributionRandomKernel.cu   |  6 +-
 .../ATen/native/cuda/DistributionTemplates.h  | 42 +++++-----
 .../ATen/native/cuda/DistributionUniform.cu   |  2 +-
 .../ATen/native/cuda/UnaryComplexKernels.cu   |  8 +-
 .../ATen/native/cuda/UnaryFractionKernels.cu  | 12 +--
 .../src/ATen/native/cuda/UnaryGammaKernels.cu |  8 +-
 .../ATen/native/cuda/UnaryGeometricKernels.cu | 14 ++--
 aten/src/ATen/native/cuda/UnaryLogKernels.cu  |  8 +-
 aten/src/ATen/native/cuda/UnaryOpsKernel.cu   | 20 ++---
 aten/src/ATen/native/cuda/UnarySignKernels.cu | 10 +--
 .../ATen/native/cuda/UnarySpecialOpsKernel.cu | 18 ++--
 22 files changed, 171 insertions(+), 172 deletions(-)

diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp
index 6ca7ae1e1d46..135775afcd75 100644
--- a/aten/src/ATen/native/Distributions.cpp
+++ b/aten/src/ATen/native/Distributions.cpp
@@ -168,7 +168,7 @@ Tensor& bernoulli_(Tensor& self, double p, c10::optional<Generator> gen) {
 
 template<typename RNG>
 struct LogNormalStub {
-  void operator()(TensorIterator& iter, double mean, double std, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double mean, double std, c10::optional<Generator> gen) {
     log_normal_stub(iter.device_type(), iter, mean, std, gen);
   }
 };
@@ -181,7 +181,7 @@ Tensor& log_normal_(Tensor& self, double mean, double std, c10::optional<Generat
 
 template<typename RNG>
 struct CauchyStub {
-  void operator()(TensorIterator& iter, double median, double sigma, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double median, double sigma, c10::optional<Generator> gen) {
     cauchy_stub(iter.device_type(), iter, median, sigma, gen);
   }
 };
@@ -194,7 +194,7 @@ Tensor& cauchy_(Tensor& self, double median, double sigma, c10::optional<Generat
 
 template<typename RNG>
 struct ExponentialStub {
-  void operator()(TensorIterator& iter, double lambda, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double lambda, c10::optional<Generator> gen) {
     exponential_stub(iter.device_type(), iter, lambda, gen);
   }
 };
@@ -207,7 +207,7 @@ Tensor& exponential_(Tensor& self, double lambda, c10::optional<Generator> gen)
 
 template<typename RNG>
 struct GeometricStub {
-  void operator()(TensorIterator& iter, double p, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double p, c10::optional<Generator> gen) {
     geometric_stub(iter.device_type(), iter, p, gen);
   }
 };
@@ -220,7 +220,7 @@ Tensor& geometric_(Tensor& self, double p, c10::optional<Generator> gen) {
 
 template<typename RNG>
 struct UniformStub {
-  void operator()(TensorIterator& iter, double from, double to, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double from, double to, c10::optional<Generator> gen) {
     uniform_stub(iter.device_type(), iter, from, to, gen);
   }
 };
@@ -228,7 +228,7 @@ struct UniformStub {
 template<typename RNG>
 struct UniformMeta {
   // No-op!
-  void operator()(TensorIterator& iter, double from, double to, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double from, double to, c10::optional<Generator> gen) {
   }
 };
 
@@ -286,7 +286,7 @@ Tensor normal(const Tensor& mean, const Tensor& std, c10::optional<Generator> ge
 
 template<typename RNG>
 struct RandomStub {
-  void operator()(TensorIterator& iter, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, c10::optional<Generator> gen) {
     random_stub(iter.device_type(), iter, gen);
   }
 };
@@ -297,10 +297,10 @@ Tensor& random_(Tensor& self, c10::optional<Generator> gen) {
 
 template<typename RNG>
 struct RandomFromToStub {
-  void operator()(TensorIterator& iter, uint64_t range, int64_t from, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, uint64_t range, int64_t from, c10::optional<Generator> gen) {
     random_from_to_stub(iter.device_type(), iter, range, from, gen);
   }
-  void operator()(TensorIterator& iter, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, c10::optional<Generator> gen) {
     random_full_64_bits_range_stub(iter.device_type(), iter, gen);
   }
 };
@@ -308,9 +308,9 @@ struct RandomFromToStub {
 template<typename RNG>
 struct RandomFromToMeta {
   // No-op!
-  void operator()(TensorIterator& iter, uint64_t range, int64_t from, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, uint64_t range, int64_t from, c10::optional<Generator> gen) {
   }
-  void operator()(TensorIterator& iter, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, c10::optional<Generator> gen) {
   }
 };
 
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index eb800b5f4653..6a6bb2d4561d 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -14,6 +14,7 @@
 #include <ATen/detail/CUDAHooksInterface.h>
 #include <c10/util/Exception.h>
 #include <ATen/NamedTensorUtils.h>
+#include <ATen/native/UnaryOps.h>
 
 #include <algorithm>
 #include <cctype>
diff --git a/aten/src/ATen/native/TensorFactories.h b/aten/src/ATen/native/TensorFactories.h
index d5943ac55ae5..a844779349dc 100644
--- a/aten/src/ATen/native/TensorFactories.h
+++ b/aten/src/ATen/native/TensorFactories.h
@@ -90,7 +90,5 @@ using binary_fn = void (*)(TensorIterator&);
 DECLARE_DISPATCH(binary_fn, complex_stub);
 DECLARE_DISPATCH(binary_fn, polar_stub);
 
-DECLARE_DISPATCH(void(*)(TensorIterator&, const int64_t, const double), kaiser_window_stub);
-
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/UnaryOps.h b/aten/src/ATen/native/UnaryOps.h
index 69bb0bf1676f..71126530d339 100644
--- a/aten/src/ATen/native/UnaryOps.h
+++ b/aten/src/ATen/native/UnaryOps.h
@@ -5,13 +5,10 @@
 #include <ATen/Generator.h>
 #include <stdexcept>
 
-namespace at { struct TensorIterator; }
-
 namespace at { namespace native {
 
-using unary_fn = void(*)(TensorIterator&);
-using unary_fn_with_scalar = void(*)(TensorIterator&, const Scalar& a);
-using structured_unary_fn = void(*)(TensorIteratorBase&);
+using unary_fn = void(*)(TensorIteratorBase&);
+using unary_fn_with_scalar = void(*)(TensorIteratorBase&, const Scalar& a);
 
 DECLARE_DISPATCH(unary_fn, abs_stub);
 DECLARE_DISPATCH(unary_fn, angle_stub);
@@ -19,7 +16,7 @@ DECLARE_DISPATCH(unary_fn, real_stub);
 DECLARE_DISPATCH(unary_fn, imag_stub);
 DECLARE_DISPATCH(unary_fn, conj_stub);
 DECLARE_DISPATCH(unary_fn, acos_stub);
-DECLARE_DISPATCH(structured_unary_fn, acosh_stub);
+DECLARE_DISPATCH(unary_fn, acosh_stub);
 DECLARE_DISPATCH(unary_fn, asinh_stub);
 DECLARE_DISPATCH(unary_fn, atanh_stub);
 DECLARE_DISPATCH(unary_fn, asin_stub);
@@ -29,8 +26,8 @@ DECLARE_DISPATCH(unary_fn, logical_not_stub);
 DECLARE_DISPATCH(unary_fn, ceil_stub);
 DECLARE_DISPATCH(unary_fn_with_scalar, clamp_max_stub);
 DECLARE_DISPATCH(unary_fn_with_scalar, clamp_min_stub);
-DECLARE_DISPATCH(structured_unary_fn, cos_stub);
-DECLARE_DISPATCH(structured_unary_fn, cosh_stub);
+DECLARE_DISPATCH(unary_fn, cos_stub);
+DECLARE_DISPATCH(unary_fn, cosh_stub);
 DECLARE_DISPATCH(unary_fn, digamma_stub);
 DECLARE_DISPATCH(unary_fn, entr_stub);
 DECLARE_DISPATCH(unary_fn, erf_stub);
@@ -43,7 +40,7 @@ DECLARE_DISPATCH(unary_fn, floor_stub);
 DECLARE_DISPATCH(unary_fn, frac_stub);
 DECLARE_DISPATCH(unary_fn, frexp_stub);
 DECLARE_DISPATCH(unary_fn, i0_stub);
-DECLARE_DISPATCH(structured_unary_fn, i0e_stub);
+DECLARE_DISPATCH(unary_fn, i0e_stub);
 DECLARE_DISPATCH(unary_fn, log_stub);
 DECLARE_DISPATCH(unary_fn, log10_stub);
 DECLARE_DISPATCH(unary_fn, log1p_stub);
@@ -58,9 +55,9 @@ DECLARE_DISPATCH(unary_fn_with_scalar, logit_stub);
 DECLARE_DISPATCH(unary_fn, sign_stub);
 DECLARE_DISPATCH(unary_fn, signbit_stub);
 DECLARE_DISPATCH(unary_fn, sgn_stub);
-DECLARE_DISPATCH(structured_unary_fn, sin_stub);
-DECLARE_DISPATCH(structured_unary_fn, sinc_stub);
-DECLARE_DISPATCH(structured_unary_fn, sinh_stub);
+DECLARE_DISPATCH(unary_fn, sin_stub);
+DECLARE_DISPATCH(unary_fn, sinc_stub);
+DECLARE_DISPATCH(unary_fn, sinh_stub);
 DECLARE_DISPATCH(unary_fn, sqrt_stub);
 DECLARE_DISPATCH(unary_fn, tan_stub);
 DECLARE_DISPATCH(unary_fn, tanh_stub);
@@ -68,25 +65,28 @@ DECLARE_DISPATCH(unary_fn, trigamma_stub);
 DECLARE_DISPATCH(unary_fn, trunc_stub);
 DECLARE_DISPATCH(unary_fn, lgamma_stub);
 
+// NB: these are actually defined in Distribution
 DECLARE_DISPATCH(void(*)(Tensor&, const Tensor&, c10::optional<Generator>), bernoulli_tensor_stub);
 DECLARE_DISPATCH(void(*)(Tensor&, const double, c10::optional<Generator>), bernoulli_scalar_stub);
-DECLARE_DISPATCH(void(*)(TensorIterator&, const double, const double, c10::optional<Generator>), cauchy_stub);
-DECLARE_DISPATCH(void(*)(TensorIterator&, const double, c10::optional<Generator>), exponential_stub);
-DECLARE_DISPATCH(void(*)(TensorIterator&, const double, c10::optional<Generator>), geometric_stub);
-DECLARE_DISPATCH(void(*)(TensorIterator&, const double, const double, c10::optional<Generator>), log_normal_stub);
-DECLARE_DISPATCH(void(*)(TensorIterator&, const double, const double, c10::optional<Generator>), uniform_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional<Generator>), cauchy_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, c10::optional<Generator>), exponential_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, c10::optional<Generator>), geometric_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional<Generator>), log_normal_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional<Generator>), uniform_stub);
 DECLARE_DISPATCH(void(*)(Tensor&, const double, const double, c10::optional<Generator>), normal_stub);
-DECLARE_DISPATCH(void(*)(TensorIterator&, const uint64_t, const int64_t, c10::optional<Generator>), random_from_to_stub);
-DECLARE_DISPATCH(void(*)(TensorIterator&, c10::optional<Generator>), random_full_64_bits_range_stub);
-DECLARE_DISPATCH(void(*)(TensorIterator&, c10::optional<Generator>), random_stub);
-DECLARE_DISPATCH(void(*)(TensorIterator&, const int64_t), polygamma_stub);
-DECLARE_DISPATCH(void(*)(TensorIterator&, const Scalar& a, const Scalar& b), clamp_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const uint64_t, const int64_t, c10::optional<Generator>), random_from_to_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, c10::optional<Generator>), random_full_64_bits_range_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, c10::optional<Generator>), random_stub);
+
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const int64_t, const double), kaiser_window_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const int64_t), polygamma_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const Scalar& a, const Scalar& b), clamp_stub);
 DECLARE_DISPATCH(
     void (*)(Tensor&, const Tensor&, int64_t, c10::optional<Generator>),
     multinomial_with_replacement_stub);
 DECLARE_DISPATCH(
     void (*)(
-        TensorIterator&,
+        TensorIteratorBase&,
         c10::optional<double>,
         c10::optional<double>,
         c10::optional<double>),
diff --git a/aten/src/ATen/native/cpu/DistributionTemplates.h b/aten/src/ATen/native/cpu/DistributionTemplates.h
index 7e59ec2d9f4f..fdbb67227913 100644
--- a/aten/src/ATen/native/cpu/DistributionTemplates.h
+++ b/aten/src/ATen/native/cpu/DistributionTemplates.h
@@ -22,7 +22,7 @@ namespace {
 // ==================================================== Random ========================================================
 
 template<typename RNG>
-void random_from_to_kernel(TensorIterator& iter, uint64_t range, int64_t base, RNG generator) {
+void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, RNG generator) {
   AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "random_from_to_kernel_cpu", [&] {
     std::lock_guard<std::mutex> lock(generator->mutex_);
     cpu_serial_kernel(iter, [range, base, generator]() -> scalar_t {
@@ -36,7 +36,7 @@ void random_from_to_kernel(TensorIterator& iter, uint64_t range, int64_t base, R
 // from(inclusive) = std::numeric_limits<int64_t>::lowest()
 // to(exclusive) = None (= std::numeric_limits<int64_t>::max() + 1)
 template<typename RNG>
-void random_full_64_bits_range_kernel(TensorIterator& iter, RNG generator) {
+void random_full_64_bits_range_kernel(TensorIteratorBase& iter, RNG generator) {
   AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::BFloat16, iter.dtype(), "random_full_64_bits_range_kernel_cpu", [&] {
     std::lock_guard<std::mutex> lock(generator->mutex_);
     if (std::is_same<scalar_t, int64_t>::value ||
@@ -55,16 +55,16 @@ void random_full_64_bits_range_kernel(TensorIterator& iter, RNG generator) {
 
 template<typename RNG>
 struct RandomFromToKernel {
-  void operator()(TensorIterator& iter, uint64_t range, int64_t base, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, uint64_t range, int64_t base, c10::optional<Generator> gen) {
     random_from_to_kernel(iter, range, base, check_generator<RNG>(gen));
   }
-  void operator()(TensorIterator& iter, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, c10::optional<Generator> gen) {
     random_full_64_bits_range_kernel(iter, check_generator<RNG>(gen));
   }
 };
 
 template<typename RNG>
-void random_kernel(TensorIterator& iter, RNG generator) {
+void random_kernel(TensorIteratorBase& iter, RNG generator) {
   std::lock_guard<std::mutex> lock(generator->mutex_);
   AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, iter.dtype(), "random_kernel_cpu", [&] {
     cpu_serial_kernel(iter, [generator]() -> scalar_t {
@@ -76,7 +76,7 @@ void random_kernel(TensorIterator& iter, RNG generator) {
 
 template<typename RNG>
 struct RandomKernel {
-  void operator()(TensorIterator& iter, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, c10::optional<Generator> gen) {
     random_kernel(iter, check_generator<RNG>(gen));
   }
 };
@@ -205,7 +205,7 @@ struct NormalKernel {
 // ==================================================== Uniform =======================================================
 
 template<typename RNG>
-void uniform_kernel(TensorIterator& iter, double from_, double to_, RNG generator) {
+void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG generator) {
   AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "uniform_kernel_cpu", [&]() {
     std::lock_guard<std::mutex> lock(generator->mutex_);
     auto from = static_cast<scalar_t>(from_);
@@ -219,7 +219,7 @@ void uniform_kernel(TensorIterator& iter, double from_, double to_, RNG generato
 
 template<typename RNG>
 struct UniformKernel {
-  void operator()(TensorIterator& iter, double from, double to, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double from, double to, c10::optional<Generator> gen) {
     uniform_kernel(iter, from, to, check_generator<RNG>(gen));
   }
 };
@@ -227,7 +227,7 @@ struct UniformKernel {
 // ==================================================== Cauchy ========================================================
 
 template<typename RNG>
-void cauchy_kernel(TensorIterator& iter, double median, double sigma, RNG generator) {
+void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, RNG generator) {
   AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "cauchy_cpu", [&]() {
     std::lock_guard<std::mutex> lock(generator->mutex_);
     at::cauchy_distribution<double> cauchy(median, sigma);
@@ -239,7 +239,7 @@ void cauchy_kernel(TensorIterator& iter, double median, double sigma, RNG genera
 
 template<typename RNG>
 struct CauchyKernel {
-  void operator()(TensorIterator& iter, double median, double sigma, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double median, double sigma, c10::optional<Generator> gen) {
     cauchy_kernel(iter, median, sigma, check_generator<RNG>(gen));
   }
 };
@@ -247,7 +247,7 @@ struct CauchyKernel {
 // ================================================== LogNormal =======================================================
 
 template<typename RNG>
-void log_normal_kernel(TensorIterator& iter, double mean, double std, RNG generator) {
+void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, RNG generator) {
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "log_normal_cpu", [&]() {
     std::lock_guard<std::mutex> lock(generator->mutex_);
     at::lognormal_distribution<double> logNormal(mean, std);
@@ -259,7 +259,7 @@ void log_normal_kernel(TensorIterator& iter, double mean, double std, RNG genera
 
 template<typename RNG>
 struct LogNormalKernel {
-  void operator()(TensorIterator& iter, double mean, double std, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double mean, double std, c10::optional<Generator> gen) {
     log_normal_kernel(iter, mean, std, check_generator<RNG>(gen));
   }
 };
@@ -267,7 +267,7 @@ struct LogNormalKernel {
 // =================================================== Geometric ======================================================
 
 template<typename RNG>
-void geometric_kernel(TensorIterator& iter, double p, RNG generator) {
+void geometric_kernel(TensorIteratorBase& iter, double p, RNG generator) {
   AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "geometric_cpu", [&]() {
     std::lock_guard<std::mutex> lock(generator->mutex_);
     at::geometric_distribution<double> geometric(p);
@@ -279,7 +279,7 @@ void geometric_kernel(TensorIterator& iter, double p, RNG generator) {
 
 template<typename RNG>
 struct GeometricKernel {
-  void operator()(TensorIterator& iter, double p, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double p, c10::optional<Generator> gen) {
     geometric_kernel(iter, p, check_generator<RNG>(gen));
   }
 };
@@ -287,7 +287,7 @@ struct GeometricKernel {
 // ================================================== Exponential =====================================================
 
 template<typename RNG>
-void exponential_kernel(TensorIterator& iter, double lambda, RNG generator) {
+void exponential_kernel(TensorIteratorBase& iter, double lambda, RNG generator) {
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "exponential_cpu", [&]() {
     std::lock_guard<std::mutex> lock(generator->mutex_);
     at::exponential_distribution<double> exponential(lambda);
@@ -299,7 +299,7 @@ void exponential_kernel(TensorIterator& iter, double lambda, RNG generator) {
 
 template<typename RNG>
 struct ExponentialKernel {
-  void operator()(TensorIterator& iter, double lambda, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double lambda, c10::optional<Generator> gen) {
     exponential_kernel(iter, lambda, check_generator<RNG>(gen));
   }
 };
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index 7a3fc4b066f7..093dbe88f34b 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -34,7 +34,7 @@ namespace {
 
 using namespace vec256;
 
-static void sigmoid_kernel(TensorIterator& iter) {
+static void sigmoid_kernel(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, iter.dtype(), "sigmoid_cpu", [&]() {
     cpu_kernel_vec(
         iter,
@@ -74,7 +74,7 @@ void VmlLog<double>(int64_t N, const double* X, double* Y) {
 }
 
 template <typename T>
-void LogitMKLKernel(T eps, TensorIterator* it) {
+void LogitMKLKernel(T eps, TensorIteratorBase* it) {
   if (!it->can_use_32bit_indexing()) {
     for (auto& sub_it : it->with_32bit_indexing()) {
       LogitMKLKernel<T>(eps, &sub_it);
@@ -111,13 +111,13 @@ void LogitMKLKernel(T eps, TensorIterator* it) {
 #else
 
 template <typename T>
-void LogitMKLKernel(T eps, TensorIterator* it) {
+void LogitMKLKernel(T eps, TensorIteratorBase* it) {
   TORCH_CHECK(false, "ATen not compiled with MKL");
 }
 
 #endif // AT_MKL_ENABLED
 
-void logit_kernel(TensorIterator& iter, const Scalar& eps_scalar) {
+void logit_kernel(TensorIteratorBase& iter, const Scalar& eps_scalar) {
   AT_DISPATCH_FLOATING_TYPES_AND(
       kBFloat16, iter.common_dtype(), "logit_cpu", [&]() {
         const scalar_t eps = eps_scalar.to<scalar_t>();
@@ -157,7 +157,7 @@ void logit_kernel(TensorIterator& iter, const Scalar& eps_scalar) {
       });
 }
 
-static void abs_kernel(TensorIterator& iter) {
+static void abs_kernel(TensorIteratorBase& iter) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "abs_cpu", [&]() {
     cpu_kernel_vec(
         iter,
@@ -166,7 +166,7 @@ static void abs_kernel(TensorIterator& iter) {
   });
 }
 
-static void angle_kernel(TensorIterator& iter) {
+static void angle_kernel(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "angle_cpu", [&]() {
     cpu_kernel_vec(
         iter,
@@ -175,7 +175,7 @@ static void angle_kernel(TensorIterator& iter) {
   });
 }
 
-static void real_kernel(TensorIterator& iter) {
+static void real_kernel(TensorIteratorBase& iter) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "real_cpu", [&]() {
     cpu_kernel_vec(
         iter,
@@ -184,7 +184,7 @@ static void real_kernel(TensorIterator& iter) {
   });
 }
 
-static void imag_kernel(TensorIterator& iter) {
+static void imag_kernel(TensorIteratorBase& iter) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "imag_cpu", [&]() {
     cpu_kernel_vec(
         iter,
@@ -193,7 +193,7 @@ static void imag_kernel(TensorIterator& iter) {
   });
 }
 
-static void conj_kernel(TensorIterator& iter) {
+static void conj_kernel(TensorIteratorBase& iter) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
       kBool, kBFloat16, kHalf, iter.common_dtype(), "conj_cpu", [&]() {
         cpu_kernel_vec(
@@ -203,7 +203,7 @@ static void conj_kernel(TensorIterator& iter) {
       });
 }
 
-static void bitwise_not_kernel(TensorIterator& iter) {
+static void bitwise_not_kernel(TensorIteratorBase& iter) {
   if (iter.dtype() == ScalarType::Bool) {
     // Boolean type does not work with ~ (bitwise NOT) in C++. bitwise_not wraps this operation for both Boolean and
     // integral types.
@@ -226,7 +226,7 @@ static void bitwise_not_kernel(TensorIterator& iter) {
   }
 }
 
-static void frac_kernel(TensorIterator& iter) {
+static void frac_kernel(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.dtype(), "frac_cpu", [&]() {
     cpu_kernel_vec(
         iter,
@@ -235,7 +235,7 @@ static void frac_kernel(TensorIterator& iter) {
   });
 }
 
-static void logical_not_kernel(TensorIterator& iter) {
+static void logical_not_kernel(TensorIteratorBase& iter) {
   // NOTE: this implementation differs from the CUDA implementation which only does single dispatch
   // (to avoid expensive compilation) because CPU kernels don't handle dynamic_casting
   // (see needs_dynamic_casting).
@@ -247,7 +247,7 @@ static void logical_not_kernel(TensorIterator& iter) {
   });
 }
 
-static void reciprocal_kernel(TensorIterator& iter) {
+static void reciprocal_kernel(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "reciprocal_cpu", [&]() {
     cpu_kernel_vec(
         iter,
@@ -256,7 +256,7 @@ static void reciprocal_kernel(TensorIterator& iter) {
   });
 }
 
-static void neg_kernel(TensorIterator& iter) {
+static void neg_kernel(TensorIteratorBase& iter) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "neg_cpu", [&]() {
     cpu_kernel_vec(
         iter,
@@ -265,7 +265,7 @@ static void neg_kernel(TensorIterator& iter) {
   });
 }
 
-static void sign_kernel(TensorIterator& iter){
+static void sign_kernel(TensorIteratorBase& iter){
   if(iter.dtype() == ScalarType::Bool){
       cpu_kernel(iter, [=](bool x) -> bool { return x; });
   } else {
@@ -288,13 +288,13 @@ static void sign_kernel(TensorIterator& iter){
   }
 }
 
-static void signbit_kernel(TensorIterator& iter){
+static void signbit_kernel(TensorIteratorBase& iter){
   AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, ScalarType::Half, iter.input_dtype(), "signbit_cpu", [&]() {
     cpu_kernel(iter, [](scalar_t a) -> bool { return a < 0; });
   });
 }
 
-static void sgn_kernel(TensorIterator& iter){
+static void sgn_kernel(TensorIteratorBase& iter){
   AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "sgn_cpu", [&]() {
     cpu_kernel_vec(
       iter,
@@ -344,7 +344,7 @@ static void acosh_kernel(TensorIteratorBase& iter) {
     });
 }
 
-static void asinh_kernel(TensorIterator& iter) {
+static void asinh_kernel(TensorIteratorBase& iter) {
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.dtype(), "asinh_cpu", [&]() {
       cpu_kernel(
         iter,
@@ -352,7 +352,7 @@ static void asinh_kernel(TensorIterator& iter) {
     });
 }
 
-static void atanh_kernel(TensorIterator& iter) {
+static void atanh_kernel(TensorIteratorBase& iter) {
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.dtype(), "atanh_cpu", [&]() {
       cpu_kernel(
         iter,
@@ -360,7 +360,7 @@ static void atanh_kernel(TensorIterator& iter) {
     });
 }
 
-static void digamma_kernel(TensorIterator& iter) {
+static void digamma_kernel(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "digamma", [&]() {
     cpu_kernel(
         iter,
@@ -368,7 +368,7 @@ static void digamma_kernel(TensorIterator& iter) {
   });
 }
 
-static void trigamma_kernel(TensorIterator& iter) {
+static void trigamma_kernel(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "trigamma", [&]() {
     cpu_kernel(
         iter,
@@ -376,7 +376,7 @@ static void trigamma_kernel(TensorIterator& iter) {
   });
 }
 
-static void exp2_kernel(TensorIterator& iter) {
+static void exp2_kernel(TensorIteratorBase& iter) {
   // Supports only floating types as std::exp2 doesn't have
   // complex overloads.
   AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.dtype(), "exp2", [&]() {
@@ -386,7 +386,7 @@ static void exp2_kernel(TensorIterator& iter) {
   });
 }
 
-static void polygamma_kernel(TensorIterator& iter, int64_t n) {
+static void polygamma_kernel(TensorIteratorBase& iter, int64_t n) {
   if (n == 0) {
     digamma_kernel(iter);
   } else if (n == 1) {
@@ -400,7 +400,7 @@ static void polygamma_kernel(TensorIterator& iter, int64_t n) {
 }
 
 static void nan_to_num_kernel(
-    TensorIterator& iter,
+    TensorIteratorBase& iter,
     c10::optional<double> nan,
     c10::optional<double> pos_inf,
     c10::optional<double> neg_inf) {
@@ -426,7 +426,7 @@ static void nan_to_num_kernel(
   });
 }
 
-static void clamp_kernel(TensorIterator& iter, const Scalar& min_scalar, const Scalar& max_scalar) {
+static void clamp_kernel(TensorIteratorBase& iter, const Scalar& min_scalar, const Scalar& max_scalar) {
   AT_DISPATCH_ALL_TYPES_AND(kBFloat16, iter.dtype(), "clamp_cpu", [&]() {
     auto min = min_scalar.to<scalar_t>();
     auto max = max_scalar.to<scalar_t>();
@@ -438,7 +438,7 @@ static void clamp_kernel(TensorIterator& iter, const Scalar& min_scalar, const S
   });
 }
 
-static void clamp_max_kernel(TensorIterator& iter, const Scalar& max_scalar) {
+static void clamp_max_kernel(TensorIteratorBase& iter, const Scalar& max_scalar) {
   AT_DISPATCH_ALL_TYPES_AND(kBFloat16, iter.dtype(), "clamp_max_cpu", [&]() {
     auto max = max_scalar.to<scalar_t>();
     auto max_vec = Vec256<scalar_t>(max);
@@ -448,7 +448,7 @@ static void clamp_max_kernel(TensorIterator& iter, const Scalar& max_scalar) {
   });
 }
 
-static void clamp_min_kernel(TensorIterator& iter, const Scalar& min_scalar) {
+static void clamp_min_kernel(TensorIteratorBase& iter, const Scalar& min_scalar) {
   AT_DISPATCH_ALL_TYPES_AND(kBFloat16, iter.dtype(), "clamp_min_cpu", [&]() {
     auto min = min_scalar.to<scalar_t>();
     auto min_vec = Vec256<scalar_t>(min);
@@ -458,7 +458,7 @@ static void clamp_min_kernel(TensorIterator& iter, const Scalar& min_scalar) {
   });
 }
 
-static void kaiser_window_kernel(TensorIterator& iter, int64_t window_length, double beta){
+static void kaiser_window_kernel(TensorIteratorBase& iter, int64_t window_length, double beta){
   AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, iter.dtype(), "kaiser_window_cpu", [&](){
     const scalar_t alpha = static_cast<scalar_t>((window_length - 1) / 2.0);
     cpu_kernel(iter, [=](scalar_t a){
@@ -467,7 +467,7 @@ static void kaiser_window_kernel(TensorIterator& iter, int64_t window_length, do
   });
 }
 
-static void cauchy_kernel(TensorIterator& iter, double median, double sigma, c10::optional<Generator> gen) {
+static void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, c10::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::cauchy_kernel(iter, median, sigma, generator);
 }
@@ -544,22 +544,22 @@ void bernoulli_scalar_kernel(Tensor &self, double p, c10::optional<Generator> ge
 }
 #endif
 
-static void exponential_kernel(TensorIterator& iter, double lambda, c10::optional<Generator> gen) {
+static void exponential_kernel(TensorIteratorBase& iter, double lambda, c10::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::exponential_kernel(iter, lambda, generator);
 }
 
-static void geometric_kernel(TensorIterator& iter, double p, c10::optional<Generator> gen) {
+static void geometric_kernel(TensorIteratorBase& iter, double p, c10::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::geometric_kernel(iter, p, generator);
 }
 
-static void log_normal_kernel(TensorIterator& iter, double mean, double std, c10::optional<Generator> gen) {
+static void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, c10::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::log_normal_kernel(iter, mean, std, generator);
 }
 
-void uniform_kernel(TensorIterator& iter, double from, double to, c10::optional<Generator> gen) {
+void uniform_kernel(TensorIteratorBase& iter, double from, double to, c10::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::uniform_kernel(iter, from, to, generator);
 }
@@ -569,12 +569,12 @@ void normal_kernel(Tensor& self, double mean, double std, c10::optional<Generato
   templates::cpu::normal_kernel(self, mean, std, generator);
 }
 
-static void random_from_to_kernel(TensorIterator& iter, uint64_t range, int64_t base, c10::optional<Generator> gen) {
+static void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, c10::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::random_from_to_kernel(iter, range, base, generator);
 }
 
-static void random_kernel(TensorIterator& iter, c10::optional<Generator> gen) {
+static void random_kernel(TensorIteratorBase& iter, c10::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::random_kernel(iter, generator);
 }
@@ -582,12 +582,12 @@ static void random_kernel(TensorIterator& iter, c10::optional<Generator> gen) {
 // This is the special kernel to handle single specific case:
 // from(inclusive) = std::numeric_limits<int64_t>::lowest()
 // to(exclusive) = None (= std::numeric_limits<int64_t>::max() + 1)
-static void random_full_64_bits_range_kernel(TensorIterator& iter, c10::optional<Generator> gen) {
+static void random_full_64_bits_range_kernel(TensorIteratorBase& iter, c10::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::random_full_64_bits_range_kernel(iter, generator);
 }
 
-static void rsqrt_kernel(TensorIterator& iter) {
+static void rsqrt_kernel(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.common_dtype(), "rsqrt_cpu", [&] {
     cpu_kernel_vec(
         iter,
@@ -598,7 +598,7 @@ static void rsqrt_kernel(TensorIterator& iter) {
   });
 }
 
-static void entr_kernel(TensorIterator& iter) {
+static void entr_kernel(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND(
       kBFloat16, iter.common_dtype(), "entr_cpu", [&] {
         cpu_kernel(iter, [](scalar_t x) -> scalar_t {
@@ -614,7 +614,7 @@ static void entr_kernel(TensorIterator& iter) {
       });
 }
 
-static void frexp_kernel(TensorIterator& iter) {
+static void frexp_kernel(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND(kHalf,
     // The iter.dtype() here is the dtype of mantissa output.
     // It's a floating point type and must be the same as the input's dtype.
@@ -668,7 +668,7 @@ static void i0e_kernel(TensorIteratorBase& iter) {
           }
 
 #define IMPLEMENT_FLOAT_KERNEL(op)                                                  \
-  static void op##_kernel(TensorIterator& iter) {                                   \
+  static void op##_kernel(TensorIteratorBase& iter) {                                   \
     TORCH_INTERNAL_ASSERT(iter.ntensors() == 2);                                    \
     AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, iter.dtype(), #op "_vml_cpu", [&]() { \
       iter.serial_for_each(                                                         \
@@ -679,7 +679,7 @@ static void i0e_kernel(TensorIteratorBase& iter) {
   REGISTER_DISPATCH(op##_stub, &op##_kernel)
 
 #define IMPLEMENT_COMPLEX_KERNEL(op)                                                             \
-  static void op##_kernel(TensorIterator& iter) {                                                \
+  static void op##_kernel(TensorIteratorBase& iter) {                                                \
     TORCH_INTERNAL_ASSERT(iter.ntensors() == 2);                                                 \
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, iter.dtype(), #op "_vml_cpu", [&]() { \
       iter.serial_for_each(                                                                      \
diff --git a/aten/src/ATen/native/cuda/AbsKernel.cu b/aten/src/ATen/native/cuda/AbsKernel.cu
index 649b235bf654..926f58a1ac8f 100644
--- a/aten/src/ATen/native/cuda/AbsKernel.cu
+++ b/aten/src/ATen/native/cuda/AbsKernel.cu
@@ -13,7 +13,7 @@ struct AbsFunctor {
   }
 };
 
-void abs_kernel_cuda(TensorIterator& iter) {
+void abs_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool, iter.dtype(), "abs_cuda", [&]() {
     gpu_kernel(iter, AbsFunctor<scalar_t>());
   });
diff --git a/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu b/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu
index 6ad0fbe869cc..35a1e6ef5a98 100644
--- a/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu
@@ -29,7 +29,7 @@
 
 namespace at { namespace native {
 
-void cauchy_kernel(TensorIterator& iter, double median, double sigma, c10::optional<Generator> gen) {
+void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, c10::optional<Generator> gen) {
   auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::cauchy_kernel(iter, median, sigma, generator);
 }
diff --git a/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu b/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu
index 2f99ba4c6cc1..b4cf288bcb7b 100644
--- a/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu
@@ -29,7 +29,7 @@
 
 namespace at { namespace native {
 
-void exponential_kernel(TensorIterator& iter, double lambda, c10::optional<Generator> gen) {
+void exponential_kernel(TensorIteratorBase& iter, double lambda, c10::optional<Generator> gen) {
   auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::exponential_kernel(iter, lambda, generator);
 }
diff --git a/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu b/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu
index 8d66c626a93a..eb71ab3231f1 100644
--- a/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu
@@ -29,7 +29,7 @@
 
 namespace at { namespace native {
 
-void geometric_kernel(TensorIterator& iter, double p_, c10::optional<Generator> gen) {
+void geometric_kernel(TensorIteratorBase& iter, double p_, c10::optional<Generator> gen) {
   auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::geometric_kernel(iter, p_, generator);
 }
diff --git a/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu b/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu
index 90c7cce1d20d..89b9c04b3a68 100644
--- a/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu
@@ -29,7 +29,7 @@
 
 namespace at { namespace native {
 
-void log_normal_kernel(TensorIterator& iter, double mean, double std, c10::optional<Generator> gen) {
+void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, c10::optional<Generator> gen) {
   auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::log_normal_kernel(iter, mean, std, generator);
 }
diff --git a/aten/src/ATen/native/cuda/DistributionRandomKernel.cu b/aten/src/ATen/native/cuda/DistributionRandomKernel.cu
index e6ffb56cc016..8d6614b9010d 100644
--- a/aten/src/ATen/native/cuda/DistributionRandomKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionRandomKernel.cu
@@ -29,17 +29,17 @@
 
 namespace at { namespace native {
 
-void random_from_to_kernel(TensorIterator& iter, uint64_t range, int64_t base, c10::optional<Generator> gen_) {
+void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, c10::optional<Generator> gen_) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::random_from_to_kernel(iter, range, base, gen);
 }
 
-void random_full_64_bits_range_kernel(TensorIterator& iter, c10::optional<Generator> gen_) {
+void random_full_64_bits_range_kernel(TensorIteratorBase& iter, c10::optional<Generator> gen_) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::random_full_64_bits_range_kernel(iter, gen);
 }
 
-void random_kernel(TensorIterator& iter, c10::optional<Generator> gen_) {
+void random_kernel(TensorIteratorBase& iter, c10::optional<Generator> gen_) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::random_kernel(iter, gen);
 }
diff --git a/aten/src/ATen/native/cuda/DistributionTemplates.h b/aten/src/ATen/native/cuda/DistributionTemplates.h
index fcc6e401e82f..953a834887a4 100644
--- a/aten/src/ATen/native/cuda/DistributionTemplates.h
+++ b/aten/src/ATen/native/cuda/DistributionTemplates.h
@@ -111,7 +111,7 @@ template<typename scalar_t,
          typename RNG,
          typename dist_t,
          typename transform_t>
-void distribution_nullary_kernel(at::TensorIterator& iter,
+void distribution_nullary_kernel(at::TensorIteratorBase& iter,
                                  RNG gen,
                                  const dist_t& dist_func,
                                  const transform_t transform_func) {
@@ -282,7 +282,7 @@ namespace cuda {
 // ==================================================== Random ========================================================
 
 template<typename RNG>
-void random_from_to_kernel(TensorIterator& iter, uint64_t range, int64_t base, RNG gen) {
+void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, RNG gen) {
   AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "random_from_to_kernel_cuda", [&] {
     if ((
       std::is_same<scalar_t, int64_t>::value ||
@@ -322,7 +322,7 @@ void random_from_to_kernel(TensorIterator& iter, uint64_t range, int64_t base, R
 // from(inclusive) = std::numeric_limits<int64_t>::lowest()
 // to(exclusive) = None (= std::numeric_limits<int64_t>::max() + 1)
 template<typename RNG>
-void random_full_64_bits_range_kernel(TensorIterator& iter, RNG gen) {
+void random_full_64_bits_range_kernel(TensorIteratorBase& iter, RNG gen) {
   AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::BFloat16, iter.dtype(), "random_full_64_bits_range_kernel_cuda", [&] {
     if (std::is_same<scalar_t, int64_t>::value ||
         std::is_same<scalar_t, double>::value ||
@@ -349,16 +349,16 @@ void random_full_64_bits_range_kernel(TensorIterator& iter, RNG gen) {
 
 template<typename RNG>
 struct RandomFromToKernel {
-  void operator()(TensorIterator& iter, uint64_t range, int64_t base, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, uint64_t range, int64_t base, c10::optional<Generator> gen) {
     random_from_to_kernel(iter, range, base, check_generator<RNG>(gen));
   }
-  void operator()(TensorIterator& iter, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, c10::optional<Generator> gen) {
     random_full_64_bits_range_kernel(iter, check_generator<RNG>(gen));
   }
 };
 
 template<typename RNG>
-void random_kernel(TensorIterator& iter, RNG gen) {
+void random_kernel(TensorIteratorBase& iter, RNG gen) {
   AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, iter.dtype(), "random_kernel_cuda", [&] {
     if (std::is_same<scalar_t, double>::value || std::is_same<scalar_t, int64_t>::value) {
       auto random_func = [] __device__ (uint64_t rand) {
@@ -389,7 +389,7 @@ void random_kernel(TensorIterator& iter, RNG gen) {
 
 template<typename RNG>
 struct RandomKernel {
-  void operator()(TensorIterator& iter, RNG gen) {
+  void operator()(TensorIteratorBase& iter, RNG gen) {
     random_kernel(iter, gen);
   }
 };
@@ -397,7 +397,7 @@ struct RandomKernel {
 // ====================================================================================================================
 
 template<typename scalar_t, typename accscalar_t, size_t curand4_engine_calls, typename RNG, typename transform_t>
-void uniform_and_transform(TensorIterator& iter, RNG gen, transform_t transform) {
+void uniform_and_transform(TensorIteratorBase& iter, RNG gen, transform_t transform) {
   if (std::is_same<scalar_t, double>::value) {
     distribution_nullary_kernel<scalar_t, accscalar_t, curand4_engine_calls/2>(iter,
       gen,
@@ -412,7 +412,7 @@ void uniform_and_transform(TensorIterator& iter, RNG gen, transform_t transform)
 }
 
 template<typename scalar_t, typename accscalar_t, size_t curand4_engine_calls, typename RNG, typename transform_t>
-void normal_and_transform(TensorIterator& iter, RNG gen, transform_t transform) {
+void normal_and_transform(TensorIteratorBase& iter, RNG gen, transform_t transform) {
   if (std::is_same<scalar_t, double>::value) {
     distribution_nullary_kernel<scalar_t, accscalar_t, curand4_engine_calls/2>(iter,
       gen,
@@ -453,7 +453,7 @@ struct NormalKernel {
 // ==================================================== Uniform ========================================================
 
 template<typename RNG>
-void uniform_kernel(TensorIterator& iter, double from_, double to_, RNG gen) {
+void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG gen) {
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "uniform_kernel_cuda", [&] {
     auto from = static_cast<scalar_t>(from_);
     auto to = static_cast<scalar_t>(to_);
@@ -475,7 +475,7 @@ void uniform_kernel(TensorIterator& iter, double from_, double to_, RNG gen) {
 
 template<typename RNG>
 struct UniformKernel {
-  void operator()(TensorIterator& iter, double from, double to, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double from, double to, c10::optional<Generator> gen) {
     uniform_kernel(iter, from, to, check_generator<RNG>(gen));
   }
 };
@@ -483,7 +483,7 @@ struct UniformKernel {
 // ================================================== LogNormal =======================================================
 
 template<typename RNG>
-void log_normal_kernel(TensorIterator& iter, double mean_, double std_, RNG gen) {
+void log_normal_kernel(TensorIteratorBase& iter, double mean_, double std_, RNG gen) {
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "log_normal_cuda", [&] {
     using accscalar_t = at::acc_type<scalar_t, true>;
     auto mean = static_cast<accscalar_t>(mean_);
@@ -498,7 +498,7 @@ void log_normal_kernel(TensorIterator& iter, double mean_, double std_, RNG gen)
 
 template<typename RNG>
 struct LogNormalKernel {
-  void operator()(TensorIterator& iter, double mean, double std, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double mean, double std, c10::optional<Generator> gen) {
     log_normal_kernel(iter, mean, std, check_generator<RNG>(gen));
   }
 };
@@ -506,7 +506,7 @@ struct LogNormalKernel {
 // =================================================== Geometric ======================================================
 
 template<typename RNG>
-void geometric_kernel(TensorIterator& iter, double p, RNG gen) {
+void geometric_kernel(TensorIteratorBase& iter, double p, RNG gen) {
   AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "geometric_cuda", [&] {
     using accscalar_t = at::DiscreteDistributionType<scalar_t>::type;
     // define lambda for geometric transformation
@@ -519,7 +519,7 @@ void geometric_kernel(TensorIterator& iter, double p, RNG gen) {
 
 template<typename RNG>
 struct GeometricKernel {
-  void operator()(TensorIterator& iter, double p, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double p, c10::optional<Generator> gen) {
     geometric_kernel(iter, p, check_generator<RNG>(gen));
   }
 };
@@ -527,7 +527,7 @@ struct GeometricKernel {
 // ================================================== Exponential =====================================================
 
 template<typename RNG>
-void exponential_kernel(TensorIterator& iter, double lambda_, RNG gen) {
+void exponential_kernel(TensorIteratorBase& iter, double lambda_, RNG gen) {
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "exponential_cuda", [&] {
     using accscalar_t = at::acc_type<scalar_t, true>;
     auto lambda = static_cast<accscalar_t>(lambda_);
@@ -541,7 +541,7 @@ void exponential_kernel(TensorIterator& iter, double lambda_, RNG gen) {
 
 template<typename RNG>
 struct ExponentialKernel {
-  void operator()(TensorIterator& iter, double lambda, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double lambda, c10::optional<Generator> gen) {
     exponential_kernel(iter, lambda, check_generator<RNG>(gen));
   }
 };
@@ -549,7 +549,7 @@ struct ExponentialKernel {
 // ==================================================== Cauchy ========================================================
 
 template<typename RNG>
-void cauchy_kernel(TensorIterator& iter, double median_, double sigma_, RNG gen) {
+void cauchy_kernel(TensorIteratorBase& iter, double median_, double sigma_, RNG gen) {
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "cauchy_cuda", [&] {
     using accscalar_t = at::acc_type<scalar_t, true>;
     auto median = static_cast<accscalar_t>(median_);
@@ -564,7 +564,7 @@ void cauchy_kernel(TensorIterator& iter, double median_, double sigma_, RNG gen)
 
 template<typename RNG>
 struct CauchyKernel {
-  void operator()(TensorIterator& iter, double median, double sigma, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double median, double sigma, c10::optional<Generator> gen) {
     cauchy_kernel(iter, median, sigma, check_generator<RNG>(gen));
   }
 };
@@ -637,7 +637,7 @@ void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG gen) {
 }
 
 template<typename RNG>
-void bernoulli_kernel(TensorIterator& iter, double p, RNG gen) {
+void bernoulli_kernel(TensorIteratorBase& iter, double p, RNG gen) {
   AT_DISPATCH_ALL_TYPES_AND3(
     at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, iter.dtype(), "bernoulli_scalar_cuda_", [&] {
       using accscalar_t = at::DiscreteDistributionType<scalar_t>::type;
@@ -651,7 +651,7 @@ void bernoulli_kernel(TensorIterator& iter, double p, RNG gen) {
 
 template<typename RNG>
 struct BernoulliKernel {
-  void operator()(TensorIterator& iter, double p, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double p, c10::optional<Generator> gen) {
     bernoulli_kernel(iter, p, check_generator<RNG>(gen));
   }
   void operator()(Tensor& self, const Tensor& p_, c10::optional<Generator> gen) {
diff --git a/aten/src/ATen/native/cuda/DistributionUniform.cu b/aten/src/ATen/native/cuda/DistributionUniform.cu
index b33c75a71f52..8cbd5ce36c51 100644
--- a/aten/src/ATen/native/cuda/DistributionUniform.cu
+++ b/aten/src/ATen/native/cuda/DistributionUniform.cu
@@ -7,7 +7,7 @@
 
 namespace at { namespace native {
 
-void uniform_kernel(TensorIterator& iter, double from, double to, c10::optional<Generator> gen) {
+void uniform_kernel(TensorIteratorBase& iter, double from, double to, c10::optional<Generator> gen) {
   auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen, cuda::detail::getDefaultCUDAGenerator());
   templates::cuda::uniform_kernel(iter, from, to, generator);
 }
diff --git a/aten/src/ATen/native/cuda/UnaryComplexKernels.cu b/aten/src/ATen/native/cuda/UnaryComplexKernels.cu
index 6afda194d37a..7fb9acd37fcc 100644
--- a/aten/src/ATen/native/cuda/UnaryComplexKernels.cu
+++ b/aten/src/ATen/native/cuda/UnaryComplexKernels.cu
@@ -23,7 +23,7 @@ __host__ __device__ static inline c10::complex<T> angle_wrapper(c10::complex<T>
   return std::arg(v);
 }
 
-void angle_kernel_cuda(TensorIterator& iter) {
+void angle_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.common_dtype(), "angle_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return angle_wrapper(a);
@@ -42,7 +42,7 @@ __host__ __device__ static inline c10::complex<T> real_wrapper(c10::complex<T> v
   return v.real();
 }
 
-void real_kernel_cuda(TensorIterator& iter) {
+void real_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "real_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return real_wrapper(a);
@@ -61,7 +61,7 @@ __host__ __device__ static inline c10::complex<T> imag_wrapper(c10::complex<T> v
   return v.imag();
 }
 
-void imag_kernel_cuda(TensorIterator& iter) {
+void imag_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "imag_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return imag_wrapper(a);
@@ -80,7 +80,7 @@ __host__ __device__ static inline c10::complex<T> conj_wrapper(c10::complex<T> v
   return std::conj(v);
 }
 
-void conj_kernel_cuda(TensorIterator& iter) {
+void conj_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
       kBool, kBFloat16, kHalf, iter.common_dtype(), "conj_cuda", [&]() {
         gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
diff --git a/aten/src/ATen/native/cuda/UnaryFractionKernels.cu b/aten/src/ATen/native/cuda/UnaryFractionKernels.cu
index f147ae788461..06655d1bba0e 100644
--- a/aten/src/ATen/native/cuda/UnaryFractionKernels.cu
+++ b/aten/src/ATen/native/cuda/UnaryFractionKernels.cu
@@ -21,7 +21,7 @@ __host__ __device__ static inline std::complex<T> ceil_wrapper(std::complex<T> v
   return std::complex<T>(std::ceil(v.real()), std::ceil(v.imag()));
 }
 
-void ceil_kernel_cuda(TensorIterator& iter) {
+void ceil_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::Half, iter.dtype(), "ceil_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return ceil_wrapper(a);
@@ -29,7 +29,7 @@ void ceil_kernel_cuda(TensorIterator& iter) {
   });
 }
 
-void frac_kernel_cuda(TensorIterator& iter) {
+void frac_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "frac_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return a - ::trunc(a);
@@ -48,7 +48,7 @@ __host__ __device__ static inline std::complex<T> floor_wrapper(std::complex<T>
   return std::complex<T>(std::floor(v.real()), std::floor(v.imag()));
 }
 
-void floor_kernel_cuda(TensorIterator& iter) {
+void floor_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::Half, iter.dtype(), "floor_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return floor_wrapper(a);
@@ -87,7 +87,7 @@ __host__ __device__ static inline c10::complex<T> reciprocal_wrapper(c10::comple
   return one/v;
 }
 
-void reciprocal_kernel_cuda(TensorIterator& iter) {
+void reciprocal_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "reciprocal_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return reciprocal_wrapper(a);
@@ -116,7 +116,7 @@ __host__ __device__ static inline c10::complex<double> nearbyint_wrapper(c10::co
 }
 #pragma pop
 
-void round_kernel_cuda(TensorIterator& iter) {
+void round_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::Half, iter.dtype(), "round_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       // We do not use std::round because we would like to round midway numbers to the nearest even integer.
@@ -143,7 +143,7 @@ __host__ __device__ static inline c10::complex<double> trunc_wrapper(c10::comple
   return c10::complex<double>(::trunc(static_cast<double>(a.real())), ::trunc(static_cast<double>(a.imag())));
 }
 
-void trunc_kernel_cuda(TensorIterator& iter) {
+void trunc_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::Half, iter.dtype(), "trunc_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return trunc_wrapper(a);
diff --git a/aten/src/ATen/native/cuda/UnaryGammaKernels.cu b/aten/src/ATen/native/cuda/UnaryGammaKernels.cu
index cdcf92e719d8..2f0341007e2f 100644
--- a/aten/src/ATen/native/cuda/UnaryGammaKernels.cu
+++ b/aten/src/ATen/native/cuda/UnaryGammaKernels.cu
@@ -10,7 +10,7 @@
 
 namespace at { namespace native {
 
-void digamma_kernel_cuda(TensorIterator& iter) {
+void digamma_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.common_dtype(), "digamma_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return calc_digamma(a);
@@ -18,7 +18,7 @@ void digamma_kernel_cuda(TensorIterator& iter) {
   });
 }
 
-void trigamma_kernel_cuda(TensorIterator& iter) {
+void trigamma_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "trigamma_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return calc_trigamma(a);
@@ -26,7 +26,7 @@ void trigamma_kernel_cuda(TensorIterator& iter) {
   });
 }
 
-void polygamma_kernel_cuda(TensorIterator& iter, int64_t n) {
+void polygamma_kernel_cuda(TensorIteratorBase& iter, int64_t n) {
   if (n == 0) {
     digamma_kernel_cuda(iter);
   } else if (n == 1) {
@@ -40,7 +40,7 @@ void polygamma_kernel_cuda(TensorIterator& iter, int64_t n) {
   }
 }
 
-void lgamma_kernel_cuda(TensorIterator& iter) {
+void lgamma_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.common_dtype(), "lgamma_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return ::lgamma(a);
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricKernels.cu b/aten/src/ATen/native/cuda/UnaryGeometricKernels.cu
index e242c8dbc494..c31584be408b 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricKernels.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricKernels.cu
@@ -10,7 +10,7 @@
 
 namespace at { namespace native {
 
-void acos_kernel_cuda(TensorIterator& iter) {
+void acos_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, iter.common_dtype(), "acos_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return ::acos(a);
@@ -18,7 +18,7 @@ void acos_kernel_cuda(TensorIterator& iter) {
   });
 }
 
-void asin_kernel_cuda(TensorIterator& iter) {
+void asin_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, iter.common_dtype(), "asin_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return ::asin(a);
@@ -26,7 +26,7 @@ void asin_kernel_cuda(TensorIterator& iter) {
   });
 }
 
-void atan_kernel_cuda(TensorIterator& iter) {
+void atan_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, iter.common_dtype(), "atan_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return ::atan(a);
@@ -66,7 +66,7 @@ void cosh_kernel_cuda(TensorIteratorBase& iter) {
   });
 }
 
-void tanh_kernel_cuda(TensorIterator& iter) {
+void tanh_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "tanh_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return ::tanh(a);
@@ -82,7 +82,7 @@ void acosh_kernel_cuda(TensorIteratorBase& iter) {
   });
 }
 
-void asinh_kernel_cuda(TensorIterator& iter) {
+void asinh_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "asinh_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
             return ::asinh(a);
@@ -90,7 +90,7 @@ void asinh_kernel_cuda(TensorIterator& iter) {
   });
 }
 
-void atanh_kernel_cuda(TensorIterator& iter) {
+void atanh_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "atanh_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
             return ::atanh(a);
@@ -98,7 +98,7 @@ void atanh_kernel_cuda(TensorIterator& iter) {
   });
 }
 
-void tan_kernel_cuda(TensorIterator& iter) {
+void tan_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, iter.common_dtype(), "tan_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return ::tan(a);
diff --git a/aten/src/ATen/native/cuda/UnaryLogKernels.cu b/aten/src/ATen/native/cuda/UnaryLogKernels.cu
index 84edc13e14ae..7c08ec506342 100644
--- a/aten/src/ATen/native/cuda/UnaryLogKernels.cu
+++ b/aten/src/ATen/native/cuda/UnaryLogKernels.cu
@@ -10,7 +10,7 @@
 
 namespace at { namespace native {
 
-void log_kernel_cuda(TensorIterator& iter) {
+void log_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "log_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return ::log(a);
@@ -18,7 +18,7 @@ void log_kernel_cuda(TensorIterator& iter) {
   });
 }
 
-void log10_kernel_cuda(TensorIterator& iter) {
+void log10_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "log10_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return ::log10(a);
@@ -26,7 +26,7 @@ void log10_kernel_cuda(TensorIterator& iter) {
   });
 }
 
-void log1p_kernel_cuda(TensorIterator& iter) {
+void log1p_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "log1p_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return ::log1p(a);
@@ -34,7 +34,7 @@ void log1p_kernel_cuda(TensorIterator& iter) {
   });
 }
 
-void log2_kernel_cuda(TensorIterator& iter) {
+void log2_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "log2_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return ::log2(a);
diff --git a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
index 153365c90119..0c9a897944ae 100644
--- a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
@@ -19,7 +19,7 @@
 namespace at {
 namespace native {
 
-void bitwise_not_kernel_cuda(TensorIterator& iter) {
+void bitwise_not_kernel_cuda(TensorIteratorBase& iter) {
   if (iter.dtype() == ScalarType::Bool) {
     gpu_kernel(iter, []GPU_LAMBDA(bool a) {
       return !a;
@@ -33,7 +33,7 @@ void bitwise_not_kernel_cuda(TensorIterator& iter) {
   }
 }
 
-void exp_kernel_cuda(TensorIterator& iter) {
+void exp_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.common_dtype(), "exp_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return ::exp(a);
@@ -41,7 +41,7 @@ void exp_kernel_cuda(TensorIterator& iter) {
   });
 }
 
-void expm1_kernel_cuda(TensorIterator& iter) {
+void expm1_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.common_dtype(), "expm1_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return ::expm1(a);
@@ -62,7 +62,7 @@ __host__ __device__ static inline c10::complex<T> rsqrt_wrapper(c10::complex<T>
   return one / ::sqrt(v);
 }
 
-void rsqrt_kernel_cuda(TensorIterator& iter) {
+void rsqrt_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, iter.common_dtype(), "rsqrt_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       // In CUDA, ::rsqrt is overloaded for float and at::Half here is implicitly cast to float.
@@ -71,7 +71,7 @@ void rsqrt_kernel_cuda(TensorIterator& iter) {
   });
 }
 
-void sqrt_kernel_cuda(TensorIterator& iter) {
+void sqrt_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "sqrt_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return ::sqrt(a);
@@ -79,7 +79,7 @@ void sqrt_kernel_cuda(TensorIterator& iter) {
   });
 }
 
-void clamp_kernel_cuda(TensorIterator& iter, const Scalar& min_value, const Scalar& max_value) {
+void clamp_kernel_cuda(TensorIteratorBase& iter, const Scalar& min_value, const Scalar& max_value) {
   AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "clamp_cuda", [&]() {
     auto lower = min_value.to<scalar_t>();
     auto upper = max_value.to<scalar_t>();
@@ -94,7 +94,7 @@ void clamp_kernel_cuda(TensorIterator& iter, const Scalar& min_value, const Scal
   });
 }
 
-void clamp_min_kernel_cuda(TensorIterator& iter, const Scalar& min_value) {
+void clamp_min_kernel_cuda(TensorIteratorBase& iter, const Scalar& min_value) {
   AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "clamp_min_cuda", [&]() {
     auto lower = min_value.to<scalar_t>();
     gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t v) -> scalar_t {
@@ -108,7 +108,7 @@ void clamp_min_kernel_cuda(TensorIterator& iter, const Scalar& min_value) {
   });
 }
 
-void clamp_max_kernel_cuda(TensorIterator& iter, const Scalar& max_value) {
+void clamp_max_kernel_cuda(TensorIteratorBase& iter, const Scalar& max_value) {
   AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "clamp_max_cuda", [&]() {
     auto upper = max_value.to<scalar_t>();
     gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t v) -> scalar_t {
@@ -123,7 +123,7 @@ void clamp_max_kernel_cuda(TensorIterator& iter, const Scalar& max_value) {
 }
 
 void nan_to_num_kernel_cuda(
-    TensorIterator& iter,
+    TensorIteratorBase& iter,
     c10::optional<double> nan,
     c10::optional<double> pos_inf,
     c10::optional<double> neg_inf) {
@@ -148,7 +148,7 @@ void nan_to_num_kernel_cuda(
   });
 }
 
-void frexp_kernel_cuda(TensorIterator& iter) {
+void frexp_kernel_cuda(TensorIteratorBase& iter) {
 #ifdef __HIP_PLATFORM_HCC__
   // Reference: https://rocmdocs.amd.com/en/latest/ROCm_API_References/HIP-MATH.html
   //            https://github.com/ROCm-Developer-Tools/HIP/issues/2169
diff --git a/aten/src/ATen/native/cuda/UnarySignKernels.cu b/aten/src/ATen/native/cuda/UnarySignKernels.cu
index 617f09c4fbd6..aef3462bf22a 100644
--- a/aten/src/ATen/native/cuda/UnarySignKernels.cu
+++ b/aten/src/ATen/native/cuda/UnarySignKernels.cu
@@ -11,7 +11,7 @@
 
 namespace at { namespace native {
 
-void logical_not_kernel_cuda(TensorIterator& iter) {
+void logical_not_kernel_cuda(TensorIteratorBase& iter) {
   // error check -- this is just ensuring we don't dispatch on types that aren't in ALL_TYPES_AND_COMPLEX_AND3(...)
   // so we don't have to maintain a separate list or to do double dispatch.
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(0), "logical_not_cuda", [&]() {});
@@ -21,7 +21,7 @@ void logical_not_kernel_cuda(TensorIterator& iter) {
   });
 }
 
-void neg_kernel_cuda(TensorIterator& iter) {
+void neg_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "neg_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return -a;
@@ -29,7 +29,7 @@ void neg_kernel_cuda(TensorIterator& iter) {
   });
 }
 
-void sign_kernel_cuda(TensorIterator& iter){
+void sign_kernel_cuda(TensorIteratorBase& iter){
   if (iter.dtype() == ScalarType::Bool) {
     gpu_kernel(iter, []GPU_LAMBDA(bool a){
       return a;
@@ -44,7 +44,7 @@ void sign_kernel_cuda(TensorIterator& iter){
   }
 }
 
-void signbit_kernel_cuda(TensorIterator& iter){
+void signbit_kernel_cuda(TensorIteratorBase& iter){
   AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, ScalarType::Half, iter.input_dtype(), "signbit_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> bool { return !std::is_unsigned<scalar_t>::value && a < 0; });
   });
@@ -59,7 +59,7 @@ __host__ __device__ static inline c10::complex<T> sgn_wrapper(c10::complex<T> z)
   }
 }
 
-void sgn_kernel_cuda(TensorIterator& iter){
+void sgn_kernel_cuda(TensorIteratorBase& iter){
   AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "sgn_cuda", [&]() {
       gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
         return sgn_wrapper(a);
diff --git a/aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu b/aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu
index fe8ae6a6af4a..90799183ef4b 100644
--- a/aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu
@@ -19,7 +19,7 @@
 namespace at {
 namespace native {
 
-void exp2_kernel_cuda(TensorIterator& iter) {
+void exp2_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.common_dtype(), "exp2_cuda", [&]() {
     gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
       return ::exp2(a);
@@ -27,7 +27,7 @@ void exp2_kernel_cuda(TensorIterator& iter) {
   });
 }
 
-void i0_kernel_cuda(TensorIterator& iter) {
+void i0_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "i0_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return calc_i0(a);
@@ -43,7 +43,7 @@ void i0e_kernel_cuda(TensorIteratorBase& iter) {
   });
 }
 
-void sigmoid_kernel_cuda(TensorIterator& iter) {
+void sigmoid_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.common_dtype(), "sigmoid_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       scalar_t one = scalar_t(1);
@@ -66,7 +66,7 @@ void sinc_kernel_cuda(TensorIteratorBase& iter) {
   });
 }
 
-void logit_kernel_cuda(TensorIterator& iter, const Scalar& eps_scalar) {
+void logit_kernel_cuda(TensorIteratorBase& iter, const Scalar& eps_scalar) {
   AT_DISPATCH_FLOATING_TYPES_AND2(
       at::ScalarType::Half,
       at::ScalarType::BFloat16,
@@ -93,7 +93,7 @@ void logit_kernel_cuda(TensorIterator& iter, const Scalar& eps_scalar) {
       });
 }
 
-void erf_kernel_cuda(TensorIterator& iter) {
+void erf_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.common_dtype(), "erf_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return ::erf(a);
@@ -101,7 +101,7 @@ void erf_kernel_cuda(TensorIterator& iter) {
   });
 }
 
-void erfc_kernel_cuda(TensorIterator& iter) {
+void erfc_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.common_dtype(), "erfc_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return ::erfc(a);
@@ -109,7 +109,7 @@ void erfc_kernel_cuda(TensorIterator& iter) {
   });
 }
 
-void erfinv_kernel_cuda(TensorIterator& iter) {
+void erfinv_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.common_dtype(), "erfinv_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return ::erfinv(a);
@@ -117,7 +117,7 @@ void erfinv_kernel_cuda(TensorIterator& iter) {
   });
 }
 
-void kaiser_window_kernel_cuda(TensorIterator& iter, int64_t window_length, double beta_){
+void kaiser_window_kernel_cuda(TensorIteratorBase& iter, int64_t window_length, double beta_){
   AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "kaiser_window_cuda", [&](){
     using T_ACC = acc_type<scalar_t, true>;
     const T_ACC inv_alpha = static_cast<T_ACC>(2.0 / (window_length - 1));
@@ -131,7 +131,7 @@ void kaiser_window_kernel_cuda(TensorIterator& iter, int64_t window_length, doub
   });
 }
 
-void entr_kernel_cuda(TensorIterator& iter) {
+void entr_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND2(
       ScalarType::Half,
       ScalarType::BFloat16,

From f17c9ea2ed7c8b40011ca7459417ae4ec08638b8 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 15 Apr 2021 15:55:49 -0700
Subject: [PATCH 28/45] Port all unary float functions to structured (#56082)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/56082

The native_functions.yaml changes were done by codemod using the
following script:

```
import ruamel.yaml
from ruamel.yaml.tokens import CommentToken
from ruamel.yaml.error import CommentMark
from tools.codegen.model import *  # noqa: F403

with open("aten/src/ATen/native/native_functions.yaml", "r") as f:
    contents = f.read()

yaml = ruamel.yaml.YAML()
yaml.preserve_quotes = True
yaml.width = 1000
yaml.boolean_representation = ['False', 'True']
r = yaml.load(contents)

convert = '''\
acos
acosh
asin
asinh
atan
atanh
cos
cosh
digamma
erf
erfc
erfinv
exp
expm1
exp2
lgamma
log
log10
log1p
log2
reciprocal
sigmoid
sin
sinc
sinh
special_entr
sqrt
tan
tanh'''.split()

for e in r:
    f = NativeFunction.from_yaml(e, Location("", 0))
    if f.structured or f.structured_delegate is not None:
        continue
    n = f.func.name.name.base
    if n not in convert:
        continue
    # mutate e to make changes
    if f.func.kind() == SchemaKind.out:
        e.insert(1, 'structured', True)
        e.insert(2, 'structured_inherits', 'TensorIteratorBase')
    else:
        # TODO: The .out overload assumption is not sound in general
        e.insert(1, 'structured_delegate', f'{n}.out')

        e['dispatch'].pop('CPU', None)
        e['dispatch'].pop('CUDA', None)
        e['dispatch'].pop('CPU, CUDA', None)
        e['dispatch'].pop('CompositeExplicitAutograd', None)

        *_, last_k = e.keys()
        needs_fixup = False

        if not e['dispatch']:
            if last_k == 'dispatch':
                needs_fixup = True
            del e['dispatch']

        # Manually fix up newlines at the end, because ruamel
        # made some bad life choices about where to associate trailing
        # whitespace for nested dicts; see
        # https://stackoverflow.com/questions/42172399/modifying-yaml-using-ruamel-yaml-adds-extra-new-lines
        if needs_fixup:
            *_, last_k = e.keys()
            # post_key, pre_key, post_value, pre_value
            e.ca.items[last_k] = [None, None, CommentToken('\n\n', CommentMark(0), None), None]

with open("aten/src/ATen/native/native_functions.yaml.new", "w") as f:
    yaml.dump(r, f)
```

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: bhosmer

Differential Revision: D27777769

Pulled By: ezyang

fbshipit-source-id: 1ecbac7cb3e0093167bb61c7d2b1ecb95b8ae17c
---
 aten/src/ATen/native/UnaryOps.cpp             | 180 +++++++-----------
 aten/src/ATen/native/UnaryOps.h               |   4 +-
 aten/src/ATen/native/cpu/UnaryOpsKernel.cpp   |  20 +-
 .../ATen/native/cuda/UnarySpecialOpsKernel.cu |   4 +-
 aten/src/ATen/native/native_functions.yaml    | 173 +++++++++--------
 tools/codegen/model.py                        |   4 +-
 torch/csrc/jit/runtime/static/ops.cpp         |   4 +-
 7 files changed, 171 insertions(+), 218 deletions(-)

diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index 3e172049f2fe..b012af2db810 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -27,20 +27,43 @@ namespace at {
 
 namespace meta {
 
-#define CREATE_UNARY_META_FUNC(func)                  \
+// Unary float operations always produce floating point
+// outputs, even if their inputs are integer
+#define CREATE_UNARY_FLOAT_META_FUNC(func)                  \
   TORCH_META_FUNC(func) (const Tensor& self) {        \
     build_unary_float_op(maybe_get_output(), self);   \
   }
 
-// Use this macro iff you need to dispatch and don't need
-// extra error checking/have a different signature, etc.
-CREATE_UNARY_META_FUNC(sin)
-CREATE_UNARY_META_FUNC(sinc)
-CREATE_UNARY_META_FUNC(sinh)
-CREATE_UNARY_META_FUNC(cosh)
-CREATE_UNARY_META_FUNC(acosh)
-CREATE_UNARY_META_FUNC(cos)
-CREATE_UNARY_META_FUNC(special_i0e)
+CREATE_UNARY_FLOAT_META_FUNC(acos)
+CREATE_UNARY_FLOAT_META_FUNC(acosh)
+CREATE_UNARY_FLOAT_META_FUNC(asin)
+CREATE_UNARY_FLOAT_META_FUNC(asinh)
+CREATE_UNARY_FLOAT_META_FUNC(atan)
+CREATE_UNARY_FLOAT_META_FUNC(atanh)
+CREATE_UNARY_FLOAT_META_FUNC(cos)
+CREATE_UNARY_FLOAT_META_FUNC(cosh)
+CREATE_UNARY_FLOAT_META_FUNC(digamma)
+CREATE_UNARY_FLOAT_META_FUNC(erf)
+CREATE_UNARY_FLOAT_META_FUNC(erfc)
+CREATE_UNARY_FLOAT_META_FUNC(erfinv)
+CREATE_UNARY_FLOAT_META_FUNC(exp)
+CREATE_UNARY_FLOAT_META_FUNC(exp2)
+CREATE_UNARY_FLOAT_META_FUNC(expm1)
+CREATE_UNARY_FLOAT_META_FUNC(lgamma)
+CREATE_UNARY_FLOAT_META_FUNC(log)
+CREATE_UNARY_FLOAT_META_FUNC(log10)
+CREATE_UNARY_FLOAT_META_FUNC(log1p)
+CREATE_UNARY_FLOAT_META_FUNC(log2)
+CREATE_UNARY_FLOAT_META_FUNC(reciprocal)
+CREATE_UNARY_FLOAT_META_FUNC(sigmoid)
+CREATE_UNARY_FLOAT_META_FUNC(sin)
+CREATE_UNARY_FLOAT_META_FUNC(sinc)
+CREATE_UNARY_FLOAT_META_FUNC(sinh)
+CREATE_UNARY_FLOAT_META_FUNC(special_entr)
+CREATE_UNARY_FLOAT_META_FUNC(special_i0e)
+CREATE_UNARY_FLOAT_META_FUNC(sqrt)
+CREATE_UNARY_FLOAT_META_FUNC(tan)
+CREATE_UNARY_FLOAT_META_FUNC(tanh)
 
 } // namespace meta
 
@@ -55,6 +78,37 @@ TORCH_IMPL_FUNC(func##_out) (const Tensor& self, const Tensor& result) {  \
   func##_stub(device_type(), *this);                                      \
 }
 
+CREATE_UNARY_TORCH_IMPL_FUNC(acos)
+CREATE_UNARY_TORCH_IMPL_FUNC(acosh)
+CREATE_UNARY_TORCH_IMPL_FUNC(asin)
+CREATE_UNARY_TORCH_IMPL_FUNC(asinh)
+CREATE_UNARY_TORCH_IMPL_FUNC(atan)
+CREATE_UNARY_TORCH_IMPL_FUNC(atanh)
+CREATE_UNARY_TORCH_IMPL_FUNC(cos)
+CREATE_UNARY_TORCH_IMPL_FUNC(cosh)
+CREATE_UNARY_TORCH_IMPL_FUNC(digamma)
+CREATE_UNARY_TORCH_IMPL_FUNC(erf)
+CREATE_UNARY_TORCH_IMPL_FUNC(erfc)
+CREATE_UNARY_TORCH_IMPL_FUNC(erfinv)
+CREATE_UNARY_TORCH_IMPL_FUNC(exp)
+CREATE_UNARY_TORCH_IMPL_FUNC(exp2)
+CREATE_UNARY_TORCH_IMPL_FUNC(expm1)
+CREATE_UNARY_TORCH_IMPL_FUNC(lgamma)
+CREATE_UNARY_TORCH_IMPL_FUNC(log)
+CREATE_UNARY_TORCH_IMPL_FUNC(log10)
+CREATE_UNARY_TORCH_IMPL_FUNC(log1p)
+CREATE_UNARY_TORCH_IMPL_FUNC(log2)
+CREATE_UNARY_TORCH_IMPL_FUNC(reciprocal)
+CREATE_UNARY_TORCH_IMPL_FUNC(sigmoid)
+CREATE_UNARY_TORCH_IMPL_FUNC(sin)
+CREATE_UNARY_TORCH_IMPL_FUNC(sinc)
+CREATE_UNARY_TORCH_IMPL_FUNC(sinh)
+CREATE_UNARY_TORCH_IMPL_FUNC(special_entr)
+CREATE_UNARY_TORCH_IMPL_FUNC(special_i0e)
+CREATE_UNARY_TORCH_IMPL_FUNC(sqrt)
+CREATE_UNARY_TORCH_IMPL_FUNC(tan)
+CREATE_UNARY_TORCH_IMPL_FUNC(tanh)
+
 template <typename Stub>
 static inline Tensor& unary_op_impl_out(Tensor& result, const Tensor& self, Stub& stub) {
   auto iter = TensorIterator::unary_op(result, self);
@@ -139,10 +193,6 @@ static inline Tensor& unary_op_impl_(Tensor& self, OutImpl& out_impl) {
   return out_impl(self, self);
 }
 
-Tensor& acos_out(const Tensor& self, Tensor& result) { return unary_op_impl_float_out(result, self, acos_stub); }
-Tensor acos(const Tensor& self) { return unary_op_impl_float(self, acos_stub); }
-Tensor& acos_(Tensor& self) { return unary_op_impl_(self, at::acos_out); }
-
 // arccos, alias for acos
 Tensor& arccos_out(const Tensor& self, Tensor& result) { return at::acos_out(result, self); }
 Tensor arccos(const Tensor& self) { return self.acos(); }
@@ -184,19 +234,11 @@ Tensor deg2rad(const Tensor& self) {
 }
 Tensor& deg2rad_(Tensor& self) { return unary_op_impl_(self, at::deg2rad_out); }
 
-Tensor& asin_out(const Tensor& self, Tensor& result) { return unary_op_impl_float_out(result, self, asin_stub); }
-Tensor asin(const Tensor& self) { return unary_op_impl_float(self, asin_stub); }
-Tensor& asin_(Tensor& self) { return unary_op_impl_(self, at::asin_out); }
-
 // arcsin, alias of asin
 Tensor& arcsin_out(const Tensor& self, Tensor& result) { return at::asin_out(result, self); }
 Tensor arcsin(const Tensor& self) { return self.asin(); }
 Tensor& arcsin_(Tensor& self) { return self.asin_(); }
 
-Tensor& atan_out(const Tensor& self, Tensor& result) { return unary_op_impl_float_out(result, self, atan_stub); }
-Tensor atan(const Tensor& self) { return unary_op_impl_float(self, atan_stub); }
-Tensor& atan_(Tensor& self) { return unary_op_impl_(self, at::atan_out); }
-
 // arctan, alias of atan
 Tensor& arctan_out(const Tensor& self, Tensor& result) { return at::atan_out(result, self); }
 Tensor arctan(const Tensor& self) { return self.atan(); }
@@ -287,18 +329,6 @@ Tensor& ceil_out(const Tensor& self, Tensor& result) {
 Tensor ceil(const Tensor& self) { return unary_op_impl(self, at::ceil_out); }
 Tensor& ceil_(Tensor& self) { return unary_op_impl_(self, at::ceil_out); }
 
-Tensor& exp_out(const Tensor& self, Tensor& result) { return unary_op_impl_float_out(result, self, exp_stub); }
-Tensor exp(const Tensor& self) { return unary_op_impl_float(self, exp_stub); }
-Tensor& exp_(Tensor& self) { return unary_op_impl_(self, at::exp_out); }
-
-Tensor& exp2_out(const Tensor& self, Tensor& result) { return unary_op_impl_float_out(result, self, exp2_stub); }
-Tensor exp2(const Tensor& self) { return unary_op_impl_float(self, exp2_stub); }
-Tensor& exp2_(Tensor& self) { return unary_op_impl_(self, at::exp2_out); }
-
-Tensor& expm1_out(const Tensor& self, Tensor& result) { return unary_op_impl_float_out(result, self, expm1_stub); }
-Tensor expm1(const Tensor& self) { return unary_op_impl_float(self, expm1_stub); }
-Tensor& expm1_(Tensor& self) { return unary_op_impl_(self, at::expm1_out); }
-
 // special_exp2, alias for exp2
 Tensor& special_exp2_out(const Tensor& self, Tensor& result) { return at::exp2_out(result, self); }
 Tensor special_exp2(const Tensor& self) { return self.exp2(); }
@@ -307,18 +337,6 @@ Tensor special_exp2(const Tensor& self) { return self.exp2(); }
 Tensor& special_expm1_out(const Tensor& self, Tensor& result) { return at::expm1_out(result, self); }
 Tensor special_expm1(const Tensor& self) { return self.expm1(); }
 
-Tensor& erf_out(const Tensor& self, Tensor& result) { return unary_op_impl_float_out(result, self, erf_stub); }
-Tensor erf(const Tensor& self) { return unary_op_impl_float(self, erf_stub); }
-Tensor& erf_(Tensor& self) { return unary_op_impl_(self, at::erf_out); }
-
-Tensor& erfc_out(const Tensor& self, Tensor& result) { return unary_op_impl_float_out(result, self, erfc_stub); }
-Tensor erfc(const Tensor& self) { return unary_op_impl_float(self, erfc_stub); }
-Tensor& erfc_(Tensor& self) { return unary_op_impl_(self, at::erfc_out); }
-
-Tensor& erfinv_out(const Tensor& self, Tensor& result) { return unary_op_impl_float_out(result, self, erfinv_stub); }
-Tensor erfinv(const Tensor& self) { return unary_op_impl_float(self, erfinv_stub); }
-Tensor& erfinv_(Tensor& self) { return unary_op_impl_(self, at::erfinv_out); }
-
 // special_erf, alias for erf
 Tensor& special_erf_out(const Tensor& self, Tensor& result) { return at::erf_out(result, self); }
 Tensor special_erf(const Tensor& self) { return self.erf(); }
@@ -349,38 +367,10 @@ Tensor& i0_out(const Tensor& self, Tensor& result) { return unary_op_impl_out(re
 Tensor i0(const Tensor& self) { return unary_op_impl(self, at::i0_out); }
 Tensor& i0_(Tensor& self) { return unary_op_impl_(self, at::i0_out); }
 
-TORCH_IMPL_FUNC(special_i0e_out) (const Tensor& self, const Tensor& result) {
-  i0e_stub(device_type(), *this);
-}
-
-Tensor& log_out(const Tensor& self, Tensor& result) { return unary_op_impl_float_out(result, self, log_stub); }
-Tensor log(const Tensor& self) { return unary_op_impl_float(self, log_stub); }
-Tensor& log_(Tensor& self) { return unary_op_impl_(self, at::log_out); }
-
-Tensor& log10_out(const Tensor& self, Tensor& result) { return unary_op_impl_float_out(result, self, log10_stub); }
-Tensor log10(const Tensor& self) { return unary_op_impl_float(self, log10_stub); }
-Tensor& log10_(Tensor& self) { return unary_op_impl_(self, at::log10_out); }
-
-Tensor& log1p_out(const Tensor& self, Tensor& result) { return unary_op_impl_float_out(result, self, log1p_stub); }
-Tensor log1p(const Tensor& self) { return unary_op_impl_float(self, log1p_stub); }
-Tensor& log1p_(Tensor& self) { return unary_op_impl_(self, at::log1p_out); }
-
-Tensor& log2_out(const Tensor& self, Tensor& result) { return unary_op_impl_float_out(result, self, log2_stub); }
-Tensor log2(const Tensor& self) { return unary_op_impl_float(self, log2_stub); }
-Tensor& log2_(Tensor& self) { return unary_op_impl_(self, at::log2_out); }
-
 Tensor& round_out(const Tensor& self, Tensor& result) { return unary_op_impl_out(result, self, round_stub); }
 Tensor round(const Tensor& self) { return unary_op_impl(self, at::round_out); }
 Tensor& round_(Tensor& self) { return unary_op_impl_(self, at::round_out); }
 
-Tensor& digamma_out(const Tensor& self, Tensor& result) { return unary_op_impl_float_out(result, self, digamma_stub); }
-Tensor digamma(const Tensor& self) { return unary_op_impl_float(self, digamma_stub); }
-Tensor& digamma_(Tensor& self) { return unary_op_impl_(self, digamma_out); }
-
-Tensor& reciprocal_out(const Tensor& self, Tensor& result) { return unary_op_impl_float_out(result, self, reciprocal_stub); }
-Tensor reciprocal(const Tensor& self) { return unary_op_impl_float(self, reciprocal_stub); }
-Tensor& reciprocal_(Tensor& self) { return unary_op_impl_(self, at::reciprocal_out); }
-
 Tensor& rsqrt_out(const Tensor& self, Tensor& result) {
   return unary_op_impl_float_out(result, self, rsqrt_stub);
 }
@@ -408,48 +398,25 @@ Tensor& sgn_out(const Tensor& self, Tensor& result) {
 Tensor sgn(const Tensor& self) { return unary_op_impl(self, at::sgn_out); }
 Tensor& sgn_(Tensor& self) { return unary_op_impl_(self, at::sgn_out); }
 
-CREATE_UNARY_TORCH_IMPL_FUNC(sin)
-CREATE_UNARY_TORCH_IMPL_FUNC(cos)
-CREATE_UNARY_TORCH_IMPL_FUNC(sinc)
-CREATE_UNARY_TORCH_IMPL_FUNC(sinh)
-CREATE_UNARY_TORCH_IMPL_FUNC(cosh)
-CREATE_UNARY_TORCH_IMPL_FUNC(acosh)
-
 // arccosh, alias for acosh
 Tensor& arccosh_out(const Tensor& self, Tensor& result) { return at::acosh_out(result, self); }
 Tensor arccosh(const Tensor& self) { return at::acosh(self); }
 Tensor& arccosh_(Tensor& self) { return at::acosh_(self); }
 
-Tensor& asinh_out(const Tensor& self, Tensor& result) { return unary_op_impl_float_out(result, self, asinh_stub); }
-Tensor asinh(const Tensor& self) { return unary_op_impl_float(self, asinh_stub); }
-Tensor& asinh_(Tensor& self) { return unary_op_impl_(self, at::asinh_out); }
-
 // arcsinh, alias for asinh
 Tensor& arcsinh_out(const Tensor& self, Tensor& result) { return at::asinh_out(result, self); }
 Tensor arcsinh(const Tensor& self) { return self.asinh(); }
 Tensor& arcsinh_(Tensor& self) { return self.asinh_(); }
 
-Tensor& atanh_out(const Tensor& self, Tensor& result) { return unary_op_impl_float_out(result, self, atanh_stub); }
-Tensor atanh(const Tensor& self) { return unary_op_impl_float(self, atanh_stub); }
-Tensor& atanh_(Tensor& self) { return unary_op_impl_(self, at::atanh_out); }
-
 // arctanh, alias for atanh
 Tensor& arctanh_out(const Tensor& self, Tensor& result) { return at::atanh_out(result, self); }
 Tensor arctanh(const Tensor& self) { return self.atanh(); }
 Tensor& arctanh_(Tensor& self) { return self.atanh_(); }
 
-Tensor& sqrt_out(const Tensor& self, Tensor& result) { return unary_op_impl_float_out(result, self, sqrt_stub); }
-Tensor sqrt(const Tensor& self) { return unary_op_impl_float(self, sqrt_stub); }
-Tensor& sqrt_(Tensor& self) { return unary_op_impl_(self, at::sqrt_out); }
-
 Tensor& square_out(const Tensor& self, Tensor& result) { return at::pow_out(result, self, 2); }
 Tensor square(const Tensor& self) { return at::pow(self, 2); }
 Tensor& square_(Tensor& self) { return self.pow_(2); }
 
-Tensor& sigmoid_out(const Tensor& self, Tensor& result) { return unary_op_impl_float_out(result, self, sigmoid_stub);  }
-Tensor sigmoid(const Tensor& self) { return unary_op_impl_float(self, sigmoid_stub);  }
-Tensor& sigmoid_(Tensor& self) { return unary_op_impl_(self, at::sigmoid_out);  }
-
 Tensor& logit_out(const Tensor& self,
     c10::optional<double> eps,
     Tensor& result) {
@@ -519,14 +486,6 @@ Tensor& nan_to_num_(
   return at::nan_to_num_out(self, self, nan, pos_inf, neg_inf);
 }
 
-Tensor& tanh_out(const Tensor& self, Tensor& result) { return unary_op_impl_float_out(result, self, tanh_stub); }
-Tensor tanh(const Tensor& self) { return unary_op_impl_float(self, tanh_stub); }
-Tensor& tanh_(Tensor& self) { return unary_op_impl_(self, at::tanh_out); }
-
-Tensor& tan_out(const Tensor& self, Tensor& result) { return unary_op_impl_float_out(result, self, tan_stub);  }
-Tensor tan(const Tensor& self) { return unary_op_impl_float(self, tan_stub);  }
-Tensor& tan_(Tensor& self) { return unary_op_impl_(self, at::tan_out);  }
-
 Tensor& trunc_out(const Tensor& self, Tensor& result) {
   // Note: this is consistent with NumPy
   TORCH_CHECK(!self.is_complex(),
@@ -727,10 +686,6 @@ Tensor& mvlgamma_(Tensor& self, int64_t p) {
   return self.copy_(args.lgamma_().sum(-1).add_(p2_sub_p * std::log(c10::pi<double>) * QUARTER));
 }
 
-Tensor& lgamma_out(const Tensor& self, Tensor& result) { return unary_op_impl_float_out(result, self, lgamma_stub); }
-Tensor lgamma(const Tensor& self) { return unary_op_impl_float(self, lgamma_stub); }
-Tensor& lgamma_(Tensor& self) { return unary_op_impl_(self, at::lgamma_out); }
-
 std::tuple<Tensor, Tensor> frexp(const Tensor& self) {
   Tensor mantissa = at::empty_like(self);
   Tensor exponent = at::empty_like(self, self.options().dtype(at::kInt));
@@ -770,9 +725,6 @@ std::tuple<Tensor&, Tensor&> frexp_out(const Tensor& self,
 Tensor special_gammaln(const Tensor& self) { return self.lgamma(); }
 Tensor& special_gammaln_out(const Tensor& self, Tensor& result) { return at::lgamma_out(result, self); }
 
-Tensor special_entr(const Tensor& self) { return unary_op_impl_float(self, entr_stub); }
-Tensor& special_entr_out(const Tensor& self, Tensor& result) { return unary_op_impl_float_out(result, self, entr_stub);}
-
 DEFINE_DISPATCH(abs_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(angle_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(real_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
@@ -792,7 +744,7 @@ DEFINE_DISPATCH(clamp_min_stub); // NOLINT(cppcoreguidelines-avoid-non-const-glo
 DEFINE_DISPATCH(cos_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(cosh_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(digamma_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
-DEFINE_DISPATCH(entr_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
+DEFINE_DISPATCH(special_entr_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(erf_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(erfc_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(erfinv_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
@@ -803,7 +755,7 @@ DEFINE_DISPATCH(floor_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-
 DEFINE_DISPATCH(frac_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(frexp_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(i0_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
-DEFINE_DISPATCH(i0e_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
+DEFINE_DISPATCH(special_i0e_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(log_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(log10_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(log1p_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
diff --git a/aten/src/ATen/native/UnaryOps.h b/aten/src/ATen/native/UnaryOps.h
index 71126530d339..dd0af3895adf 100644
--- a/aten/src/ATen/native/UnaryOps.h
+++ b/aten/src/ATen/native/UnaryOps.h
@@ -29,7 +29,7 @@ DECLARE_DISPATCH(unary_fn_with_scalar, clamp_min_stub);
 DECLARE_DISPATCH(unary_fn, cos_stub);
 DECLARE_DISPATCH(unary_fn, cosh_stub);
 DECLARE_DISPATCH(unary_fn, digamma_stub);
-DECLARE_DISPATCH(unary_fn, entr_stub);
+DECLARE_DISPATCH(unary_fn, special_entr_stub);
 DECLARE_DISPATCH(unary_fn, erf_stub);
 DECLARE_DISPATCH(unary_fn, erfc_stub);
 DECLARE_DISPATCH(unary_fn, erfinv_stub);
@@ -40,7 +40,7 @@ DECLARE_DISPATCH(unary_fn, floor_stub);
 DECLARE_DISPATCH(unary_fn, frac_stub);
 DECLARE_DISPATCH(unary_fn, frexp_stub);
 DECLARE_DISPATCH(unary_fn, i0_stub);
-DECLARE_DISPATCH(unary_fn, i0e_stub);
+DECLARE_DISPATCH(unary_fn, special_i0e_stub);
 DECLARE_DISPATCH(unary_fn, log_stub);
 DECLARE_DISPATCH(unary_fn, log10_stub);
 DECLARE_DISPATCH(unary_fn, log1p_stub);
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index 093dbe88f34b..de88dbae5fef 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -675,6 +675,7 @@ static void i0e_kernel(TensorIteratorBase& iter) {
           IMPLEMENT_ITERATOR_LAMBDA(op),                                            \
           {0, iter.numel()});                                                       \
     });                                                                             \
+    iter.cast_outputs();                                                            \
   }                                                                                 \
   REGISTER_DISPATCH(op##_stub, &op##_kernel)
 
@@ -686,17 +687,6 @@ static void i0e_kernel(TensorIteratorBase& iter) {
           IMPLEMENT_ITERATOR_LAMBDA(op),                                                         \
           {0, iter.numel()});                                                                    \
     });                                                                                          \
-  }                                                                                              \
-  REGISTER_DISPATCH(op##_stub, &op##_kernel)
-
-  #define IMPLEMENT_COMPLEX_STRUCTURED_KERNEL(op)                                                \
-  static void op##_kernel(TensorIteratorBase& iter) {                                            \
-    TORCH_INTERNAL_ASSERT(iter.ntensors() == 2);                                                 \
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, iter.dtype(), #op "_vml_cpu", [&]() { \
-      iter.serial_for_each(                                                                      \
-          IMPLEMENT_ITERATOR_LAMBDA(op),                                                         \
-          {0, iter.numel()});                                                                    \
-    });                                                                                          \
     iter.cast_outputs();                                                                         \
   }                                                                                              \
   REGISTER_DISPATCH(op##_stub, &op##_kernel)
@@ -745,16 +735,16 @@ REGISTER_DISPATCH(clamp_stub, &clamp_kernel);
 REGISTER_DISPATCH(clamp_max_stub, &clamp_max_kernel);
 REGISTER_DISPATCH(clamp_min_stub, &clamp_min_kernel);
 REGISTER_DISPATCH(kaiser_window_stub, &kaiser_window_kernel);
-REGISTER_DISPATCH(entr_stub, &entr_kernel);
+REGISTER_DISPATCH(special_entr_stub, &entr_kernel);
 REGISTER_DISPATCH(frexp_stub, &frexp_kernel);
-REGISTER_DISPATCH(i0e_stub, &i0e_kernel);
+REGISTER_DISPATCH(special_i0e_stub, &i0e_kernel);
 
 
 IMPLEMENT_COMPLEX_KERNEL(acos)
 IMPLEMENT_COMPLEX_KERNEL(asin)
 IMPLEMENT_COMPLEX_KERNEL(atan)
 IMPLEMENT_FLOAT_KERNEL(ceil)
-IMPLEMENT_COMPLEX_STRUCTURED_KERNEL(cos)
+IMPLEMENT_COMPLEX_KERNEL(cos)
 IMPLEMENT_FLOAT_KERNEL(erf)
 IMPLEMENT_FLOAT_KERNEL(erfc)
 IMPLEMENT_FLOAT_KERNEL(erfinv)
@@ -767,7 +757,7 @@ IMPLEMENT_FLOAT_KERNEL(log1p)
 IMPLEMENT_COMPLEX_KERNEL(log2)
 IMPLEMENT_FLOAT_KERNEL(i0)
 IMPLEMENT_FLOAT_KERNEL(round)
-IMPLEMENT_COMPLEX_STRUCTURED_KERNEL(sin)
+IMPLEMENT_COMPLEX_KERNEL(sin)
 IMPLEMENT_COMPLEX_KERNEL(sqrt)
 IMPLEMENT_COMPLEX_KERNEL(tan)
 IMPLEMENT_COMPLEX_KERNEL(tanh)
diff --git a/aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu b/aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu
index 90799183ef4b..00295cb06fec 100644
--- a/aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu
@@ -153,7 +153,7 @@ void entr_kernel_cuda(TensorIteratorBase& iter) {
 
 REGISTER_DISPATCH(exp2_stub, &exp2_kernel_cuda);
 REGISTER_DISPATCH(i0_stub, &i0_kernel_cuda);
-REGISTER_DISPATCH(i0e_stub, &i0e_kernel_cuda);
+REGISTER_DISPATCH(special_i0e_stub, &i0e_kernel_cuda);
 REGISTER_DISPATCH(sigmoid_stub, &sigmoid_kernel_cuda);
 REGISTER_DISPATCH(sinc_stub, &sinc_kernel_cuda);
 REGISTER_DISPATCH(logit_stub, &logit_kernel_cuda);
@@ -161,7 +161,7 @@ REGISTER_DISPATCH(erf_stub, &erf_kernel_cuda);
 REGISTER_DISPATCH(erfc_stub, &erfc_kernel_cuda);
 REGISTER_DISPATCH(erfinv_stub, &erfinv_kernel_cuda);
 REGISTER_DISPATCH(kaiser_window_stub, &kaiser_window_kernel_cuda);
-REGISTER_DISPATCH(entr_stub, &entr_kernel_cuda);
+REGISTER_DISPATCH(special_entr_stub, &entr_kernel_cuda);
 
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index c0a342415eb8..bf52225f185e 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -297,15 +297,15 @@
 
 - func: acos(Tensor self) -> Tensor
   variants: function, method
-  dispatch:
-    CPU, CUDA: acos
+  structured_delegate: acos.out
 
 - func: acos_(Tensor(a!) self) -> Tensor(a!)
   variants: function, method
-  dispatch:
-    CompositeExplicitAutograd: acos_
+  structured_delegate: acos.out
 
 - func: acos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: acos_out
 
@@ -509,15 +509,15 @@
 
 - func: asinh(Tensor self) -> Tensor
   variants: function, method
-  dispatch:
-    CPU, CUDA: asinh
+  structured_delegate: asinh.out
 
 - func: asinh_(Tensor(a!) self) -> Tensor(a!)
   variants: function, method
-  dispatch:
-    CompositeExplicitAutograd: asinh_
+  structured_delegate: asinh.out
 
 - func: asinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: asinh_out
 
@@ -531,16 +531,16 @@
 - func: arcsinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
 
 - func: atanh(Tensor self) -> Tensor
+  structured_delegate: atanh.out
   variants: function, method
-  dispatch:
-    CPU, CUDA: atanh
 
 - func: atanh_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: atanh.out
   variants: function, method
-  dispatch:
-    CompositeExplicitAutograd: atanh_
 
 - func: atanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: atanh_out
 
@@ -568,17 +568,19 @@
 
 - func: asin(Tensor self) -> Tensor
   variants: function, method
+  structured_delegate: asin.out
   dispatch:
-    CPU, CUDA: asin
     SparseCPU, SparseCUDA: asin_sparse
 
 - func: asin_(Tensor(a!) self) -> Tensor(a!)
   variants: function, method
+  structured_delegate: asin.out
   dispatch:
-    CPU, CUDA: asin_
     SparseCPU, SparseCUDA: asin_sparse_
 
 - func: asin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: asin_out
     SparseCPU, SparseCUDA: asin_out_sparse
@@ -593,16 +595,16 @@
 - func: arcsin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
 
 - func: atan(Tensor self) -> Tensor
+  structured_delegate: atan.out
   variants: function, method
-  dispatch:
-    CPU, CUDA: atan
 
 - func: atan_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: atan.out
   variants: function, method
-  dispatch:
-    CompositeExplicitAutograd: atan_
 
 - func: atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: atan_out
 
@@ -1554,72 +1556,72 @@
     Meta: empty_strided_meta
 
 - func: erf(Tensor self) -> Tensor
+  structured_delegate: erf.out
   variants: function, method
-  dispatch:
-    CPU, CUDA: erf
 
 - func: erf_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: erf.out
   variants: function, method
-  dispatch:
-    CompositeExplicitAutograd: erf_
 
 - func: erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: erf_out
 
 - func: erfc(Tensor self) -> Tensor
+  structured_delegate: erfc.out
   variants: function, method
-  dispatch:
-    CPU, CUDA: erfc
 
 - func: erfc_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: erfc.out
   variants: function, method
-  dispatch:
-    CompositeExplicitAutograd: erfc_
 
 - func: erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: erfc_out
 
 - func: exp(Tensor self) -> Tensor
+  structured_delegate: exp.out
   variants: function, method
-  dispatch:
-    CPU, CUDA: exp
 
 - func: exp_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: exp.out
   variants: function, method
-  dispatch:
-    CompositeExplicitAutograd: exp_
 
 - func: exp.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: exp_out
 
 - func: exp2(Tensor self) -> Tensor
+  structured_delegate: exp2.out
   variants: function, method
-  dispatch:
-    CPU, CUDA: exp2
 
 - func: exp2_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: exp2.out
   variants: function, method
-  dispatch:
-    CompositeExplicitAutograd: exp2_
 
 - func: exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: exp2_out
 
 - func: expm1(Tensor self) -> Tensor
+  structured_delegate: expm1.out
   variants: function, method
-  dispatch:
-    CPU, CUDA: expm1
 
 - func: expm1_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: expm1.out
   variants: function, method
-  dispatch:
-    CompositeExplicitAutograd: expm1_
 
 - func: expm1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: expm1_out
 
@@ -2091,61 +2093,63 @@
     CUDA: linspace_cuda_out
 
 - func: log(Tensor self) -> Tensor
+  structured_delegate: log.out
   variants: function, method
-  dispatch:
-    CPU, CUDA: log
 
 - func: log_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: log.out
   variants: function, method
-  dispatch:
-    CompositeExplicitAutograd: log_
 
 - func: log.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: log_out
 
 - func: log10(Tensor self) -> Tensor
+  structured_delegate: log10.out
   variants: function, method
-  dispatch:
-    CPU, CUDA: log10
 
 - func: log10_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: log10.out
   variants: function, method
-  dispatch:
-    CompositeExplicitAutograd: log10_
 
 - func: log10.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: log10_out
 
 - func: log1p(Tensor self) -> Tensor
+  structured_delegate: log1p.out
   variants: function, method
   dispatch:
-    CPU, CUDA: log1p
     SparseCPU, SparseCUDA: log1p_sparse
 
 - func: log1p_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: log1p.out
   variants: function, method
   dispatch:
-    CPU, CUDA: log1p_
     SparseCPU, SparseCUDA: log1p_sparse_
 
 - func: log1p.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: log1p_out
     SparseCPU, SparseCUDA: log1p_out_sparse
 
 - func: log2(Tensor self) -> Tensor
+  structured_delegate: log2.out
   variants: function, method
-  dispatch:
-    CPU, CUDA: log2
 
 - func: log2_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: log2.out
   variants: function, method
-  dispatch:
-    CompositeExplicitAutograd: log2_
 
 - func: log2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: log2_out
 
@@ -2939,16 +2943,16 @@
   variants: function, method
 
 - func: reciprocal(Tensor self) -> Tensor
+  structured_delegate: reciprocal.out
   variants: function, method
-  dispatch:
-    CPU, CUDA: reciprocal
 
 - func: reciprocal_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: reciprocal.out
   variants: function, method
-  dispatch:
-    CompositeExplicitAutograd: reciprocal_
 
 - func: reciprocal.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: reciprocal_out
 
@@ -3147,19 +3151,21 @@
     CompositeImplicitAutograd: math_silu_backward
 
 - func: sigmoid(Tensor self) -> Tensor
+  structured_delegate: sigmoid.out
   variants: function, method
   dispatch:
-    CPU, CUDA: sigmoid
     QuantizedCPU: sigmoid_quantized_cpu
     MkldnnCPU: mkldnn_sigmoid
 
 - func: sigmoid_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: sigmoid.out
   variants: function, method
   dispatch:
-    CPU, CUDA: sigmoid_
     MkldnnCPU: mkldnn_sigmoid_
 
 - func: sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: sigmoid_out
 
@@ -3441,17 +3447,18 @@
   device_guard: False
 
 - func: sqrt(Tensor self) -> Tensor
+  structured_delegate: sqrt.out
   variants: function, method
   dispatch:
-    CPU, CUDA: sqrt
     SparseCPU, SparseCUDA: sqrt_sparse
 
 - func: sqrt_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: sqrt.out
   variants: function, method
-  dispatch:
-    CPU, CUDA: sqrt_
 
 - func: sqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: sqrt_out
     SparseCPU, SparseCUDA: sqrt_out_sparse
@@ -3530,33 +3537,34 @@
     CompositeExplicitAutograd: t_
 
 - func: tan(Tensor self) -> Tensor
+  structured_delegate: tan.out
   variants: function, method
-  dispatch:
-    CPU, CUDA: tan
 
 - func: tan_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: tan.out
   variants: function, method
-  dispatch:
-    CompositeExplicitAutograd: tan_
 
 - func: tan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: tan_out
 
 - func: tanh(Tensor self) -> Tensor
+  structured_delegate: tanh.out
   variants: function, method
   dispatch:
-    CPU, CUDA: tanh
     QuantizedCPU: tanh_quantized_cpu
     MkldnnCPU: mkldnn_tanh
 
 - func: tanh_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: tanh.out
   variants: function, method
   dispatch:
     MkldnnCPU: mkldnn_tanh_
-    CompositeExplicitAutograd: tanh_
-
 - func: tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: tanh_out
 
@@ -5023,9 +5031,8 @@
     CUDA: triu_cuda_
 
 - func: digamma_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: digamma.out
   variants: method
-  dispatch:
-    CPU, CUDA: digamma_
 
 - func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!)
   variants: method
@@ -5765,27 +5772,28 @@
     CPU, CUDA: multinomial
 
 - func: lgamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: lgamma_out
 
 - func: lgamma_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: lgamma.out
   variants: method
-  dispatch:
-    CPU, CUDA: lgamma_
 
 - func: lgamma(Tensor self) -> Tensor
+  structured_delegate: lgamma.out
   variants: method, function
-  dispatch:
-    CPU, CUDA: lgamma
 
 - func: digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: digamma_out
 
 - func: digamma(Tensor self) -> Tensor
+  structured_delegate: digamma.out
   variants: method, function
-  dispatch:
-    CPU, CUDA: digamma
 
 - func: polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -5797,16 +5805,16 @@
     CompositeExplicitAutograd: polygamma
 
 - func: erfinv(Tensor self) -> Tensor
+  structured_delegate: erfinv.out
   variants: method, function
-  dispatch:
-    CPU, CUDA: erfinv
 
 - func: erfinv_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: erfinv.out
   variants: method
-  dispatch:
-    CPU, CUDA: erfinv_
 
 - func: erfinv.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: erfinv_out
 
@@ -8355,12 +8363,13 @@
 #   The "special_" names should be hidden from the user and not documented.
 
 - func: special_entr(Tensor self) -> Tensor
+  structured_delegate: special_entr.out
   python_module: special
   variants: function
-  dispatch:
-    CPU, CUDA: special_entr
 
 - func: special_entr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
   python_module: special
   variants: function
   dispatch:
diff --git a/tools/codegen/model.py b/tools/codegen/model.py
index e40ecf8cc8ea..24fde3b1d8ee 100644
--- a/tools/codegen/model.py
+++ b/tools/codegen/model.py
@@ -333,7 +333,9 @@ def from_yaml(ei: Dict[str, object], loc: 'Location') -> 'NativeFunction':
             "strictly subsumes the other.  If you wanted to provide an explicit autograd " \
             "implementation, specify CompositeExplicitAutograd; otherwise specify CompositeImplicitAutograd only"
 
-        e.pop('__line__')
+        # don't care if it exists or not; make it easier to use this function
+        # with other yaml parsers that aren't setting __line__ in the dict
+        e.pop('__line__', None)
         assert not e, f"leftover entries: {e}"
 
         return NativeFunction(
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 00ae73626967..fd75601a44cf 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -534,7 +534,7 @@ REGISTER_OPERATOR_FUNCTOR(aten::tanh, aten_tanh, [](Node* n) -> SROperator {
     auto& out_t = p_node->Output(0).toTensor();
     if (!te->supports(in0_t)) {
       fastResizeToZero(out_t);
-      at::native::tanh_out(in0_t, out_t);
+      at::cpu::tanh_out(out_t, in0_t);
     } else {
       at::native::resize_(out_t, in0_t.sizes(), c10::nullopt);
       (*te)(out_t.data_ptr<float>(), in0_t.data_ptr<float>(), in0_t.numel());
@@ -555,7 +555,7 @@ REGISTER_OPERATOR_FUNCTOR(
         auto& out_t = p_node->Output(0).toTensor();
         if (!te->supports(in0_t)) {
           fastResizeToZero(out_t);
-          at::native::sigmoid_out(in0_t, out_t);
+          at::cpu::sigmoid_out(out_t, in0_t);
         } else {
           at::native::resize_(out_t, in0_t.sizes(), c10::nullopt);
           (*te)(

From 52f1a07b63dddf7a57c63f4f47a8fc3a6589b275 Mon Sep 17 00:00:00 2001
From: Victor Bittorf <vsb@fb.com>
Date: Thu, 15 Apr 2021 15:56:00 -0700
Subject: [PATCH 29/45] Python API for Vitals (#53238)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/53238

There is a tension for the Vitals design: (1) we want a macro based logging API for C++ and (2) we want a clean python API. Furthermore, we want to this to work with "print on destruction" semantics.

The unfortunate resolution is that there are (2) ways to define vitals:
(1) Use the macros for local use only within C++ - this keeps the semantics people enjoy
(2) For vitals to be used through either C++ or Python, we use a global VitalsAPI object.

Both these go to the same place for the user: printing to stdout as the globals are destructed.

The long history on this diff shows many different ways to try to avoid having 2 different paths... we tried weak pointers & shared pointers, verbose switch cases, etc. Ultimately each ran into an ugly trade-off and this cuts the difference better the alternatives.

Test Plan:
buck test mode/dev caffe2/test:torch -- --regex vital
buck test //caffe2/aten:vitals

Reviewed By: orionr

Differential Revision: D26736443

fbshipit-source-id: ccab464224913edd07c1e8532093f673cdcb789f
---
 aten/src/ATen/core/Vitals.cpp | 24 +++++++++++++++++++++
 aten/src/ATen/core/Vitals.h   | 40 +++++++++++++++++++++++++++++------
 aten/src/ATen/test/vitals.cpp | 21 ++++++++++++++++++
 test/test_torch.py            | 35 ++++++++++++++++++++++++++++++
 tools/build_variables.bzl     |  1 +
 torch/csrc/Module.cpp         | 20 ++++++++++++------
 torch/overrides.py            |  2 ++
 7 files changed, 130 insertions(+), 13 deletions(-)

diff --git a/aten/src/ATen/core/Vitals.cpp b/aten/src/ATen/core/Vitals.cpp
index 722e275d62b2..babbc305905e 100644
--- a/aten/src/ATen/core/Vitals.cpp
+++ b/aten/src/ATen/core/Vitals.cpp
@@ -1,9 +1,12 @@
 #include <ATen/core/Vitals.h>
 #include <cstdlib>
 
+
 namespace at {
 namespace vitals {
 
+APIVitals VitalsAPI;
+
 TorchVitalAttr& TorchVital::create(const std::string& attr) {
   if (!torchVitalEnabled()) {
     static TorchVitalAttr disabled;
@@ -30,5 +33,26 @@ bool torchVitalEnabled() {
   return enabled;
 }
 
+bool APIVitals::setVital(
+    const std::string& vital_name,
+    const std::string& attr_name,
+    const std::string& value) {
+  if (!torchVitalEnabled()) {
+    return false;
+  }
+
+  auto iter = name_map_.find(vital_name);
+  TorchVital *vital = nullptr;
+  if (iter == name_map_.end()) {
+    auto r = name_map_.emplace(std::make_pair(vital_name, TorchVital(vital_name)));
+    vital = &r.first->second;
+  } else {
+    vital = &iter->second;
+  }
+
+  vital->create(attr_name) << value;
+  return true;
+}
+
 } // namespace at
 } // namespace vitals
diff --git a/aten/src/ATen/core/Vitals.h b/aten/src/ATen/core/Vitals.h
index 6d4f1ba3d171..e62b947e1d01 100644
--- a/aten/src/ATen/core/Vitals.h
+++ b/aten/src/ATen/core/Vitals.h
@@ -1,15 +1,19 @@
 #pragma once
 #include <cstring>
 #include <iostream>
+#include <memory>
 #include <sstream>
 #include <unordered_map>
+#include <map>
+
+#include <c10/core/impl/LocalDispatchKeySet.h>
 
 namespace at {
 namespace vitals {
 
-bool torchVitalEnabled();
+TORCH_API bool torchVitalEnabled();
 
-struct TorchVitalAttr {
+struct TORCH_API TorchVitalAttr {
   // always initialized to empty
   std::string value = "";
   template <typename T>
@@ -23,7 +27,7 @@ struct TorchVitalAttr {
   }
 };
 
-struct TorchVital {
+struct TORCH_API TorchVital {
   std::string name;
   std::unordered_map<std::string, TorchVitalAttr> attrs;
 
@@ -40,11 +44,35 @@ struct TorchVital {
   }
 };
 
+// A way to access vitals by string names instead of by global reference.
+// This enables access to vitals from the PythonAPI.
+class TORCH_API APIVitals
+{
+public:
+    // Set any vital sign that was added to the map.
+    bool setVital(const std::string &vital_name, const std::string &attr_name, const std::string &value);
+
+    APIVitals(): name_map_() { }
+
+    // Ensure this stays a singleton
+    APIVitals(APIVitals const& other) = delete;
+    APIVitals(APIVitals&& other) = delete;
+    APIVitals& operator=(const APIVitals&) = delete;
+    APIVitals& operator=(APIVitals&&) = delete;
+
+private:
+    std::unordered_map<std::string, TorchVital> name_map_;
+};
+
+extern TORCH_API APIVitals VitalsAPI;
+
 } // namespace at
 } // namespace vitals
 
-#define TORCH_VITAL_DECLARE(name) extern TorchVital TorchVital_##name;
+#define TORCH_VITAL_DECLARE(name) TORCH_API at::vitals::TorchVital TorchVital_##name;
+
+#define TORCH_VITAL_DEFINE(name) TORCH_API at::vitals::TorchVital TorchVital_##name(#name);
 
-#define TORCH_VITAL_DEFINE(name) TorchVital TorchVital_##name(#name);
+#define TORCH_VITAL_BASE(name) TorchVital_##name
 
-#define TORCH_VITAL(name, attr) TorchVital_##name.create(#attr)
+#define TORCH_VITAL(name, attr) TORCH_VITAL_BASE(name).create(#attr)
diff --git a/aten/src/ATen/test/vitals.cpp b/aten/src/ATen/test/vitals.cpp
index e019bcd69eb5..47c9bb635a25 100644
--- a/aten/src/ATen/test/vitals.cpp
+++ b/aten/src/ATen/test/vitals.cpp
@@ -89,3 +89,24 @@ TEST(Vitals, OnAndOff) {
     }
   }
 }
+
+TEST(Vitals, APIVitals) {
+  std::stringstream buffer;
+  bool rvalue;
+  std::streambuf* sbuf = std::cout.rdbuf();
+  std::cout.rdbuf(buffer.rdbuf());
+  {
+#ifdef _WIN32
+    _putenv("TORCH_VITAL=1");
+#else
+    setenv("TORCH_VITAL", "1", 1);
+#endif
+    APIVitals api_vitals;
+    rvalue = api_vitals.setVital("TestingSetVital", "TestAttr", "TestValue");
+  }
+  std::cout.rdbuf(sbuf);
+
+  auto s = buffer.str();
+  ASSERT_TRUE(rvalue);
+  ASSERT_TRUE(s.find("TestingSetVital.TestAttr\t\t TestValue") != std::string::npos);
+}
diff --git a/test/test_torch.py b/test/test_torch.py
index 22392cd2e80a..7edbd3aaefca 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -2,12 +2,14 @@
 import torch
 import numpy as np
 
+import contextlib
 import io
 import inspect
 import math
 import random
 import re
 import copy
+import os
 import tempfile
 import unittest
 import warnings
@@ -2882,6 +2884,39 @@ def add_neg_dim_tests():
         setattr(AbstractTestCases._TestTorchMixin, test_name, make_neg_dim_test(name, tensor_arg, arg_constr, types, extra_dim))
 
 
+@contextlib.contextmanager
+def torch_vital_set(value):
+    stash = None
+    if 'TORCH_VITAL' in os.environ:
+        stash = os.environ['TORCH_VITAL']
+    os.environ['TORCH_VITAL'] = value
+    try:
+        yield
+    finally:
+        if stash:
+            os.environ['TORCH_VITAL'] = stash
+        else:
+            del os.environ['TORCH_VITAL']
+
+
+# Tests Vital Signs for Torch
+class TestVitalSigns(TestCase):
+    def test_basic_vitals(self):
+        with torch_vital_set(''):
+            self.assertFalse(torch.vitals_enabled())
+        with torch_vital_set('ON'):
+            self.assertTrue(torch.vitals_enabled())
+
+    def test_write_vital(self):
+        with torch_vital_set('ON'):
+            self.assertTrue(torch.vitals_enabled())
+            # This tests the code path of setting a vital
+            self.assertTrue(torch.set_vital('Dataloader', 'basic_unit_test', 'TEST'))
+            # Ideally we would have a read test for vitals, though because the the C++ design
+            # pattern of loggers we use, we can't know the whole list of vitals until the
+            # global C++ namespace is destructed.
+
+
 # Device-generic tests. Instantiated below and not run directly.
 class TestTorchDeviceType(TestCase):
     exact_dtype = True
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index f83def2c3ad5..40af105c6c91 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -696,6 +696,7 @@ aten_cpu_source_non_codegen_list = [
     "aten/src/ATen/core/Tensor.cpp",
     "aten/src/ATen/core/VariableFallbackKernel.cpp",
     "aten/src/ATen/core/VariableHooksInterface.cpp",
+    "aten/src/ATen/core/Vitals.cpp",
     "aten/src/ATen/core/boxing/KernelFunction.cpp",
     "aten/src/ATen/core/custom_class.cpp",
     "aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp",
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index a08f0c913517..f3f726bb0ca2 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -5,20 +5,21 @@
 #include <sys/socket.h>
 #endif
 
-#include <unordered_map>
-#include <cstdlib>
-#include <libshm.h>
-#include <TH/TH.h>
-#include <c10/util/Logging.h>
 #include <ATen/ATen.h>
-#include <ATen/ExpandUtils.h>
-#include <ATen/dlpack.h>
 #include <ATen/DLConvertor.h>
+#include <ATen/ExpandUtils.h>
 #include <ATen/Parallel.h>
 #include <ATen/Utils.h>
 #include <ATen/VmapMode.h>
+#include <ATen/dlpack.h>
+#include <ATen/core/Vitals.h>
+#include <TH/TH.h>
+#include <c10/util/Logging.h>
+#include <cstdlib>
+#include <libshm.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
+#include <unordered_map>
 
 #include <torch/csrc/THP.h>
 #include <torch/csrc/DynamicTypes.h>
@@ -934,6 +935,11 @@ PyObject* initModule() {
   py_module.def("_demangle", &c10::demangle);
   py_module.def("_log_api_usage_once", &LogAPIUsageOnceFromPython);
 
+  py_module.def("vitals_enabled", &at::vitals::torchVitalEnabled);
+  py_module.def("set_vital", [](const std::string &vital, const std::string &attr, const std::string value){
+    return at::vitals::VitalsAPI.setVital(vital, attr, value);
+  });
+
   py_module.def(
     "init_num_threads",
     torch::wrap_pybind_function(at::init_num_threads),
diff --git a/torch/overrides.py b/torch/overrides.py
index 09244712244a..f9211d440305 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -193,6 +193,8 @@ def get_ignored_functions() -> Set[Callable]:
         torch.unify_type_list,
         torch.is_warn_always_enabled,
         torch.set_warn_always,
+        torch.vitals_enabled,
+        torch.set_vital,
         Tensor.__delitem__,
         Tensor.__dir__,
         Tensor.__getattribute__,

From 1ca51f0fba5246be482b5485774690d6bda2e02d Mon Sep 17 00:00:00 2001
From: Jay Chae <jchae@fb.com>
Date: Thu, 15 Apr 2021 15:59:14 -0700
Subject: [PATCH 30/45] [kineto] deprecate metdata args from
 ClientTraceActivity (#55988)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/55988

Pull Request resolved: https://github.com/pytorch/kineto/pull/165

as part of the ClientTraceActivity -> GenericTraceActivity migration, move all the metadata fields to JSON encoded string

Test Plan:
- `buck build`
- tested with subsequent diffs

Reviewed By: gdankel

Differential Revision: D27340314

fbshipit-source-id: f55b77a779e4bda1fb8667cb4e0f4252b93af5ea
---
 torch/csrc/autograd/profiler_kineto.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index 52831bdf9a0b..f0b5e2d22c7e 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -72,12 +72,14 @@ struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState {
     // }
 
     // Not setting atm
+#ifndef USE_KINETO_UPDATED
     op.inputTypes = "[]";
     op.arguments = "[]";
     op.outputDims = "[]";
     op.outputTypes = "[]";
     op.inputNames = "[]";
     op.outputNames = "[]";
+#endif
 
     // setting both pthread and linux tid for Kineto
     op.sysThreadId = cachedTid();
@@ -139,13 +141,23 @@ struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState {
   void finalizeCPUTrace() {
     TORCH_INTERNAL_ASSERT(cpu_trace->activities.size() == kineto_events_.size());
     for (size_t idx = 0; idx < cpu_trace->activities.size(); ++idx) {
+#ifdef USE_KINETO_UPDATED
+      if (kineto_events_[idx].hasShapes()) {
+        cpu_trace->activities[idx].addMetadata("Input Dims", shapesToStr(kineto_events_[idx].shapes()));
+      }
+#else
       if (kineto_events_[idx].hasShapes()) {
         cpu_trace->activities[idx].inputDims = shapesToStr(kineto_events_[idx].shapes());
       } else {
         cpu_trace->activities[idx].inputDims = "[]";
       }
+#endif
       if (kineto_events_[idx].hasStack()) {
+#ifdef USE_KINETO_UPDATED
+        cpu_trace->activities[idx].addMetadata("Call stack", stacksToStr(kineto_events_[idx].stack()));
+#else
         cpu_trace->activities[idx].callStack = stacksToStr(kineto_events_[idx].stack());
+#endif
       }
     }
   }

From 5ad3bc715c540bbf7b32723d007703957a10e801 Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Thu, 15 Apr 2021 16:01:22 -0700
Subject: [PATCH 31/45] ns for fx: change node I/O determination to strict
 allowlist (#55434)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/55434

Before this PR, there was some hacky logic which determined
the input and output types of nodes based on heuristics such
as inspecting `__module__`, or assuming that an op has an
I/O dtype of `torch.float` when the heuristics did not find
any matches.  This is problematic because the heuristics were not exact,
and this could result in non-sensical shadow graphs when the heuristics
would return an incorrect dtype.

This PR switches the dtype determination to an allowlist system,
where we specify exactly what the dtypes are for the nodes or modules
which are in an allowlist, and we add an `UNKNOWN` type for everything
else.  The shadow logic is changed to skip inserting shadows on any
function or module where the I/O dtype is unknown.

The current allowlist only contains functions necessary for the
currently existing tests.  Filling out the allowlist with all necessary
torch functions is left for a future PR.

As a result of this, we can do the following (also implemented in this PR):
1. enable graph matching on nodes with equal types (for example,
F.linear and F.linear). The restriction that only nodes with equal types
was in the code as a placeholder, it's better to allow comparisons of
nodes of equal types. One case where this is useful is unshadowed
activations.
2. enable models with user defined modules to be passed to Numeric Suite
APIs without errors.

Test Plan:
```
python test/test_quantization.py TestFXGraphMatcher
python test/test_quantization.py TestFXGraphMatcherModels
python test/test_quantization.py TestFXNumericSuiteCoreAPIs
python test/test_quantization.py TestFXNumericSuiteCoreAPIsModels
```

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D27622418

fbshipit-source-id: 40dcba0222c01154c141467640c1eb89725f33a7
---
 test/quantization/test_numeric_suite_fx.py |  97 +++++++++++++++---
 torch/quantization/ns/graph_matcher.py     |   9 +-
 torch/quantization/ns/graph_passes.py      |  56 ++++++++---
 torch/quantization/ns/utils.py             | 112 +++++++++++++++------
 torch/quantization/ns/weight_utils.py      |  38 ++++---
 5 files changed, 233 insertions(+), 79 deletions(-)

diff --git a/test/quantization/test_numeric_suite_fx.py b/test/quantization/test_numeric_suite_fx.py
index 33cc06886008..3ff1eeb1519f 100644
--- a/test/quantization/test_numeric_suite_fx.py
+++ b/test/quantization/test_numeric_suite_fx.py
@@ -34,8 +34,10 @@
     extract_weights,
     _extract_weights_impl,
     add_loggers,
+    _add_loggers_impl,
     OutputLogger,
     add_shadow_loggers,
+    _add_shadow_loggers_impl,
     extract_logger_info,
     extract_shadow_logger_info,
 )
@@ -233,11 +235,7 @@ def forward(self, x0):
         self.assert_types_for_matched_subgraph_pairs(results, expected_types, mp, mq)
 
     @skipIfNoFBGEMM
-    def test_nodes_with_equal_types_do_not_get_matched(self):
-        # verifies that by default, nodes with equivalent types do not get matched.
-        # This is important for user defined types, for which we do not know
-        # the weight extraction functions or input type. In the future, this can
-        # be made configurable.
+    def test_nodes_with_equal_types_get_matched(self):
         class M(nn.Module):
             def __init__(self):
                 super().__init__()
@@ -264,16 +262,16 @@ def forward(self, x):
         mq = convert_fx(mp_copy)
         results = get_matching_subgraph_pairs(mp, mq)
 
-        # Conv2 should not be matched because we disabled quantization for it,
-        # so its type is the same in mp and mq. sigmoid should not be
-        # matched because they use the same function in mp and mq. relu should
-        # be matched because it is in the allowlist of functions with same
-        # signature across dtypes.
+        # all of these should be matched
         expected_types = {
-            'base_op_torch.nn.Conv2d_0':
+            'base_op_torch.nn.Conv2d_1':
                 ((nn.Conv2d, nn.Conv2d), (nnq.Conv2d, nnq.Conv2d)),
+            'base_op_torch.nn.Conv2d_0':
+                ((nn.Conv2d, nn.Conv2d), (nn.Conv2d, nn.Conv2d)),
             'base_op_torch.mul_0': ((torch.mul, torch.mul), (toq.mul, toq.mul)),
             'base_op_torch.relu_0': ((F.relu, F.relu), (F.relu, F.relu)),
+            'base_op_torch.sigmoid_0':
+                ((torch.sigmoid, torch.sigmoid), (torch.sigmoid, torch.sigmoid)),
         }
         self.assert_types_for_matched_subgraph_pairs(results, expected_types, mp, mq)
 
@@ -588,7 +586,7 @@ def test_add_shadow_loggers_mod(self):
             nn.Conv2d(1, 1, 1),
             nn.Conv2d(1, 1, 1),
         ).eval()
-        self._test_match_shadow_activations(
+        res = self._test_match_shadow_activations(
             m, (torch.randn(1, 1, 4, 4),), results_len=2)
 
     @skipIfNoFBGEMM
@@ -610,7 +608,7 @@ def forward(self, x):
                 return x
 
         m = M().eval()
-        self._test_match_shadow_activations(
+        res = self._test_match_shadow_activations(
             m, (torch.randn(4, 4),), results_len=2)
 
     @skipIfNoFBGEMM
@@ -720,6 +718,75 @@ def test_linear_fp16_shadow_activations(self):
                 qconfig_dict=qconfig_dict,
                 should_log_inputs=should_log_inputs)
 
+    @skipIfNoFBGEMM
+    def test_user_module(self):
+        """
+        For user defined modules,
+        1. weight extraction should not crash
+        2. unshadowed activations should have loggers, loggers will only log if
+             the output dtype is in the allowlist
+        3. shadowed activations should not have loggers
+             (since I/O dtype is unknown)
+        """
+        class UserModule(nn.Module):
+            def forward(self, x):
+                return x
+
+        class M(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = nn.Linear(1, 1)
+                self.user_module = UserModule()
+
+            def forward(self, x):
+                x = self.linear(x)
+                x = self.user_module(x)
+                return x
+
+        m = M().eval()
+
+        # quantize without tracing through UserModule
+        qconfig_dict = {'': torch.quantization.default_qconfig}
+        prepare_custom_config_dict = {'non_traceable_module_name': ['user_module']}
+        mp = prepare_fx(m, qconfig_dict, prepare_custom_config_dict)
+        mp(torch.randn(1, 1, 1))
+        mq = convert_fx(copy.deepcopy(mp))
+
+        # weight extraction should not crash
+        weights = _extract_weights_impl('fp32_prepared', mp, 'int8', mq)
+
+        # unshadowed activations should have loggers
+
+        # add loggers, without retracing
+        # note: converting again because we cannot copy a quantized linear
+        mp_ns, mq_ns = _add_loggers_impl(
+            'fp32_prepared', copy.deepcopy(mp), 'int8',
+            convert_fx(copy.deepcopy(mp)), OutputLogger,
+            should_log_inputs=True)
+        # both fp32 and int8 models should have 4 loggers each, 2 for I/O
+        # of linear, and 2 for I/O of user_module
+        unshadowed_expected_occurrence = {
+            ns.call_module(OutputLogger): 4,
+        }
+        self.checkGraphModuleNodes(
+            mp_ns, expected_node_occurrence=unshadowed_expected_occurrence)
+        self.checkGraphModuleNodes(
+            mq_ns, expected_node_occurrence=unshadowed_expected_occurrence)
+
+        # shadowed activations should only have loggers for nodes where
+        # the types are known and we can do a dtype cast
+
+        # add shadow loggers, without retracing
+        mp_shadows_mq_ns = _add_shadow_loggers_impl(
+            'fp32_prepared', mp, 'int8', mq, OutputLogger,
+            should_log_inputs=True)
+        # 2 loggers for I/O of linear, 0 loggers for I/O of user_module
+        shadowed_expected_occurrence = {
+            ns.call_module(OutputLogger): 2,
+        }
+        self.checkGraphModuleNodes(
+            mp_shadows_mq_ns, expected_node_occurrence=unshadowed_expected_occurrence)
+
 
 class TestFXNumericSuiteCoreAPIsModels(FXNumericSuiteQuantizationTestCase):
     """
@@ -841,7 +908,7 @@ def test_sparsenn_compare_activations(self):
             x = torch.randn(2, 4)
             self._test_match_activations(
                 sparse_nn, (idx, offsets, x),
-                results_len=4,
+                results_len=5,
                 should_log_inputs=should_log_inputs)
 
     @skipIfNoFBGEMM
@@ -851,7 +918,7 @@ def test_sparsenn_shadow(self):
             idx = torch.LongTensor([1, 2, 4, 5, 4, 3, 2, 9])
             offsets = torch.LongTensor([0, 4])
             x = torch.randn(2, 4)
-            self._test_match_activations(
+            self._test_match_shadow_activations(
                 sparse_nn, (idx, offsets, x),
                 results_len=4,
                 should_log_inputs=should_log_inputs)
diff --git a/torch/quantization/ns/graph_matcher.py b/torch/quantization/ns/graph_matcher.py
index 8102a5c33ad9..e51784a30c85 100644
--- a/torch/quantization/ns/graph_matcher.py
+++ b/torch/quantization/ns/graph_matcher.py
@@ -109,6 +109,10 @@ def get_base_name_to_sets_of_related_ops() -> Dict[str, Set[Callable]]:
         'torch.nn.MaxPool2d': set([
             nn.MaxPool2d,
         ]),
+        # sigmoid
+        'torch.sigmoid': set([
+            torch.sigmoid,
+        ]),
     }
     return base_name_to_sets_of_related_ops
 
@@ -137,6 +141,7 @@ def get_non_matchable_functions() -> Set[Callable]:
     # TODO(future PR): allow customizations
     return set([
         torch.quantize_per_tensor,
+        operator.getitem,
     ])
 
 def get_non_matchable_modules() -> Set[Callable]:
@@ -646,10 +651,6 @@ def get_matching_subgraph_pairs(
 ({cur_subgraph_a}, {type_start_a}) and
 ({cur_subgraph_b}, {type_start_b}) are not related"""
                 raise GraphMatchingException(msg)
-            elif subgraph_relationship == SubgraphTypeRelationship.EQUAL:
-                # For now, skip nodes with equal types. In the future, this can
-                # be made configurable.
-                continue
             key_name_a = _get_name_for_subgraph(
                 cur_subgraph_a, gm_a, base_name_to_sets_of_related_ops,
                 existing_names_a)
diff --git a/torch/quantization/ns/graph_passes.py b/torch/quantization/ns/graph_passes.py
index a4899e081bc9..c342825b38d8 100644
--- a/torch/quantization/ns/graph_passes.py
+++ b/torch/quantization/ns/graph_passes.py
@@ -177,7 +177,8 @@ def _insert_dtype_cast_after_node(
         dtype_cast_mod_cls = torch.nn.Identity
     else:
         raise AssertionError(
-            f"dtype cast from {node_input_type_c} to {node_input_type_a} needs to be implemented")
+            f"dtype cast from {node_input_type_c} {node_c.format_node()} to " +
+            f"{node_input_type_a} {node_a.format_node()} needs to be implemented")
 
     if isinstance(prev_node_c, Node):
         new_dtype_cast_name = \
@@ -328,13 +329,17 @@ def _insert_copy_of_node_a_after_input_node_c(
     for node_a_arg in node_a.args[num_non_param_args:]:
         if isinstance(node_a_arg, Node):
             arg_a = return_first_non_observer_node(node_a_arg, gm_a)
-            arg_a_copy_name = \
-                get_new_attr_name_with_prefix(arg_a.name + '_shadow_copy_')(gm_b)  # type: ignore
-            arg_a_obj = getattr_from_fqn(gm_a, arg_a.target)  # type: ignore
-            setattr(gm_b, arg_a_copy_name, arg_a_obj.detach())
-            node_a_arg_copy = graph_c.create_node(
-                'get_attr', arg_a_copy_name, (), {}, arg_a_copy_name)
-            new_args.append(node_a_arg_copy)
+            if arg_a.op == 'get_attr':
+                arg_a_copy_name = \
+                    get_new_attr_name_with_prefix(arg_a.name + '_shadow_copy_')(gm_b)  # type: ignore
+                arg_a_obj = getattr_from_fqn(gm_a, arg_a.target)  # type: ignore
+                setattr(gm_b, arg_a_copy_name, arg_a_obj.detach())
+                node_a_arg_copy = graph_c.create_node(
+                    'get_attr', arg_a_copy_name, (), {}, arg_a_copy_name)
+                new_args.append(node_a_arg_copy)
+            else:
+                raise AssertionError(
+                    f"handling of node with op {arg_a.op} is not implemented")
         else:
             raise AssertionError(
                 f"handling for arg of type {type(node_a_arg)} is not implemented")
@@ -438,16 +443,17 @@ def load_arg(a):
             graph_c.output(map_arg(node_b.args[0], load_arg))
             continue
 
-        if node_b.op == 'call_module' and is_activation_post_process(modules[node_b.target]):
+        # calculate the flags to determine what to do with this node
+        node_b_is_observer = \
+            node_b.op == 'call_module' and is_activation_post_process(modules[node_b.target])
+        node_b_is_start_node = node_b in start_node_b_to_matched_subgraph_a_and_name
+        node_b_is_end_node = node_b in end_node_b_to_matched_subgraph_a_and_name
+
+        if node_b_is_observer:
             # remove activation post process node
             env_c[node_b.name] = env_c[node_b.args[0].name]  # type: ignore
 
-        elif (
-            node_b in start_node_b_to_matched_subgraph_a_and_name or
-            node_b in end_node_b_to_matched_subgraph_a_and_name
-        ):
-            node_b_is_start_node = node_b in start_node_b_to_matched_subgraph_a_and_name
-            node_b_is_end_node = node_b in end_node_b_to_matched_subgraph_a_and_name
+        elif (node_b_is_start_node or node_b_is_end_node):
 
             if node_b_is_start_node:
                 subgraph_a, ref_name = \
@@ -457,6 +463,26 @@ def load_arg(a):
                 subgraph_a, ref_name = \
                     end_node_b_to_matched_subgraph_a_and_name[node_b]
 
+            # For both start_node and end_node verify that we know how to do
+            # the dtype cast. If we do not, skip.
+            node_input_type_a, node_output_type_a = \
+                get_node_first_input_and_output_type(subgraph_a.start_node, gm_a, logger_cls)
+            node_input_type_b, node_output_type_b = \
+                get_node_first_input_and_output_type(node_b, gm_b, logger_cls)
+            node_io_types_known_a_and_b = (
+                node_input_type_a != NodeInputOrOutputType.UNKNOWN and
+                node_output_type_a != NodeInputOrOutputType.UNKNOWN and
+                node_input_type_b != NodeInputOrOutputType.UNKNOWN and
+                node_output_type_b != NodeInputOrOutputType.UNKNOWN
+            )
+            if not node_io_types_known_a_and_b:
+                print(
+                    f'skipping shadow loggers for node_b: {node_b.format_node()}' +
+                    f', start_node_a: {subgraph_a.start_node.format_node()}' +
+                    ', unknown dtype cast')
+                env_c[node_b.name] = graph_c.node_copy(node_b, load_arg)
+                continue
+
             if node_b_is_start_node:
 
                 # if necessary, log the input of node_c
diff --git a/torch/quantization/ns/utils.py b/torch/quantization/ns/utils.py
index f49a84958850..12e2fa715ded 100644
--- a/torch/quantization/ns/utils.py
+++ b/torch/quantization/ns/utils.py
@@ -2,6 +2,10 @@
 
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.quantized as nnq
+import torch.nn.quantized.dynamic as nnqd
+toq = torch.ops.quantized
 from torch.fx import GraphModule
 from torch.fx.graph import Node
 from torch.quantization.fx.quantize import is_activation_post_process
@@ -24,28 +28,73 @@ class NodeInputOrOutputType(enum.Enum):
     FP32 = enum.auto()  # torch.float
     INT8 = enum.auto()  # torch.qint8 or torch.quint8
     FP16 = enum.auto()  # torch.float16
+    UNKNOWN = enum.auto()  # we cannot determine input/output dtype
     # TODO(future PRs): dynamic quant, fake quant, etc
 
 
+# TODO(future PR): make configurable
+# TODO(future PR): fill out coverage
+fp32_funs = set([
+    F.linear,
+    F.conv1d,
+    F.conv2d,
+    F.conv3d,
+    # TODO(future PR): move this to a new category, since
+    # i/o can be fp32 or int8
+    torch.cat,
+    F.relu,
+])
+
+# TODO(future PR): make configurable
+# TODO(future PR): fill out coverage
+int8_funs = set([
+    toq.linear,
+    toq.linear_relu,
+    toq.conv1d,
+    toq.conv1d_relu,
+    toq.conv2d,
+    toq.conv2d_relu,
+    toq.conv3d,
+    toq.conv3d_relu,
+    toq.cat,
+])
+
+# TODO(future PR): make configurable
+# TODO(future PR): fill out coverage
+fp32_mods = set([
+    nn.Linear,
+    nn.Conv1d,
+    nn.Conv2d,
+    nn.Conv3d,
+    nn.LSTM,
+    # note: nnqd.Linear is an instance of nnq.Linear, so this
+    # check has to happen before the int8 module check
+    nnqd.Linear,
+    nnqd.LSTM,
+])
+
+# TODO(future PR): make configurable
+# TODO(future PR): fill out coverage
+int8_mods = set([
+    nnq.Linear,
+    nnq.Conv1d,
+    nnq.Conv2d,
+    nnq.Conv3d,
+])
+
+
 def get_node_first_input_and_output_type(
     node: Node,
     gm: GraphModule,
     logger_cls: Callable,
 ) -> Tuple[NodeInputOrOutputType, NodeInputOrOutputType]:
     if node.op == 'call_function':
-        fp32_fun_target_names = ('torch.nn.functional', 'torch.nn')
-        # hack alert: this is not ready for production
-        # TODO(future PR): use a real mapping
-        fp32_funs = (torch.cat,)
-        int8_fun_target_names = ('torch._ops.quantized',)
-        # For now, hacky check to see which op is in which namespace
-        # TODO(future PR): use a real mapping
-        if node.target.__module__ in fp32_fun_target_names or node.target in fp32_funs:
+        if node.target in fp32_funs:
             return (NodeInputOrOutputType.FP32, NodeInputOrOutputType.FP32)
-        else:
-            assert node.target.__module__ in int8_fun_target_names, \
-                'unknown node target %s with module %s' % (node.target, node.target.__module__)
+        elif node.target in int8_funs:
             return (NodeInputOrOutputType.INT8, NodeInputOrOutputType.INT8)
+        else:
+            return (NodeInputOrOutputType.UNKNOWN, NodeInputOrOutputType.UNKNOWN)
 
     elif node.op == 'call_module':
         assert node.op == 'call_module'
@@ -61,20 +110,18 @@ def get_node_first_input_and_output_type(
                     first_arg, gm, logger_cls)
             return (prev_node_output_type, prev_node_output_type)
         # For now, hacky check to see which mod is in which namespace
-        # TODO(future PR): use a real mapping
-        is_known_fp32_input_module = (
-            mod.__module__.startswith('torch.nn.modules') or
-            mod.__module__.startswith('torch.nn.quantized.dynamic')
+        is_known_fp32_input_module = any(
+            isinstance(mod, target_type) for target_type in fp32_mods
+        )
+        is_known_int8_input_module = any(
+            isinstance(mod, target_type) for target_type in int8_mods
         )
         if is_known_fp32_input_module:
             return (NodeInputOrOutputType.FP32, NodeInputOrOutputType.FP32)
-        else:
-            is_known_int8_module = (
-                mod.__module__.startswith('torch.nn.quantized') or
-                mod.__module__.startswith('torch.nn.intrinsic.quantized')
-            )
-            assert is_known_int8_module, 'unknown node target %s' % mod
+        elif is_known_int8_input_module:
             return (NodeInputOrOutputType.INT8, NodeInputOrOutputType.INT8)
+        else:
+            return (NodeInputOrOutputType.UNKNOWN, NodeInputOrOutputType.UNKNOWN)
 
     elif node.op == 'call_method':
         if node.target == 'dequantize':
@@ -103,11 +150,9 @@ def get_node_first_input_and_output_type(
 
             return (prev_node_output_type, NodeInputOrOutputType.FP16)
 
-        # TODO(future PR): improve this instead of guessing
-        return (NodeInputOrOutputType.FP32, NodeInputOrOutputType.FP32)
+        return (NodeInputOrOutputType.UNKNOWN, NodeInputOrOutputType.UNKNOWN)
     else:
-        # TODO(future PR): improve this instead of guessing
-        return (NodeInputOrOutputType.FP32, NodeInputOrOutputType.FP32)
+        return (NodeInputOrOutputType.UNKNOWN, NodeInputOrOutputType.UNKNOWN)
 
 def return_first_non_observer_node(
     node: Node,
@@ -122,18 +167,19 @@ def return_first_non_observer_node(
     graph: (node_non_obs -> obs0), node = obs0 : returns node_non_obs
     graph: (node_non_obs -> obs0 -> fq0), node = fq0 : returns node_non_obs
     """
-    node_obj = getattr_from_fqn(gm, node.target)  # type: ignore
-    if is_activation_post_process(node_obj):
-        assert len(node.args) == 1
-        assert isinstance(node.args[0], Node)
-        node = node.args[0]
-        # code duplication intended, not worth refactoring
-        assert isinstance(node.target, str)
-        node_obj = getattr_from_fqn(gm, node.target)
+    if node.op == 'call_module':
+        node_obj = getattr_from_fqn(gm, node.target)  # type: ignore
         if is_activation_post_process(node_obj):
             assert len(node.args) == 1
             assert isinstance(node.args[0], Node)
             node = node.args[0]
+            # code duplication intended, not worth refactoring
+            assert isinstance(node.target, str)
+            node_obj = getattr_from_fqn(gm, node.target)
+            if is_activation_post_process(node_obj):
+                assert len(node.args) == 1
+                assert isinstance(node.args[0], Node)
+                node = node.args[0]
     return node
 
 def get_number_of_non_param_args(
diff --git a/torch/quantization/ns/weight_utils.py b/torch/quantization/ns/weight_utils.py
index 988f274e308f..9484b3cbaa15 100644
--- a/torch/quantization/ns/weight_utils.py
+++ b/torch/quantization/ns/weight_utils.py
@@ -177,7 +177,6 @@ def extract_weight_from_node(
 
         # check that A is one the modules we need
         # assume B is related (this is done by graph matcher)
-        # TODO(future PR): 1d and 3d convs
         related_to_conv1d_mod = isinstance(mod, nn.Conv1d) or \
             (type(mod), nn.Conv1d) in type_a_related_to_b
         related_to_conv2d_mod = isinstance(mod, nn.Conv2d) or \
@@ -189,20 +188,35 @@ def extract_weight_from_node(
         related_to_lstm_mod = isinstance(mod, nn.LSTM) or \
             (type(mod), nn.LSTM) in type_a_related_to_b
 
-        # TODO(future PR): other module types
         if related_to_conv1d_mod or related_to_conv2d_mod or related_to_conv3d_mod:
             weights = [get_conv_mod_weight(mod)]
+            return {
+                'type': res_type,
+                'values': weights,
+                'prev_node_name': node.name,
+                'prev_node_target_type': str(type(mod)),
+                'ref_node_name': node.name,
+                'index_within_arg': 0,
+            }
         elif related_to_lstm_mod:
             weights = get_lstm_mod_weights(mod)
-        else:
-            assert related_to_linear_mod, f"module type {type(mod)} not handled yet"
+            return {
+                'type': res_type,
+                'values': weights,
+                'prev_node_name': node.name,
+                'prev_node_target_type': str(type(mod)),
+                'ref_node_name': node.name,
+                'index_within_arg': 0,
+            }
+        elif related_to_linear_mod:
             weights = [get_linear_mod_weight(mod)]
-        return {
-            'type': res_type,
-            'values': weights,
-            'prev_node_name': node.name,
-            'prev_node_target_type': str(type(mod)),
-            'ref_node_name': node.name,
-            'index_within_arg': 0,
-        }
+            return {
+                'type': res_type,
+                'values': weights,
+                'prev_node_name': node.name,
+                'prev_node_target_type': str(type(mod)),
+                'ref_node_name': node.name,
+                'index_within_arg': 0,
+            }
+
     return None

From 3786c2719de3b62f4257f5636cd87b6e6a94aaf9 Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Thu, 15 Apr 2021 16:01:22 -0700
Subject: [PATCH 32/45] ns for fx: make NSTracer inherit from
 QuantizationTracer (#55505)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/55505

This necessary to add support in NS for QAT modules, to avoid
duplicating logic between NSTracer and QuantizationTracer.

The eng work to expose the custom module and class names to
the user will be in a future PR.

Test Plan:
```
python test/test_quantization.py TestFXNumericSuiteCoreAPIs
python test/test_quantization.py TestFXNumericSuiteCoreAPIsModels
```

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D27650407

fbshipit-source-id: 431f47c5353b41c11371c5efa79657bfd085459a
---
 torch/quantization/_numeric_suite_fx.py | 22 +++++++++++++++++-----
 torch/quantization/ns/graph_passes.py   | 14 ++++----------
 torch/quantization/ns/utils.py          | 18 ++++++++++++++++++
 3 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/torch/quantization/_numeric_suite_fx.py b/torch/quantization/_numeric_suite_fx.py
index 0ae63cba7f5b..80fdbf9f30ee 100644
--- a/torch/quantization/_numeric_suite_fx.py
+++ b/torch/quantization/_numeric_suite_fx.py
@@ -2,9 +2,9 @@
 
 import torch
 import torch.nn as nn
+import torch.quantization.quantize_fx as quantize_fx
 from torch.fx import GraphModule
 from torch.fx.graph import Node
-from torch.fx.symbolic_trace import Tracer
 from torch.quantization.ns.graph_matcher import (
     get_matching_subgraph_pairs,
     get_base_name_to_sets_of_related_ops,
@@ -94,7 +94,7 @@ def __repr__(self):
 results_type={self.results_type}, index_within_arg={self.index_within_arg})"""
 
 
-class NSTracer(Tracer):
+class NSTracer(quantize_fx.QuantizationTracer):
     """
     Just like a regular tracer, but treats observers and fake_quantize
     modules as leaf modules.
@@ -163,7 +163,11 @@ def extract_weights(
     type_a_related_to_b = \
         get_type_a_related_to_b(base_name_to_sets_of_related_ops)
 
-    tracer_a, tracer_b = NSTracer(), NSTracer()
+    # TODO(future PR): expose these
+    skipped_module_names: List[str] = []
+    skipped_module_classes: List[Callable] = []
+    tracer_a = NSTracer(skipped_module_names, skipped_module_classes)
+    tracer_b = NSTracer(skipped_module_names, skipped_module_classes)
     gm_a = GraphModule(model_a, tracer_a.trace(model_a))
     gm_b = GraphModule(model_b, tracer_b.trace(model_b))
     return _extract_weights_impl(model_name_a, gm_a, model_name_b, gm_b)
@@ -233,7 +237,11 @@ def add_loggers(
     logger_cls: Callable,
     should_log_inputs : bool = False,
 ) -> Tuple[nn.Module, nn.Module]:
-    tracer_a, tracer_b = NSTracer(), NSTracer()
+    # TODO(future PR): expose these
+    skipped_module_names: List[str] = []
+    skipped_module_classes: List[Callable] = []
+    tracer_a = NSTracer(skipped_module_names, skipped_module_classes)
+    tracer_b = NSTracer(skipped_module_names, skipped_module_classes)
     gm_a = GraphModule(model_a, tracer_a.trace(model_a))
     gm_b = GraphModule(model_b, tracer_b.trace(model_b))
     return _add_loggers_impl(
@@ -330,7 +338,11 @@ def add_shadow_loggers(
     Same thing as add_loggers, but for an `a_shadows_b` model.
     TODO(future PR): real docblock
     """
-    tracer_a, tracer_b = NSTracer(), NSTracer()
+    # TODO(future PR): expose these
+    skipped_module_names: List[str] = []
+    skipped_module_classes: List[Callable] = []
+    tracer_a = NSTracer(skipped_module_names, skipped_module_classes)
+    tracer_b = NSTracer(skipped_module_names, skipped_module_classes)
     gm_a = GraphModule(model_a, tracer_a.trace(model_a))
     gm_b = GraphModule(model_b, tracer_b.trace(model_b))
     return _add_shadow_loggers_impl(
diff --git a/torch/quantization/ns/graph_passes.py b/torch/quantization/ns/graph_passes.py
index c342825b38d8..f39519a7be93 100644
--- a/torch/quantization/ns/graph_passes.py
+++ b/torch/quantization/ns/graph_passes.py
@@ -10,6 +10,7 @@
     NodeInputOrOutputType,
     return_first_non_observer_node,
     get_number_of_non_param_args,
+    get_target_type_str,
 )
 
 from .ns_types import (
@@ -43,14 +44,7 @@ def _insert_logger_after_node(
     # create new name
     logger_node_name = \
         get_new_attr_name_with_prefix(node.name + logger_node_name_suffix)(gm)
-    # create a string representation of the node's target type
-    target_type = ''
-    if node.op == 'call_function':
-        target_type = str(node.target)
-    elif node.op == 'call_module':
-        assert isinstance(node.target, str)
-        target_mod = getattr_from_fqn(gm, node.target)
-        target_type = str(type(target_mod))
+    target_type = get_target_type_str(node, gm)
     # create the logger object
     logger_obj = logger_cls(
         ref_node_name, node.name, model_name, ref_name, target_type,
@@ -477,8 +471,8 @@ def load_arg(a):
             )
             if not node_io_types_known_a_and_b:
                 print(
-                    f'skipping shadow loggers for node_b: {node_b.format_node()}' +
-                    f', start_node_a: {subgraph_a.start_node.format_node()}' +
+                    f'skipping shadow loggers for node_b: {get_target_type_str(node_b, gm_b)}' +
+                    f', start_node_a: {get_target_type_str(subgraph_a.start_node, gm_a)}' +
                     ', unknown dtype cast')
                 env_c[node_b.name] = graph_c.node_copy(node_b, load_arg)
                 continue
diff --git a/torch/quantization/ns/utils.py b/torch/quantization/ns/utils.py
index 12e2fa715ded..3871ba4ef8ba 100644
--- a/torch/quantization/ns/utils.py
+++ b/torch/quantization/ns/utils.py
@@ -4,6 +4,8 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.nn.quantized as nnq
+import torch.nn.intrinsic as nni
+import torch.nn.intrinsic.quantized as nniq
 import torch.nn.quantized.dynamic as nnqd
 toq = torch.ops.quantized
 from torch.fx import GraphModule
@@ -65,6 +67,7 @@ class NodeInputOrOutputType(enum.Enum):
     nn.Linear,
     nn.Conv1d,
     nn.Conv2d,
+    nni.ConvReLU2d,
     nn.Conv3d,
     nn.LSTM,
     # note: nnqd.Linear is an instance of nnq.Linear, so this
@@ -79,6 +82,7 @@ class NodeInputOrOutputType(enum.Enum):
     nnq.Linear,
     nnq.Conv1d,
     nnq.Conv2d,
+    nniq.ConvReLU2d,
     nnq.Conv3d,
 ])
 
@@ -206,3 +210,17 @@ def get_number_of_non_param_args(
 
     # default is 1
     return 1
+
+def get_target_type_str(node: Node, gm: GraphModule) -> str:
+    """
+    Returns a string representation of the type of the function or module
+    pointed to by this node, or '' for other op types.
+    """
+    target_type = ''
+    if node.op == 'call_function':
+        target_type = str(node.target)
+    elif node.op == 'call_module':
+        assert isinstance(node.target, str)
+        target_mod = getattr_from_fqn(gm, node.target)
+        target_type = str(type(target_mod))
+    return target_type

From 1cbc4023e94dfe26fa52dac5d1b0113a60d92ef9 Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Thu, 15 Apr 2021 16:01:22 -0700
Subject: [PATCH 33/45] ns for fx: add qat handling for weight extraction
 (#55506)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/55506

Makes the NS weight extraction tests also test QAT, and fixes
the mappings where necessary to cover all the fusions and make
the tests pass.

Test Plan:
```
python test/test_quantization.py TestFXNumericSuiteCoreAPIs.test_extract_weights_mod_ptq
python test/test_quantization.py TestFXNumericSuiteCoreAPIs.test_extract_weights_mod_qat
```

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D27650409

fbshipit-source-id: c5bd9268d1bc559afc27d4c5109effd77bf1538a
---
 test/quantization/test_numeric_suite_fx.py | 144 ++++++++++++++-------
 torch/quantization/ns/graph_matcher.py     |   6 +
 torch/quantization/ns/weight_utils.py      |   4 -
 3 files changed, 102 insertions(+), 52 deletions(-)

diff --git a/test/quantization/test_numeric_suite_fx.py b/test/quantization/test_numeric_suite_fx.py
index 3ff1eeb1519f..76aaa43d935d 100644
--- a/test/quantization/test_numeric_suite_fx.py
+++ b/test/quantization/test_numeric_suite_fx.py
@@ -61,6 +61,89 @@ def forward(self, x):
         return x
 
 
+class AllConvAndLinearFusionModules(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        # conv1d
+        self.conv1d_0 = nn.Conv1d(1, 1, 1)
+        # conv1d - relu
+        self.conv1d_1 = nn.Conv1d(1, 1, 1)
+        self.relu_0 = nn.ReLU()
+        # conv1d - bn (qat only)
+        self.conv1d_2 = nn.Conv1d(1, 1, 1)
+        self.bn1d_0 = nn.BatchNorm1d(1)
+        # conv1d - bn - relu (qat only)
+        self.conv1d_3 = nn.Conv1d(1, 1, 1)
+        self.bn1d_1 = nn.BatchNorm1d(1)
+        self.relu_4 = nn.ReLU()
+        # conv2d
+        self.conv2d_0 = nn.Conv2d(1, 1, 1)
+        # conv2d - relu
+        self.conv2d_1 = nn.Conv2d(1, 1, 1)
+        self.relu_1 = nn.ReLU()
+        # conv2d - bn (qat only)
+        self.conv2d_2 = nn.Conv2d(1, 1, 1)
+        self.bn2d_0 = nn.BatchNorm2d(1)
+        # conv2d - bn - relu (qat only)
+        self.conv2d_3 = nn.Conv2d(1, 1, 1)
+        self.bn2d_1 = nn.BatchNorm2d(1)
+        self.relu_5 = nn.ReLU()
+        # conv3d
+        self.conv3d_0 = nn.Conv3d(1, 1, 1)
+        # conv3d - relu
+        self.conv3d_1 = nn.Conv3d(1, 1, 1)
+        self.relu_2 = nn.ReLU()
+        # conv3d - bn (qat only)
+        self.conv3d_2 = nn.Conv3d(1, 1, 1)
+        self.bn3d_0 = nn.BatchNorm3d(1)
+        # conv3d - bn - relu (qat only)
+        self.conv3d_3 = nn.Conv3d(1, 1, 1)
+        self.bn3d_1 = nn.BatchNorm3d(1)
+        self.relu_6 = nn.ReLU()
+        # linear
+        self.linear_0 = nn.Linear(1, 1)
+        # linear - relu
+        self.linear_1 = nn.Linear(1, 1)
+        self.relu_3 = nn.ReLU()
+
+    def forward(self, x):
+        # conv1d
+        x = self.conv1d_0(x)
+        x = self.conv1d_1(x)
+        x = self.relu_0(x)
+        x = self.conv1d_2(x)
+        x = self.bn1d_0(x)
+        x = self.conv1d_3(x)
+        x = self.bn1d_1(x)
+        x = self.relu_4(x)
+        # conv2d
+        x = x.reshape(1, 1, 1, 1)
+        x = self.conv2d_0(x)
+        x = self.conv2d_1(x)
+        x = self.relu_1(x)
+        x = self.conv2d_2(x)
+        x = self.bn2d_0(x)
+        x = self.conv2d_3(x)
+        x = self.bn2d_1(x)
+        x = self.relu_5(x)
+        # conv3d
+        x = x.reshape(1, 1, 1, 1, 1)
+        x = self.conv3d_0(x)
+        x = self.conv3d_1(x)
+        x = self.relu_2(x)
+        x = self.conv3d_2(x)
+        x = self.bn3d_0(x)
+        x = self.conv3d_3(x)
+        x = self.bn3d_1(x)
+        x = self.relu_6(x)
+        # linear
+        x = x.reshape(1, 1)
+        x = self.linear_0(x)
+        x = self.linear_1(x)
+        x = self.relu_3(x)
+        return x
+
+
 class TestFXGraphMatcher(QuantizationTestCase):
 
     @override_qengines
@@ -308,10 +391,12 @@ def test_mobilenet_v2_qat(self):
 
 
 class FXNumericSuiteQuantizationTestCase(QuantizationTestCase):
-    def _test_extract_weights(self, m, results_len=0, qconfig_dict=None):
+    def _test_extract_weights(
+        self, m, results_len=0, qconfig_dict=None, prepare_fn=prepare_fx
+    ):
         if qconfig_dict is None:
             qconfig_dict = {'': torch.quantization.default_qconfig}
-        mp = prepare_fx(m, qconfig_dict)
+        mp = prepare_fn(m, qconfig_dict)
         # TODO(future PR): prevent the need for copying here, we can copy the
         # modules but should reuse the underlying tensors
         mp_copy = copy.deepcopy(mp)
@@ -406,53 +491,16 @@ def _test_match_shadow_activations(
 class TestFXNumericSuiteCoreAPIs(FXNumericSuiteQuantizationTestCase):
 
     @skipIfNoFBGEMM
-    def test_extract_weights_mod(self):
+    def test_extract_weights_mod_ptq(self):
+        m = AllConvAndLinearFusionModules().eval()
+        self._test_extract_weights(m, results_len=14)
 
-        class M(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                # conv1d
-                self.conv1d_0 = nn.Conv1d(1, 1, 1)
-                # conv1d - relu
-                self.conv1d_1 = nn.Conv1d(1, 1, 1)
-                self.relu_0 = nn.ReLU()
-                # conv2d
-                self.conv2d_0 = nn.Conv2d(1, 1, 1)
-                # conv2d - relu
-                self.conv2d_1 = nn.Conv2d(1, 1, 1)
-                self.relu_1 = nn.ReLU()
-                # conv3d
-                self.conv3d_0 = nn.Conv3d(1, 1, 1)
-                # conv3d - relu
-                self.conv3d_1 = nn.Conv3d(1, 1, 1)
-                self.relu_2 = nn.ReLU()
-                # linear
-                self.linear_0 = nn.Linear(1, 1)
-                # linear - relu
-                self.linear_1 = nn.Linear(1, 1)
-                self.relu_3 = nn.ReLU()
-
-
-            def forward(self, x):
-                x = self.conv1d_0(x)
-                x = self.conv1d_1(x)
-                x = self.relu_0(x)
-                x = x.reshape(1, 1, 1, 1)
-                x = self.conv2d_0(x)
-                x = self.conv2d_1(x)
-                x = self.relu_1(x)
-                x = x.reshape(1, 1, 1, 1, 1)
-                x = self.conv3d_0(x)
-                x = self.conv3d_1(x)
-                x = self.relu_2(x)
-                x = x.reshape(1, 1)
-                x = self.linear_0(x)
-                x = self.linear_1(x)
-                x = self.relu_3(x)
-                return x
-
-        m = M().eval()
-        self._test_extract_weights(m, results_len=8)
+    @skipIfNoFBGEMM
+    def test_extract_weights_mod_qat(self):
+        m = AllConvAndLinearFusionModules().train()
+        qconfig_dict = {'': torch.quantization.get_default_qat_qconfig('fbgemm')}
+        self._test_extract_weights(
+            m, results_len=14, qconfig_dict=qconfig_dict, prepare_fn=prepare_qat_fx)
 
     @skipIfNoFBGEMM
     def test_extract_weights_linear_fun(self):
diff --git a/torch/quantization/ns/graph_matcher.py b/torch/quantization/ns/graph_matcher.py
index e51784a30c85..61818f1706e7 100644
--- a/torch/quantization/ns/graph_matcher.py
+++ b/torch/quantization/ns/graph_matcher.py
@@ -30,6 +30,7 @@ def get_base_name_to_sets_of_related_ops() -> Dict[str, Set[Callable]]:
             nn.Conv1d,
             nnq.Conv1d,
             nniqat.ConvBn1d,
+            nniqat.ConvBnReLU1d,
             nniq.ConvReLU1d,
             nni.ConvReLU1d,
         ]),
@@ -38,6 +39,8 @@ def get_base_name_to_sets_of_related_ops() -> Dict[str, Set[Callable]]:
             nnq.Conv2d,
             nnqat.Conv2d,
             nniqat.ConvBn2d,
+            nniqat.ConvBnReLU2d,
+            nniqat.ConvReLU2d,
             nniq.ConvReLU2d,
             nni.ConvReLU2d,
         ]),
@@ -46,6 +49,8 @@ def get_base_name_to_sets_of_related_ops() -> Dict[str, Set[Callable]]:
             nnq.Conv3d,
             nnqat.Conv3d,
             nniqat.ConvBn3d,
+            nniqat.ConvBnReLU3d,
+            nniqat.ConvReLU3d,
             nniq.ConvReLU3d,
             nni.ConvReLU3d,
         ]),
@@ -73,6 +78,7 @@ def get_base_name_to_sets_of_related_ops() -> Dict[str, Set[Callable]]:
             nniq.LinearReLU,
             nnqat.Linear,
             nnqd.Linear,
+            nniqat.LinearReLU,
         ]),
         # linear functionals
         'torch.nn.functional.linear': set([
diff --git a/torch/quantization/ns/weight_utils.py b/torch/quantization/ns/weight_utils.py
index 9484b3cbaa15..259fc18c785e 100644
--- a/torch/quantization/ns/weight_utils.py
+++ b/torch/quantization/ns/weight_utils.py
@@ -17,7 +17,6 @@
 from typing import List, Optional, Set, Tuple, Callable
 
 def get_conv_mod_weight(mod: nn.Module) -> torch.Tensor:
-    # TODO(future PR): handle QAT variants
     if (
         isinstance(mod, nn.Conv1d) or
         isinstance(mod, nn.Conv2d) or
@@ -34,7 +33,6 @@ def get_conv_mod_weight(mod: nn.Module) -> torch.Tensor:
         return mod._weight_bias()[0]  # type: ignore
 
 def get_linear_mod_weight(mod: nn.Module) -> torch.Tensor:
-    # TODO(future PR): make more generic, handle everything
     if isinstance(mod, nn.Linear):
         return mod.weight.detach()
     elif isinstance(mod, nni.LinearReLU):
@@ -138,8 +136,6 @@ def extract_weight_from_node(
     res_type = NSSingleResultValuesType.WEIGHT.value
     if node.op == 'call_function':
 
-        # linear
-        # TODO(future PR): other function types
         related_to_linear = node.target in (F.linear,) or \
             (node.target, F.linear) in type_a_related_to_b
         related_to_conv1d = node.target in (F.conv1d,) or \

From f6a3936ab3e77909ef7df07cbfbf0e8813d4bdcb Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Thu, 15 Apr 2021 16:01:22 -0700
Subject: [PATCH 34/45] ns for fx: extend functional weight extraction testing
 to QAT (#55507)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/55507

As titled, extends the test cases for weight extraction from
functionals to cover QAT.

Test Plan:
```
python test/test_quantization.py TestFXNumericSuiteCoreAPIs
```

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D27650408

fbshipit-source-id: 8ce87d56bbc0da7c2330ece71a897d6d8c5110a0
---
 test/quantization/test_numeric_suite_fx.py | 142 ++++++++++++---------
 1 file changed, 82 insertions(+), 60 deletions(-)

diff --git a/test/quantization/test_numeric_suite_fx.py b/test/quantization/test_numeric_suite_fx.py
index 76aaa43d935d..c0a58635c766 100644
--- a/test/quantization/test_numeric_suite_fx.py
+++ b/test/quantization/test_numeric_suite_fx.py
@@ -61,6 +61,20 @@ def forward(self, x):
         return x
 
 
+class LinearReluLinearFunctional(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.w = nn.Parameter(torch.Tensor(4, 4))
+        self.b = nn.Parameter(torch.zeros(4))
+        torch.nn.init.kaiming_uniform_(self.w, a=math.sqrt(5))
+
+    def forward(self, x):
+        x = F.linear(x, self.w, self.b)
+        x = F.relu(x)
+        x = F.linear(x, self.w, self.b)
+        return x
+
+
 class AllConvAndLinearFusionModules(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -144,6 +158,51 @@ def forward(self, x):
         return x
 
 
+class AllConvFunctional(torch.nn.Module):
+    def __init__(self, weight1d, weight2d, weight3d, bias1d, bias2d, bias3d):
+        super().__init__()
+        self.weight1d = torch.nn.Parameter(weight1d)
+        self.weight2d = torch.nn.Parameter(weight2d)
+        self.weight3d = torch.nn.Parameter(weight3d)
+        self.bias1d = torch.nn.Parameter(bias1d)
+        self.bias2d = torch.nn.Parameter(bias2d)
+        self.bias3d = torch.nn.Parameter(bias3d)
+        self.stride1d = 1
+        self.padding1d = 0
+        self.dilation1d = 1
+        self.stride2d = (1, 1)
+        self.padding2d = (0, 0)
+        self.dilation2d = (1, 1)
+        self.groups = 1
+        self.stride3d = (1, 1, 1)
+        self.padding3d = (0, 0, 0)
+        self.dilation3d = (1, 1, 1)
+
+    def forward(self, x):
+        x = F.conv1d(
+            x, self.weight1d, self.bias1d, self.stride1d, self.padding1d,
+            self.dilation1d, self.groups)
+        x = F.conv1d(
+            x, self.weight1d, self.bias1d, self.stride1d, self.padding1d,
+            self.dilation1d, self.groups)
+        x = F.relu(x)
+        x = F.conv2d(
+            x, self.weight2d, self.bias2d, self.stride2d, self.padding2d,
+            self.dilation2d, self.groups)
+        x = F.conv2d(
+            x, self.weight2d, self.bias2d, self.stride2d, self.padding2d,
+            self.dilation2d, self.groups)
+        x = F.relu(x)
+        x = F.conv3d(
+            x, self.weight3d, self.bias3d, self.stride3d, self.padding3d,
+            self.dilation3d, self.groups)
+        x = F.conv3d(
+            x, self.weight3d, self.bias3d, self.stride3d, self.padding3d,
+            self.dilation3d, self.groups)
+        x = F.relu(x)
+        return x
+
+
 class TestFXGraphMatcher(QuantizationTestCase):
 
     @override_qengines
@@ -503,78 +562,41 @@ def test_extract_weights_mod_qat(self):
             m, results_len=14, qconfig_dict=qconfig_dict, prepare_fn=prepare_qat_fx)
 
     @skipIfNoFBGEMM
-    def test_extract_weights_linear_fun(self):
-        class M(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.w = nn.Parameter(torch.empty(4, 4))
-                self.b = nn.Parameter(torch.zeros(4))
-                torch.nn.init.kaiming_uniform_(self.w, a=math.sqrt(5))
-
-            def forward(self, x):
-                x = F.linear(x, self.w, self.b)
-                x = F.relu(x)
-                x = F.linear(x, self.w, self.b)
-                return x
-
-        m = M().eval()
+    def test_extract_weights_linear_fun_ptq(self):
+        m = LinearReluLinearFunctional().eval()
         self._test_extract_weights(m, results_len=2)
 
     @skipIfNoFBGEMM
-    def test_extract_weights_conv_fun(self):
-        class M(torch.nn.Module):
-            def __init__(self, weight1d, weight2d, weight3d, bias1d, bias2d, bias3d):
-                super().__init__()
-                self.weight1d = torch.nn.Parameter(weight1d)
-                self.weight2d = torch.nn.Parameter(weight2d)
-                self.weight3d = torch.nn.Parameter(weight3d)
-                self.bias1d = torch.nn.Parameter(bias1d)
-                self.bias2d = torch.nn.Parameter(bias2d)
-                self.bias3d = torch.nn.Parameter(bias3d)
-                self.stride1d = 1
-                self.padding1d = 0
-                self.dilation1d = 1
-                self.stride2d = (1, 1)
-                self.padding2d = (0, 0)
-                self.dilation2d = (1, 1)
-                self.groups = 1
-                self.stride3d = (1, 1, 1)
-                self.padding3d = (0, 0, 0)
-                self.dilation3d = (1, 1, 1)
-
-            def forward(self, x):
-                x = F.conv1d(
-                    x, self.weight1d, self.bias1d, self.stride1d, self.padding1d,
-                    self.dilation1d, self.groups)
-                x = F.conv1d(
-                    x, self.weight1d, self.bias1d, self.stride1d, self.padding1d,
-                    self.dilation1d, self.groups)
-                x = F.relu(x)
-                x = F.conv2d(
-                    x, self.weight2d, self.bias2d, self.stride2d, self.padding2d,
-                    self.dilation2d, self.groups)
-                x = F.conv2d(
-                    x, self.weight2d, self.bias2d, self.stride2d, self.padding2d,
-                    self.dilation2d, self.groups)
-                x = F.relu(x)
-                x = F.conv3d(
-                    x, self.weight3d, self.bias3d, self.stride3d, self.padding3d,
-                    self.dilation3d, self.groups)
-                x = F.conv3d(
-                    x, self.weight3d, self.bias3d, self.stride3d, self.padding3d,
-                    self.dilation3d, self.groups)
-                x = F.relu(x)
-                return x
+    def test_extract_weights_linear_fun_qat(self):
+        m = LinearReluLinearFunctional().train()
+        qconfig_dict = {'': torch.quantization.get_default_qat_qconfig('fbgemm')}
+        self._test_extract_weights(
+            m, results_len=2, qconfig_dict=qconfig_dict, prepare_fn=prepare_qat_fx)
 
+    @skipIfNoFBGEMM
+    def test_extract_weights_conv_fun_ptq(self):
         w1d = torch.randn(1, 1, 1)
         w2d = torch.randn(1, 1, 1, 1)
         w3d = torch.randn(1, 1, 1, 1, 1)
         b1d = torch.randn(1)
         b2d = torch.randn(1)
         b3d = torch.randn(1)
-        m = M(w1d, w2d, w3d, b1d, b2d, b3d).eval()
+        m = AllConvFunctional(w1d, w2d, w3d, b1d, b2d, b3d).eval()
         self._test_extract_weights(m, results_len=6)
 
+    @skipIfNoFBGEMM
+    def test_extract_weights_conv_fun_qat(self):
+        w1d = torch.randn(1, 1, 1)
+        w2d = torch.randn(1, 1, 1, 1)
+        w3d = torch.randn(1, 1, 1, 1, 1)
+        b1d = torch.randn(1)
+        b2d = torch.randn(1)
+        b3d = torch.randn(1)
+        m = AllConvFunctional(w1d, w2d, w3d, b1d, b2d, b3d).train()
+        qconfig_dict = {'': torch.quantization.get_default_qat_qconfig('fbgemm')}
+        self._test_extract_weights(
+            m, results_len=6, qconfig_dict=qconfig_dict, prepare_fn=prepare_qat_fx)
+
     @skipIfNoFBGEMM
     def test_extract_weights_dynamic(self):
         # TODO(future PR): add Linear-ReLU, after #55393 is fixed.

From 37fbc069f16292280254b6f733f36b45daacc4ea Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Thu, 15 Apr 2021 16:01:22 -0700
Subject: [PATCH 35/45] ns for fx: qat test cases for unshadowed activations
 (#55508)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/55508

Adds QAT test cases for unshadowed activation APIs.

Test Plan:
```
python test/test_quantization.py TestFXNumericSuiteCoreAPIs
```

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D27650406

fbshipit-source-id: bcbbdf1d32b8f8627c30d6aaf22607f34d1e2e08
---
 test/quantization/test_numeric_suite_fx.py | 53 +++++++++++++---------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/test/quantization/test_numeric_suite_fx.py b/test/quantization/test_numeric_suite_fx.py
index c0a58635c766..b27193159f92 100644
--- a/test/quantization/test_numeric_suite_fx.py
+++ b/test/quantization/test_numeric_suite_fx.py
@@ -474,10 +474,15 @@ def _test_match_activations(
         should_log_inputs=False,
         qconfig_dict=None,
         skip_scripting=False,
+        prepare_fn=prepare_fx,
     ):
         if qconfig_dict is None:
             qconfig_dict = {'': torch.quantization.default_qconfig}
-        mp = prepare_fx(m, qconfig_dict)
+        if prepare_fn == prepare_fx:
+            m.eval()
+        else:
+            m.train()
+        mp = prepare_fn(m, qconfig_dict)
         mp(*data)
         # TODO(future PR): prevent the need for copying here, we can copy the
         # modules but should reuse the underlying tensors
@@ -608,47 +613,51 @@ def test_extract_weights_dynamic(self):
         }
         self._test_extract_weights(m, results_len=1, qconfig_dict=qconfig_dict)
 
-    @skipIfNoFBGEMM
-    def test_match_activations_mod(self):
+    def _test_match_activations_mod_impl(self, prepare_fn=prepare_fx):
         m = nn.Sequential(
             torch.quantization.QuantStub(),
             nn.Conv2d(1, 1, 1),
             nn.Conv2d(1, 1, 1),
         ).eval()
+        qconfig_dict = None
+        if prepare_fn == prepare_qat_fx:
+            qconfig_dict = {'': torch.quantization.get_default_qat_qconfig('fbgemm')}
         expected_occurrence = {
             ns.call_module(OutputLogger): 2,
         }
         self._test_match_activations(
             m, (torch.randn(2, 1, 2, 2),),
             prepared_expected_node_occurrence=expected_occurrence,
-            results_len=2)
+            results_len=2, qconfig_dict=qconfig_dict, prepare_fn=prepare_fn)
 
-    @override_qengines
-    def test_match_activations_fun(self):
-        class M(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.w1 = nn.Parameter(torch.empty(4, 4))
-                self.b1 = nn.Parameter(torch.zeros(4))
-                self.w2 = nn.Parameter(torch.empty(4, 4))
-                self.b2 = nn.Parameter(torch.zeros(4))
-                torch.nn.init.kaiming_uniform_(self.w1, a=math.sqrt(5))
-                torch.nn.init.kaiming_uniform_(self.w2, a=math.sqrt(5))
+    @skipIfNoFBGEMM
+    def test_match_activations_mod_ptq(self):
+        self._test_match_activations_mod_impl(prepare_fn=prepare_fx)
 
-            def forward(self, x):
-                x = F.linear(x, self.w1, self.b1)
-                x = F.linear(x, self.w2, self.b2)
-                x = F.relu(x)
-                return x
+    @skipIfNoFBGEMM
+    def test_match_activations_mod_qat(self):
+        self._test_match_activations_mod_impl(prepare_fn=prepare_qat_fx)
 
-        m = M().eval()
+    def _test_match_activations_fun_impl(self, prepare_fn=prepare_fx):
+        m = LinearReluLinearFunctional().eval()
+        qconfig_dict = None
+        if prepare_fn == prepare_qat_fx:
+            qconfig_dict = {'': torch.quantization.get_default_qat_qconfig('fbgemm')}
         expected_occurrence = {
             ns.call_module(OutputLogger): 2,
         }
         self._test_match_activations(
             m, (torch.randn(4, 4),),
             prepared_expected_node_occurrence=expected_occurrence,
-            results_len=2)
+            results_len=2, prepare_fn=prepare_fn, qconfig_dict=qconfig_dict)
+
+    @skipIfNoFBGEMM
+    def test_match_activations_fun_ptq(self):
+        self._test_match_activations_fun_impl(prepare_fn=prepare_fx)
+
+    @skipIfNoFBGEMM
+    def test_match_activations_fun_qat(self):
+        self._test_match_activations_fun_impl(prepare_fn=prepare_qat_fx)
 
     @skipIfNoFBGEMM
     def test_add_shadow_loggers_mod(self):

From 84b5f67d9bc7ec24cc64bb68946aa44e17e0d886 Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Thu, 15 Apr 2021 16:01:22 -0700
Subject: [PATCH 36/45] ns for fx: add qat tests cases for shadowed activations
 (#55614)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/55614

Adds testing for shadowed activations APIs and QAT.

Test Plan:
```
python test/test_quantization.py TestFXNumericSuiteCoreAPIs.test_add_shadow_loggers_mod_ptq
python test/test_quantization.py TestFXNumericSuiteCoreAPIs.test_add_shadow_loggers_mod_qat
python test/test_quantization.py TestFXNumericSuiteCoreAPIs.test_add_shadow_loggers_fun_ptq
python test/test_quantization.py TestFXNumericSuiteCoreAPIs.test_add_shadow_loggers_qat_qat
```

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D27650405

fbshipit-source-id: c5138d98aa072e2927a54329c87e755413adeb5d
---
 test/quantization/test_numeric_suite_fx.py | 53 +++++++++++++---------
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/test/quantization/test_numeric_suite_fx.py b/test/quantization/test_numeric_suite_fx.py
index b27193159f92..1be630ced87a 100644
--- a/test/quantization/test_numeric_suite_fx.py
+++ b/test/quantization/test_numeric_suite_fx.py
@@ -518,10 +518,15 @@ def _test_match_activations(
     def _test_match_shadow_activations(
         self, m, data, prepared_expected_node_occurrence=None, results_len=0,
         should_log_inputs=False, qconfig_dict=None, skip_scripting=False,
+        prepare_fn=prepare_fx,
     ):
         if qconfig_dict is None:
             qconfig_dict = {'': torch.quantization.default_qconfig}
-        mp = prepare_fx(m, qconfig_dict)
+        if prepare_fn == prepare_fx:
+            m.eval()
+        else:
+            m.train()
+        mp = prepare_fn(m, qconfig_dict)
         mp(*data)
         # TODO(future PR): prevent the need for copying here, we can copy the
         # modules but should reuse the underlying tensors
@@ -659,36 +664,42 @@ def test_match_activations_fun_ptq(self):
     def test_match_activations_fun_qat(self):
         self._test_match_activations_fun_impl(prepare_fn=prepare_qat_fx)
 
-    @skipIfNoFBGEMM
-    def test_add_shadow_loggers_mod(self):
+    def _test_add_shadow_loggers_mod_impl(self, prepare_fn=prepare_fx):
         m = nn.Sequential(
             nn.Conv2d(1, 1, 1),
             nn.Conv2d(1, 1, 1),
         ).eval()
+        qconfig_dict = None
+        if prepare_fn == prepare_qat_fx:
+            qconfig_dict = {'': torch.quantization.get_default_qat_qconfig('fbgemm')}
         res = self._test_match_shadow_activations(
-            m, (torch.randn(1, 1, 4, 4),), results_len=2)
+            m, (torch.randn(1, 1, 4, 4),), results_len=2,
+            prepare_fn=prepare_fn, qconfig_dict=qconfig_dict)
 
     @skipIfNoFBGEMM
-    def test_add_shadow_loggers_fun(self):
-        class M(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.w1 = nn.Parameter(torch.empty(4, 4))
-                self.b1 = nn.Parameter(torch.zeros(4))
-                self.w2 = nn.Parameter(torch.empty(4, 4))
-                self.b2 = nn.Parameter(torch.zeros(4))
-                torch.nn.init.kaiming_uniform_(self.w1, a=math.sqrt(5))
-                torch.nn.init.kaiming_uniform_(self.w2, a=math.sqrt(5))
+    def test_add_shadow_loggers_mod_ptq(self):
+        self._test_add_shadow_loggers_mod_impl(prepare_fn=prepare_fx)
 
-            def forward(self, x):
-                x = F.linear(x, self.w1, self.b1)
-                x = F.linear(x, self.w2, self.b2)
-                x = F.relu(x)
-                return x
+    @skipIfNoFBGEMM
+    def test_add_shadow_loggers_mod_qat(self):
+        self._test_add_shadow_loggers_mod_impl(prepare_fn=prepare_qat_fx)
 
-        m = M().eval()
+    def _test_add_shadow_loggers_fun_impl(self, prepare_fn=prepare_fx):
+        m = LinearReluLinearFunctional()
+        qconfig_dict = None
+        if prepare_fn == prepare_qat_fx:
+            qconfig_dict = {'': torch.quantization.get_default_qat_qconfig('fbgemm')}
         res = self._test_match_shadow_activations(
-            m, (torch.randn(4, 4),), results_len=2)
+            m, (torch.randn(4, 4),), results_len=2, prepare_fn=prepare_fn,
+            qconfig_dict=qconfig_dict)
+
+    @skipIfNoFBGEMM
+    def test_add_shadow_loggers_fun_ptq(self):
+        self._test_add_shadow_loggers_fun_impl(prepare_fn=prepare_fx)
+
+    @skipIfNoFBGEMM
+    def test_add_shadow_loggers_fun_qat(self):
+        self._test_add_shadow_loggers_fun_impl(prepare_fn=prepare_qat_fx)
 
     @skipIfNoFBGEMM
     def test_add_shadow_loggers_multiple_dtype_casts(self):

From b461104554edba950ca8179e4917df5ee9ce022e Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Thu, 15 Apr 2021 16:01:22 -0700
Subject: [PATCH 37/45] ns for fx: make get_reversed_fusions reuse quantization
 fusions (#55803)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/55803

Makes the NS `graph_matcher.get_reversed_fusions` use the fusions
defined the FX quantization code instead of duplicating them.

Test Plan:
```
python test/test_quantization.py TestFXNumericSuiteCoreAPIs
python test/test_quantization.py TestFXNumericSuiteCoreAPIsModels
```

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D27719980

fbshipit-source-id: 12e3183405181bb9001f10e765cfb4d2ffdfdd88
---
 torch/quantization/ns/graph_matcher.py | 47 +++++++++++++++-----------
 1 file changed, 28 insertions(+), 19 deletions(-)

diff --git a/torch/quantization/ns/graph_matcher.py b/torch/quantization/ns/graph_matcher.py
index 61818f1706e7..0cd4276ec750 100644
--- a/torch/quantization/ns/graph_matcher.py
+++ b/torch/quantization/ns/graph_matcher.py
@@ -17,6 +17,7 @@
 
 from .utils import getattr_from_fqn
 from .ns_types import NSSubgraph
+from torch.quantization.fx.pattern_utils import get_default_quant_patterns
 
 from typing import Dict, Tuple, List, Optional, Set, Callable, Any, Union
 
@@ -183,28 +184,36 @@ def get_reversed_fusions() -> Set[Tuple[NSFusionType, int]]:
     of 0 represents the first op in regular (non-reverse) order, 1 represents the
     second op, etc.
     """
-    # TODO(future PR): remove the custom syntax for defining fusion patterns
-    # and reuse either quantization's syntax or something else.
-    return set([
-        # linear functionals
-        ((F.relu, F.linear), 0),
-        # conv functionals
-        ((F.relu, F.conv1d), 0),
-        ((F.relu, F.conv2d), 0),
-        ((F.relu, F.conv3d), 0),
-        # conv modules
-        ((nn.ReLU, nn.Conv1d), 0),
-        ((nn.ReLU, nn.Conv2d), 0),
-        ((nn.ReLU, nn.Conv3d), 0),
-        # linear modules
-        ((nn.ReLU, nn.Linear), 0),
+    results: Set[Tuple[NSFusionType, int]] = set([])
+
+    # Possible syntaxes:
+    # * single op: torch.nn.Conv2d
+    # * multiple ops: (torch.nn.ReLU, torch.nn.Conv2d)
+    # For fusions, we only care about patterns composed of multiple ops.
+    # TODO(future PR): allow customizations from default patterns.
+    all_quant_patterns = get_default_quant_patterns()
+    default_base_op_idx = 0
+    for quant_pattern, _quant_handler in all_quant_patterns.items():
+        # this only takes patterns of multiple ops
+        if isinstance(quant_pattern, tuple):
+            results.add((quant_pattern, default_base_op_idx))  # type: ignore
+
+    # After this point, results countains values such as
+    # [..., ((torch.nn.Relu, torch.nn.Conv2d), 0), ...]
+
+    # Patterns for matching fp16 emulation are not specified in the quantization
+    # fusion mappings.  For now, define them here.
+    fp16_em_base_op_idx = 1
+    patterns_to_add = [
         # linear-relu fp16 emulation:
         # fp16_to_fp32 -> linear -> relu -> fp32_to_fp16
-        ((("to", torch.float16), F.relu, F.linear, "dequantize"), 1),
-    ])
+        ((("to", torch.float16), F.relu, F.linear, "dequantize"), fp16_em_base_op_idx,),
+    ]
+    for p in patterns_to_add:
+        results.add(p)
+
+    return results
 
-# TODO(future PR): we should see if we can reuse quantization's fusion
-# patterns here.
 def end_node_matches_reversed_fusion(
     end_node: Node,
     reversed_fusion: NSFusionType,

From c8209a73369edcda996299964ad8ef516ff9ebdf Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Thu, 15 Apr 2021 16:01:22 -0700
Subject: [PATCH 38/45] ns for fx: move pattern utils to separate file (#55805)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/55805

No logic change, just moving util functions to separate file.

Test Plan:
```
python test/test_quantization.py TestFXGraphMatcher
python test/test_quantization.py TestFXNumericSuiteCoreAPIs
```

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D27719982

fbshipit-source-id: c80d5397c1efeb9fc83eacaa532ecbde557cca3f
---
 torch/quantization/ns/graph_matcher.py | 255 +-----------------------
 torch/quantization/ns/pattern_utils.py | 264 +++++++++++++++++++++++++
 2 files changed, 271 insertions(+), 248 deletions(-)
 create mode 100644 torch/quantization/ns/pattern_utils.py

diff --git a/torch/quantization/ns/graph_matcher.py b/torch/quantization/ns/graph_matcher.py
index 0cd4276ec750..f5f2d0b90fe9 100644
--- a/torch/quantization/ns/graph_matcher.py
+++ b/torch/quantization/ns/graph_matcher.py
@@ -4,12 +4,6 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import torch.nn.quantized as nnq
-import torch.nn.quantized.dynamic as nnqd
-import torch.nn.qat as nnqat
-import torch.nn.intrinsic.quantized as nniq
-import torch.nn.intrinsic.qat as nniqat
-import torch.nn.intrinsic as nni
 toq = torch.ops.quantized
 
 from torch.fx import GraphModule
@@ -17,130 +11,18 @@
 
 from .utils import getattr_from_fqn
 from .ns_types import NSSubgraph
-from torch.quantization.fx.pattern_utils import get_default_quant_patterns
+from .pattern_utils import (
+    get_base_name_to_sets_of_related_ops,
+    get_type_a_related_to_b,
+    get_reversed_fusions,
+    end_node_matches_reversed_fusion,
+)
 
-from typing import Dict, Tuple, List, Optional, Set, Callable, Any, Union
+from typing import Dict, Tuple, List, Optional, Set, Callable, Any
 
 def _get_output_nodes(g: Graph) -> List[Node]:
     return [n for n in g.nodes if n.op == 'output']
 
-def get_base_name_to_sets_of_related_ops() -> Dict[str, Set[Callable]]:
-    base_name_to_sets_of_related_ops: Dict[str, Set[Callable]] = {
-        # conv modules
-        'torch.nn.Conv1d': set([
-            nn.Conv1d,
-            nnq.Conv1d,
-            nniqat.ConvBn1d,
-            nniqat.ConvBnReLU1d,
-            nniq.ConvReLU1d,
-            nni.ConvReLU1d,
-        ]),
-        'torch.nn.Conv2d': set([
-            nn.Conv2d,
-            nnq.Conv2d,
-            nnqat.Conv2d,
-            nniqat.ConvBn2d,
-            nniqat.ConvBnReLU2d,
-            nniqat.ConvReLU2d,
-            nniq.ConvReLU2d,
-            nni.ConvReLU2d,
-        ]),
-        'torch.nn.Conv3d': set([
-            nn.Conv3d,
-            nnq.Conv3d,
-            nnqat.Conv3d,
-            nniqat.ConvBn3d,
-            nniqat.ConvBnReLU3d,
-            nniqat.ConvReLU3d,
-            nniq.ConvReLU3d,
-            nni.ConvReLU3d,
-        ]),
-        # conv functionals
-        'torch.nn.functional.conv1d': set([
-            F.conv1d,
-            toq.conv1d,
-            toq.conv1d_relu,
-        ]),
-        'torch.nn.functional.conv2d': set([
-            F.conv2d,
-            toq.conv2d,
-            toq.conv2d_relu,
-        ]),
-        'torch.nn.functional.conv3d': set([
-            F.conv3d,
-            toq.conv3d,
-            toq.conv3d_relu,
-        ]),
-        # linear modules
-        'torch.nn.Linear': set([
-            nn.Linear,
-            nnq.Linear,
-            nni.LinearReLU,
-            nniq.LinearReLU,
-            nnqat.Linear,
-            nnqd.Linear,
-            nniqat.LinearReLU,
-        ]),
-        # linear functionals
-        'torch.nn.functional.linear': set([
-            F.linear,
-            toq.linear,
-            toq.linear_relu,
-        ]),
-        # LSTM
-        'torch.nn.LSTM': set([
-            nn.LSTM,
-            nnqd.LSTM,
-        ]),
-        # add
-        'torch.add': set([
-            torch.add,
-            toq.add,
-            operator.add,  # x + y
-        ]),
-        # cat
-        'torch.cat': set([
-            torch.cat,
-            toq.cat,
-        ]),
-        # mul
-        'torch.mul': set([
-            torch.mul,
-            toq.mul,
-        ]),
-        # relu
-        'torch.relu': set([
-            F.relu,
-        ]),
-        # maxpool2d
-        'torch.nn.MaxPool2d': set([
-            nn.MaxPool2d,
-        ]),
-        # sigmoid
-        'torch.sigmoid': set([
-            torch.sigmoid,
-        ]),
-    }
-    return base_name_to_sets_of_related_ops
-
-def get_type_a_related_to_b(
-    base_name_to_sets_of_related_ops: Dict[str, Set[Callable]],
-) -> Set[Tuple[Callable, Callable]]:
-    # TODO(future PR): allow customizations
-    # TODO(future PR): reuse existing quantization mappings
-    # TODO(future PR): add the rest of modules and ops here
-    type_a_related_to_b: Set[Tuple[Callable, Callable]] = set()
-
-    for base_name, s in base_name_to_sets_of_related_ops.items():
-        s_list = list(s)
-        # add every bidirectional pair
-        for idx_0 in range(0, len(s_list) - 1):
-            for idx_1 in range(idx_0 + 1, len(s_list)):
-                type_a_related_to_b.add((s_list[idx_0], s_list[idx_1]))
-                type_a_related_to_b.add((s_list[idx_1], s_list[idx_0]))
-
-    return type_a_related_to_b
-
 def get_non_matchable_functions() -> Set[Callable]:
     """
     `call_function` nodes pointing to these functions are non-matchable.
@@ -161,129 +43,6 @@ def get_non_matchable_modules() -> Set[Callable]:
         torch.quantization.FakeQuantizeBase,
     ])
 
-NSFusionElType = Union[
-    Callable,  # call_function or call_module type, example: F.linear or nn.Conv2d
-    str,  # call_method name, example: "dequantize"
-    Tuple[str, Any],  # call_method name and first argument, example: ("to", torch.float16)
-]
-NSFusionType = Union[
-    Tuple[NSFusionElType, NSFusionElType],
-    Tuple[NSFusionElType, NSFusionElType, NSFusionElType, NSFusionElType],
-]
-
-def get_reversed_fusions() -> Set[Tuple[NSFusionType, int]]:
-    """
-    Set of potential fusions, in reverse order.  The order is reversed
-    to match how fusion patterns are defined in quantization code.
-
-    Fusion format:
-    ((fusion_op_0, fusion_op_1), base_op_idx)
-
-    Where base_op_idx is the idx of the op we should use to match other related
-    ops. Note: base_op_idx is specified in non-reverse order, i.e. a base_op_idx
-    of 0 represents the first op in regular (non-reverse) order, 1 represents the
-    second op, etc.
-    """
-    results: Set[Tuple[NSFusionType, int]] = set([])
-
-    # Possible syntaxes:
-    # * single op: torch.nn.Conv2d
-    # * multiple ops: (torch.nn.ReLU, torch.nn.Conv2d)
-    # For fusions, we only care about patterns composed of multiple ops.
-    # TODO(future PR): allow customizations from default patterns.
-    all_quant_patterns = get_default_quant_patterns()
-    default_base_op_idx = 0
-    for quant_pattern, _quant_handler in all_quant_patterns.items():
-        # this only takes patterns of multiple ops
-        if isinstance(quant_pattern, tuple):
-            results.add((quant_pattern, default_base_op_idx))  # type: ignore
-
-    # After this point, results countains values such as
-    # [..., ((torch.nn.Relu, torch.nn.Conv2d), 0), ...]
-
-    # Patterns for matching fp16 emulation are not specified in the quantization
-    # fusion mappings.  For now, define them here.
-    fp16_em_base_op_idx = 1
-    patterns_to_add = [
-        # linear-relu fp16 emulation:
-        # fp16_to_fp32 -> linear -> relu -> fp32_to_fp16
-        ((("to", torch.float16), F.relu, F.linear, "dequantize"), fp16_em_base_op_idx,),
-    ]
-    for p in patterns_to_add:
-        results.add(p)
-
-    return results
-
-def end_node_matches_reversed_fusion(
-    end_node: Node,
-    reversed_fusion: NSFusionType,
-    gm: GraphModule,
-) -> bool:
-    """
-    Returns true if a pattern ending with `end_node` matches
-    the fusion pattern.
-    """
-    cur_node = end_node
-    for fusion_idx in range(len(reversed_fusion)):
-        cur_fusion_el = reversed_fusion[fusion_idx]
-
-        if cur_node.op == 'call_function':
-            fusion_el_is_fun = (not isinstance(cur_fusion_el, str)) and \
-                (not isinstance(cur_fusion_el, type))
-            if fusion_el_is_fun:
-                if cur_node.target != cur_fusion_el:
-                    return False
-                if len(cur_node.args) > 0 and isinstance(cur_node.args[0], Node):
-                    cur_node = cur_node.args[0]
-                else:
-                    return False
-            else:
-                return False
-
-        elif cur_node.op == 'call_module':
-            fusion_el_is_mod = isinstance(cur_fusion_el, type)
-            if fusion_el_is_mod:
-                assert isinstance(cur_node.target, str)
-                target_mod = getattr_from_fqn(gm, cur_node.target)
-                if not isinstance(cur_fusion_el, type):
-                    return False
-                if not isinstance(target_mod, cur_fusion_el):
-                    return False
-                if len(cur_node.args) > 0 and isinstance(cur_node.args[0], Node):
-                    cur_node = cur_node.args[0]
-                else:
-                    return False
-            else:
-                return False
-
-        elif cur_node.op == 'call_method':
-            fusion_el_is_meth_with_second_arg = \
-                isinstance(cur_fusion_el, tuple) and len(cur_fusion_el) == 2
-            fusion_el_is_meth_without_args = isinstance(cur_fusion_el, str)
-            if fusion_el_is_meth_without_args or fusion_el_is_meth_with_second_arg:
-                if fusion_el_is_meth_without_args:
-                    if cur_node.target != cur_fusion_el:
-                        return False
-                else:
-                    assert isinstance(cur_fusion_el, tuple)
-                    if cur_node.target != cur_fusion_el[0]:
-                        return False
-                    elif len(cur_node.args) < 2:
-                        return False
-                    elif cur_node.args[1] != cur_fusion_el[1]:
-                        return False
-
-                if len(cur_node.args) > 0 and isinstance(cur_node.args[0], Node):
-                    cur_node = cur_node.args[0]
-                else:
-                    return False
-            else:
-                return False
-        else:
-            return False
-
-    return True
-
 
 class _NSGraphMatchableSubgraphsIterator:
     """
diff --git a/torch/quantization/ns/pattern_utils.py b/torch/quantization/ns/pattern_utils.py
new file mode 100644
index 000000000000..ed35f77ed137
--- /dev/null
+++ b/torch/quantization/ns/pattern_utils.py
@@ -0,0 +1,264 @@
+import operator
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+toq = torch.ops.quantized
+
+import torch.nn.quantized as nnq
+import torch.nn.quantized.dynamic as nnqd
+import torch.nn.qat as nnqat
+import torch.nn.intrinsic.quantized as nniq
+import torch.nn.intrinsic.qat as nniqat
+import torch.nn.intrinsic as nni
+
+from torch.fx import GraphModule
+from torch.fx.graph import Node
+
+from .utils import getattr_from_fqn
+from torch.quantization.fx.pattern_utils import get_default_quant_patterns
+
+from typing import Dict, Tuple, Set, Callable, Any, Union
+
+def get_base_name_to_sets_of_related_ops() -> Dict[str, Set[Callable]]:
+    base_name_to_sets_of_related_ops: Dict[str, Set[Callable]] = {
+        # conv modules
+        'torch.nn.Conv1d': set([
+            nn.Conv1d,
+            nnq.Conv1d,
+            nniqat.ConvBn1d,
+            nniqat.ConvBnReLU1d,
+            nniq.ConvReLU1d,
+            nni.ConvReLU1d,
+        ]),
+        'torch.nn.Conv2d': set([
+            nn.Conv2d,
+            nnq.Conv2d,
+            nnqat.Conv2d,
+            nniqat.ConvBn2d,
+            nniqat.ConvBnReLU2d,
+            nniqat.ConvReLU2d,
+            nniq.ConvReLU2d,
+            nni.ConvReLU2d,
+        ]),
+        'torch.nn.Conv3d': set([
+            nn.Conv3d,
+            nnq.Conv3d,
+            nnqat.Conv3d,
+            nniqat.ConvBn3d,
+            nniqat.ConvBnReLU3d,
+            nniqat.ConvReLU3d,
+            nniq.ConvReLU3d,
+            nni.ConvReLU3d,
+        ]),
+        # conv functionals
+        'torch.nn.functional.conv1d': set([
+            F.conv1d,
+            toq.conv1d,
+            toq.conv1d_relu,
+        ]),
+        'torch.nn.functional.conv2d': set([
+            F.conv2d,
+            toq.conv2d,
+            toq.conv2d_relu,
+        ]),
+        'torch.nn.functional.conv3d': set([
+            F.conv3d,
+            toq.conv3d,
+            toq.conv3d_relu,
+        ]),
+        # linear modules
+        'torch.nn.Linear': set([
+            nn.Linear,
+            nnq.Linear,
+            nni.LinearReLU,
+            nniq.LinearReLU,
+            nnqat.Linear,
+            nnqd.Linear,
+            nniqat.LinearReLU,
+        ]),
+        # linear functionals
+        'torch.nn.functional.linear': set([
+            F.linear,
+            toq.linear,
+            toq.linear_relu,
+        ]),
+        # LSTM
+        'torch.nn.LSTM': set([
+            nn.LSTM,
+            nnqd.LSTM,
+        ]),
+        # add
+        'torch.add': set([
+            torch.add,
+            toq.add,
+            operator.add,  # x + y
+        ]),
+        # cat
+        'torch.cat': set([
+            torch.cat,
+            toq.cat,
+        ]),
+        # mul
+        'torch.mul': set([
+            torch.mul,
+            toq.mul,
+        ]),
+        # relu
+        'torch.relu': set([
+            F.relu,
+        ]),
+        # maxpool2d
+        'torch.nn.MaxPool2d': set([
+            nn.MaxPool2d,
+        ]),
+        # sigmoid
+        'torch.sigmoid': set([
+            torch.sigmoid,
+        ]),
+    }
+    return base_name_to_sets_of_related_ops
+
+
+def get_type_a_related_to_b(
+    base_name_to_sets_of_related_ops: Dict[str, Set[Callable]],
+) -> Set[Tuple[Callable, Callable]]:
+    # TODO(future PR): allow customizations
+    # TODO(future PR): reuse existing quantization mappings
+    # TODO(future PR): add the rest of modules and ops here
+    type_a_related_to_b: Set[Tuple[Callable, Callable]] = set()
+
+    for base_name, s in base_name_to_sets_of_related_ops.items():
+        s_list = list(s)
+        # add every bidirectional pair
+        for idx_0 in range(0, len(s_list) - 1):
+            for idx_1 in range(idx_0 + 1, len(s_list)):
+                type_a_related_to_b.add((s_list[idx_0], s_list[idx_1]))
+                type_a_related_to_b.add((s_list[idx_1], s_list[idx_0]))
+
+    return type_a_related_to_b
+
+
+NSFusionElType = Union[
+    Callable,  # call_function or call_module type, example: F.linear or nn.Conv2d
+    str,  # call_method name, example: "dequantize"
+    Tuple[str, Any],  # call_method name and first argument, example: ("to", torch.float16)
+]
+NSFusionType = Union[
+    Tuple[NSFusionElType, NSFusionElType],
+    Tuple[NSFusionElType, NSFusionElType, NSFusionElType, NSFusionElType],
+]
+
+def get_reversed_fusions() -> Set[Tuple[NSFusionType, int]]:
+    """
+    Set of potential fusions, in reverse order.  The order is reversed
+    to match how fusion patterns are defined in quantization code.
+
+    Fusion format:
+    ((fusion_op_0, fusion_op_1), base_op_idx)
+
+    Where base_op_idx is the idx of the op we should use to match other related
+    ops. Note: base_op_idx is specified in non-reverse order, i.e. a base_op_idx
+    of 0 represents the first op in regular (non-reverse) order, 1 represents the
+    second op, etc.
+    """
+    results: Set[Tuple[NSFusionType, int]] = set([])
+
+    # Possible syntaxes:
+    # * single op: torch.nn.Conv2d
+    # * multiple ops: (torch.nn.ReLU, torch.nn.Conv2d)
+    # For fusions, we only care about patterns composed of multiple ops.
+    # TODO(future PR): allow customizations from default patterns.
+    all_quant_patterns = get_default_quant_patterns()
+    default_base_op_idx = 0
+    for quant_pattern, _quant_handler in all_quant_patterns.items():
+        # this only takes patterns of multiple ops
+        if isinstance(quant_pattern, tuple):
+            results.add((quant_pattern, default_base_op_idx))  # type: ignore
+
+    # After this point, results countains values such as
+    # [..., ((torch.nn.Relu, torch.nn.Conv2d), 0), ...]
+
+    # Patterns for matching fp16 emulation are not specified in the quantization
+    # fusion mappings.  For now, define them here.
+    fp16_em_base_op_idx = 1
+    patterns_to_add = [
+        # linear-relu fp16 emulation:
+        # fp16_to_fp32 -> linear -> relu -> fp32_to_fp16
+        ((("to", torch.float16), F.relu, F.linear, "dequantize"), fp16_em_base_op_idx,),
+    ]
+    for p in patterns_to_add:
+        results.add(p)
+
+    return results
+
+
+def end_node_matches_reversed_fusion(
+    end_node: Node,
+    reversed_fusion: NSFusionType,
+    gm: GraphModule,
+) -> bool:
+    """
+    Returns true if a pattern ending with `end_node` matches
+    the fusion pattern.
+    """
+    cur_node = end_node
+    for fusion_idx in range(len(reversed_fusion)):
+        cur_fusion_el = reversed_fusion[fusion_idx]
+
+        if cur_node.op == 'call_function':
+            fusion_el_is_fun = (not isinstance(cur_fusion_el, str)) and \
+                (not isinstance(cur_fusion_el, type))
+            if fusion_el_is_fun:
+                if cur_node.target != cur_fusion_el:
+                    return False
+                if len(cur_node.args) > 0 and isinstance(cur_node.args[0], Node):
+                    cur_node = cur_node.args[0]
+                else:
+                    return False
+            else:
+                return False
+
+        elif cur_node.op == 'call_module':
+            fusion_el_is_mod = isinstance(cur_fusion_el, type)
+            if fusion_el_is_mod:
+                assert isinstance(cur_node.target, str)
+                target_mod = getattr_from_fqn(gm, cur_node.target)
+                if not isinstance(cur_fusion_el, type):
+                    return False
+                if not isinstance(target_mod, cur_fusion_el):
+                    return False
+                if len(cur_node.args) > 0 and isinstance(cur_node.args[0], Node):
+                    cur_node = cur_node.args[0]
+                else:
+                    return False
+            else:
+                return False
+
+        elif cur_node.op == 'call_method':
+            fusion_el_is_meth_with_second_arg = \
+                isinstance(cur_fusion_el, tuple) and len(cur_fusion_el) == 2
+            fusion_el_is_meth_without_args = isinstance(cur_fusion_el, str)
+            if fusion_el_is_meth_without_args or fusion_el_is_meth_with_second_arg:
+                if fusion_el_is_meth_without_args:
+                    if cur_node.target != cur_fusion_el:
+                        return False
+                else:
+                    assert isinstance(cur_fusion_el, tuple)
+                    if cur_node.target != cur_fusion_el[0]:
+                        return False
+                    elif len(cur_node.args) < 2:
+                        return False
+                    elif cur_node.args[1] != cur_fusion_el[1]:
+                        return False
+
+                if len(cur_node.args) > 0 and isinstance(cur_node.args[0], Node):
+                    cur_node = cur_node.args[0]
+                else:
+                    return False
+            else:
+                return False
+        else:
+            return False
+
+    return True

From f59244ec16dc8e0843829ab65ab89ffd94a08bea Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Thu, 15 Apr 2021 16:01:22 -0700
Subject: [PATCH 39/45] ns for fx: add test for op relationship coverage
 (#55837)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/55837

Adds a test that checks that all of the relevant op pairs defined in
`quantization_mappings.py` are also defined as related by Numerical
Suite.

Note: this does not cover all the ops, just the ones in
`quantization_mappings.py`.  A future PR will fill out the remainder.

Test Plan:
```
python test/test_quantization.py TestFXGraphMatcher.test_op_relationship_mapping
```

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D27719979

fbshipit-source-id: 9e852ef94da5f7a653ea15ba52c68a89c8e30208
---
 test/quantization/test_numeric_suite_fx.py  | 144 ++++++++++++++++++++
 torch/quantization/ns/pattern_utils.py      | 107 +++++++++++++++
 torch/quantization/quantization_mappings.py |   4 +
 3 files changed, 255 insertions(+)

diff --git a/test/quantization/test_numeric_suite_fx.py b/test/quantization/test_numeric_suite_fx.py
index 1be630ced87a..a442f974af62 100644
--- a/test/quantization/test_numeric_suite_fx.py
+++ b/test/quantization/test_numeric_suite_fx.py
@@ -1,5 +1,6 @@
 import copy
 import math
+import operator
 
 import torch
 import torch.nn as nn
@@ -24,8 +25,19 @@
     SparseNNModel,
     skip_if_no_torchvision,
 )
+from torch.quantization.quantization_mappings import (
+    get_default_static_quant_module_mappings,
+    get_default_dynamic_quant_module_mappings,
+    get_default_float_to_quantized_operator_mappings,
+)
 from torch.testing._internal.common_quantization import NodeSpec as ns
 from torch.testing._internal.common_quantized import override_qengines
+from torch.quantization.fx.pattern_utils import get_default_quant_patterns
+import torch.quantization.fx.quantization_patterns as qp
+from torch.quantization.ns.pattern_utils import (
+    get_base_name_to_sets_of_related_ops,
+    get_type_a_related_to_b,
+)
 from torch.quantization.ns.graph_matcher import (
     get_matching_subgraph_pairs,
     GraphMatchingException,
@@ -417,6 +429,138 @@ def forward(self, x):
         }
         self.assert_types_for_matched_subgraph_pairs(results, expected_types, mp, mq)
 
+    def test_op_relationship_mapping(self):
+        """
+        Tests that the mapping of op relationships is complete.
+        """
+        base_name_to_sets_of_related_ops = get_base_name_to_sets_of_related_ops()
+        type_a_related_to_b = \
+            get_type_a_related_to_b(base_name_to_sets_of_related_ops)
+
+        # 1. check static quant module mappings
+        static_quant_mod_mappings = get_default_static_quant_module_mappings()
+        for fp32_type, int8_type in static_quant_mod_mappings.items():
+            # skip quants and dequants, for the purposes of Numerical Suite
+            types_to_skip = (
+                torch.quantization.QuantStub,
+                torch.quantization.DeQuantStub,
+                nnq.FloatFunctional,
+            )
+            if fp32_type in types_to_skip:
+                continue
+
+            # verify relatedness
+            in_type_a_related_to_b = \
+                (fp32_type, int8_type) in type_a_related_to_b
+            self.assertTrue(
+                in_type_a_related_to_b,
+                f"{fp32_type} and {int8_type} need a relationship mapping")
+
+        # 2. check static quant op mappings
+        static_quant_fun_mappings = get_default_float_to_quantized_operator_mappings()
+        for fp32_type, int8_type in static_quant_fun_mappings.items():
+            # verify relatedness
+            in_type_a_related_to_b = \
+                (fp32_type, int8_type) in type_a_related_to_b
+            self.assertTrue(
+                in_type_a_related_to_b,
+                f"{fp32_type} and {int8_type} need a relationship mapping")
+
+        # 3. check dynamic quant mappings
+        dynamic_quant_mappings = get_default_dynamic_quant_module_mappings()
+        for fp32_type, int8_type in dynamic_quant_mappings.items():
+            # TODO(future PR): enable correct weight extraction for these
+            # and remove from this list.
+            types_to_skip = (
+                nn.GRUCell,
+                nn.GRU,
+                nn.LSTMCell,
+                nn.RNNCell,
+            )
+            if fp32_type in types_to_skip:
+                continue
+            # verify relatedness
+            in_type_a_related_to_b = \
+                (fp32_type, int8_type) in type_a_related_to_b
+            self.assertTrue(
+                in_type_a_related_to_b,
+                f"{fp32_type} and {int8_type} need a relationship mapping")
+
+        # 4. go through the ops mapped to each QuantizeHandler type, and verify
+        # correctness.
+        def _op_in_base_sets_of_related_ops(op):
+            for name, ops in base_name_to_sets_of_related_ops.items():
+                if op in ops:
+                    return True
+            return False
+
+        default_quant_patterns = get_default_quant_patterns()
+        for pattern, qhandler_cls in default_quant_patterns.items():
+            base_op = None
+            if isinstance(pattern, tuple):
+                base_op = pattern[-1]
+            elif isinstance(pattern, str):
+                # TODO(future PR): add handling for these
+                continue
+            else:
+                base_op = pattern
+
+            qhandler_cls_all_ops_quantizeable = [
+                qp.CatQuantizeHandler,
+                qp.ConvReluQuantizeHandler,
+                qp.LinearReLUQuantizeHandler,
+                qp.BatchNormQuantizeHandler,
+                qp.EmbeddingQuantizeHandler,
+                qp.RNNDynamicQuantizeHandler,
+                qp.ELUQuantizeHandler,
+            ]
+
+            qhandler_cls_quant_op_same_signature = [
+                qp.FixedQParamsOpQuantizeHandler,
+                qp.CopyNodeQuantizeHandler,
+            ]
+
+            if qhandler_cls == qp.BinaryOpQuantizeHandler:
+                # these ops do not have quantized equivalents
+                ops_to_skip = [
+                    torch.bmm,
+                    torch.sum,
+                    torch.div,
+                    torch.sub,
+                    operator.truediv,
+                    operator.sub
+                ]
+                if base_op in ops_to_skip:
+                    continue
+                self.assertTrue(
+                    _op_in_base_sets_of_related_ops(base_op),
+                    f"{base_op} not in sets of related ops")
+            elif qhandler_cls == qp.RNNDynamicQuantizeHandler:
+                # TODO(future PR): add support for all classes in
+                # RNNDynamicQuantizeHandler
+                pass
+            elif qhandler_cls == qp.DefaultNodeQuantizeHandler:
+                ops_to_skip = [
+                    torch.nn.SiLU,
+                    torch.nn.functional.silu,
+                ]
+                if base_op in ops_to_skip:
+                    continue
+                self.assertTrue(
+                    _op_in_base_sets_of_related_ops(base_op),
+                    f"{base_op} not in sets of related ops")
+            elif qhandler_cls in qhandler_cls_quant_op_same_signature:
+                # these ops use the same op signature for fp32 and quantized
+                # tensors
+                pass
+            elif qhandler_cls in qhandler_cls_all_ops_quantizeable:
+                self.assertTrue(
+                    _op_in_base_sets_of_related_ops(base_op),
+                    f"{base_op} not in sets of related ops")
+            else:
+                raise AssertionError(
+                    f"handing for {qhandler_cls} not implemented")
+
 
 class TestFXGraphMatcherModels(QuantizationTestCase):
 
diff --git a/torch/quantization/ns/pattern_utils.py b/torch/quantization/ns/pattern_utils.py
index ed35f77ed137..6de252bc9128 100644
--- a/torch/quantization/ns/pattern_utils.py
+++ b/torch/quantization/ns/pattern_utils.py
@@ -76,6 +76,7 @@ def get_base_name_to_sets_of_related_ops() -> Dict[str, Set[Callable]]:
             nnqat.Linear,
             nnqd.Linear,
             nniqat.LinearReLU,
+            nn.modules.linear._LinearWithBias,
         ]),
         # linear functionals
         'torch.nn.functional.linear': set([
@@ -103,6 +104,7 @@ def get_base_name_to_sets_of_related_ops() -> Dict[str, Set[Callable]]:
         'torch.mul': set([
             torch.mul,
             toq.mul,
+            operator.mul,
         ]),
         # relu
         'torch.relu': set([
@@ -116,6 +118,111 @@ def get_base_name_to_sets_of_related_ops() -> Dict[str, Set[Callable]]:
         'torch.sigmoid': set([
             torch.sigmoid,
         ]),
+        # BatchNorm
+        'torch.nn.BatchNorm2d': set([
+            nn.BatchNorm2d,
+            nnq.BatchNorm2d,
+        ]),
+        'torch.nn.BatchNorm3d': set([
+            nn.BatchNorm3d,
+            nnq.BatchNorm3d,
+        ]),
+        # ConvTranspose
+        'torch.nn.ConvTranspose1d': set([
+            nn.ConvTranspose1d,
+            nnq.ConvTranspose1d,
+        ]),
+        'torch.nn.ConvTranspose2d': set([
+            nn.ConvTranspose2d,
+            nnq.ConvTranspose2d,
+        ]),
+        # ELU
+        'torch.nn.ELU': set([
+            nn.ELU,
+            nnq.ELU,
+        ]),
+        # Embedding
+        'torch.nn.Embedding': set([
+            nn.Embedding,
+            nnq.Embedding,
+        ]),
+        # EmbeddingBag
+        'torch.nn.EmbeddingBag': set([
+            nn.EmbeddingBag,
+            nnq.EmbeddingBag,
+        ]),
+        # GroupNorm
+        'torch.nn.GroupNorm': set([
+            nn.GroupNorm,
+            nnq.GroupNorm,
+        ]),
+        # Hardswish
+        'torch.nn.Hardswish': set([
+            nn.Hardswish,
+            nnq.Hardswish,
+        ]),
+        # InstanceNorm
+        'torch.nn.InstanceNorm1d': set([
+            nn.InstanceNorm1d,
+            nnq.InstanceNorm1d,
+        ]),
+        'torch.nn.InstanceNorm2d': set([
+            nn.InstanceNorm2d,
+            nnq.InstanceNorm2d,
+        ]),
+        'torch.nn.InstanceNorm3d': set([
+            nn.InstanceNorm3d,
+            nnq.InstanceNorm3d,
+        ]),
+        # LayerNorm
+        'torch.nn.LayerNorm': set([
+            nn.LayerNorm,
+            nnq.LayerNorm,
+        ]),
+        # LeakyReLU
+        'torch.nn.LeakyReLU': set([
+            nn.LeakyReLU,
+            nnq.LeakyReLU,
+        ]),
+        # ReLU6
+        'torch.nn.ReLU6': set([
+            nn.ReLU6,
+            nnq.ReLU6,
+        ]),
+        # BNReLU2d
+        'torch.nn.intrinsic.BNReLU2d': set([
+            nni.BNReLU2d,
+            nniq.BNReLU2d,
+        ]),
+        'torch.nn.intrinsic.BNReLU3d': set([
+            nni.BNReLU3d,
+            nniq.BNReLU3d,
+        ]),
+        # F.elu
+        'torch.nn.functional.elu': set([
+            F.elu,
+            toq.elu,
+        ]),
+        # F.hardswish
+        'torch.nn.functional.hardswish': set([
+            F.hardswish,
+            toq.hardswish,
+        ]),
+        # F.instance_norm
+        'torch.nn.functional.instance_norm': set([
+            F.instance_norm,
+            toq.instance_norm,
+        ]),
+        # F.layer_norm
+        'torch.nn.functional.layer_norm': set([
+            F.layer_norm,
+            toq.layer_norm,
+        ]),
+        # F.leaky_relu
+        'torch.nn.functional.leaky_relu': set([
+            F.leaky_relu,
+            toq.leaky_relu,
+        ]),
     }
     return base_name_to_sets_of_related_ops
 
diff --git a/torch/quantization/quantization_mappings.py b/torch/quantization/quantization_mappings.py
index f8e1d55fa7d9..7adf2869a9b9 100644
--- a/torch/quantization/quantization_mappings.py
+++ b/torch/quantization/quantization_mappings.py
@@ -199,6 +199,10 @@ def get_default_compare_output_module_list() -> Set[Callable]:
     )
     return copy.deepcopy(NUMERIC_SUITE_COMPARE_MODEL_OUTPUT_MODULE_LIST)
 
+def get_default_float_to_quantized_operator_mappings(
+) -> Dict[Union[Callable, str], Callable]:
+    return copy.deepcopy(DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS)
+
 # TODO: merge with get_static_quant_module_class
 def get_quantized_operator(float_op: Union[Callable, str]) -> Callable:
     ''' Get the quantized operator corresponding to the float operator

From bde53cfd9ad3992b21ff30120018017252e8a643 Mon Sep 17 00:00:00 2001
From: Hui Guo <huiguo@fb.com>
Date: Thu, 15 Apr 2021 16:08:41 -0700
Subject: [PATCH 40/45] [tensorexpr] Add missing python bindings for NNC Stmts
 (#55570)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/55570

Test Plan: Imported from OSS

Reviewed By: ZolotukhinM

Differential Revision: D27634987

Pulled By: huiguoo

fbshipit-source-id: 220a00b1dcc4d42d93b6600b730d35432316eff6
---
 torch/csrc/jit/tensorexpr/tensorexpr_init.cpp | 79 ++++++++++++++++++-
 1 file changed, 77 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
index b951c5b344db..ec7f6f9f4669 100644
--- a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
+++ b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
@@ -127,7 +127,15 @@ void initTensorExprBindings(PyObject* module) {
           [](Placeholder& self, const std::vector<ExprHandle>& v) {
             return self.load(v);
           })
-      .def("buf", [](Placeholder& self) { return BufHandle(self.data()); });
+      .def(
+          "store",
+          [](Placeholder& self,
+             const std::vector<ExprHandle>& args,
+             const ExprHandle& val) { return self.store(args, val); })
+      .def(
+          "data",
+          [](Placeholder& self) { return BufHandle(self.data()); },
+          py::return_value_policy::reference);
   py::class_<Tensor, std::unique_ptr<Tensor, py::nodelete>>(te, "Tensor")
       .def(py::init(
           [](BufHandle& b, Stmt* s) { return new Tensor(b.node(), s); }))
@@ -222,6 +230,30 @@ void initTensorExprBindings(PyObject* module) {
         return Reduce(func_name, dim_args, reducer, buffer, reduce_args);
       },
       py::return_value_policy::reference);
+  te.def(
+      "Reduce",
+      [](const std::string& func_name,
+         const std::vector<DimArg>& dim_args,
+         const Reducer& reducer,
+         const std::function<ExprHandle(const std::vector<VarHandle>&)>&
+             body_func,
+         const std::vector<DimArg>& reduce_args) {
+        return Reduce(func_name, dim_args, reducer, body_func, reduce_args);
+      },
+      py::return_value_policy::reference);
+  te.def(
+      "Reduce",
+      [](const std::string& func_name,
+         const std::vector<DimArg>& dim_args,
+         const Reducer& reducer,
+         const std::function<ExprHandle(const std::vector<VarHandle>&)>&
+             init_func,
+         const std::function<ExprHandle(const std::vector<VarHandle>&)>&
+             body_func,
+         const std::vector<DimArg>& reduce_args) {
+        return Reduce(func_name, dim_args, reducer, body_func, reduce_args);
+      },
+      py::return_value_policy::reference);
 
   py::class_<Stmt, std::unique_ptr<Stmt, py::nodelete>>(te, "Stmt")
       .def(py::init([](const std::vector<Stmt*>& stmts) {
@@ -232,17 +264,48 @@ void initTensorExprBindings(PyObject* module) {
         ss << self;
         return ss.str();
       });
+  py::class_<Store, Stmt, std::unique_ptr<Store, py::nodelete>>(te, "Store")
+      .def_static(
+          "make",
+          [](const BufHandle& buf,
+             std::vector<ExprHandle>& indicies,
+             const ExprHandle& value) {
+            return Store::make(buf, indicies, value);
+          },
+          py::return_value_policy::reference);
+
   py::class_<For, Stmt, std::unique_ptr<For, py::nodelete>>(te, "For")
       .def(
           "index_var",
           [](const For& self) { return VarHandle(self.var()); },
           py::return_value_policy::reference)
-      .def("body", &For::body, py::return_value_policy::reference);
+      .def("body", &For::body, py::return_value_policy::reference)
+      .def("set_parallel", &For::set_parallel)
+      .def_static(
+          "make",
+          [](const VarHandle& var,
+             const ExprHandle& start,
+             const ExprHandle& stop,
+             Stmt* body) { return For::make(var, start, stop, body); },
+          py::return_value_policy::reference);
+
+  py::class_<Cond, Stmt, std::unique_ptr<Cond, py::nodelete>>(te, "Cond")
+      .def_static(
+          "make",
+          [](const ExprHandle& condition, Stmt* true_stmt, Stmt* false_stmt) {
+            return new Cond(condition.node(), true_stmt, false_stmt);
+          },
+          py::return_value_policy::reference)
+      .def("true_stmt", &Cond::true_stmt, py::return_value_policy::reference)
+      .def("false_stmt", &Cond::false_stmt, py::return_value_policy::reference);
 
   py::class_<
       tensorexpr::Block,
       Stmt,
       std::unique_ptr<tensorexpr::Block, py::nodelete>>(te, "Block")
+      .def(py::init([](const std::vector<Stmt*>& stmts) {
+        return tensorexpr::Block::make(stmts);
+      }))
       .def(
           "stmts",
           &tensorexpr::Block::stmts,
@@ -280,6 +343,18 @@ void initTensorExprBindings(PyObject* module) {
             return self.getAllLoopNestsWritingToBuf(b.node());
           },
           py::return_value_policy::reference)
+      .def(
+          "get_innermost_loops_for",
+          [](const LoopNest& self, const BufHandle* b) {
+            return self.getAllInnermostLoopsWritingToBuf(b->node());
+          },
+          py::return_value_policy::reference)
+      .def(
+          "get_parent_loop",
+          [](const LoopNest& self, const Stmt* s) {
+            return self.getParentLoop(s);
+          },
+          py::return_value_policy::reference)
       .def(
           "split_with_tail",
           [](const LoopNest& self, For* f, int factor) {

From 400398006fb902e924bd4a7339ab2b47be08e3ac Mon Sep 17 00:00:00 2001
From: Jay Chae <jchae@fb.com>
Date: Thu, 15 Apr 2021 16:10:02 -0700
Subject: [PATCH 41/45] [PARAM] Param comms debug info (#55976)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/55976

- Define a concrete `DebugInfo` to collect Param comms.
- Add a macro to easily log `DebugInfo`

Test Plan:
Tested on `ads:simplified_launcher` with `dyno gputrace`
locally tested in libkinetoObserver that it can collect the debug Infobase

Reviewed By: kingchc, ilia-cher

Differential Revision: D26773447

fbshipit-source-id: a8eeede2d6dbf34d7a1b3614843b4a1baba94448
---
 c10/util/ThreadLocalDebugInfo.h     |   1 +
 torch/lib/c10d/CMakeLists.txt       |   2 +
 torch/lib/c10d/ParamCommsUtils.cpp  |  25 ++++++
 torch/lib/c10d/ParamCommsUtils.hpp  |  81 +++++++++++++++++++
 torch/lib/c10d/ProcessGroupNCCL.cpp | 118 +++++++++++++++++++++++++---
 5 files changed, 216 insertions(+), 11 deletions(-)
 create mode 100644 torch/lib/c10d/ParamCommsUtils.cpp
 create mode 100644 torch/lib/c10d/ParamCommsUtils.hpp

diff --git a/c10/util/ThreadLocalDebugInfo.h b/c10/util/ThreadLocalDebugInfo.h
index a1d167d0652d..89b2d559e266 100644
--- a/c10/util/ThreadLocalDebugInfo.h
+++ b/c10/util/ThreadLocalDebugInfo.h
@@ -14,6 +14,7 @@ enum class C10_API_ENUM DebugInfoKind : uint8_t {
   MOBILE_RUNTIME_INFO,
   PROFILER_STATE,
   INFERENCE_CONTEXT, // for inference usage
+  PARAM_COMMS_INFO,
 
   TEST_INFO, // used only in tests
   TEST_INFO_2, // used only in tests
diff --git a/torch/lib/c10d/CMakeLists.txt b/torch/lib/c10d/CMakeLists.txt
index 4e72e2e32fbf..e04a4d9ad870 100644
--- a/torch/lib/c10d/CMakeLists.txt
+++ b/torch/lib/c10d/CMakeLists.txt
@@ -45,6 +45,7 @@ endfunction()
 
 set(C10D_SRCS
   FileStore.cpp
+  ParamCommsUtils.cpp
   PrefixStore.cpp
   ProcessGroup.cpp
   Store.cpp
@@ -121,6 +122,7 @@ if(USE_C10D_GLOO)
 endif()
 
 copy_header(FileStore.hpp)
+copy_header(ParamCommsUtils.hpp)
 copy_header(PrefixStore.hpp)
 copy_header(ProcessGroup.hpp)
 copy_header(Store.hpp)
diff --git a/torch/lib/c10d/ParamCommsUtils.cpp b/torch/lib/c10d/ParamCommsUtils.cpp
new file mode 100644
index 000000000000..02b70ee12274
--- /dev/null
+++ b/torch/lib/c10d/ParamCommsUtils.cpp
@@ -0,0 +1,25 @@
+// (c) Facebook, Inc. and its affiliates. Confidential and proprietary.
+
+#include <c10d/ParamCommsUtils.hpp>
+
+namespace torch {
+
+extern const std::string kParamCommsCallName = "record_param_comms";
+
+ParamCommsDebugInfo::ParamCommsDebugInfo(
+    int rank,
+    std::string&& colName,
+    int inSize,
+    int outSize,
+    at::ScalarType dType,
+    std::vector<int64_t>&& inSplitSizes,
+    std::vector<int64_t>&& outSplitSizes) :
+      rank_(rank),
+      columnName_(colName),
+      inMessageSize_(inSize),
+      outMessageSize_(outSize),
+      dType_(dType),
+      inputSplitSizes_(inSplitSizes),
+      outputSplitSizes_(outSplitSizes) {}
+
+} // namespace torch
diff --git a/torch/lib/c10d/ParamCommsUtils.hpp b/torch/lib/c10d/ParamCommsUtils.hpp
new file mode 100644
index 000000000000..c7ffaec930a7
--- /dev/null
+++ b/torch/lib/c10d/ParamCommsUtils.hpp
@@ -0,0 +1,81 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <c10/util/ThreadLocalDebugInfo.h>
+#include <ATen/core/ivalue.h>
+
+namespace torch {
+
+extern const std::string kParamCommsCallName;
+
+class ParamCommsDebugInfo
+    : public c10::DebugInfoBase {
+
+ public:
+  ParamCommsDebugInfo() = default;
+  ParamCommsDebugInfo(
+    int rank,
+    std::string&& colName,
+    int inSize,
+    int outSize,
+    at::ScalarType dType,
+    std::vector<int64_t>&& inSplitSizes,
+    std::vector<int64_t>&& outSplitSizes);
+
+  ~ParamCommsDebugInfo() override = default;
+
+  int getRank() const {
+    return rank_;
+  }
+
+  const std::string getColumnName() const {
+    return columnName_;
+  }
+
+  int getInMessageSize() const {
+    return inMessageSize_;
+  }
+
+  int getOutMessageSize() const {
+    return outMessageSize_;
+  }
+
+  at::ScalarType getDType() const {
+    return dType_;
+  }
+
+  const std::vector<int64_t>& getInputSplitSizes() const {
+    return inputSplitSizes_;
+  }
+
+  const std::vector<int64_t>& getOutputSplitSizes() const {
+    return outputSplitSizes_;
+  }
+
+ private:
+  int rank_{};
+  std::string columnName_;
+  int inMessageSize_{};
+  int outMessageSize_{};
+  at::ScalarType dType_ = at::kByte;
+  std::vector<int64_t> inputSplitSizes_;
+  std::vector<int64_t> outputSplitSizes_;
+};
+
+// TODO(jchae): handle non empty in/out split sizes
+#define RECORD_PARAM_COMMS(rank, colName, inSize, outSize, dType, inSplitSizes, outSplitSizes) \
+  std::vector<int64_t> iss; \
+  std::vector<int64_t> oss; \
+  auto paramCommsInfo = std::make_shared<torch::ParamCommsDebugInfo>( \
+    rank, \
+    colName, \
+    inSize, \
+    outSize, \
+    dType, \
+    std::move(iss), \
+    std::move(oss)); \
+  c10::DebugInfoGuard g(c10::DebugInfoKind::PARAM_COMMS_INFO, paramCommsInfo); \
+  RECORD_FUNCTION(torch::kParamCommsCallName, std::vector<c10::IValue>());
+
+} // namespace torch
diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp
index 7ee88b6339c4..28a9893b198b 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.cpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.cpp
@@ -10,6 +10,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/Logging.h>
+#include <c10d/ParamCommsUtils.hpp>
 #include <torch/csrc/cuda/nccl.h>
 
 #include <c10d/Utils.hpp>
@@ -397,6 +398,14 @@ void ProcessGroupNCCL::WorkNCCL::synchronizeInternal(
 
 // Same as calling synchronize().
 bool ProcessGroupNCCL::WorkNCCL::wait(std::chrono::milliseconds timeout) {
+  RECORD_PARAM_COMMS(
+      rank_,      // rank
+      "wait",     // colName
+      0,          // inSize
+      0,          // outSize
+      at::kByte,  // dType
+      {},         // inSplitSizes
+      {});        // outSplitSizes
   synchronizeInternal(timeout);
   // Always return true, because abort API is not implemented.
   return true;
@@ -1219,6 +1228,17 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::allreduce(
     const AllreduceOptions& opts) {
   check_gpu_tensors(tensors);
 
+  // @lint-ignore CLANGTIDY
+  auto tensor = tensors.back();
+  RECORD_PARAM_COMMS(
+      rank_,                    // rank
+      "allreduce",              // colName
+      tensor.numel(),           // inSize
+      tensor.numel(),           // outSize
+      tensor.scalar_type(),     // dType
+      std::vector<int64_t>(),   // inSplitSizes
+      std::vector<int64_t>());  // outSplitSizes
+
   return collective(
       tensors,
       tensors,
@@ -1251,6 +1271,17 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::broadcast(
     const BroadcastOptions& opts) {
   check_gpu_tensors(tensors);
 
+  // @lint-ignore CLANGTIDY
+  auto tensor = tensors.back();
+  RECORD_PARAM_COMMS(
+      rank_,                // rank
+      "broadcast",          // colName
+      tensor.numel(),       // inSize
+      tensor.numel(),       // outSize
+      tensor.scalar_type(), // dType
+      {},                   // inSplitSizes
+      {});                  // outSplitSizes
+
   return collective(
       tensors,
       tensors,
@@ -1275,6 +1306,16 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::reduce(
     std::vector<at::Tensor>& tensors,
     const ReduceOptions& opts) {
   check_gpu_tensors(tensors);
+  // @lint-ignore CLANGTIDY
+  auto tensor = tensors.back();
+  RECORD_PARAM_COMMS(
+      rank_,                // rank
+      "reduce",             // colName
+      tensor.numel(),       // inSize
+      tensor.numel(),       // outSize
+      tensor.scalar_type(), // dType
+      {},                   // inSplitSizes
+      {});                  // outSplitSizes
 
   return collective(
       tensors,
@@ -1308,6 +1349,19 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::allgather(
       flatten_for_scatter_gather(outputTensors, inputTensors, size_);
   check_gpu_tensors(outputFlattened);
 
+  // @lint-ignore CLANGTIDY
+  auto tensor = inputTensors.back();
+  RECORD_PARAM_COMMS(
+      rank_,                // rank
+      "all_gather",         // colName
+      tensor.numel(),       // inSize
+      tensor.numel() *      // outSize
+        this->getSize(),    // dType
+      tensor.scalar_type(), // inSplitSizes
+      {},                   // outSplitSizes
+      {});
+
+
   return collective(
       inputTensors,
       outputFlattened,
@@ -1357,6 +1411,18 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::reduce_scatter(
     const ReduceScatterOptions& opts) {
   check_gpu_tensors(outputTensors);
 
+  // @lint-ignore CLANGTIDY
+  auto tensor = outputTensors.back();
+  RECORD_PARAM_COMMS(
+      rank_,                // rank
+      "reduce_scatter",     // colName
+      tensor.numel() *      // inSize
+        this->getSize(),    // outSize
+      tensor.numel(),       // dType
+      tensor.scalar_type(), // inSplitSizes
+      {},                   // outSplitSizes
+      {});
+
   auto inputFlattened =
       flatten_for_scatter_gather(inputTensors, outputTensors, size_);
   check_gpu_tensors(inputFlattened);
@@ -1399,6 +1465,16 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::reduce_scatter(
 
 c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::barrier(
     const BarrierOptions& opts) {
+
+  RECORD_PARAM_COMMS(
+      rank_,      // rank
+      "barrier",  // colName
+      0,          // inSize
+      0,          // outSize
+      at::kByte,  // dType
+      {},         // inSplitSizes
+      {});        // outSplitSizes
+
   std::vector<at::Device> devices;
 
   // Use user defined GPU device ids if provided
@@ -1464,6 +1540,16 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall_base(
   if (outputSplitSizes.size() == 0 && inputSplitSizes.size() == 0) {
     std::vector<at::Tensor> inputTensors = {inputTensor};
     std::vector<at::Tensor> outputTensors = {outputTensor};
+
+    RECORD_PARAM_COMMS(
+        rank_,                // rank
+        "all_to_all",         // colName
+        inputTensor.numel(),  // inSize
+        outputTensor.numel(), // outSize
+        at::kByte,            // dType
+        {},                   // inSplitSizes
+        {});                  // outSplitSizes
+
     return collective(
         inputTensors,
         outputTensors,
@@ -1489,6 +1575,16 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall_base(
     c10d::checkSplitSizes(outputSplitSizes, outputTensor, size_);
     std::vector<at::Tensor> inputTensors = {inputTensor};
     std::vector<at::Tensor> outputTensors = {outputTensor};
+
+    RECORD_PARAM_COMMS(
+        rank_,                        // rank
+        "all_to_allv",                // colName
+        inputTensor.numel(),          // inSize
+        outputTensor.numel(),         // outSize
+        at::kByte,                    // dType
+        std::move(inputSplitSizes),   // inSplitSizes
+        std::move(outputSplitSizes)); // outSplitSizes
+
     return collective(
         inputTensors,
         outputTensors,
@@ -1534,21 +1630,21 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall(
     check_gpu_single_tensor(outputTensors[r]);
     check_gpu_single_tensor(inputTensors[r]);
     TORCH_CHECK(device == outputTensors[r].device() && device == inputTensors[r].device(),
-      "Tensors must be on the same device")
+        "Tensors must be on the same device")
   }
   std::vector<at::Tensor> inputTensor0 = {inputTensors[0]};
   std::vector<at::Tensor> outputTensor0 = {outputTensors[0]};
   return collective(
-    inputTensor0,
-    outputTensor0,
-    [&](at::Tensor& /* unused */,
-        at::Tensor& /* unused */,
-        ncclComm_t comm,
-        at::cuda::CUDAStream& stream) {
-      torch::cuda::nccl::all2all(outputTensors, inputTensors, comm, stream);
-      return ncclSuccess;
-    },
-    OpType::ALLTOALL);
+      inputTensor0,
+      outputTensor0,
+      [&](at::Tensor& /* unused */,
+          at::Tensor& /* unused */,
+          ncclComm_t comm,
+          at::cuda::CUDAStream& stream) {
+        torch::cuda::nccl::all2all(outputTensors, inputTensors, comm, stream);
+        return ncclSuccess;
+      },
+      OpType::ALLTOALL);
 }
 
 c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::send(

From 7d410bc3c8131c29083117a145b13a8bc1d87260 Mon Sep 17 00:00:00 2001
From: Sam Estep <sestep@fb.com>
Date: Thu, 15 Apr 2021 16:52:12 -0700
Subject: [PATCH 42/45] .github: Add initial linux CI workflow (#55176)

Summary:
This is a commandeer of https://github.com/pytorch/pytorch/issues/54091.

TODO:

- [x] understand why the build is [failing](https://github.com/pytorch/pytorch/pull/55176/checks?check_run_id=2254742265) here when it was [succeeding](https://github.com/pytorch/pytorch/pull/54091/checks?check_run_id=2177844748) on https://github.com/pytorch/pytorch/issues/54091
- [x] fix the build failure
- [x] fix the test failure(s)
- [x] add CI check to generate YAML workflows from templates, similar to https://github.com/pytorch/pytorch/issues/55171
- [ ] uncomment the rest of the matrix

Pull Request resolved: https://github.com/pytorch/pytorch/pull/55176

Reviewed By: walterddr

Differential Revision: D27803529

Pulled By: seemethere

fbshipit-source-id: 52a65ec8f7a83b929fed47f0bbdca544210ec9c2
---
 .../scripts/generate_linux_ci_workflows.py    | 164 +++++++++++++++++
 .github/scripts/install_nvidia_utils_linux.sh |  43 +++++
 .github/scripts/report_git_status.sh          |   5 +
 .github/templates/linux_ci_workflow.yml.in    | 174 ++++++++++++++++++
 .github/workflows/lint.yml                    |  24 ++-
 .../pytorch-linux-xenial-py3.6-gcc5.4.yml     | 174 ++++++++++++++++++
 .gitignore                                    |   9 +
 .jenkins/pytorch/build.sh                     |  49 ++---
 .jenkins/pytorch/common.sh                    |  15 +-
 .jenkins/pytorch/macos-test.sh                |   6 +-
 .jenkins/pytorch/test.sh                      |  44 ++++-
 .jenkins/pytorch/win-test.sh                  |   8 +-
 Makefile                                      |  11 ++
 13 files changed, 677 insertions(+), 49 deletions(-)
 create mode 100755 .github/scripts/generate_linux_ci_workflows.py
 create mode 100755 .github/scripts/install_nvidia_utils_linux.sh
 create mode 100755 .github/scripts/report_git_status.sh
 create mode 100644 .github/templates/linux_ci_workflow.yml.in
 create mode 100644 .github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml

diff --git a/.github/scripts/generate_linux_ci_workflows.py b/.github/scripts/generate_linux_ci_workflows.py
new file mode 100755
index 000000000000..135034f24b07
--- /dev/null
+++ b/.github/scripts/generate_linux_ci_workflows.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python
+
+from pathlib import Path
+
+import jinja2
+
+DOCKER_REGISTRY = "308535385114.dkr.ecr.us-east-1.amazonaws.com"
+
+GITHUB_DIR = Path(__file__).parent.parent
+
+CPU_TEST_RUNNER = "linux.2xlarge"
+CUDA_TEST_RUNNER = "linux.8xlarge.nvidia.gpu"
+
+
+class PyTorchLinuxWorkflow:
+    def __init__(self, build_environment: str, docker_image_base: str):
+        self.build_environment = build_environment
+        self.docker_image_base = docker_image_base
+        self.test_runner_type = CPU_TEST_RUNNER
+        if "cuda" in build_environment:
+            self.test_runner_type = CUDA_TEST_RUNNER
+
+    def generate_workflow_file(
+        self, workflow_template: jinja2.Template, jinja_env: jinja2.Environment
+    ) -> Path:
+        output_file_path = GITHUB_DIR.joinpath(
+            f"workflows/{self.build_environment}.yml"
+        )
+        with open(output_file_path, "w") as output_file:
+            output_file.write(
+                workflow_template.render(
+                    build_environment=self.build_environment,
+                    docker_image_base=self.docker_image_base,
+                    test_runner_type=self.test_runner_type
+                )
+            )
+            output_file.write('\n')
+        return output_file_path
+
+
+WORKFLOWS = [
+    PyTorchLinuxWorkflow(
+        build_environment="pytorch-linux-xenial-py3.6-gcc5.4",
+        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
+    ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-parallelnative-linux-xenial-py3.6-gcc5.4",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-pure_torch-linux-xenial-py3.6-gcc5.4",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3.6-gcc7",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc7",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang5-asan",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-asan",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang7-onnx",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang7-onnx",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-libtorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-bionic-py3.6-clang9-noarch",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-xla-linux-bionic-py3.6-clang9",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-vulkan-linux-bionic-py3.6-clang9",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-bionic-py3.8-gcc9-coverage",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.8-gcc9",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-bionic-rocm3.9-py3.6",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-rocm3.9-py3.6",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_64",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v7a",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v8a",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang5-mobile",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-asan",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang5-mobile-custom-dynamic",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang5-mobile-custom-static",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang5-mobile-code-analysis",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_64",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v7a",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v8a",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    # ),
+]
+
+if __name__ == "__main__":
+    jinja_env = jinja2.Environment(
+        variable_start_string="!{{",
+        loader=jinja2.FileSystemLoader(str(GITHUB_DIR.joinpath("templates"))),
+    )
+    workflow_template = jinja_env.get_template("linux_ci_workflow.yml.in")
+    for workflow in WORKFLOWS:
+        print(
+            workflow.generate_workflow_file(
+                workflow_template=workflow_template,
+                jinja_env=jinja_env
+            )
+        )
diff --git a/.github/scripts/install_nvidia_utils_linux.sh b/.github/scripts/install_nvidia_utils_linux.sh
new file mode 100755
index 000000000000..69337e08dac9
--- /dev/null
+++ b/.github/scripts/install_nvidia_utils_linux.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+
+set -eou pipefail
+
+DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID) \
+DRIVER_FN="NVIDIA-Linux-x86_64-460.39.run"
+YUM_REPO_URL="https://nvidia.github.io/nvidia-docker/${DISTRIBUTION}/nvidia-docker.repo"
+
+install_nvidia_docker2_amzn2() {
+    (
+        set -x
+        # Needed for yum-config-manager
+        sudo yum install -y yum-utils
+        sudo yum-config-manager --add-repo "${YUM_REPO_URL}"
+        sudo yum install -y nvidia-docker2
+        sudo systemctl restart docker
+    )
+}
+
+install_nvidia_driver() {
+    (
+        set -x
+        sudo yum groupinstall -y "Development Tools"
+        curl -fsL -o nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
+        sudo /bin/bash nvidia_driver -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false)
+        nvidia-smi
+    )
+}
+
+# Install container toolkit based on distribution
+echo "== Installing nvidia container toolkit for ${DISTRIBUTION} =="
+case "${DISTRIBUTION}" in
+    amzn*)
+        install_nvidia_docker2_amzn2
+        ;;
+    *)
+        echo "ERROR: Unknown distribution ${DISTRIBUTION}"
+        exit 1
+        ;;
+esac
+
+echo "== Installing nvidia driver ${DRIVER_FN} =="
+install_nvidia_driver
diff --git a/.github/scripts/report_git_status.sh b/.github/scripts/report_git_status.sh
new file mode 100755
index 000000000000..357bacfecb24
--- /dev/null
+++ b/.github/scripts/report_git_status.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+CHANGES=$(git status --porcelain)
+echo "$CHANGES"
+git diff
+[ -z "$CHANGES" ]
diff --git a/.github/templates/linux_ci_workflow.yml.in b/.github/templates/linux_ci_workflow.yml.in
new file mode 100644
index 000000000000..a816af1d4100
--- /dev/null
+++ b/.github/templates/linux_ci_workflow.yml.in
@@ -0,0 +1,174 @@
+# @generated by .github/scripts/generate_linux_ci_workflows.py, Do not update manually
+#
+# Template is at:    .github/templates/linux_ci_workflow.yml
+# Generation script: .github/scripts/generate_linux_ci_workflows.py
+name: Linux CI (!{{ build_environment }})
+
+on:
+  # TODO: Enable pull_request builds when we can verify capacity can be met by auto-scalers
+  # pull_request:
+  push:
+    branches:
+      - master
+      - release/*
+  workflow_dispatch:
+
+env:
+  BUILD_ENVIRONMENT: !{{ build_environment }}
+  DOCKER_IMAGE_BASE: !{{ docker_image_base }}
+  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+  TORCH_CUDA_ARCH_LIST: 5.2
+  IN_CI: 1
+
+jobs:
+  calculate-docker-image:
+    runs-on: ubuntu-18.04
+    outputs:
+      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
+    steps:
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+      - name: Calculate docker image tag
+        id: calculate-tag
+        run: |
+          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
+          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
+  build:
+    runs-on: linux.2xlarge
+    needs: calculate-docker-image
+    env:
+      DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
+    steps:
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
+          submodules: recursive
+      - name: Log in to ECR
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Pull docker image
+        run: |
+          docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch
+        run: |
+          SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
+          MEMORY_LIMIT_MAX_JOBS=8 # our "linux.2xlarge" runner has 16 vCPUs, if we use all of them we'll OOM
+          export MAX_JOBS=$(( SCCACHE_MAX_JOBS > MEMORY_LIMIT_MAX_JOBS ? MEMORY_LIMIT_MAX_JOBS : SCCACHE_MAX_JOBS ))
+          docker run \
+            -e BUILD_ENVIRONMENT \
+            -e MAX_JOBS \
+            -e SCCACHE_BUCKET \
+            -e SKIP_SCCACHE_INITIALIZATION=1 \
+            -e TORCH_CUDA_ARCH_LIST \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --tty \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}" \
+            sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
+      - name: Archive artifacts into zip
+        run: |
+          zip -q -r artifacts.zip dist build
+      - uses: actions/upload-artifact@v2
+        name: Store PyTorch Build Artifacts
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          retention-days: 30
+          if-no-files-found: error
+          path:
+            artifacts.zip
+      - name: Clean up docker images
+        if: always()
+        run: |
+          # Prune all of the docker images
+          docker system prune -af
+  test:
+    runs-on: !{{ test_runner_type }}
+    needs:
+      - calculate-docker-image
+      - build
+    env:
+      DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
+    steps:
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+      - name: Log in to ECR
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Pull docker image
+        run: |
+          docker pull "${DOCKER_IMAGE}"
+      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') }}
+        run: |
+          bash .github/scripts/install_nvidia_utils_linux.sh
+          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+      - name: Determine shm-size
+        run: |
+          shm_size="1g"
+          case "${BUILD_ENVIRONMENT}" in
+            *cuda*)
+              shm_size="2g"
+              ;;
+            *rocm*)
+              shm_size="8g"
+              ;;
+          esac
+          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
+      - uses: actions/download-artifact@v2
+        name: Download PyTorch Build Artifacts
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+      - name: Unzip artifacts
+        run: |
+          unzip -q artifacts.zip
+      - name: Output disk space left
+        run: |
+          sudo df -H
+      - name: Test PyTorch
+        run: |
+          SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
+          MEMORY_LIMIT_MAX_JOBS=8 # our "linux.2xlarge" runner has 16 vCPUs, if we use all of them we'll OOM
+          export MAX_JOBS=$(( SCCACHE_MAX_JOBS > MEMORY_LIMIT_MAX_JOBS ? MEMORY_LIMIT_MAX_JOBS : SCCACHE_MAX_JOBS ))
+          # Used for GPU_FLAG since that doesn't play nice
+          # shellcheck disable=SC2086
+          docker run \
+            ${GPU_FLAG:-} \
+            -e BUILD_ENVIRONMENT \
+            -e IN_CI \
+            -e MAX_JOBS \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --shm-size="${SHM_SIZE}" \
+            --tty \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}" \
+            sh -c 'sudo chown -R jenkins . && pip install dist/*.whl && .jenkins/pytorch/test.sh'
+      - name: Clean up docker images
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 5047246453a8..e39b8cb8fc36 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -91,6 +91,23 @@ jobs:
         run: |
           python2 setup.py | grep "Python 2 has reached end-of-life and is no longer supported by PyTorch."
 
+  templates:
+    runs-on: ubuntu-18.04
+    steps:
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.x
+          architecture: x64
+      - name: Install Jinja2
+        run: pip install Jinja2
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+      - name: Regenerate workflows
+        run: .github/scripts/generate_linux_ci_workflows.py
+      - name: Assert that regenerating the workflows didn't change them
+        run: .github/scripts/report_git_status.sh
+
   toc:
     runs-on: ubuntu-18.04
     # https://github.com/actions/virtual-environments/issues/599#issuecomment-602754687
@@ -111,12 +128,7 @@ jobs:
             markdown-toc --bullets='-' -i "$FILE"
           done
       - name: Assert that regenerating the ToCs didn't change them
-        run: |
-          set -eux
-          CHANGES=$(git status --porcelain)
-          echo "$CHANGES"
-          git diff
-          [ -z "$CHANGES" ]
+        run: .github/scripts/report_git_status.sh
 
   flake8-py3:
     runs-on: ubuntu-18.04
diff --git a/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml
new file mode 100644
index 000000000000..d799a9aeebd7
--- /dev/null
+++ b/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml
@@ -0,0 +1,174 @@
+# @generated by .github/scripts/generate_linux_ci_workflows.py, Do not update manually
+#
+# Template is at:    .github/templates/linux_ci_workflow.yml
+# Generation script: .github/scripts/generate_linux_ci_workflows.py
+name: Linux CI (pytorch-linux-xenial-py3.6-gcc5.4)
+
+on:
+  # TODO: Enable pull_request builds when we can verify capacity can be met by auto-scalers
+  # pull_request:
+  push:
+    branches:
+      - master
+      - release/*
+  workflow_dispatch:
+
+env:
+  BUILD_ENVIRONMENT: pytorch-linux-xenial-py3.6-gcc5.4
+  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4
+  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+  TORCH_CUDA_ARCH_LIST: 5.2
+  IN_CI: 1
+
+jobs:
+  calculate-docker-image:
+    runs-on: ubuntu-18.04
+    outputs:
+      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
+    steps:
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+      - name: Calculate docker image tag
+        id: calculate-tag
+        run: |
+          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
+          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
+  build:
+    runs-on: linux.2xlarge
+    needs: calculate-docker-image
+    env:
+      DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
+    steps:
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
+          submodules: recursive
+      - name: Log in to ECR
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Pull docker image
+        run: |
+          docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch
+        run: |
+          SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
+          MEMORY_LIMIT_MAX_JOBS=8 # our "linux.2xlarge" runner has 16 vCPUs, if we use all of them we'll OOM
+          export MAX_JOBS=$(( SCCACHE_MAX_JOBS > MEMORY_LIMIT_MAX_JOBS ? MEMORY_LIMIT_MAX_JOBS : SCCACHE_MAX_JOBS ))
+          docker run \
+            -e BUILD_ENVIRONMENT \
+            -e MAX_JOBS \
+            -e SCCACHE_BUCKET \
+            -e SKIP_SCCACHE_INITIALIZATION=1 \
+            -e TORCH_CUDA_ARCH_LIST \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --tty \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}" \
+            sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
+      - name: Archive artifacts into zip
+        run: |
+          zip -q -r artifacts.zip dist build
+      - uses: actions/upload-artifact@v2
+        name: Store PyTorch Build Artifacts
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          retention-days: 30
+          if-no-files-found: error
+          path:
+            artifacts.zip
+      - name: Clean up docker images
+        if: always()
+        run: |
+          # Prune all of the docker images
+          docker system prune -af
+  test:
+    runs-on: linux.2xlarge
+    needs:
+      - calculate-docker-image
+      - build
+    env:
+      DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
+    steps:
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+      - name: Log in to ECR
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Pull docker image
+        run: |
+          docker pull "${DOCKER_IMAGE}"
+      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') }}
+        run: |
+          bash .github/scripts/install_nvidia_utils_linux.sh
+          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+      - name: Determine shm-size
+        run: |
+          shm_size="1g"
+          case "${BUILD_ENVIRONMENT}" in
+            *cuda*)
+              shm_size="2g"
+              ;;
+            *rocm*)
+              shm_size="8g"
+              ;;
+          esac
+          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
+      - uses: actions/download-artifact@v2
+        name: Download PyTorch Build Artifacts
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+      - name: Unzip artifacts
+        run: |
+          unzip -q artifacts.zip
+      - name: Output disk space left
+        run: |
+          sudo df -H
+      - name: Test PyTorch
+        run: |
+          SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
+          MEMORY_LIMIT_MAX_JOBS=8 # our "linux.2xlarge" runner has 16 vCPUs, if we use all of them we'll OOM
+          export MAX_JOBS=$(( SCCACHE_MAX_JOBS > MEMORY_LIMIT_MAX_JOBS ? MEMORY_LIMIT_MAX_JOBS : SCCACHE_MAX_JOBS ))
+          # Used for GPU_FLAG since that doesn't play nice
+          # shellcheck disable=SC2086
+          docker run \
+            ${GPU_FLAG:-} \
+            -e BUILD_ENVIRONMENT \
+            -e IN_CI \
+            -e MAX_JOBS \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --shm-size="${SHM_SIZE}" \
+            --tty \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}" \
+            sh -c 'sudo chown -R jenkins . && pip install dist/*.whl && .jenkins/pytorch/test.sh'
+      - name: Clean up docker images
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.gitignore b/.gitignore
index 3210a8ce062f..01cd062c0f51 100644
--- a/.gitignore
+++ b/.gitignore
@@ -288,3 +288,12 @@ TAGS
 
 # bazel symlinks
 bazel-*
+
+# generated shellcheck directories
+.shellcheck_generated*/
+
+# zip archives
+*.zip
+
+# core dump files
+core.*
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index 3e6f6178d3d7..81ee4516be98 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -59,13 +59,20 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
   export BUILD_SPLIT_CUDA=ON
 fi
 
+if [[ ${BUILD_ENVIRONMENT} == *"pure_torch"* ]]; then
+  export BUILD_CAFFE2=OFF
+fi
+
+if [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then
+  export ATEN_THREADING=TBB
+  export USE_TBB=1
+elif [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
+  export ATEN_THREADING=NATIVE
+fi
+
 # TODO: Don't run this...
 pip_install -r requirements.txt || true
 
-# Enable LLVM dependency for TensorExpr testing
-export USE_LLVM=/opt/llvm
-export LLVM_DIR=/opt/llvm/lib/cmake/llvm
-
 # TODO: Don't install this here
 if ! which conda; then
   # In ROCm CIs, we are doing cross compilation on build machines with
@@ -229,40 +236,6 @@ else
       cp build/.ninja_log dist
     fi
 
-    # Build custom operator tests.
-    CUSTOM_OP_BUILD="$PWD/../custom-op-build"
-    CUSTOM_OP_TEST="$PWD/test/custom_operator"
-    python --version
-    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
-    mkdir "$CUSTOM_OP_BUILD"
-    pushd "$CUSTOM_OP_BUILD"
-    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)"
-    make VERBOSE=1
-    popd
-    assert_git_not_dirty
-
-    # Build jit hook tests
-    JIT_HOOK_BUILD="$PWD/../jit-hook-build"
-    JIT_HOOK_TEST="$PWD/test/jit_hooks"
-    python --version
-    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
-    mkdir "$JIT_HOOK_BUILD"
-    pushd "$JIT_HOOK_BUILD"
-    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)"
-    make VERBOSE=1
-    popd
-    assert_git_not_dirty
-
-    # Build custom backend tests.
-    CUSTOM_BACKEND_BUILD="$PWD/../custom-backend-build"
-    CUSTOM_BACKEND_TEST="$PWD/test/custom_backend"
-    python --version
-    mkdir "$CUSTOM_BACKEND_BUILD"
-    pushd "$CUSTOM_BACKEND_BUILD"
-    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)"
-    make VERBOSE=1
-    popd
-    assert_git_not_dirty
   else
     # Test standalone c10 build
     if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda10.1-cudnn7-py3* ]]; then
diff --git a/.jenkins/pytorch/common.sh b/.jenkins/pytorch/common.sh
index 57cb554b8196..1755efee03f6 100644
--- a/.jenkins/pytorch/common.sh
+++ b/.jenkins/pytorch/common.sh
@@ -72,7 +72,16 @@ if [[ "$BUILD_ENVIRONMENT" != *pytorch-win-* ]]; then
     # Save sccache logs to file
     sccache --stop-server || true
     rm ~/sccache_error.log || true
-    if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
+    if [[ -n "${SKIP_SCCACHE_INITIALIZATION:-}" ]]; then
+      # sccache --start-server seems to hang forever on self hosted runners for GHA
+      # so let's just go ahead and skip the --start-server altogether since it seems
+      # as though sccache still gets used even when the sscache server isn't started
+      # explicitly
+      echo "Skipping sccache server initialization, setting environment variables"
+      export SCCACHE_IDLE_TIMEOUT=1200
+      export SCCACHE_ERROR_LOG=~/sccache_error.log
+      export RUST_LOG=sccache::server=error
+    elif [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
       SCCACHE_ERROR_LOG=~/sccache_error.log SCCACHE_IDLE_TIMEOUT=0 sccache --start-server
     else
       # increasing SCCACHE_IDLE_TIMEOUT so that extension_backend_test.cpp can build after this PR:
@@ -147,3 +156,7 @@ fi
 retry () {
   "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
 }
+
+# Enable LLVM dependency for TensorExpr testing
+export USE_LLVM=/opt/llvm
+export LLVM_DIR=/opt/llvm/lib/cmake/llvm
diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh
index 45051e697195..d1ae02525847 100755
--- a/.jenkins/pytorch/macos-test.sh
+++ b/.jenkins/pytorch/macos-test.sh
@@ -51,7 +51,11 @@ test_python_all() {
   export GLOO_SOCKET_IFNAME=lo0
   echo "Ninja version: $(ninja --version)"
 
-  if [ -n "$CIRCLE_PULL_REQUEST" ]; then
+  # Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second
+  # CIRCLE_PULL_REQUEST comes from CircleCI
+  # GITHUB_HEAD_REF comes from Github Actions
+  IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-${GITHUB_HEAD_REF:-}}
+  if [ -n "$IN_PULL_REQUEST" ]; then
     DETERMINE_FROM=$(mktemp)
     file_diff_from_base "$DETERMINE_FROM"
   fi
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index f24f9c90d4c7..583d7654c200 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -115,7 +115,11 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-NO_AVX2-* ]]; then
   export ATEN_CPU_CAPABILITY=avx
 fi
 
-if [ -n "$CIRCLE_PULL_REQUEST" ] && [[ "$BUILD_ENVIRONMENT" != *coverage* ]]; then
+# Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second
+# CIRCLE_PULL_REQUEST comes from CircleCI
+# GITHUB_HEAD_REF comes from Github Actions
+IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-${GITHUB_HEAD_REF:-}}
+if [ -n "$IN_PULL_REQUEST" ] && [[ "$BUILD_ENVIRONMENT" != *coverage* ]]; then
   DETERMINE_FROM=$(mktemp)
   file_diff_from_base "$DETERMINE_FROM"
 fi
@@ -257,6 +261,18 @@ test_rpc() {
 
 test_custom_backend() {
   if [[ "$BUILD_ENVIRONMENT" != *rocm* ]] && [[ "$BUILD_ENVIRONMENT" != *asan* ]] ; then
+    echo "Building custom backends tests"
+    # Build custom backend tests.
+    CUSTOM_BACKEND_BUILD="$PWD/../custom-backend-build"
+    CUSTOM_BACKEND_TEST="$PWD/test/custom_backend"
+    python --version
+    mkdir "$CUSTOM_BACKEND_BUILD"
+    pushd "$CUSTOM_BACKEND_BUILD"
+    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)"
+    make VERBOSE=1
+    popd
+    assert_git_not_dirty
+
     echo "Testing custom backends"
     CUSTOM_BACKEND_BUILD="$PWD/../custom-backend-build"
     pushd test/custom_backend
@@ -274,6 +290,19 @@ test_custom_backend() {
 
 test_custom_script_ops() {
   if [[ "$BUILD_ENVIRONMENT" != *rocm* ]] && [[ "$BUILD_ENVIRONMENT" != *asan* ]] ; then
+    # Build custom operator tests.
+    echo "Building custom script operators tests"
+    CUSTOM_OP_BUILD="$PWD/../custom-op-build"
+    CUSTOM_OP_TEST="$PWD/test/custom_operator"
+    python --version
+    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
+    mkdir "$CUSTOM_OP_BUILD"
+    pushd "$CUSTOM_OP_BUILD"
+    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)"
+    make VERBOSE=1
+    popd
+    assert_git_not_dirty
+
     echo "Testing custom script operators"
     CUSTOM_OP_BUILD="$PWD/../custom-op-build"
     pushd test/custom_operator
@@ -290,6 +319,19 @@ test_custom_script_ops() {
 
 test_jit_hooks() {
   if [[ "$BUILD_ENVIRONMENT" != *rocm* ]] && [[ "$BUILD_ENVIRONMENT" != *asan* ]] ; then
+    echo "Building jit hooks in cpp tests"
+    # Build jit hook tests
+    JIT_HOOK_BUILD="$PWD/../jit-hook-build"
+    JIT_HOOK_TEST="$PWD/test/jit_hooks"
+    python --version
+    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
+    mkdir "$JIT_HOOK_BUILD"
+    pushd "$JIT_HOOK_BUILD"
+    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)"
+    make VERBOSE=1
+    popd
+    assert_git_not_dirty
+
     echo "Testing jit hooks in cpp"
     HOOK_BUILD="$PWD/../jit-hook-build"
     pushd test/jit_hooks
diff --git a/.jenkins/pytorch/win-test.sh b/.jenkins/pytorch/win-test.sh
index 9b2b05403f80..6152021099c4 100755
--- a/.jenkins/pytorch/win-test.sh
+++ b/.jenkins/pytorch/win-test.sh
@@ -42,12 +42,16 @@ fi
 
 export SCRIPT_HELPERS_DIR=$SCRIPT_PARENT_DIR/win-test-helpers
 
-if [ -n "$CIRCLE_PULL_REQUEST" ]; then
+# Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second
+# CIRCLE_PULL_REQUEST comes from CircleCI
+# GITHUB_HEAD_REF comes from Github Actions
+IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-${GITHUB_HEAD_REF:-}}
+if [ -n "$IN_PULL_REQUEST" ]; then
   DETERMINE_FROM="${TMP_DIR}/determine_from"
   file_diff_from_base "$DETERMINE_FROM"
 fi
 
-if [[ "${CIRCLE_JOB}" == *11* ]]; then
+if [[ "${BUILD_ENVIRONMENT}" == *cuda11* ]]; then
   export BUILD_SPLIT_CUDA=ON
 fi
 
diff --git a/Makefile b/Makefile
index 13755ce544c6..3fe69bf14cfb 100644
--- a/Makefile
+++ b/Makefile
@@ -14,8 +14,19 @@ ios:
 
 clean: # This will remove ALL build folders.
 	@rm -r build*/
+	@$(RM) -r $(SHELLCHECK_GHA_GENERATED_FOLDER)
 
 linecount:
 	@cloc --read-lang-def=caffe.cloc caffe2 || \
 		echo "Cloc is not available on the machine. You can install cloc with " && \
 		echo "    sudo apt-get install cloc"
+
+SHELLCHECK_GHA_GENERATED_FOLDER=.shellcheck_generated_gha
+shellcheck-gha:
+	@$(RM) -r $(SHELLCHECK_GHA_GENERATED_FOLDER)
+	tools/extract_scripts.py --out=$(SHELLCHECK_GHA_GENERATED_FOLDER)
+	tools/run_shellcheck.sh $(SHELLCHECK_GHA_GENERATED_FOLDER)
+
+generate-gha-workflows:
+	./.github/scripts/generate_linux_ci_workflows.py
+	$(MAKE) shellcheck-gha

From 42f5d66080ef2cc89d2a1164957eee80e8491c80 Mon Sep 17 00:00:00 2001
From: Can Balioglu <cbalioglu@users.noreply.github.com>
Date: Thu, 15 Apr 2021 17:14:04 -0700
Subject: [PATCH 43/45] [DDP] Fixes flaky tests caused by incorrect
 floating-point comparison (#56192)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/50699.

The root cause was that some floating-point assertions had a "greater than or **equal to**" condition. The "equal to" part was causing flakiness due to strict equality check (`==`) in `TestCase.assertGreaterEqual()`. This PR introduces a new assertion method called `assertGreaterAlmostEqual()` in `common_utils.py` that mitigates the problem by behaving similar to `TestCase.assertAlmostEqual()`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/56192

Reviewed By: zhaojuanmao

Differential Revision: D27804724

Pulled By: cbalioglu

fbshipit-source-id: bc44a41ca4ce45dfee62fb3769fb47bfd9028831
---
 torch/testing/_internal/common_utils.py       | 30 +++++++++++++++++++
 .../_internal/distributed/distributed_test.py |  4 +--
 .../_internal/distributed/rpc/jit/rpc_test.py |  2 +-
 3 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 7dd1785e87b7..972084f524a6 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -1558,6 +1558,36 @@ def assertExpectedStripMangled(self, s, subname=None):
         s = re.sub(r'__torch__[^ ]+', '', s)
         self.assertExpected(s, subname)
 
+    def assertGreaterAlmostEqual(self, first, second, places=None, msg=None, delta=None):
+        """Assert that ``first`` is greater than or almost equal to ``second``.
+
+        The equality of ``first`` and ``second`` is determined in a similar way to
+        the ``assertAlmostEqual`` function of the standard library.
+        """
+        if delta is not None and places is not None:
+            raise TypeError("specify delta or places not both")
+
+        if first >= second:
+            return
+
+        diff = second - first
+        if delta is not None:
+            if diff <= delta:
+                return
+
+            standardMsg = f"{first} not greater than or equal to {second} within {delta} delta"
+        else:
+            if places is None:
+                places = 7
+
+            if round(diff, places) == 0:
+                return
+
+            standardMsg = f"{first} not greater than or equal to {second} within {places} places"
+
+        msg = self._formatMessage(msg, standardMsg)
+        raise self.failureException(msg)
+
     # run code in subprocess and capture exceptions.
     @staticmethod
     def run_process_no_exception(code, env=None):
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index a7978556c501..e99cbe5b945b 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -626,7 +626,7 @@ def _test_barrier_timeout(self, group_id, timeout):
                 expected_time = time.time() + timeout.total_seconds()
                 with self.assertRaisesRegex(Exception, " (Timed out|closed|timeout) "):
                     dist.barrier(group_id)
-                self.assertGreaterEqual(time.time(), expected_time)
+                self.assertGreaterAlmostEqual(time.time(), expected_time, delta=0.05)
             else:
                 time.sleep(timeout.total_seconds())
 
@@ -2659,7 +2659,7 @@ def _test_barrier_helper(
                 else:
                     dist.broadcast(expected_time, dest, group_id)
                     dist.barrier(group_id)
-                    self.assertGreaterEqual(
+                    self.assertGreaterAlmostEqual(
                         float(time.time()),
                         float(expected_time[0]),
                         "destination rank: %d, my rank: %d" % (dest, rank) +
diff --git a/torch/testing/_internal/distributed/rpc/jit/rpc_test.py b/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
index f8af30836624..8d86ddfbbb74 100644
--- a/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
@@ -1283,7 +1283,7 @@ def test_record_function_jit_end_callbacks_with_fork(self):
         self.assertEqual(sleep_event.name, "foo")
         # Validate that callbacks were fired at the right time by checking the
         # profiling event cpu time
-        self.assertGreaterEqual(sleep_event.cpu_time * 1e-6, sleep_interval)
+        self.assertGreaterAlmostEqual(sleep_event.cpu_time * 1e-6, sleep_interval)
 
     def test_call_fork_in_jit_with_profiling(self):
         # Ensures that we can call torch.ops.profiler._call_end_callbacks_on_jit_fut on a jit

From 94ce10f732460bc1ada314ed4b677263c01d5b47 Mon Sep 17 00:00:00 2001
From: Tao Xu <taox@fb.com>
Date: Thu, 15 Apr 2021 17:34:17 -0700
Subject: [PATCH 44/45] [iOS GPU] Use setTexture() rather than copyTexture()
 (#56069)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/56069

It's more efficient to capture a MPSImage object than copying a one from outside.
ghstack-source-id: 126552396

Test Plan:
- All operator tests pass
- Sandcastle
- CircleCI

Reviewed By: SS-JIA

Differential Revision: D27694542

fbshipit-source-id: e1bbbffc3f8c109816cb117aebd0aae8576c6c5c
---
 .../ATen/native/metal/mpscnn/MPSImageWrapper.h    |  1 +
 .../ATen/native/metal/mpscnn/MPSImageWrapper.mm   |  5 +++++
 .../native/metal/ops/MetalBinaryElementwise.mm    | 15 +++------------
 aten/src/ATen/native/metal/ops/MetalClamp.mm      |  2 +-
 .../src/ATen/native/metal/ops/MetalConvolution.mm |  2 +-
 aten/src/ATen/native/metal/ops/MetalHardswish.mm  |  2 +-
 aten/src/ATen/native/metal/ops/MetalNeurons.mm    |  2 +-
 7 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.h b/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.h
index 054ce8433cba..8b842b090943 100644
--- a/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.h
+++ b/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.h
@@ -25,6 +25,7 @@ class API_AVAILABLE(ios(10.0), macos(10.13)) MPSImageWrapper {
   void setCommandBuffer(MetalCommandBuffer* buffer);
   MetalCommandBuffer* commandBuffer() const;
   IntArrayRef textureSizes() const;
+  void setTexture(MPSImage* image);
   MPSImage* image() const;
   void recycleImage();
   void synchronize();
diff --git a/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.mm b/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.mm
index f683faf4912d..f46fbd60a806 100644
--- a/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.mm
+++ b/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.mm
@@ -70,6 +70,11 @@
   }
 }
 
+void MPSImageWrapper::setTexture(MPSImage* image) {
+    TORCH_CHECK(image);
+    _image = image;
+}
+
 void MPSImageWrapper::synchronize() {
   if ([_image isTemporaryImage]) {
     _image =
diff --git a/aten/src/ATen/native/metal/ops/MetalBinaryElementwise.mm b/aten/src/ATen/native/metal/ops/MetalBinaryElementwise.mm
index 2e8c5fa41d5a..862722bd721d 100644
--- a/aten/src/ATen/native/metal/ops/MetalBinaryElementwise.mm
+++ b/aten/src/ATen/native/metal/ops/MetalBinaryElementwise.mm
@@ -96,7 +96,7 @@ Tensor binaryElementwiseShaderKernel(
   [X2 markRead];
   MetalTensorImpl* impl = (MetalTensorImpl*)input1.unsafeGetTensorImpl();
   MetalTensorImplStorage& implStorage = impl->unsafe_opaque_handle();
-  implStorage.texture()->copyFromTexture(Y);
+  implStorage.texture()->setTexture(Y);
   return input1;
 }
 
@@ -142,13 +142,11 @@ Tensor binaryElementwiseMPSCNNKernel(
   if (broadCastFirstInput(X1, X2)) {
     outputSize = input2.sizes().vec();
   }
-  MetalTensorImplStorage mt{outputSize};
   MetalCommandBuffer* cb1 = getCommandBufferFromTensor(input1);
   MetalCommandBuffer* cb2 = getCommandBufferFromTensor(input2);
   TORCH_CHECK(
       [cb1 isEqual:cb2], @"inputs have different Metal command buffers");
-  mt.texture()->allocateTemporaryTextureStorage(outputSize, cb1);
-  MPSImage* Y = mt.texture()->image();
+  MPSImage* Y = createTemporaryImage(cb1, outputSize);
   T* kernel = [[T alloc] initWithDevice:[MPSCNNContext sharedInstance].device];
   kernel.primaryStrideInPixelsY = X1.height == 1 ? 0 : 1;
   kernel.primaryStrideInPixelsX = X1.width == 1 ? 0 : 1;
@@ -160,13 +158,12 @@ Tensor binaryElementwiseMPSCNNKernel(
                destinationImage:Y];
   MetalTensorImpl* impl = (MetalTensorImpl*)input1.unsafeGetTensorImpl();
   MetalTensorImplStorage& implStorage = impl->unsafe_opaque_handle();
-  implStorage.texture()->copyFromTexture(Y);
+  implStorage.texture()->setTexture(Y);
   return input1;
 }
 
 Tensor add_Tensor(const Tensor& input1, const Tensor& input2, const Scalar& alpha) {
   TORCH_CHECK(input1.is_metal());
-  TORCH_CHECK(input1.dim() == input2.dim());
   auto input2_ = input2.is_metal() ? input2 : input2.metal();
   if (@available(iOS 11.3, *)) {
     return binaryElementwiseMPSCNNKernel<MPSCNNAdd>(input1, input2_);
@@ -178,7 +175,6 @@ Tensor add_Tensor(const Tensor& input1, const Tensor& input2, const Scalar& alph
 
 Tensor& add__Tensor(Tensor& input1, const Tensor& input2, const Scalar& alpha) {
   TORCH_CHECK(input1.is_metal());
-  TORCH_CHECK(input1.dim() == input2.dim());
   auto input2_ = input2.is_metal() ? input2 : input2.metal();
   if (@available(iOS 11.3, *)) {
     return binaryElementwiseMPSCNNKernel_<MPSCNNAdd>(input1, input2_);
@@ -202,7 +198,6 @@ Tensor sub_Tensor(const Tensor& input1, const Tensor& input2, const Scalar& alph
 
 Tensor& sub__Tensor(Tensor& input1, const Tensor& input2, const Scalar& alpha) {
   TORCH_CHECK(input1.is_metal());
-  TORCH_CHECK(input1.dim() == input2.dim());
   auto input2_ = input2.is_metal() ? input2 : input2.metal();
   if (@available(iOS 11.3, *)) {
     return binaryElementwiseMPSCNNKernel_<MPSCNNSubtract>(input1, input2_);
@@ -214,7 +209,6 @@ Tensor sub_Tensor(const Tensor& input1, const Tensor& input2, const Scalar& alph
 
 Tensor mul_Tensor(const Tensor& input1, const Tensor& input2) {
   TORCH_CHECK(input1.is_metal());
-  TORCH_CHECK(input1.dim() == input2.dim());
   auto input2_ = input2.is_metal() ? input2 : input2.metal();
   if (@available(iOS 11.3, *)) {
     return binaryElementwiseMPSCNNKernel<MPSCNNMultiply>(input1, input2_);
@@ -226,7 +220,6 @@ Tensor mul_Tensor(const Tensor& input1, const Tensor& input2) {
 
 Tensor& mul__Tensor(Tensor& input1, const Tensor& input2) {
   TORCH_CHECK(input1.is_metal());
-  TORCH_CHECK(input1.dim() == input2.dim());
   auto input2_ = input2.is_metal() ? input2 : input2.metal();
   if (@available(iOS 11.3, *)) {
     return binaryElementwiseMPSCNNKernel_<MPSCNNMultiply>(input1, input2_);
@@ -238,7 +231,6 @@ Tensor mul_Tensor(const Tensor& input1, const Tensor& input2) {
 
 Tensor div_Tensor(const Tensor& input1, const Tensor& input2) {
   TORCH_CHECK(input1.is_metal());
-  TORCH_CHECK(input1.dim() == input2.dim());
   auto input2_ = input2.is_metal() ? input2 : input2.metal();
   if (@available(iOS 11.3, *)) {
     return binaryElementwiseMPSCNNKernel<MPSCNNDivide>(input1, input2_);
@@ -250,7 +242,6 @@ Tensor div_Tensor(const Tensor& input1, const Tensor& input2) {
 
 Tensor& div__Tensor(Tensor& input1, const Tensor& input2) {
   TORCH_CHECK(input1.is_metal());
-  TORCH_CHECK(input1.dim() == input2.dim());
   auto input2_ = input2.is_metal() ? input2 : input2.metal();
   if (@available(iOS 11.3, *)) {
     return binaryElementwiseMPSCNNKernel_<MPSCNNDivide>(input1, input2_);
diff --git a/aten/src/ATen/native/metal/ops/MetalClamp.mm b/aten/src/ATen/native/metal/ops/MetalClamp.mm
index e13ce3341135..2511f090cdd0 100644
--- a/aten/src/ATen/native/metal/ops/MetalClamp.mm
+++ b/aten/src/ATen/native/metal/ops/MetalClamp.mm
@@ -25,7 +25,7 @@
   using MetalTensorImpl = at::MetalTensorImpl<MetalTensorImplStorage>;
   MetalTensorImpl* impl = (MetalTensorImpl*)input.unsafeGetTensorImpl();
   MetalTensorImplStorage& implStorage = impl->unsafe_opaque_handle();
-  implStorage.texture()->copyFromTexture(Y);
+  implStorage.texture()->setTexture(Y);
   return input;
 }
 
diff --git a/aten/src/ATen/native/metal/ops/MetalConvolution.mm b/aten/src/ATen/native/metal/ops/MetalConvolution.mm
index a437e3ae45af..3b363c0ebe68 100644
--- a/aten/src/ATen/native/metal/ops/MetalConvolution.mm
+++ b/aten/src/ATen/native/metal/ops/MetalConvolution.mm
@@ -91,7 +91,7 @@ Tensor conv2d(const Tensor& input, Conv2dOpContext& context) {
     MPSCNNClampOp* clampOp =
         [MPSCNNClampOp newWithTextures:@[ Y1, Y2 ] Args:@[ @(min), @(max) ]];
     [clampOp encode:commandBuffer.buffer];
-    mt.texture()->copyFromTexture(Y2);
+    mt.texture()->setTexture(Y2);
   }
   auto output = makeTensor(std::move(mt), input.options());
   return output;
diff --git a/aten/src/ATen/native/metal/ops/MetalHardswish.mm b/aten/src/ATen/native/metal/ops/MetalHardswish.mm
index a4ad15b15a4d..aeb891782969 100644
--- a/aten/src/ATen/native/metal/ops/MetalHardswish.mm
+++ b/aten/src/ATen/native/metal/ops/MetalHardswish.mm
@@ -44,7 +44,7 @@
   [X markRead];
   MetalTensorImpl* impl = (MetalTensorImpl*)input.unsafeGetTensorImpl();
   MetalTensorImplStorage& implStorage = impl->unsafe_opaque_handle();
-  implStorage.texture()->copyFromTexture(Y);
+  implStorage.texture()->setTexture(Y);
   return input;
 }
 
diff --git a/aten/src/ATen/native/metal/ops/MetalNeurons.mm b/aten/src/ATen/native/metal/ops/MetalNeurons.mm
index c05e9d1001bb..4aa6d9bc3029 100644
--- a/aten/src/ATen/native/metal/ops/MetalNeurons.mm
+++ b/aten/src/ATen/native/metal/ops/MetalNeurons.mm
@@ -41,7 +41,7 @@ Tensor neuronKernel(const Tensor& input, MPSCNNNeuron* neuron) {
                destinationImage:Y];
   MetalTensorImpl* impl = (MetalTensorImpl*)input.unsafeGetTensorImpl();
   MetalTensorImplStorage& implStorage = impl->unsafe_opaque_handle();
-  implStorage.texture()->copyFromTexture(Y);
+  implStorage.texture()->setTexture(Y);
   return input;
 }
 

From bd3c63aeeb72b565fec707ce8af71230a0727e21 Mon Sep 17 00:00:00 2001
From: Dhruv Matani <dhruvbird@fb.com>
Date: Thu, 15 Apr 2021 17:49:29 -0700
Subject: [PATCH 45/45] [PyTorch Edge] Move
 torch::jit::mobile::_export_operator_list() from
 serialization/export_module.cpp to mobile/import.cpp (#56044)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/56044

We want to be able to drop the dependence of full-jit deps in the auto-generated unit tests for 2 reasons:

1. Running bloaty on the auto-generated unit tests should be somewhat representative of the actual size.
2. The runtime environment of the auto-generated unit tests should be as close to the production environment as possible to ensure that we are running the tests in a production-like runtime.

Due to the dependece on full-jit, we aren't there yet. For the auto-generated tests, we probably don't need to depend on `_export_operator_list()` evetually, but for now we do since it is used to decide whether the model being run is a Metal GPU model or a CPU model, and gates whether the test runs that model or not.

Eventually, we can stop doing this in the test and do it in the codegen from PTM-CLI instead (by fetching the operators from that tool, and writing out to the BUCK file which backend(s) this model is targeting). However, that will take some time to land, so in the spirit of expediency, this change is being proposed.

Discussed this offline with iseeyuan
ghstack-source-id: 126656877

Test Plan: Build + BSB.

Reviewed By: iseeyuan

Differential Revision: D27694781

fbshipit-source-id: f31a2dfd40803c02f4fd19c45a3cc6fb9bdf9697
---
 torch/csrc/jit/mobile/import.cpp              | 22 +++++++++++++++++++
 torch/csrc/jit/mobile/import.h                | 17 ++++++++++++++
 torch/csrc/jit/serialization/export.h         | 17 --------------
 .../csrc/jit/serialization/export_module.cpp  | 21 ------------------
 4 files changed, 39 insertions(+), 38 deletions(-)

diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp
index a80d18f4ca0c..1707e9df60ce 100644
--- a/torch/csrc/jit/mobile/import.cpp
+++ b/torch/csrc/jit/mobile/import.cpp
@@ -3,6 +3,7 @@
 #include <ATen/core/ivalue.h>
 #include <caffe2/serialize/inline_container.h>
 #include <torch/csrc/jit/api/compilation_unit.h>
+#include <torch/csrc/jit/mobile/interpreter.h>
 #include <torch/csrc/jit/mobile/observer.h>
 #include <torch/csrc/jit/runtime/instruction.h>
 #include <torch/csrc/jit/serialization/import_export_constants.h>
@@ -677,5 +678,26 @@ void _load_extra_only_for_mobile(
   deserializer.deserialize_only_extra(device, extra_files);
 }
 
+namespace mobile {
+
+std::set<std::string> _export_operator_list(
+    torch::jit::mobile::Module& module) {
+  std::set<std::string> operator_list;
+  for (Method func : module.get_methods()) {
+    const Function& function = func.function();
+    const std::shared_ptr<Code> cptr = function.get_code();
+    // op_names below isn't a list of unique operator names. In fact
+    // it can contain the same operator name many many times, so we need
+    // to de-dup the list by adding all the operator names into
+    // an std::set<std::string>.
+    std::vector<c10::OperatorName> const& op_names = cptr->op_names_;
+    for (auto& op_name : op_names) {
+      operator_list.insert(toString(op_name));
+    }
+  }
+  return operator_list;
+}
+
+} // namespace mobile
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/mobile/import.h b/torch/csrc/jit/mobile/import.h
index a299be1e9e17..1067c5ebeb81 100644
--- a/torch/csrc/jit/mobile/import.h
+++ b/torch/csrc/jit/mobile/import.h
@@ -71,5 +71,22 @@ void _load_extra_only_for_mobile(
     const std::string& filename,
     c10::optional<at::Device> device,
     ExtraFilesMap& extra_files);
+
+namespace mobile {
+
+/**
+ * Given a torch::jit::mobile::Module, return a set of operator names
+ * (with overload name) that are used by any method in this mobile
+ * Mobile. This method runs through the bytecode for all methods
+ * in the specified model (module), and extracts all the root
+ * operator names. Root operators are operators that are called
+ * directly by the model (as opposed to non-root operators, which
+ * may be called transitively by the root operators).
+ *
+ */
+TORCH_API std::set<std::string> _export_operator_list(
+    torch::jit::mobile::Module& module);
+
+} // namespace mobile
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/serialization/export.h b/torch/csrc/jit/serialization/export.h
index 652188fa6b5e..58665e21bad9 100644
--- a/torch/csrc/jit/serialization/export.h
+++ b/torch/csrc/jit/serialization/export.h
@@ -116,22 +116,5 @@ TORCH_API void SetExportModuleMobileInfoConverter(
  */
 TORCH_API std::vector<std::string> export_opnames(const Module& m);
 
-namespace mobile {
-
-class Module;
-/**
- * Given a torch::jit::mobile::Module, return a set of operator names
- * (with overload name) that are used by any method in this mobile
- * Mobile. This method runs through the bytecode for all methods
- * in the specified model (module), and extracts all the root
- * operator names. Root operators are operators that are called
- * directly by the model (as opposed to non-root operators, which
- * may be called transitively by the root operators).
- *
- */
-TORCH_API std::set<std::string> _export_operator_list(
-    torch::jit::mobile::Module& module);
-
-} // namespace mobile
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/serialization/export_module.cpp b/torch/csrc/jit/serialization/export_module.cpp
index 73ac285fe3e0..c7488d5d6127 100644
--- a/torch/csrc/jit/serialization/export_module.cpp
+++ b/torch/csrc/jit/serialization/export_module.cpp
@@ -658,26 +658,5 @@ std::vector<std::string> export_opnames(const script::Module& m) {
   return std::vector<std::string>(names.begin(), names.end());
 }
 
-namespace mobile {
-
-std::set<std::string> _export_operator_list(
-    torch::jit::mobile::Module& module) {
-  std::set<std::string> operator_list;
-  for (Method func : module.get_methods()) {
-    const Function& function = func.function();
-    const std::shared_ptr<Code> cptr = function.get_code();
-    // op_names below isn't a list of unique operator names. In fact
-    // it can contain the same operator name many many times, so we need
-    // to de-dup the list by adding all the operator names into
-    // an std::set<std::string>.
-    std::vector<c10::OperatorName> const& op_names = cptr->op_names_;
-    for (auto& op_name : op_names) {
-      operator_list.insert(toString(op_name));
-    }
-  }
-  return operator_list;
-}
-
-} // namespace mobile
 } // namespace jit
 } // namespace torch